In [None]:
import sys,os
import pandas as pd 	#for CSV or large data files
import numpy as np
from tqdm import tqdm 
import matplotlib.pyplot as plt, seaborn as sns	#for ploting or visual representation of data

sys.path.append(r"D:\code\repo\M.tech\sem1\DA\LAB\contest")
from FILE_DIR import *

# embedding models
from sentence_transformers import SentenceTransformer

# utils
from sklearn.utils import resample
from sklearn.cross_decomposition import PLSRegression

from sklearn.metrics.pairwise import cosine_similarity
import torch, torch.nn as nn

tqdm.pandas()

<h3 style="color:orange">Hyperparameters</h3>

In [None]:
DEVICE="cuda" if torch.cuda.is_available() else "cpu"
BATCH_SIZE=[8,16,32,64,128][2]

def cos_sim(C1,C2):
    return cosine_similarity(C1.reshape(-1,1),C2.reshape(-1,1))

# create dir
if "temp" not in os.listdir("../"):
    os.makedirs("../temp")
    os.makedirs("../temp/embeddings")

<h3 style="color:orange">Load data</h3>

In [None]:
data_train=pd.read_json(TRAIN_DATA)
data_test=pd.read_json(TEST_DATA)

print(data_train.shape)
print(data_test.shape)
# data

data_train[['user_prompt', 'response', 'system_prompt']] = data_train[['user_prompt', 'response', 'system_prompt']].fillna('')
data_test[['user_prompt', 'response', 'system_prompt']] = data_test[['user_prompt', 'response', 'system_prompt']].fillna('')

print("\nafter preprocess")
print(data_train.shape)
print(data_test.shape)
print(data_train.isna().sum(),end="\n\n")
print(data_test.isna().sum())

<h3 style="color:orange">Load embdding model</h3>

In [None]:
arcitecture=[("google/embeddinggemma-300m","gemma"),
             ("all-mpnet-base-v2","mpnetv2"),
             ("intfloat/e5-base-v2","e5v2"),
             ("BAAI/bge-m3","bgem3")][2]

# for gemma
hugging_face_token="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"
# emb_model = SentenceTransformer(arcitecture[0],device=DEVICE,token=hugging_face_token)
emb_model = SentenceTransformer(arcitecture[0],device=DEVICE)

emb_model_name=f"{arcitecture[1]}_split"
# emb_model.half()

print(emb_model.device)
print(emb_model_name)

if emb_model_name not in os.listdir("../temp/embeddings"):
    os.makedirs(f"../temp/embeddings/{emb_model_name}")
    os.makedirs(f"../temp/embeddings/{emb_model_name}/train")
    os.makedirs(f"../temp/embeddings/{emb_model_name}/test")

<h5 style="color:cyan">for train data</h5>

In [None]:
data_train["user_prompt_emb"]=list(emb_model.encode(data_train["user_prompt"],show_progress_bar=True,batch_size=BATCH_SIZE,device=DEVICE))
data_train[["user_prompt_emb"]].to_parquet(f"../temp/embeddings/{emb_model_name}/train/user_prompt.parquet")

data_train["response_emb"]=list(emb_model.encode(data_train["response"],show_progress_bar=True,batch_size=BATCH_SIZE,device=DEVICE))
data_train[["response_emb"]].to_parquet(f"../temp/embeddings/{emb_model_name}/train/response.parquet")

data_train["system_prompt_emb"]=list(emb_model.encode(data_train["system_prompt"],show_progress_bar=True,batch_size=BATCH_SIZE,device=DEVICE))
data_train[["system_prompt_emb"]].to_parquet(f"../temp/embeddings/{emb_model_name}/train/system_prompt.parquet")

In [None]:
# add cosine similarity
data_train["cos_ur"] = data_train.apply(lambda row: cos_sim(row["response_emb"],row["user_prompt_emb"]),axis=1)
data_train[["cos_ur"]].to_parquet(f"../temp/embeddings/{emb_model_name}/train/cossim_ur.parquet")

data_train["cos_us"] = data_train.apply(lambda row: cos_sim(row["system_prompt_emb"],row["user_prompt_emb"]),axits=1)
data_train[["cos_us"]].to_parquet(f"../temp/embeddings/{emb_model_name}/train/cossim_us.parquet")

data_train["cos_rs"] = data_train.apply(lambda row: cos_sim(row["response_emb"],row["system_prompt_emb"]),axis=1)
data_train[["cos_rs"]].to_parquet(f"../temp/embeddings/{emb_model_name}/train/cossim_rs.parquet")

print(f"ur: {np.mean(data_train['cos_ur'].values)}")
print(f"us: {np.mean(data_train['cos_us'].values)}")
print(f"rs: {np.mean(data_train['cos_rs'].values)}")

<h5 style="color:cyan">for test data</h5>

In [None]:
data_test["user_prompt_emb"]=list(emb_model.encode(data_test["user_prompt"],show_progress_bar=True,batch_size=BATCH_SIZE,device=DEVICE))
data_test[["user_prompt_emb"]].to_parquet(f"../temp/embeddings/{emb_model_name}/test/user_prompt.parquet")

data_test["response_emb"]=list(emb_model.encode(data_test["response"],show_progress_bar=True,batch_size=BATCH_SIZE,device=DEVICE))
data_test[["response_emb"]].to_parquet(f"../temp/embeddings/{emb_model_name}/test/response.parquet")

data_test["system_prompt_emb"]=list(emb_model.encode(data_test["system_prompt"],show_progress_bar=True,batch_size=BATCH_SIZE,device=DEVICE))
data_test[["system_prompt_emb"]].to_parquet(f"../temp/embeddings/{emb_model_name}/test/system_prompt.parquet")