In [3]:
import glob
from tqdm import tqdm
from datasets import load_dataset
import json
from sentence_transformers import SentenceTransformer, losses, SentenceTransformerTrainer, SentenceTransformerTrainingArguments
import gzip
import os
from sentence_transformers import InputExample
from datasets import Dataset
import torch
os.environ["CUDA_VISIBLE_DEVICES"] = "0"

def get_data(path, data_type):
    cnt = 0
    data_dict = {
        'anchor': [],
        'positive': []
    }
    results_path = glob.glob(path + '/*.json')
    for sample_result_path in tqdm(results_path):
        turn_num = int(sample_result_path.split('/')[-1].split('.')[0].split('_')[-1])
        sample_result = json.load(open(sample_result_path))
        target_item_title = sample_result['rec'][0]
        dialog_dict = sample_result['simulator_dialog']['context']
        # assert (len(dialog_dict) - turn_num) % 2 == 0.0, f"dialog_dict: {len(dialog_dict)}, turn_num: {turn_num}"
        if (len(dialog_dict) - turn_num) % 2 != 0.0: continue
        iter_num = (len(dialog_dict) - turn_num) // 2
        gt_item_list = sample_result['rec']
        for gt_item in gt_item_list:
            passage = gt_item
            if data_type == 'total':
                for idx in range(iter_num):
                    query_dialog = dialog_dict[: -2*idx]
                    query = ""
                    for context in query_dialog:
                        query += f"{context['role']}: {context['content']} "
                    data_dict['anchor'].append(query)
                    data_dict['positive'].append(passage)
            else:
                query_dialog = dialog_dict[: -2*iter_num]
                query = ""
                for context in query_dialog:
                    query += f"{context['role']}: {context['content']} "
                data_dict['anchor'].append(query)
                data_dict['positive'].append(passage)
            
    data = Dataset.from_dict(data_dict)
    return data

data_type = 'total'
# embedding_model = "nomic-ai/nomic-embed-text-v1"
embedding_model = "nomic-ai/nomic-embed-text-v1"
rec_model = "Llama-3.2-1B-Instruct"
# adapter_model = "Llama-3.2-1B-Instruct-DPO-tuned-new-reward"
# adapter_model = "Llama-3.2-1B-Instruct-DPO-tuned-nomic-no-tuned"
adapter_model = None
topK= 10
history = "full"
split = "train"
user_model = "Llama-3.1-8B-Instruct"

# Train the model
num_epochs = 7
lr = 5e-5

# temp = embedding_model
# embedding_model = "Snowflake/snowflake-arctic-embed-s"
if adapter_model == None:
    train_dir = f'/home/shchoi/iEvaLM-CRS/save_5/user_{user_model}/emb_{embedding_model}/openmodel_{rec_model}_top{topK}_{history}_history/opendialkg_eval/full_non_repeated/train'
    eval_dir = f'/home/shchoi/iEvaLM-CRS/save_5/user_{user_model}/emb_{embedding_model}/openmodel_{rec_model}_top{topK}_{history}_history/opendialkg_eval/full_non_repeated/valid' 
else:
    train_dir = f'/home/shchoi/iEvaLM-CRS/save_5/user_{user_model}/emb_{embedding_model}/openmodel_{rec_model}_adapter_{adapter_model}_top{topK}_{history}_history/opendialkg_eval/full_non_repeated/train'
    eval_dir = f'/home/shchoi/iEvaLM-CRS/save_5/user_{user_model}/emb_{embedding_model}/openmodel_{rec_model}_adapter_{adapter_model}_top{topK}_{history}_history/opendialkg_eval/full_non_repeated/test' 
# embedding_model = temp
print(train_dir)
training_data = get_data(train_dir, data_type)
eval_data = get_data(eval_dir, data_type)
print(eval_dir)
# print(gt_dir)

/home/shchoi/iEvaLM-CRS/save_5/user_Llama-3.1-8B-Instruct/emb_nomic-ai/nomic-embed-text-v1/openmodel_Llama-3.2-1B-Instruct_top10_full_history/opendialkg_eval/full_non_repeated/train


100%|██████████| 3560/3560 [00:00<00:00, 18616.27it/s]
100%|██████████| 758/758 [00:00<00:00, 22440.52it/s]

/home/shchoi/iEvaLM-CRS/save_5/user_Llama-3.1-8B-Instruct/emb_nomic-ai/nomic-embed-text-v1/openmodel_Llama-3.2-1B-Instruct_top10_full_history/opendialkg_eval/full_non_repeated/valid





In [4]:
training_data

Dataset({
    features: ['anchor', 'positive'],
    num_rows: 10416
})

In [5]:
from torch.utils.data import DataLoader
import wandb

from sentence_transformers.evaluation import EmbeddingSimilarityEvaluator, SimilarityFunction, InformationRetrievalEvaluator
from sentence_transformers.training_args import BatchSamplers
# embedding_model = "nomic-ai/nomic-embed-text-v1-already-tuned-lr_te-5"
# embedding_model = "nomic-ai/nomic-embed-text-v1-after-tuned"
model = SentenceTransformer(embedding_model, cache_folder = "/data1/shchoi/LLM_ckp/hub", device="cuda:0", trust_remote_code=True)
# model = SentenceTransformer("/home/shchoi/iEvaLM-CRS/experiment_code/output/total/user_Llama-3.1-8B-Instruct/emb_nomic-ai/nomic-embed-text-v1/openmodel_Llama-3.2-1B-Instruct_top10_full_history/opendialkg_eval/full_non_repeated/train/checkpoint-65", cache_folder = "/data1/shchoi/LLM_ckp/hub", device="cuda:0", trust_remote_code=True)
embedding_model = "nomic-ai/nomic-embed-text-v1-true-total-tuned"
# Initialize loss
train_loss = losses.MultipleNegativesRankingLoss(model)

args = SentenceTransformerTrainingArguments(
    # Required parameter:
    output_dir=f'output/{data_type}/user_{user_model}/emb_{embedding_model}/openmodel_{rec_model}_top{topK}_{history}_history/opendialkg_eval/full_non_repeated/{split}',
    seed = 42,
    # Optional training parameters:
    learning_rate=lr,
    num_train_epochs=num_epochs,
    per_device_train_batch_size=64,
    per_device_eval_batch_size=64,
    gradient_accumulation_steps=4, 
    warmup_ratio=0.1,
    fp16=True,  # Set to False if GPU can't handle FP16
    bf16=False,  # Set to True if GPU supports BF16
    batch_sampler=BatchSamplers.NO_DUPLICATES,  # MultipleNegativesRankingLoss benefits from no duplicates
    # Optional tracking/debugging parameters:
    eval_strategy="steps",
    eval_steps=5,
    save_strategy="steps",
    save_steps=5,
    save_total_limit=5,
    logging_steps=5,
    eval_on_start=True,
)

trainer = SentenceTransformerTrainer(
    model=model,
    args=args,
    train_dataset=training_data,
    eval_dataset=eval_data,
    loss=train_loss,
    # evaluator=evaluator,
)

trainer.train()

  state_dict = loader(resolved_archive_file)
<All keys matched successfully>
[34m[1mwandb[0m: Currently logged in as: [33mdexrf1[0m to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin
[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.


Step,Training Loss,Validation Loss
0,No log,0.111814
5,15.510500,0.107833
10,13.813900,0.088218
15,11.155700,0.07403


KeyboardInterrupt: 

In [7]:
trainer.save_model(f'output/{data_type}/user_{user_model}/emb_{embedding_model}/openmodel_{rec_model}_top{topK}_{history}_history/opendialkg_eval/full_non_repeated/{split}')