In [None]:
import numpy as np
import pandas as pd
from tqdm import tqdm

In [None]:
from peft import PeftConfig, PeftModel
from transformers import AutoModelForCausalLM, AutoTokenizer

device = "cuda"

base_model_name = "GAI-LLM/Yi-Ko-6B-mixed-v15"
adapter_model_name = "../model/GAI-LLM-Yi-Ko-6B-mixed-v15-sft-qlora-v1"

model = AutoModelForCausalLM.from_pretrained(base_model_name, device_map="auto")
model = PeftModel.from_pretrained(model, adapter_model_name)

tokenizer = AutoTokenizer.from_pretrained(base_model_name)

In [None]:
data = pd.read_csv('../data/eval_v1.csv')

In [None]:
data = data.sample(n=2000)

In [None]:
data.head(2)

In [None]:
len(data)

In [None]:
def extract_text(input_string):
    index_t = input_string.find('<|assistant|>')
    if index_t != -1:  
        result = input_string[index_t + len('<|assistant|>'):]
    else: 
        raise Exception
    return result

In [None]:
# 2000 x 10
questions = []
inference_results = []
PROMPT_TEMPLATE = '<|user|>{question}{sep_token}<|assistant|>'

for i in tqdm(range(len(data))):
    result = []
    row = data.iloc[i]

    question = row['질문']
    answer = row['답변']

    prompt = PROMPT_TEMPLATE.format(question=question,
                                     sep_token=tokenizer.eos_token)
    inputs = tokenizer.encode(prompt, return_tensors="pt").to(device)
    outputs = model.generate(input_ids=inputs, 
                         max_length=512, 
                         num_beams=10,
                         repetition_penalty=1.5,
                         diversity_penalty=0.5,
                         num_beam_groups=5,
                         num_return_sequences=10)
    
    for k in range(10):
        response = tokenizer.decode(outputs[k], skip_special_tokens=True)
        response = extract_text(response)
        result.append(response)
    
    questions.append(question)
    inference_results.append(result)

In [None]:
len(questions), len(inference_results)

In [None]:
# 모든 결과 저장
sft_inf_results = {'data':[]}
for q, result in zip(questions,inference_results):
    item = {
        'question':q,
        'result':result
    }
    sft_inf_results['data'].append(item)

import json
with open("../data/dpo/GAI-LLM-Yi-Ko-6B-mixed-v15-qlora-v1-dpo-raw.json", "w") as json_file:
    json.dump(sft_inf_results, json_file, ensure_ascii=False, indent=4)


# embedding 비교

In [None]:
from sentence_transformers import SentenceTransformer 

embed_model = SentenceTransformer('distiluse-base-multilingual-cased-v1')
embed_model =  embed_model.to(device)

In [None]:
def cosine_similarity(a, b):
    dot_product = np.dot(a, b)
    norm_a = np.linalg.norm(a)
    norm_b = np.linalg.norm(b)
    return dot_product / (norm_a * norm_b) if norm_a != 0 and norm_b != 0 else 0

In [None]:
prompt = []
chosen = []
rejected = []

for i in tqdm(range(len(inference_results))):
    row = data.iloc[i]
    preds = inference_results[i]

    answer = row['답변']

    gt_embed = embed_model.encode(answer)
    sample_score_list = []

    for pred in preds:
        pred_embed = embed_model.encode(pred)
        sample_score = cosine_similarity(gt_embed, pred_embed)
        sample_score_list.append(sample_score)
    
    max_idx = sample_score_list.index(max(sample_score_list))
    min_idx = sample_score_list.index(min(sample_score_list))
    
    prompt.append(row['질문'])
    chosen.append(preds[max_idx])
    rejected.append(preds[min_idx])


In [None]:
dpo_df = pd.DataFrame({
        'prompt':prompt,
        'chosen':chosen,
        'rejected':rejected
    }
)
dpo_df.head()

In [None]:
from sklearn.model_selection import train_test_split
train_ratio = 0.9 
train_df, dev_df = train_test_split(dpo_df, test_size=1-train_ratio, random_state=42)
print(train_df.shape) # 13xx

In [None]:
train_df.to_csv('../data/dpo/GAI-LLM-Yi-Ko-6B-mixed-v15-qlora-v1-dpo-train.csv')
dev_df.to_csv('../data/dpo/GAI-LLM-Yi-Ko-6B-mixed-v15-qlora-v1-dpo-eval.csv')