In [11]:
import pandas as pd
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline

import numpy as np
from sentence_transformers import SentenceTransformer


import torch
import random
import torch.backends.cudnn as cudnn

torch.manual_seed(0)
torch.cuda.manual_seed(0)
torch.cuda.manual_seed_all(0)
cudnn.benchmark = False
cudnn.deterministic = True
random.seed(0)

scoring_model = SentenceTransformer('distiluse-base-multilingual-cased-v1')

In [12]:
# MODEL = "nlpai-lab/kullm-polyglot-5.8b-v2"
# MODEL = "KT-AI/midm-bitext-S-7B-inst-v1"
# MODEL = "EleutherAI/polyglot-ko-1.3b"
# MODEL = "EleutherAI/polyglot-ko-1.3b_sft10"
MODEL = "EleutherAI/polyglot-ko-1.3b_f-sft_5"
MODEL = "model/model/further_train/checkpoint-last_f-sft_5/checkpoint-last"
# MODEL = "facebook/xglm-564M"
model = AutoModelForCausalLM.from_pretrained(
    # "model/" + MODEL + "/checkpoint-last",
    MODEL,
    torch_dtype=torch.bfloat16,
    low_cpu_mem_usage=True,
).to(device=f"cuda")
model.eval()
# tokenizer = AutoTokenizer.from_pretrained("model/" + MODEL + "/checkpoint-last")
tokenizer = AutoTokenizer.from_pretrained(MODEL)

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [13]:
# Embedding vector 변환
def cosine_similarity(a, b):
    dot_product = np.dot(a, b)
    norm_a = np.linalg.norm(a)
    norm_b = np.linalg.norm(b)
    return dot_product / (norm_a * norm_b) if norm_a != 0 and norm_b != 0 else 0

def compute_cosine(pred, gt):
    
    # 생성된 답변 내용을 512 Embedding Vector로 변환
    pred_embed = scoring_model.encode(pred)
    gt_embed = scoring_model.encode(gt)
    
    sample_score = cosine_similarity(gt_embed, pred_embed)
    # Cosine Similarity Score가 0보다 작으면 0으로 간주
    sample_score = max(sample_score, 0)
    print('예측 : ', pred)
    print('정답 : ', gt)
    print('Cosine Similarity Score : ', sample_score)
    print('-'*20)

    return sample_score

In [14]:
train_df = pd.read_csv('./data/train.csv')

In [15]:
pipe = pipeline("text-generation", model=model, tokenizer=tokenizer, device=0)

# prompter = Prompter("kullm")

def infer(instruction="", input_text=""):
    prompt = instruction + input_text
    output = pipe(prompt, max_length=512, temperature=0.2, num_beams=5, eos_token_id=2)
    s = output[0]["generated_text"].split('답변:')[-1].strip()
    result = s

    return result

# f"""###질문: {example['quenstion'][i]}
# ###범주: {example['category'][i]}
# ###답변: {example['answer'][i]}"""

In [16]:
# import random
# idx = random.randint(0, 664)
# data = train_df.iloc[1, 1:3]
# ans = train_df.iloc[1, 4]
# q1, q2 = data
# instruction = "건축, 도배에 관한 질의응답입니다. 간단하게 대답해주세요."
# instruction = ''
# result1 = infer(instruction, "###Q: " + q1 + "\n###A: ")
# result2 = infer(instruction, q2)
# print("Q",q1)
# print("A",result1)
# print()
# print(q2)
# print(result2)

In [17]:
# compute_cosine(result1, ans), compute_cosine(result2, ans)

In [18]:
test_df = pd.read_csv('data/preproc_test.csv')

In [19]:
import warnings
warnings.filterwarnings('ignore')
from tqdm import tqdm
submission = pd.read_csv('data/sample_submission.csv')

for i, text in tqdm(enumerate(test_df['question'])):
    # pred = infer(text).split('.')[0]
    pred = infer("###질문: " + text + "\n###답변:")
    pred = scoring_model.encode(pred)
    submission.iloc[i, 1:] = pred


0it [00:00, ?it/s]Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


1it [00:16, 16.56s/it]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
2it [00:35, 17.89s/it]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
3it [00:51, 17.22s/it]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
4it [01:08, 16.95s/it]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
5it [01:25, 16.93s/it]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
6it [01:43, 17.46s/it]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
7it [01:59, 17.06s/it]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
8it [02:16, 16.97s/it]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
9it [02:35, 17.38s/it]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
10it [02:51, 17.18s/it]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
11it [03:08, 17.10s/it]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
12it [03

In [20]:
submission.to_csv(f"data/submission/polyglot-ko-1.3b_f-sft_5.csv", index=False)
pd.read_csv(f"data/submission/polyglot-ko-1.3b_f-sft_5.csv")

Unnamed: 0,id,vec_0,vec_1,vec_2,vec_3,vec_4,vec_5,vec_6,vec_7,vec_8,...,vec_502,vec_503,vec_504,vec_505,vec_506,vec_507,vec_508,vec_509,vec_510,vec_511
0,TEST_000,0.002573,0.035210,0.012646,0.005216,0.066983,0.025870,-0.044902,0.007480,0.008723,...,-0.018339,-0.039620,-0.029461,-0.032569,0.000073,-0.006292,0.024488,0.006801,0.050308,0.039636
1,TEST_001,-0.006316,0.043964,-0.012013,-0.000516,0.072565,-0.021178,0.022248,-0.040333,0.030803,...,-0.022531,-0.015690,0.026113,-0.031141,-0.036512,0.043442,-0.025938,0.027305,-0.021794,0.034515
2,TEST_002,-0.008436,-0.011499,-0.042170,0.001840,0.105926,-0.049753,-0.051349,-0.045694,0.056298,...,-0.032728,0.002324,0.064047,-0.040778,0.012211,0.008556,-0.041107,-0.023482,-0.049628,0.069997
3,TEST_003,-0.010799,0.012507,0.015290,0.030082,0.057094,-0.059067,-0.060367,-0.007579,0.000650,...,-0.033620,-0.024804,0.029809,-0.051397,0.005900,0.005190,-0.054066,-0.049180,-0.034041,0.061599
4,TEST_004,-0.023186,-0.017924,-0.015720,-0.013581,0.098607,-0.019275,0.078659,0.005730,-0.018426,...,-0.016727,-0.061035,0.040050,0.007069,-0.039191,0.021982,0.036501,0.022402,0.000076,0.039522
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
125,TEST_125,0.009847,-0.042760,0.009235,0.009850,0.108200,-0.007113,0.077466,0.030668,0.021609,...,0.014816,0.016811,0.040374,-0.047482,0.042004,0.028202,0.018279,-0.037097,-0.032359,0.013112
126,TEST_126,0.017769,-0.047265,-0.036992,-0.006113,0.080631,-0.038806,0.035686,0.025429,0.013350,...,-0.025982,-0.020841,0.057414,-0.018425,0.010714,0.012469,0.004307,-0.011002,0.004503,0.084766
127,TEST_127,-0.023581,0.004912,-0.024375,0.053496,0.097232,-0.008287,-0.009340,0.026250,-0.032130,...,-0.034932,-0.121293,0.017691,-0.008162,-0.038281,0.039590,0.046264,-0.011574,0.013584,-0.000163
128,TEST_128,0.039401,-0.013772,-0.066872,0.013240,0.107638,-0.065605,0.010459,-0.007505,0.013110,...,-0.035605,0.001200,0.013389,-0.029797,-0.007267,0.010359,-0.004484,-0.021936,0.032521,-0.005766


In [21]:
# submission.to_csv(f"data/submission/{MODEL.split('/')[-1]}.csv", index=False)
# pd.read_csv(f"data/submission/{MODEL.split('/')[-1]}.csv")