In [1]:
import numpy as np
import pandas as pd

# base model

In [None]:
from peft import PeftConfig, PeftModel
from transformers import AutoModelForCausalLM, AutoTokenizer

device = "cuda"

base_model_name = "GAI-LLM/Yi-Ko-6B-mixed-v15"

model = AutoModelForCausalLM.from_pretrained(base_model_name, device_map="auto")
tokenizer = AutoTokenizer.from_pretrained(base_model_name)

In [9]:
inputs = tokenizer.encode(f"안녕.", return_tensors="pt").to(device)
outputs = model.generate(input_ids=inputs, 
                         max_length=512)
print(tokenizer.decode(outputs[0]))

<|startoftext|> 안녕..............................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................


# fine-tuned model

In [2]:
from peft import PeftConfig, PeftModel
from transformers import AutoModelForCausalLM, AutoTokenizer

device = "cuda"

base_model_name = "GAI-LLM/Yi-Ko-6B-mixed-v15"
adapter_model_name = "../model/GAI-LLM-Yi-Ko-6B-mixed-v15-sft-qlora-dpo-v1"

model = AutoModelForCausalLM.from_pretrained(base_model_name, device_map="auto")
model = PeftModel.from_pretrained(model, adapter_model_name)

tokenizer = AutoTokenizer.from_pretrained(base_model_name)

  from .autonotebook import tqdm as notebook_tqdm
  _torch_pytree._register_pytree_node(
  _torch_pytree._register_pytree_node(
Loading checkpoint shards: 100%|██████████| 2/2 [00:13<00:00,  6.74s/it]
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [3]:
inputs = tokenizer.encode(f"<|user|>방청 페인트의 종류에는 어떤 것들이 있는지 알고 계신가요? 또한, 원목사이딩을 사용하는 것에 어떤 단점이 있을까요?{tokenizer.eos_token}<|assistant|>", return_tensors="pt").to(device)
outputs = model.generate(input_ids=inputs, 
                         max_length=512, 
                         num_beams=5)

print(tokenizer.decode(outputs[0], skip_special_tokens=True))

 <|user|>방청 페인트의 종류에는 어떤 것들이 있는지 알고 계신가요? 또한, 원목사이딩을 사용하는 것에 어떤 단점이 있을까요? <|assistant|>I'm sorry, sir, but I'm afraid you'll have to wait.죄송합니다, 선생님, 기다리셔야 할 것 같습니다.<|assistant|>I'm sorry, sir, but I'm afraid you'll have to wait.죄송합니다, 선생님, 기다리셔야 할 것 같습니다.<|assistant|>I'm sorry, sir, but I'm afraid you'll have to wait.죄송합니다, 선생님, 기다리셔야 할 것 같습니다.<|assistant|>I'm sorry, sir, but I'm afraid you'll have to wait.죄송합니다, 선생님, 기다리셔야 할 것 같습니다.<|assistant|>I'm sorry, sir, but I'm afraid you'll have to wait.죄송합니다, 선생님, 기다리셔야 할 것 같습니다.<|assistant|>I'm sorry, sir, but I'm afraid you'll have to wait.죄송합니다, 선생님, 기다리셔야 할 것 같습니다.<|assistant|>I'm sorry, sir, but I'm afraid you'll have to wait.죄송합니다, 선생님, 기다리셔야 할 것 같습니다.<|assistant|>I'm sorry, sir, but I'm afraid you'll have to wait.죄송합니다, 선생님, 기다리셔야 할 것 같습니다.<|assistant|>I'm sorry, sir, but I'm afraid you'll have to wait.죄송합니다, 선생님, 기다리셔야 할 것 같습니다.<|assistant|>I'm sorry, sir, but I'm afraid you'll have to wait.죄송합니다, 선생님, 기다리셔야 할 것 같습니다.<|assistant|>I'm

# inference

In [10]:
def extract_text(input_string):
    index_t = input_string.find('<|assistant|>')
    if index_t != -1:  
        result = input_string[index_t + len('<|assistant|>'):]
    else: 
        raise Exception
    return result

In [8]:
test = pd.read_csv('../data/test_raw.csv')
test.head(3)

Unnamed: 0,id,질문
0,TEST_000,"방청 페인트의 종류에는 어떤 것들이 있는지 알고 계신가요? 또한, 원목사이딩을 사용..."
1,TEST_001,도배지에 녹은 자국이 발생하는 주된 원인과 그 해결 방법은 무엇인가요?
2,TEST_002,"큐블럭의 단점을 알려주세요. 또한, 압출법 단열판을 사용하는 것의 장점은 무엇인가요?"


In [11]:
import tqdm

generated_sent = []

for i in tqdm(range(len(test))):
    q = test.iloc[i]['질문']
    prompt = f'<|user|>{q}{tokenizer.eos_token}<|assistant|>'

    inputs = tokenizer.encode(prompt, return_tensors="pt").to(device)
    outputs = model.generate(input_ids=inputs, max_length=512)
    response = tokenizer.decode(outputs[0], skip_special_tokens=True)
    response = extract_text(response)

    generated_sent.append(response)

# submission

In [14]:
import numpy as np
import pandas as pd
from sentence_transformers import SentenceTransformer # SentenceTransformer Version 2.2.2

# Embedding Vector 추출에 활용할 모델(distiluse-base-multilingual-cased-v1) 불러오기
m = SentenceTransformer('distiluse-base-multilingual-cased-v1')

sub=pd.read_csv('../data/sample_submission.csv')

modules.json: 100%|██████████| 341/341 [00:00<00:00, 1.07MB/s]
config_sentence_transformers.json: 100%|██████████| 122/122 [00:00<00:00, 523kB/s]
README.md: 100%|██████████| 2.45k/2.45k [00:00<00:00, 10.2MB/s]
sentence_bert_config.json: 100%|██████████| 53.0/53.0 [00:00<00:00, 234kB/s]
config.json: 100%|██████████| 556/556 [00:00<00:00, 2.28MB/s]
pytorch_model.bin: 100%|██████████| 539M/539M [00:10<00:00, 50.8MB/s] 
tokenizer_config.json: 100%|██████████| 452/452 [00:00<00:00, 1.64MB/s]
vocab.txt: 100%|██████████| 996k/996k [00:00<00:00, 1.78MB/s]
tokenizer.json: 100%|██████████| 1.96M/1.96M [00:00<00:00, 2.10MB/s]
special_tokens_map.json: 100%|██████████| 112/112 [00:00<00:00, 481kB/s]
1_Pooling/config.json: 100%|██████████| 190/190 [00:00<00:00, 546kB/s]
2_Dense/config.json: 100%|██████████| 114/114 [00:00<00:00, 405kB/s]
pytorch_model.bin: 100%|██████████| 1.58M/1.58M [00:01<00:00, 1.40MB/s]


In [15]:
encode_list=[]
for i in range(len(generated_sent)):
  embed=m.encode(generated_sent[i]) #주어진 모델로 인코딩
  encode_list.append(embed)

In [None]:
for i in range(len(encode_list)):
  sub.loc[i, 'vec_0':'vec_511']=encode_list[i] #제출 파일에 끼워넣기

In [17]:
sub.head(2)

Unnamed: 0,id,vec_0,vec_1,vec_2,vec_3,vec_4,vec_5,vec_6,vec_7,vec_8,...,vec_502,vec_503,vec_504,vec_505,vec_506,vec_507,vec_508,vec_509,vec_510,vec_511
0,TEST_000,0.044647,0.044686,0.019709,0.003137,0.059527,0.014705,-0.013557,0.025025,0.02693,...,0.018252,-0.041767,-0.011106,-0.032668,-0.028045,0.010286,0.053333,0.019077,0.004254,0.030388
1,TEST_001,0.001539,-0.014106,0.006531,0.013342,0.069398,-0.013717,-0.008829,-0.02308,-0.008041,...,-0.025134,-0.016303,0.043635,-0.045904,-0.025309,0.035036,-0.005466,-0.009464,0.036712,-0.004162


In [19]:
sub.set_index('id',inplace=True)

In [20]:
sub.head(2)

Unnamed: 0_level_0,vec_0,vec_1,vec_2,vec_3,vec_4,vec_5,vec_6,vec_7,vec_8,vec_9,...,vec_502,vec_503,vec_504,vec_505,vec_506,vec_507,vec_508,vec_509,vec_510,vec_511
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
TEST_000,0.044647,0.044686,0.019709,0.003137,0.059527,0.014705,-0.013557,0.025025,0.02693,0.023581,...,0.018252,-0.041767,-0.011106,-0.032668,-0.028045,0.010286,0.053333,0.019077,0.004254,0.030388
TEST_001,0.001539,-0.014106,0.006531,0.013342,0.069398,-0.013717,-0.008829,-0.02308,-0.008041,0.01686,...,-0.025134,-0.016303,0.043635,-0.045904,-0.025309,0.035036,-0.005466,-0.009464,0.036712,-0.004162


In [21]:
sub.to_csv('../result/GAI-LLM-Yi-Ko-6B-mixed-v15-qlora-v1.csv')

In [33]:
with open('../result/GAI-LLM-Yi-Ko-6B-mixed-v15-qlora-v1.txt ', 'w+') as file:
    file.write('\n'.join(generated_sent))