In [2]:
import argparse
import logging
import os

import numpy as np
import pandas as pd
import pytorch_lightning as pl
import torch
from pytorch_lightning import loggers as pl_loggers
from torch.utils.data import DataLoader, Dataset
from transformers import (BartForConditionalGeneration,
                          PreTrainedTokenizerFast)
from transformers import AutoTokenizer, AutoModel
from transformers.optimization import AdamW, get_cosine_schedule_with_warmup


# SFT

In [2]:
model = BartForConditionalGeneration.from_pretrained("gogamza/kobart-base-v2", device_map="auto")
tokenizer = PreTrainedTokenizerFast.from_pretrained( "gogamza/kobart-base-v2", bos_token="<s>", eos_token="</s>",unk_token='<unk>',pad_token='<pad>',mask_token='<mask>')

You passed along `num_labels=3` with an incompatible id to label map: {'0': 'NEGATIVE', '1': 'POSITIVE'}. The number of labels wil be overwritten to 2.
You passed along `num_labels=3` with an incompatible id to label map: {'0': 'NEGATIVE', '1': 'POSITIVE'}. The number of labels wil be overwritten to 2.


In [4]:
train_df = pd.read_csv('../data/train_v3.csv')
eval_df = pd.read_csv('../data/eval_v3.csv')
train_df.shape

(185796, 3)

In [6]:
class ChatDataset(Dataset):
    def __init__(self, filepath, max_seq_len=128) -> None:
        self.filepath = filepath
        self.dataset = pd.read_csv(self.filepath)
        self.bos_token = '<s>'
        self.eos_token = '</s>'
        self.max_seq_len = max_seq_len
        self.tokenizer = PreTrainedTokenizerFast.from_pretrained(
            "gogamza/kobart-base-v2",
            bos_token="<s>",
            eos_token="</s>",
            unk_token='<unk>',
            pad_token='<pad>',
            mask_token='<mask>'
        )
    def __len__(self):
        return len(self.dataset)

    def make_input_id_mask(self, tokens, index):
        input_id = self.tokenizer.convert_tokens_to_ids(tokens)
        attention_mask = [1] * len(input_id)
        if len(input_id) < self.max_seq_len:
            while len(input_id) < self.max_seq_len:
                input_id += [self.tokenizer.pad_token_id]
                attention_mask += [0]
        else:
            # logging.warning(f'exceed max_seq_len for given article : {index}')
            input_id = input_id[:self.max_seq_len - 1] + [
                self.tokenizer.eos_token_id]
            attention_mask = attention_mask[:self.max_seq_len]
        return input_id, attention_mask

    def __getitem__(self, index):
        record = self.dataset.iloc[index]
        q, a = record['질문'], record['답변']
        q_tokens = [self.bos_token] + \
            self.tokenizer.tokenize(q) + [self.eos_token]
        a_tokens = [self.bos_token] + \
            self.tokenizer.tokenize(a) + [self.eos_token]
        encoder_input_id, encoder_attention_mask = self.make_input_id_mask(
            q_tokens, index)
        decoder_input_id, decoder_attention_mask = self.make_input_id_mask(
            a_tokens, index)
        labels = self.tokenizer.convert_tokens_to_ids(
            a_tokens[1:(self.max_seq_len + 1)])
        if len(labels) < self.max_seq_len:
            while len(labels) < self.max_seq_len:
                # for cross entropy loss masking
                labels += [-100]
        return {'input_ids': np.array(encoder_input_id, dtype=np.int_),
                'attention_mask': np.array(encoder_attention_mask, dtype=np.float_),
                'decoder_input_ids': np.array(decoder_input_id, dtype=np.int_),
                'decoder_attention_mask': np.array(decoder_attention_mask, dtype=np.float_),
                'labels': np.array(labels, dtype=np.int_)}

In [7]:
train_data= ChatDataset('../data/train_v3.csv',512)
val_data= ChatDataset('../data/eval_v3.csv',512)
train = DataLoader(train_data,batch_size=64,num_workers=8, shuffle=True)
val=DataLoader(val_data, batch_size=64,num_workers=8, shuffle=False)

You passed along `num_labels=3` with an incompatible id to label map: {'0': 'NEGATIVE', '1': 'POSITIVE'}. The number of labels wil be overwritten to 2.
You passed along `num_labels=3` with an incompatible id to label map: {'0': 'NEGATIVE', '1': 'POSITIVE'}. The number of labels wil be overwritten to 2.


In [10]:
from transformers import AutoModelForCausalLM, TrainingArguments, Trainer

training_args = TrainingArguments(
    output_dir="../model/kobart",
    per_device_train_batch_size=64,
    per_device_eval_batch_size=64,
    evaluation_strategy="steps",
    eval_steps=500,
    logging_steps=500,
    gradient_accumulation_steps=2,
    num_train_epochs=1,
    weight_decay=0.1,
    warmup_steps=100,
    lr_scheduler_type="cosine",
    learning_rate=5e-4,
    save_steps=500,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_data,
    eval_dataset=val_data,
)

In [None]:
trainer.train()

# SFT-Inference

In [1]:
from transformers import (BartForConditionalGeneration,
                          PreTrainedTokenizerFast)
import pandas as pd
import numpy as np

  from .autonotebook import tqdm as notebook_tqdm
  _torch_pytree._register_pytree_node(
  _torch_pytree._register_pytree_node(
  _torch_pytree._register_pytree_node(


In [2]:
#model = BartForConditionalGeneration.from_pretrained("gogamza/kobart-base-v2", device_map="auto")
model = BartForConditionalGeneration.from_pretrained('../model/kobart/checkpoint-2800')
tokenizer = PreTrainedTokenizerFast.from_pretrained( "gogamza/kobart-base-v2", bos_token="<s>", eos_token="</s>",unk_token='<unk>',pad_token='<pad>',mask_token='<mask>')

You passed along `num_labels=3` with an incompatible id to label map: {'0': 'NEGATIVE', '1': 'POSITIVE'}. The number of labels wil be overwritten to 2.
You passed along `num_labels=3` with an incompatible id to label map: {'0': 'NEGATIVE', '1': 'POSITIVE'}. The number of labels wil be overwritten to 2.


In [10]:
import torch

def get_response(text):
    device = "cuda"

    input_ids =  [tokenizer.bos_token_id] + tokenizer.encode(text) + [tokenizer.eos_token_id]
    res_ids = model.generate(torch.tensor([input_ids]).to(device),
                                                max_length=512,
                                                num_beams=5,
                                                eos_token_id=tokenizer.eos_token_id,
                                                bad_words_ids=[[tokenizer.unk_token_id]])
    a = tokenizer.batch_decode(res_ids.tolist(), skip_special_tokens=True)[0]
    return a

In [11]:
text = "방청 페인트의 종류에는 어떤 것들이 있는지 알고 계신가요? 또한, 원목사이딩을 사용하는 것에 어떤 단점이 있을까요?"
print(get_response(text))

방청 페인트의 종류로는 광명단페인트, 방청산화철페인트, 알미늄페인트, 역청질페인트, 워시프라이머, 크롬산아연페인트, 규산염페인트 등이 있습니다. 이러한 페인트들은 각각의 특성과 용도에 맞게 선택하여 사용할 수 있습니다. 광명단페인트는 반사율이 높아서 더 밝은 공간으로 만들어주며, 방청산화철페인트는 방청제가 첨가되어 방청 효과를 내는 페인트입니다. 알미늄페인트는 금속 또는 철강 표면에 사용되어 내화, 방청 효과를 가지는 페인트입니다. 역청질페인트는 강철, 철판, 강관 등 강철재에 방청처리용으로 사용되는 페인트로, 방청제의 첨가와 염모처리에 의해 방청효과가 향상되었습니다. 워시프라이머는 철강, 알루미늄, 유리섬유, 금속, 플라스틱 등에 사용되는 중요한 물품의 부착용으로서 용도에 맞게 선정할 수 있습니다. 크롬산아연페인트와 규산염페인트도 각각의 특성과 용도에 따라 선택하여 사용할 수 있습니다.원목사이딩의 장점은 있으나, 단점으로는 가격대가 높고 관리가 어려우며 습기에 약해 뒤틀림, 부서짐, 수축/팽창이 생길 수 있는 점이 있습니다. 추가적으로 원목사이딩은 색상이 변색될 수 있고 해충 침입에 취약하다는 점도 고려해야 합니다.


# SFT inference

In [15]:
generated_sent=[]
test=pd.read_csv('../data/test_raw.csv')

for i in range(len(test)):
  row = test.iloc[i]
  question = row['질문']
  response = get_response(question)
  generated_sent.append(response)

In [16]:
print(generated_sent[-1])

카페트의 기대수명은 일반적으로 6년입니다. 다만, 사용 빈도, 청소 방식, 햇빛 노출 정도 등 사용 환경과 관리에 따라 수명이 달라질 수 있으므로 주기적으로 청소 및 관리를 해야 합니다. 또한, 질 좋은 카페트를 구매하고 깔끔하게 관리하여 수명을 연장할 수 있습니다.오리지널징크의 장점은 다른 징크에 비해 수명이 길며 다양한 패턴과 디자인이 가능하다는 점입니다. 또한, 친환경적이고 금속 부식에 대한 내식성이 뛰어나 유지보수가 용이하다는 특징이 있습니다. 이러한 이점들로 인해 오리지널징크는 건물 외벽 및 지붕재로 널리 사용되고 있습니다.


In [12]:
import numpy as np
from sentence_transformers import SentenceTransformer # SentenceTransformer Version 2.2.2

# Embedding Vector 추출에 활용할 모델(distiluse-base-multilingual-cased-v1) 불러오기
m = SentenceTransformer('distiluse-base-multilingual-cased-v1')

In [13]:
sub=pd.read_csv('../data/sample_submission.csv')

In [17]:
encode_list=[]
for i in range(len(generated_sent)):
  embed=m.encode(generated_sent[i]) #주어진 모델로 인코딩
  encode_list.append(embed)

In [None]:
for i in range(len(encode_list)):
  sub.loc[i, 'vec_0':'vec_511']=encode_list[i]

sub.set_index('id',inplace=True)

In [19]:
sub.head(2)

Unnamed: 0_level_0,vec_0,vec_1,vec_2,vec_3,vec_4,vec_5,vec_6,vec_7,vec_8,vec_9,...,vec_502,vec_503,vec_504,vec_505,vec_506,vec_507,vec_508,vec_509,vec_510,vec_511
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
TEST_000,0.005784,0.036606,0.007475,0.006188,0.062858,0.028712,-0.04588,0.01161,0.009886,0.033085,...,-0.019034,-0.041623,-0.028136,-0.034332,0.00191,-0.007698,0.029485,0.005473,0.050413,0.038683
TEST_001,-0.009441,0.023115,-0.015673,0.002355,0.09169,0.012363,0.014662,-0.00303,-0.020069,0.027384,...,-9.5e-05,-0.014458,-0.008928,0.009849,-0.068741,0.058391,0.003188,-0.015291,-0.004628,0.040544


In [20]:
sub.to_csv('../result/kobart-v3.csv')

In [21]:
with open('../result/kobart-v3.txt ', 'w+') as file:
    file.write('\n'.join(generated_sent))

# PPO

In [3]:
from trl import AutoModelForCausalLMWithValueHead
model = AutoModelForCausalLMWithValueHead.from_pretrained('../model/kobart/checkpoint-2800/')
tokenizer = PreTrainedTokenizerFast.from_pretrained( "gogamza/kobart-base-v2", bos_token="<s>", eos_token="</s>",unk_token='<unk>',pad_token='<pad>',mask_token='<mask>')

You passed along `num_labels=3` with an incompatible id to label map: {'0': 'NEGATIVE', '1': 'POSITIVE'}. The number of labels wil be overwritten to 2.
Some weights of BartForCausalLM were not initialized from the model checkpoint at ../model/kobart/checkpoint-2800/ and are newly initialized: ['model.decoder.embed_tokens.weight', 'lm_head.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
You passed along `num_labels=3` with an incompatible id to label map: {'0': 'NEGATIVE', '1': 'POSITIVE'}. The number of labels wil be overwritten to 2.


In [91]:
import numpy as np
import pandas as pd
from sentence_transformers import SentenceTransformer # SentenceTransformer Version 2.2.2

# Embedding Vector 추출에 활용할 모델(distiluse-base-multilingual-cased-v1) 불러오기
embed_model = SentenceTransformer('distiluse-base-multilingual-cased-v1')

In [5]:
train_df = pd.read_csv('../data/train_v3.csv')
eval_df = pd.read_csv('../data/eval_v3.csv')

In [6]:
train_df.head()

Unnamed: 0.1,Unnamed: 0,질문,답변
0,177338,유성발수제의 장점이 뭐야?인테리어에서 현대적 감각을 나타내기 위해 어떤 디자인 요소...,"유성발수제의 장점은 소재에 물의 침투를 차단하여 소재의 수명을 연장하고, 동결, 염..."
1,245286,속건형 유성 발수제의 장점이 뭐야?어떤 종류의 인테리어 조명이 있죠?,속건형 유성 발수제의 장점은 다양한 건축자재의 표면에 발라 빗물과 기타 물질의 침투...
2,213874,어떻게 다이닝 룸을 정통적으로 꾸밀 수 있을까요?비닐사이딩은 뭔가요?,다이닝 룸을 정통적으로 꾸미기 위해서는 대형 식탁과 고풍스러운 의자를 선택하는 것이...
3,285334,규산염페인트가 뭐야?천연벽지의 장점과 단점에 대해 알고 계신가요?,규산염페인트는 교상의 규산염과 방청안료를 주원료로 장유성바니쉬를 혼합한 것을 의미합...
4,206342,리얼징크가 뭐야?층간소음이 발생하는 기준은 어떤 것인가요?,"리얼징크는 철판에 부식을 방지하기 위해 아연 도금을 한 후 페인트를 칠한 외장재로,..."


In [43]:
import torch

class PPODataset(torch.utils.data.Dataset):
    def __init__(self, path1, path2=None):
        df1 = pd.read_csv(path1)
        if path2:
            df2 = pd.read_csv(path2)
            df = pd.concat((df1, df2))
        else:
            df = df1
            
        self.dataset = df1
        self.tokenizer = PreTrainedTokenizerFast.from_pretrained(
            "gogamza/kobart-base-v2",
            bos_token="<s>",
            eos_token="</s>",
            unk_token='<unk>',
            pad_token='<pad>',
            mask_token='<mask>'
        )
        self.max_seq_len = 256
        self.bos_token = '<s>'
        self.eos_token = '</s>'

    def __len__(self):
        return len(self.dataset)

    def make_input_id_mask(self, tokens, index):
        input_id = self.tokenizer.convert_tokens_to_ids(tokens)
        attention_mask = [1] * len(input_id)
        if len(input_id) < self.max_seq_len:
            while len(input_id) < self.max_seq_len:
                input_id += [self.tokenizer.pad_token_id]
                attention_mask += [0]
        else:
            # logging.warning(f'exceed max_seq_len for given article : {index}')
            input_id = input_id[:self.max_seq_len - 1] + [
                self.tokenizer.eos_token_id]
            attention_mask = attention_mask[:self.max_seq_len]
        return input_id, attention_mask

    def __getitem__(self, index):
        record = self.dataset.iloc[index]
        q, a = record['질문'], record['답변']
        q_tokens = [self.bos_token] + \
            self.tokenizer.tokenize(q) + [self.eos_token]
        a_tokens = [self.bos_token] + \
            self.tokenizer.tokenize(a) + [self.eos_token]
        encoder_input_id, encoder_attention_mask = self.make_input_id_mask(
            q_tokens, index)
        decoder_input_id, decoder_attention_mask = self.make_input_id_mask(
            a_tokens, index)
        labels = self.tokenizer.convert_tokens_to_ids(
            a_tokens[1:(self.max_seq_len + 1)])
        if len(labels) < self.max_seq_len:
            while len(labels) < self.max_seq_len:
                # for cross entropy loss masking
                labels += [-100]
        return {'input_ids': np.array(encoder_input_id, dtype=np.int_),
                'attention_mask': np.array(encoder_attention_mask, dtype=np.float_),
                'decoder_input_ids': np.array(decoder_input_id, dtype=np.int_),
                'decoder_attention_mask': np.array(decoder_attention_mask, dtype=np.float_),
                'labels': np.array(labels, dtype=np.int_)}

    def __len__(self):
        return len(self.dataset)

In [44]:
train_dataset=PPODataset('../data/train_v3.csv','../data/eval_v3.csv')

You passed along `num_labels=3` with an incompatible id to label map: {'0': 'NEGATIVE', '1': 'POSITIVE'}. The number of labels wil be overwritten to 2.


In [45]:
from trl import PPOTrainer
from trl import PPOConfig

config = PPOConfig(
    model_name="gogamza/kobart-base-v2",
    learning_rate=1e-4,
    batch_size=64,
    gradient_accumulation_steps=2,
    ppo_epochs=4,
)

ppo_trainer = PPOTrainer(
    model=model,
    config=config,
    dataset=train_dataset,
    tokenizer=tokenizer
)


In [97]:
generation_kwargs = {
    "min_length": -1,
    "top_k": 0.0,
    "top_p": 1.0,
    "do_sample": True,
    "pad_token_id": tokenizer.eos_token_id,
    "max_length":512
}

In [98]:
a = [tokenizer.decode(l.squeeze(), skip_special_tokens=True) for l in [train_dataset[1]['decoder_input_ids'], train_dataset[0]['decoder_input_ids']]]

In [107]:
response_texts = ['안녕?', '반가워']
labels = ['저리가.', '반가웡']

def cosine_similarity(a, b):
    dot_product = np.dot(a, b)
    norm_a = np.linalg.norm(a)
    norm_b = np.linalg.norm(b)
    return dot_product / (norm_a * norm_b) if norm_a != 0 and norm_b != 0 else 0

def get_rewards(embed_model, response_texts, labels):
    rewards = []
    for pred, label in zip(response_texts, labels):
        pred_embed = embed_model.encode(pred)
        label_embed = embed_model.encode(label)
    
        sample_score = cosine_similarity(label_embed, pred_embed)
        sample_score = torch.tensor(sample_score)
        rewards.append(sample_score)

    return rewards

print(get_rewards(embed_model, response_texts, labels))

[tensor(0.5478), tensor(0.6613)]


In [108]:
from tqdm import tqdm
for epoch in tqdm(range(ppo_trainer.config.ppo_epochs), "epoch: "):
    for batch in tqdm(ppo_trainer.dataloader): 
        query_tensors = batch["input_ids"]
        query_tensors_for_input = []
        response_tensors = []
        for query_tensor in query_tensors:
            #### Get response from SFTModel
            response_tensor = ppo_trainer.generate(query_tensor, **generation_kwargs)
            response_tensors.append(response_tensor)
            query_tensors_for_input.append(query_tensor)
        
        #batch["response"] = [tokenizer.decode(r.squeeze()) for r in response_tensors]
        response_texts = [tokenizer.decode(r.squeeze(), skip_special_tokens=True) for r in response_tensors]
        labels =[tokenizer.decode(l.squeeze(), skip_special_tokens=True) for l in batch['decoder_input_ids']]
        
        #### Compute reward score
        rewards = get_rewards(embed_model, response_texts, labels)

        #pipe_outputs = reward_model(texts)
        #rewards = [torch.tensor(output[1]["score"]) for output in pipe_outputs]

        #### Run PPO step
        stats = ppo_trainer.step(query_tensors_for_input, response_tensors, rewards)
        ppo_trainer.log_stats(stats, batch, rewards)



  0%|          | 0/6033 [00:55<?, ?it/s]?it/s]
epoch:   0%|          | 0/400 [00:55<?, ?it/s]


RuntimeError: Tensors must have same number of dimensions: got 1 and 2

In [None]:
#### Save model
ppo_trainer.save_model("../model/kobart-ppo")