In [2]:
import argparse
import logging
import os

import numpy as np
import pandas as pd
import pytorch_lightning as pl
import torch
from pytorch_lightning import loggers as pl_loggers
from torch.utils.data import DataLoader, Dataset
from transformers import (BartForConditionalGeneration,
                          PreTrainedTokenizerFast)
from transformers import AutoTokenizer, AutoModel
from transformers.optimization import AdamW, get_cosine_schedule_with_warmup

import random
import numpy as np

random_seed = 42
random.seed(random_seed)
np.random.seed(random_seed)
torch.manual_seed(random_seed)


  from .autonotebook import tqdm as notebook_tqdm
  _torch_pytree._register_pytree_node(
  _torch_pytree._register_pytree_node(
  _torch_pytree._register_pytree_node(


<torch._C.Generator at 0x7f9b13946d30>

# SFT

In [2]:
model = BartForConditionalGeneration.from_pretrained("gogamza/kobart-base-v2", device_map="auto")
tokenizer = PreTrainedTokenizerFast.from_pretrained( "gogamza/kobart-base-v2", bos_token="<s>", eos_token="</s>",unk_token='<unk>',pad_token='<pad>',mask_token='<mask>')

You passed along `num_labels=3` with an incompatible id to label map: {'0': 'NEGATIVE', '1': 'POSITIVE'}. The number of labels wil be overwritten to 2.
You passed along `num_labels=3` with an incompatible id to label map: {'0': 'NEGATIVE', '1': 'POSITIVE'}. The number of labels wil be overwritten to 2.


In [4]:
train_df = pd.read_csv('../data/train_v3.csv')
eval_df = pd.read_csv('../data/eval_v3.csv')
train_df.shape

(185796, 3)

In [6]:
class ChatDataset(Dataset):
    def __init__(self, filepath, max_seq_len=128) -> None:
        self.filepath = filepath
        self.dataset = pd.read_csv(self.filepath)
        self.bos_token = '<s>'
        self.eos_token = '</s>'
        self.max_seq_len = max_seq_len
        self.tokenizer = PreTrainedTokenizerFast.from_pretrained(
            "gogamza/kobart-base-v2",
            bos_token="<s>",
            eos_token="</s>",
            unk_token='<unk>',
            pad_token='<pad>',
            mask_token='<mask>'
        )
    def __len__(self):
        return len(self.dataset)

    def make_input_id_mask(self, tokens, index):
        input_id = self.tokenizer.convert_tokens_to_ids(tokens)
        attention_mask = [1] * len(input_id)
        if len(input_id) < self.max_seq_len:
            while len(input_id) < self.max_seq_len:
                input_id += [self.tokenizer.pad_token_id]
                attention_mask += [0]
        else:
            # logging.warning(f'exceed max_seq_len for given article : {index}')
            input_id = input_id[:self.max_seq_len - 1] + [
                self.tokenizer.eos_token_id]
            attention_mask = attention_mask[:self.max_seq_len]
        return input_id, attention_mask

    def __getitem__(self, index):
        record = self.dataset.iloc[index]
        q, a = record['질문'], record['답변']
        q_tokens = [self.bos_token] + \
            self.tokenizer.tokenize(q) + [self.eos_token]
        a_tokens = [self.bos_token] + \
            self.tokenizer.tokenize(a) + [self.eos_token]
        encoder_input_id, encoder_attention_mask = self.make_input_id_mask(
            q_tokens, index)
        decoder_input_id, decoder_attention_mask = self.make_input_id_mask(
            a_tokens, index)
        labels = self.tokenizer.convert_tokens_to_ids(
            a_tokens[1:(self.max_seq_len + 1)])
        if len(labels) < self.max_seq_len:
            while len(labels) < self.max_seq_len:
                # for cross entropy loss masking
                labels += [-100]
        return {'input_ids': np.array(encoder_input_id, dtype=np.int_),
                'attention_mask': np.array(encoder_attention_mask, dtype=np.float_),
                'decoder_input_ids': np.array(decoder_input_id, dtype=np.int_),
                'decoder_attention_mask': np.array(decoder_attention_mask, dtype=np.float_),
                'labels': np.array(labels, dtype=np.int_)}

In [7]:
train_data= ChatDataset('../data/train_v3.csv',512)
val_data= ChatDataset('../data/eval_v3.csv',512)
train = DataLoader(train_data,batch_size=64,num_workers=8, shuffle=True)
val=DataLoader(val_data, batch_size=64,num_workers=8, shuffle=False)

You passed along `num_labels=3` with an incompatible id to label map: {'0': 'NEGATIVE', '1': 'POSITIVE'}. The number of labels wil be overwritten to 2.
You passed along `num_labels=3` with an incompatible id to label map: {'0': 'NEGATIVE', '1': 'POSITIVE'}. The number of labels wil be overwritten to 2.


In [10]:
from transformers import AutoModelForCausalLM, TrainingArguments, Trainer

training_args = TrainingArguments(
    output_dir="../model/kobart",
    per_device_train_batch_size=64,
    per_device_eval_batch_size=64,
    evaluation_strategy="steps",
    eval_steps=500,
    logging_steps=500,
    gradient_accumulation_steps=2,
    num_train_epochs=1,
    weight_decay=0.1,
    warmup_steps=100,
    lr_scheduler_type="cosine",
    learning_rate=5e-4,
    save_steps=500,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_data,
    eval_dataset=val_data,
)

In [None]:
trainer.train()

# SFT-Inference

In [2]:
from transformers import (BartForConditionalGeneration,
                          PreTrainedTokenizerFast)
import pandas as pd
import numpy as np
from trl import AutoModelForSeq2SeqLMWithValueHead

device = "cuda:1"

[2024-02-16 07:49:05,289] [INFO] [real_accelerator.py:158:get_accelerator] Setting ds_accelerator to cuda (auto detect)


In [3]:
#model = BartForConditionalGeneration.from_pretrained("gogamza/kobart-base-v2", device_map="auto")
#model = BartForConditionalGeneration.from_pretrained('../model/kobart/checkpoint-2800').to(device)
model = AutoModelForSeq2SeqLMWithValueHead.from_pretrained('../model/kobart/checkpoint-6400/').to(device)
tokenizer = PreTrainedTokenizerFast.from_pretrained( "gogamza/kobart-base-v2", bos_token="<s>", eos_token="</s>",unk_token='<unk>',pad_token='<pad>',mask_token='<mask>')

You passed along `num_labels=3` with an incompatible id to label map: {'0': 'NEGATIVE', '1': 'POSITIVE'}. The number of labels wil be overwritten to 2.
You passed along `num_labels=3` with an incompatible id to label map: {'0': 'NEGATIVE', '1': 'POSITIVE'}. The number of labels wil be overwritten to 2.


In [4]:
import torch

def get_response(text):
    input_ids =  [tokenizer.bos_token_id] + tokenizer.encode(text) + [tokenizer.eos_token_id]
    res_ids = model.generate(torch.tensor([input_ids]).to(device),
                                                max_length=512,
                                                num_beams=10,
                                                eos_token_id=tokenizer.eos_token_id,
                                                bad_words_ids=[[tokenizer.unk_token_id]])
    a = tokenizer.batch_decode(res_ids.tolist(), skip_special_tokens=True)[0]
    return a

In [5]:
text = "방청 페인트의 종류에는 어떤 것들이 있는지 알고 계신가요? 또한, 원목사이딩을 사용하는 것에 어떤 단점이 있을까요?"
print(get_response(text))

방청페인트의 종류는 광명단페인트, 방청산화철페인트, 알미늄페인트, 역청질페인트, 워시프라이머, 크롬산아연페인트, 규산염페인트가 있습니다.원목사이딩의 단점은 가격대가 높고 관리가 어려우며 습기에 약해 뒤틀림, 부서짐, 수축/팽장이 생길 수 있다는 점입니다.


In [6]:
generated_sent=[]
test=pd.read_csv('../data/test_raw.csv')

for i in range(len(test)):
  row = test.iloc[i]
  question = row['질문']
  response = get_response(question)
  generated_sent.append(response)

In [7]:
print(generated_sent[-1])

카페트의 기대수명은 6년입니다.오리지널징크는 다른 징크에 비해 수명이 길고 다양한 패턴과 디자인이 가능하며 친환경적이고 금속 부식에 대한 내식성이 뛰어나 유지보수가 용이하다는 장점이 있습니다.


In [8]:
import numpy as np
from sentence_transformers import SentenceTransformer # SentenceTransformer Version 2.2.2

# Embedding Vector 추출에 활용할 모델(distiluse-base-multilingual-cased-v1) 불러오기
m = SentenceTransformer('distiluse-base-multilingual-cased-v1')

In [9]:
sub=pd.read_csv('../data/sample_submission.csv')

In [10]:
encode_list=[]
for i in range(len(generated_sent)):
  embed=m.encode(generated_sent[i]) #주어진 모델로 인코딩
  encode_list.append(embed)

In [None]:
for i in range(len(encode_list)):
  sub.loc[i, 'vec_0':'vec_511']=encode_list[i]

sub.set_index('id',inplace=True)

In [12]:
sub.head(2)

Unnamed: 0_level_0,vec_0,vec_1,vec_2,vec_3,vec_4,vec_5,vec_6,vec_7,vec_8,vec_9,...,vec_502,vec_503,vec_504,vec_505,vec_506,vec_507,vec_508,vec_509,vec_510,vec_511
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
TEST_000,0.032679,-0.022694,0.015984,0.022094,0.06113,0.01772,-0.002545,-0.011087,-0.005358,0.053613,...,-0.01573,0.000518,-0.029576,-0.0352,-0.002526,0.023186,0.032703,0.036993,0.007826,0.013723
TEST_001,-0.001152,-0.008816,0.014305,0.000614,0.038565,0.002583,-0.011721,-0.002723,-0.000382,0.018276,...,0.0167,-0.010989,0.01479,-0.023875,-0.059724,0.054009,0.001227,0.009057,0.010631,0.029853


In [13]:
sub.to_csv('../result/kobart-v4.csv')

In [14]:
dataset = {'data':[]}
for i in range(len(test)):
  row = test.iloc[i]
  question = row['질문']
  response = generated_sent[i]
  dataset['data'].append({
    'question':question,
    'response':response
  })

import json
with open('../result/kobart-v4.json ', 'w') as file:
    json.dump(dataset, file, indent=4, ensure_ascii=False)

# PPO

In [3]:
from trl import AutoModelForSeq2SeqLMWithValueHead

os.environ["CUDA_DEVICE_ORDER"]="PCI_BUS_ID"  # Arrange GPU devices starting from 0
os.environ["CUDA_VISIBLE_DEVICES"]= "3" 

model = AutoModelForSeq2SeqLMWithValueHead.from_pretrained('../model/kobart/checkpoint-6400/')
tokenizer = PreTrainedTokenizerFast.from_pretrained( "gogamza/kobart-base-v2", bos_token="<s>", eos_token="</s>",unk_token='<unk>',pad_token='<pad>',mask_token='<mask>')

[2024-02-16 08:55:54,551] [INFO] [real_accelerator.py:158:get_accelerator] Setting ds_accelerator to cuda (auto detect)


You passed along `num_labels=3` with an incompatible id to label map: {'0': 'NEGATIVE', '1': 'POSITIVE'}. The number of labels wil be overwritten to 2.
You passed along `num_labels=3` with an incompatible id to label map: {'0': 'NEGATIVE', '1': 'POSITIVE'}. The number of labels wil be overwritten to 2.


In [4]:
import numpy as np
import pandas as pd
from sentence_transformers import SentenceTransformer # SentenceTransformer Version 2.2.2

# Embedding Vector 추출에 활용할 모델(distiluse-base-multilingual-cased-v1) 불러오기
embed_model = SentenceTransformer('distiluse-base-multilingual-cased-v1')

In [7]:
import torch

tokenizer = PreTrainedTokenizerFast.from_pretrained(
    "gogamza/kobart-base-v2",
    bos_token="<s>",
    eos_token="</s>",
    unk_token='<unk>',
    pad_token='<pad>',
    mask_token='<mask>'
)
class PPODataset(torch.utils.data.Dataset):
    def __init__(self, tokenizer, path):
        df = pd.read_csv(path)
        self.dataset = df
        self.tokenizer = tokenizer
        self.max_seq_len = 512
        self.bos_token = '<s>'
        self.eos_token = '</s>'

    def __len__(self):
        return len(self.dataset)

    def make_input_id_mask(self, tokens, index):
        input_id = self.tokenizer.convert_tokens_to_ids(tokens)
        attention_mask = [1] * len(input_id)
        if len(input_id) < self.max_seq_len:
            while len(input_id) < self.max_seq_len:
                input_id += [self.tokenizer.pad_token_id]
                attention_mask += [0]
        else:
            # logging.warning(f'exceed max_seq_len for given article : {index}')
            input_id = input_id[:self.max_seq_len - 1] + [
                self.tokenizer.eos_token_id]
            attention_mask = attention_mask[:self.max_seq_len]
        return input_id, attention_mask

    def __getitem__(self, index):
        record = self.dataset.iloc[index]
        q, a = record['질문'], record['답변']
        q_tokens = [self.bos_token] + \
            self.tokenizer.tokenize(q) + [self.eos_token]
        a_tokens = [self.bos_token] + \
            self.tokenizer.tokenize(a) + [self.eos_token]
        encoder_input_id, encoder_attention_mask = self.make_input_id_mask(
            q_tokens, index)
        decoder_input_id, decoder_attention_mask = self.make_input_id_mask(
            a_tokens, index)
        labels = self.tokenizer.convert_tokens_to_ids(
            a_tokens[1:(self.max_seq_len + 1)])
        if len(labels) < self.max_seq_len:
            while len(labels) < self.max_seq_len:
                # for cross entropy loss masking
                labels += [-100]
        return {'input_ids': np.array(encoder_input_id, dtype=np.int_),
                'attention_mask': np.array(encoder_attention_mask, dtype=np.float_),
                'decoder_input_ids': np.array(decoder_input_id, dtype=np.int_),
                'decoder_attention_mask': np.array(decoder_attention_mask, dtype=np.float_),
                'labels': np.array(labels, dtype=np.int_)}

    def __len__(self):
        return len(self.dataset)

You passed along `num_labels=3` with an incompatible id to label map: {'0': 'NEGATIVE', '1': 'POSITIVE'}. The number of labels wil be overwritten to 2.


In [9]:
train_dataset=PPODataset(tokenizer, path='../data/train_v2.csv')
eval_dataset=PPODataset(tokenizer,path='../data/eval_v2.csv')

In [10]:
from trl import PPOTrainer
from trl import PPOConfig

config = PPOConfig(
    model_name="gogamza/kobart-base-v2",
    learning_rate=1e-5,
    batch_size=4,
    gradient_accumulation_steps=1,
    ppo_epochs=4,
    seed=random_seed,
)

ppo_trainer = PPOTrainer(
    model=model,
    config=config,
    dataset=eval_dataset,
    tokenizer=tokenizer
)


In [11]:
generation_kwargs = {
    "min_length": -1,
    "top_k": 0.0,
    "top_p": 1.0,
    "do_sample": True,
    "pad_token_id": tokenizer.pad_token_id,
    "eos_token_id": tokenizer.eos_token_id,
    "bos_token_id": tokenizer.bos_token_id,
    "max_new_tokens": 512
}

In [12]:
response_texts = ['안녕?', '반가워']
labels = ['저리가.', '반가웡']

def cosine_similarity(a, b):
    dot_product = np.dot(a, b)
    norm_a = np.linalg.norm(a)
    norm_b = np.linalg.norm(b)
    return dot_product / (norm_a * norm_b) if norm_a != 0 and norm_b != 0 else 0

def get_rewards(embed_model, response_texts, labels):
    rewards = []
    for pred, label in zip(response_texts, labels):
        pred_embed = embed_model.encode(pred)
        label_embed = embed_model.encode(label)
    
        sample_score = cosine_similarity(label_embed, pred_embed)
        sample_score = torch.tensor(sample_score)
        rewards.append(sample_score)

    return rewards

print(get_rewards(embed_model, response_texts, labels))

[tensor(0.5478), tensor(0.6613)]


In [None]:
from tqdm import tqdm
from torch.nn.utils.rnn import pad_sequence

for epoch in tqdm(range(ppo_trainer.config.ppo_epochs), "epoch: "):
    for batch in tqdm(ppo_trainer.dataloader): 
        query_tensors = batch["input_ids"]
        query_tensors_for_step = []
        response_tensors = []

        for query_tensor in query_tensors:
            #### Get response from SFTModel
            response_tensor = ppo_trainer.generate(query_tensor, **generation_kwargs)
            response_tensors.append(response_tensor)
            query_tensors_for_step.append(query_tensor)
        
        response_tensors_for_input = pad_sequence(
            [response_tensors[index][0] for index in range(len(response_tensors))],
            batch_first=True, padding_value=tokenizer.pad_token_id)
        response_tensors_for_step = [i for i in response_tensors_for_input]
        
        #batch["response"] = [tokenizer.decode(r.squeeze()) for r in response_tensors]
        #response_texts = [tokenizer.decode(r.squeeze(), skip_special_tokens=True) for r in response_tensors]
        response_texts =  tokenizer.batch_decode(response_tensors_for_input, skip_special_tokens=True)
        labels = tokenizer.batch_decode(batch['decoder_input_ids'], skip_special_tokens=True)

        #### Compute reward score
        rewards = get_rewards(embed_model, response_texts, labels)

        print(response_texts)
        print(labels)
        print(rewards)
        print('-------')

        #pipe_outputs = reward_model(texts)
        #rewards = [torch.tensor(output[1]["score"]) for output in pipe_outputs]

        #### Run PPO step
        stats = ppo_trainer.step(query_tensors_for_step, response_tensors_for_step, rewards)
        ppo_trainer.log_stats(stats, batch, rewards)


In [None]:
#### Save model
ppo_trainer.save_model("../model/kobart-ppo")

# PPO-Inference

In [None]:
from trl import AutoModelForSeq2SeqLMWithValueHead

os.environ["CUDA_DEVICE_ORDER"]="PCI_BUS_ID"  # Arrange GPU devices starting from 0
os.environ["CUDA_VISIBLE_DEVICES"]= "3" 

model = AutoModelForSeq2SeqLMWithValueHead.from_pretrained('../model/kobart/checkpoint-5600/')
tokenizer = PreTrainedTokenizerFast.from_pretrained( "gogamza/kobart-base-v2", bos_token="<s>", eos_token="</s>",unk_token='<unk>',pad_token='<pad>',mask_token='<mask>')

In [None]:
import torch

def get_response(text):
    input_ids =  [tokenizer.bos_token_id] + tokenizer.encode(text) + [tokenizer.eos_token_id]
    res_ids = model.generate(torch.tensor([input_ids]).to(device),
                                                max_length=512,
                                                num_beams=10,
                                                eos_token_id=tokenizer.eos_token_id,
                                                bad_words_ids=[[tokenizer.unk_token_id]])
    a = tokenizer.batch_decode(res_ids.tolist(), skip_special_tokens=True)[0]
    return a

In [None]:
text = "방청 페인트의 종류에는 어떤 것들이 있는지 알고 계신가요? 또한, 원목사이딩을 사용하는 것에 어떤 단점이 있을까요?"
print(get_response(text))

In [None]:
generated_sent=[]
test=pd.read_csv('../data/test_raw.csv')

for i in range(len(test)):
  row = test.iloc[i]
  question = row['질문']
  response = get_response(question)
  generated_sent.append(response)

In [None]:
print(generated_sent[-1])

In [None]:
import numpy as np
import pandas as pd
from sentence_transformers import SentenceTransformer # SentenceTransformer Version 2.2.2

# Embedding Vector 추출에 활용할 모델(distiluse-base-multilingual-cased-v1) 불러오기
m = SentenceTransformer('distiluse-base-multilingual-cased-v1')

In [None]:
sub=pd.read_csv('../data/sample_submission.csv')

In [None]:
encode_list=[]
for i in range(len(generated_sent)):
  embed=m.encode(generated_sent[i]) #주어진 모델로 인코딩
  encode_list.append(embed)

In [None]:
for i in range(len(encode_list)):
  sub.loc[i, 'vec_0':'vec_511']=encode_list[i]

sub.set_index('id',inplace=True)

In [None]:
sub.head(2)

In [None]:
sub.to_csv('../result/kobart-ppo-v4.csv')

In [None]:
dataset = {'data':[]}
for i in range(len(test)):
  row = test.iloc[i]
  question = row['질문']
  response = generated_sent[i]
  dataset['data'].append({
    'question':question,
    'response':response
  })

import json
with open('../result/kobart-ppo-v4.json ', 'w') as file:
    json.dump(dataset, file, indent=4, ensure_ascii=False)