In [None]:
import torch
import transformers
import torch.nn as nn
from torch.utils.data import Dataset
from datasets import load_dataset
from copy import deepcopy
from torch.optim import Adam
from transformers import BartTokenizer
from transformers import BartForConditionalGeneration, PreTrainedTokenizerFast
from transformers import PreTrainedTokenizerFast
from transformers import BartForConditionalGeneration, BartConfig
from transformers import Seq2SeqTrainer, Seq2SeqTrainingArguments
from transformers import DataCollatorForSeq2Seq, AutoModelForSeq2SeqLM
from nltk.translate.bleu_score import sentence_bleu
import numpy as np
import pandas as pd

In [None]:
class TranslationDataset(Dataset):
  def __init__(self, df, tokenizer, max_len, ignore_index=-100, verbose=True):
    super().__init__()
    self.tokenizer = tokenizer
    self.max_len = max_len
    self.df = df
    self.len = len(self.df)
    self.pad_index = self.tokenizer.pad_token_id
    self.ignore_index = ignore_index

  def add_padding_data(self, inputs):
    if len(inputs) < self.max_len:
      pad = np.array([self.pad_index] * (self.max_len - len(inputs)))
      inputs = np.concatenate([inputs, pad])
    else:
      inputs = inputs[:self.max_len]
    return inputs

  def add_ignored_data(self, inputs):
    if len(inputs) < self.max_len:
      pad = np.array([self.ignore_index] * (self.max_len - len(inputs)))
      inputs = np.concatenate([inputs, pad])
    else:
      inputs = inputs[:self.max_len]

    return inputs

  def __getitem__(self, idx, verbose=True):
    instance = self.df.iloc[idx]
    input_ids = self.tokenizer.encode(instance['원문'])
    input_ids = np.append(input_ids, self.tokenizer.eos_token_id)
    input_ids = self.add_padding_data(input_ids)
    input_ids = np.insert(input_ids, 0, self.tokenizer.bos_token_id)

    label_ids = self.tokenizer.encode(instance['번역문'])
    label_ids.append(self.tokenizer.eos_token_id)
    label_ids.insert(0, self.tokenizer.bos_token_id)

    dec_input_ids = [self.tokenizer.eos_token_id]
    dec_input_ids += label_ids[:-1]
    dec_input_ids = self.add_padding_data(dec_input_ids)
    label_ids = self.add_ignored_data(label_ids)

    input_ids = torch.tensor(np.array(input_ids)).long()
    decoder_input_ids = torch.tensor(np.array(dec_input_ids)).long()

    attention_mask = input_ids.ne(self.tokenizer.pad_token_id).float()

    return {'input_ids': input_ids,
            #'attention_mask': input_ids.ne(self.tokenizer.pad_token_id).float(),
            'decoder_input_ids': decoder_input_ids,
            # 'decoder_attention_mask': decoder_input_ids.ne(self.tokenizer.pad_token_id).float(),
            'labels': np.array(label_ids, dtype = np.int_)}

  def __len__(self):
    return self.len

In [None]:
def compute_metrics(pred):
  preds, labels = pred

  preds = tokenizer.batch_decode(preds, skip_special_tokens = True)
  labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
  labels = tokenizer.batch_decode(labels, skip_special_tokens = True)

  print("원문: ", val['원문'][0])
  print("번역 정답", labels[0])
  print("번역 결과: ", preds[0])

  reference = preds[0].split()
  candidate = []
  candidate.append(labels[0].split())
  bleu = sentence_bleu(references = candidate, hypothesis=reference, weights=(1, 0, 0, 0))
  return {"BLEU score": bleu }

In [None]:
lr = 3e-5
stop = 3
epoch = 10
batch = 4
seed = 42
device = 'cuda'

In [None]:
train = pd.read_csv("english_korean_data/train_small.csv", encoding="cp949")
val = pd.read_csv("english_korean_data/test_open.csv", encoding="cp949")
train_dataset = TranslationDataset(train, PreTrainedTokenizerFast.from_pretrained('gogamza/kobart-base-v1'), 256)
val_dataset = TranslationDataset(val, PreTrainedTokenizerFast.from_pretrained('gogamza/kobart-base-v1'), 256)

In [None]:
model = AutoModelForSeq2SeqLM.from_pretrained("gogamza/kobart-base-v1")
tokenizer = PreTrainedTokenizerFast.from_pretrained("gogamza/kobart-base-v1")
collator = DataCollatorForSeq2Seq(tokenizer, model=model, label_pad_token_id = tokenizer.pad_token_id)

In [None]:

tokenizer = PreTrainedTokenizerFast.from_pretrained("gogamza/kobart-base-v1")


config = BartConfig.from_pretrained("gogamza/kobart-base-v1")


config.encoder_embed_dim = 768  
config.encoder_embed_path = None


encoder_embedding = torch.nn.Embedding(config.vocab_size, config.encoder_embed_dim)


original_model = BartForConditionalGeneration.from_pretrained("gogamza/kobart-base-v1")


original_model.model.encoder.embed_tokens = encoder_embedding


model = original_model

collator = DataCollatorForSeq2Seq(tokenizer, model=model, label_pad_token_id=tokenizer.pad_token_id)

In [None]:
optimizer = torch.optim.AdamW(model.parameters(), lr=lr)
lr_scheduler = transformers.get_cosine_schedule_with_warmup(optimizer = optimizer,
                                                        num_warmup_steps = 100,
                                                        num_training_steps = epoch * len(train_dataset) * batch,
                                                        last_epoch = -1)

In [None]:
args = Seq2SeqTrainingArguments(run_name = "KoBART_translator",
                                output_dir = "./BART_translator_2",
                                evaluation_strategy="steps",
                                eval_steps = 100,
                                save_steps = 100,
                                save_total_limit=2,

                                per_device_train_batch_size= batch,
                                per_device_eval_batch_size = batch,
                                gradient_accumulation_steps = 16,
                                num_train_epochs = epoch,

                                load_best_model_at_end = True,
                                #fp16=True,
                                do_train=True,
                                do_eval=True,
                                predict_with_generate=True,)

trainer = Seq2SeqTrainer(model = model,
                        tokenizer = tokenizer,
                        args = args,
                        train_dataset = train_dataset,
                        eval_dataset = val_dataset,
                        compute_metrics = compute_metrics,
                        optimizers = (optimizer, lr_scheduler),
                        data_collator = collator,)

In [None]:
trainer.train()

In [None]:
torch.save({
    'epoch': epoch,  # 현재 학습 epoch
    'model_state_dict': model.state_dict(),  # 모델 저장
    'optimizer_state_dict': optimizer.state_dict(),  # 옵티마이저 저장
}, 'translator3.pth')

In [None]:
def infer(text, label):
  tmp = [tokenizer.bos_token_id] + tokenizer.encode(text) + [tokenizer.eos_token_id]
  out = model.generate(input_ids = torch.tensor(tmp)[None, :].to(device))
  result = tokenizer.decode(out[0])

  print("번역 결과: ", result)

  reference = result.split()
  candidate = []
  candidate.append(label.split())
  bleu = sentence_bleu(references=candidate, hypothesis=reference, weights=(1, 0, 0, 0))

  print("BLEU score", bleu)

In [None]:
infer("아쉽지만 그러면 한 명은 기다려야 할 것 같네요.", "This room stinks of cigarette smells. I want to change rooms.")