In [1]:
import torch
import transformers
import torch.nn as nn
from torch.utils.data import Dataset
from datasets import load_dataset
from copy import deepcopy
from torch.optim import Adam
from transformers import BartTokenizer
from transformers import BartForConditionalGeneration, PreTrainedTokenizerFast
from transformers import PreTrainedTokenizerFast
from transformers import BartForConditionalGeneration, BartConfig
from transformers import Seq2SeqTrainer, Seq2SeqTrainingArguments
from transformers import DataCollatorForSeq2Seq, AutoModelForSeq2SeqLM
from nltk.translate.bleu_score import sentence_bleu
import numpy as np
import pandas as pd

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
class TranslationDataset(Dataset):
  def __init__(self, df, tokenizer, max_len, ignore_index=-100, verbose=True):
    super().__init__()
    self.tokenizer = tokenizer
    self.max_len = max_len
    self.df = df
    self.len = len(self.df)
    self.pad_index = self.tokenizer.pad_token_id
    self.ignore_index = ignore_index

  def add_padding_data(self, inputs):
    if len(inputs) < self.max_len:
      pad = np.array([self.pad_index] * (self.max_len - len(inputs)))
      inputs = np.concatenate([inputs, pad])
    else:
      inputs = inputs[:self.max_len]
    return inputs

  def add_ignored_data(self, inputs):
    if len(inputs) < self.max_len:
      pad = np.array([self.ignore_index] * (self.max_len - len(inputs)))
      inputs = np.concatenate([inputs, pad])
    else:
      inputs = inputs[:self.max_len]

    return inputs

  def __getitem__(self, idx, verbose=True):
    instance = self.df.iloc[idx]
    input_ids = self.tokenizer.encode(instance['원문'])
    input_ids = np.append(input_ids, self.tokenizer.eos_token_id)
    input_ids = self.add_padding_data(input_ids)
    input_ids = np.insert(input_ids, 0, self.tokenizer.bos_token_id)

    label_ids = self.tokenizer.encode(instance['번역문'])
    label_ids.append(self.tokenizer.eos_token_id)
    label_ids.insert(0, self.tokenizer.bos_token_id)

    dec_input_ids = [self.tokenizer.eos_token_id]
    dec_input_ids += label_ids[:-1]
    dec_input_ids = self.add_padding_data(dec_input_ids)
    label_ids = self.add_ignored_data(label_ids)

    input_ids = torch.tensor(np.array(input_ids)).long()
    decoder_input_ids = torch.tensor(np.array(dec_input_ids)).long()

    attention_mask = input_ids.ne(self.tokenizer.pad_token_id).float()

    return {'input_ids': input_ids,
            #'attention_mask': input_ids.ne(self.tokenizer.pad_token_id).float(),
            'decoder_input_ids': decoder_input_ids,
            # 'decoder_attention_mask': decoder_input_ids.ne(self.tokenizer.pad_token_id).float(),
            'labels': np.array(label_ids, dtype = np.int_)}

  def __len__(self):
    return self.len

In [3]:
def compute_metrics(pred):
  preds, labels = pred

  preds = tokenizer.batch_decode(preds, skip_special_tokens = True)
  labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
  labels = tokenizer.batch_decode(labels, skip_special_tokens = True)

  print("원문: ", val['원문'][0])
  print("번역 정답", labels[0])
  print("번역 결과: ", preds[0])

  reference = preds[0].split()
  candidate = []
  candidate.append(labels[0].split())
  bleu = sentence_bleu(references = candidate, hypothesis=reference, weights=(1, 0, 0, 0))
  return {"BLEU score": bleu }

In [4]:
lr = 3e-5
stop = 3
epoch = 10
batch = 4
seed = 42
device = 'cuda'

In [5]:
train = pd.read_csv("english_korean_data/train_small.csv", encoding="cp949")
val = pd.read_csv("english_korean_data/test_open.csv", encoding="cp949")
train_dataset = TranslationDataset(train, PreTrainedTokenizerFast.from_pretrained('gogamza/kobart-base-v1'), 256)
val_dataset = TranslationDataset(val, PreTrainedTokenizerFast.from_pretrained('gogamza/kobart-base-v1'), 256)

You passed along `num_labels=3` with an incompatible id to label map: {'0': 'NEGATIVE', '1': 'POSITIVE'}. The number of labels wil be overwritten to 2.
You passed along `num_labels=3` with an incompatible id to label map: {'0': 'NEGATIVE', '1': 'POSITIVE'}. The number of labels wil be overwritten to 2.


In [6]:
model = AutoModelForSeq2SeqLM.from_pretrained("gogamza/kobart-base-v1")
tokenizer = PreTrainedTokenizerFast.from_pretrained("gogamza/kobart-base-v1")
collator = DataCollatorForSeq2Seq(tokenizer, model=model, label_pad_token_id = tokenizer.pad_token_id)

You passed along `num_labels=3` with an incompatible id to label map: {'0': 'NEGATIVE', '1': 'POSITIVE'}. The number of labels wil be overwritten to 2.
You passed along `num_labels=3` with an incompatible id to label map: {'0': 'NEGATIVE', '1': 'POSITIVE'}. The number of labels wil be overwritten to 2.


In [7]:

tokenizer = PreTrainedTokenizerFast.from_pretrained("gogamza/kobart-base-v1")


config = BartConfig.from_pretrained("gogamza/kobart-base-v1")


config.encoder_embed_dim = 768  
config.encoder_embed_path = None


encoder_embedding = torch.nn.Embedding(config.vocab_size, config.encoder_embed_dim)


original_model = BartForConditionalGeneration.from_pretrained("gogamza/kobart-base-v1")


original_model.model.encoder.embed_tokens = encoder_embedding


model = original_model

collator = DataCollatorForSeq2Seq(tokenizer, model=model, label_pad_token_id=tokenizer.pad_token_id)

You passed along `num_labels=3` with an incompatible id to label map: {'0': 'NEGATIVE', '1': 'POSITIVE'}. The number of labels wil be overwritten to 2.
You passed along `num_labels=3` with an incompatible id to label map: {'0': 'NEGATIVE', '1': 'POSITIVE'}. The number of labels wil be overwritten to 2.
You passed along `num_labels=3` with an incompatible id to label map: {'0': 'NEGATIVE', '1': 'POSITIVE'}. The number of labels wil be overwritten to 2.


In [8]:
optimizer = torch.optim.AdamW(model.parameters(), lr=lr)
lr_scheduler = transformers.get_cosine_schedule_with_warmup(optimizer = optimizer,
                                                        num_warmup_steps = 100,
                                                        num_training_steps = epoch * len(train_dataset) * batch,
                                                        last_epoch = -1)

In [9]:
args = Seq2SeqTrainingArguments(run_name = "KoBART_translator",
                                output_dir = "./BART_translator_2",
                                evaluation_strategy="steps",
                                eval_steps = 100,
                                save_steps = 100,
                                save_total_limit=2,

                                per_device_train_batch_size= batch,
                                per_device_eval_batch_size = batch,
                                gradient_accumulation_steps = 16,
                                num_train_epochs = epoch,

                                load_best_model_at_end = True,
                                #fp16=True,
                                do_train=True,
                                do_eval=True,
                                predict_with_generate=True,)

trainer = Seq2SeqTrainer(model = model,
                        tokenizer = tokenizer,
                        args = args,
                        train_dataset = train_dataset,
                        eval_dataset = val_dataset,
                        compute_metrics = compute_metrics,
                        optimizers = (optimizer, lr_scheduler),
                        data_collator = collator,)

In [10]:
trainer.train()

  0%|          | 0/1560 [00:00<?, ?it/s]You're using a PreTrainedTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
                                                  
  6%|▋         | 100/1560 [04:22<25:15,  1.04s/it] 

원문:  너희 아빠랑 이번 주말에 보러 다녀와야겠네 그럼.
번역 정답 Then you should go and watch it with your dad this weekend.
번역 결과:  Ih there are are are are
{'eval_loss': 2.2680821418762207, 'eval_BLEU score': 0, 'eval_runtime': 158.5952, 'eval_samples_per_second': 63.054, 'eval_steps_per_second': 15.763, 'epoch': 0.64}


The hypothesis contains 0 counts of 2-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
The hypothesis contains 0 counts of 3-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
The hypothesis contains 0 counts of 4-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
                                                  
 13%|█▎        | 200/1560 [08:47<23:37,  1.04s/it] 

원문:  너희 아빠랑 이번 주말에 보러 다녀와야겠네 그럼.
번역 정답 Then you should go and watch it with your dad this weekend.
번역 결과:  Ih it is an ordered it is
{'eval_loss': 2.127504348754883, 'eval_BLEU score': 0.06993452279385044, 'eval_runtime': 158.8315, 'eval_samples_per_second': 62.96, 'eval_steps_per_second': 15.74, 'epoch': 1.28}


                                                     
 19%|█▉        | 300/1560 [13:10<21:51,  1.04s/it] 

원문:  너희 아빠랑 이번 주말에 보러 다녀와야겠네 그럼.
번역 정답 Then you should go and watch it with your dad this weekend.
번역 결과:  Ih there are only only 
{'eval_loss': 2.0682413578033447, 'eval_BLEU score': 0, 'eval_runtime': 158.2308, 'eval_samples_per_second': 63.199, 'eval_steps_per_second': 15.8, 'epoch': 1.92}


                                                     
 26%|██▌       | 400/1560 [17:32<20:05,  1.04s/it] 

원문:  너희 아빠랑 이번 주말에 보러 다녀와야겠네 그럼.
번역 정답 Then you should go and watch it with your dad this weekend.
번역 결과:  Ih it's a lot of color, s
{'eval_loss': 2.0137991905212402, 'eval_BLEU score': 0, 'eval_runtime': 157.2427, 'eval_samples_per_second': 63.596, 'eval_steps_per_second': 15.899, 'epoch': 2.56}


 32%|███▏      | 500/1560 [19:17<18:26,  1.04s/it]   

{'loss': 2.4676, 'learning_rate': 2.9999925941003032e-05, 'epoch': 3.2}


                                                  
 32%|███▏      | 500/1560 [21:55<18:26,  1.04s/it] 

원문:  너희 아빠랑 이번 주말에 보러 다녀와야겠네 그럼.
번역 정답 Then you should go and watch it with your dad this weekend.
번역 결과:  Ih it's a refund on the ref
{'eval_loss': 1.9806348085403442, 'eval_BLEU score': 0, 'eval_runtime': 157.4413, 'eval_samples_per_second': 63.516, 'eval_steps_per_second': 15.879, 'epoch': 3.2}


                                                     
 38%|███▊      | 600/1560 [26:18<16:41,  1.04s/it] 

원문:  너희 아빠랑 이번 주말에 보러 다녀와야겠네 그럼.
번역 정답 Then you should go and watch it with your dad this weekend.
번역 결과:  Ih it's a refund on the ref
{'eval_loss': 2.185455083847046, 'eval_BLEU score': 0, 'eval_runtime': 158.183, 'eval_samples_per_second': 63.218, 'eval_steps_per_second': 15.804, 'epoch': 3.84}


                                                     
 45%|████▍     | 700/1560 [30:42<14:56,  1.04s/it] 

원문:  너희 아빠랑 이번 주말에 보러 다녀와야겠네 그럼.
번역 정답 Then you should go and watch it with your dad this weekend.
번역 결과:  Ih we have to get a refund on
{'eval_loss': 1.9353324174880981, 'eval_BLEU score': 0, 'eval_runtime': 157.888, 'eval_samples_per_second': 63.336, 'eval_steps_per_second': 15.834, 'epoch': 4.48}


                                                     
 51%|█████▏    | 800/1560 [35:06<13:38,  1.08s/it] 

원문:  너희 아빠랑 이번 주말에 보러 다녀와야겠네 그럼.
번역 정답 Then you should go and watch it with your dad this weekend.
번역 결과:  Ih we get a refund on the stor
{'eval_loss': 1.927201747894287, 'eval_BLEU score': 0, 'eval_runtime': 157.623, 'eval_samples_per_second': 63.443, 'eval_steps_per_second': 15.861, 'epoch': 5.12}


                                                     
 58%|█████▊    | 900/1560 [39:33<11:42,  1.06s/it] 

원문:  너희 아빠랑 이번 주말에 보러 다녀와야겠네 그럼.
번역 정답 Then you should go and watch it with your dad this weekend.
번역 결과:  Ih it's a bit because I'm
{'eval_loss': 1.917406678199768, 'eval_BLEU score': 0, 'eval_runtime': 158.6617, 'eval_samples_per_second': 63.027, 'eval_steps_per_second': 15.757, 'epoch': 5.76}


 64%|██████▍   | 1000/1560 [41:21<10:02,  1.08s/it] 

{'loss': 1.867, 'learning_rate': 2.9999625077581203e-05, 'epoch': 6.4}


                                                   
 64%|██████▍   | 1000/1560 [43:59<10:02,  1.08s/it]

원문:  너희 아빠랑 이번 주말에 보러 다녀와야겠네 그럼.
번역 정답 Then you should go and watch it with your dad this weekend.
번역 결과:  Ih we can't be able to get 
{'eval_loss': 1.9005603790283203, 'eval_BLEU score': 0, 'eval_runtime': 158.0035, 'eval_samples_per_second': 63.29, 'eval_steps_per_second': 15.822, 'epoch': 6.4}


                                                     
 71%|███████   | 1100/1560 [48:25<08:16,  1.08s/it]

원문:  너희 아빠랑 이번 주말에 보러 다녀와야겠네 그럼.
번역 정답 Then you should go and watch it with your dad this weekend.
번역 결과:  Ih we get a lot of color, s
{'eval_loss': 1.9036136865615845, 'eval_BLEU score': 0, 'eval_runtime': 157.9129, 'eval_samples_per_second': 63.326, 'eval_steps_per_second': 15.832, 'epoch': 7.04}


The hypothesis contains 0 counts of 2-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
The hypothesis contains 0 counts of 3-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
The hypothesis contains 0 counts of 4-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
                                                   
 77%|███████▋  | 1200/1560 [52:51<06:17,  1.05s/it]

원문:  너희 아빠랑 이번 주말에 보러 다녀와야겠네 그럼.
번역 정답 Then you should go and watch it with your dad this weekend.
번역 결과:  Ih we go to the company set to
{'eval_loss': 1.886002540588379, 'eval_BLEU score': 0.07581633246407919, 'eval_runtime': 157.1868, 'eval_samples_per_second': 63.619, 'eval_steps_per_second': 15.905, 'epoch': 7.68}


The hypothesis contains 0 counts of 2-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
The hypothesis contains 0 counts of 3-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
The hypothesis contains 0 counts of 4-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
                                                   
 83%|████████▎ | 1300/1560 [57:13<04:29,  1.04s/it]

원문:  너희 아빠랑 이번 주말에 보러 다녀와야겠네 그럼.
번역 정답 Then you should go and watch it with your dad this weekend.
번역 결과:  Ih we go to the company set 
{'eval_loss': 1.904558777809143, 'eval_BLEU score': 0.06993452279385044, 'eval_runtime': 157.3752, 'eval_samples_per_second': 63.542, 'eval_steps_per_second': 15.886, 'epoch': 8.32}


The hypothesis contains 0 counts of 2-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
The hypothesis contains 0 counts of 3-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
The hypothesis contains 0 counts of 4-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
                                                   
 90%|████████▉ | 1400/1560 [1:01:36<02:46,  1.04s/it]

원문:  너희 아빠랑 이번 주말에 보러 다녀와야겠네 그럼.
번역 정답 Then you should go and watch it with your dad this weekend.
번역 결과:  Then I will give you a refund 
{'eval_loss': 1.8662301301956177, 'eval_BLEU score': 0.13986904558770089, 'eval_runtime': 157.7864, 'eval_samples_per_second': 63.377, 'eval_steps_per_second': 15.844, 'epoch': 8.96}


 96%|█████████▌| 1500/1560 [1:03:22<01:02,  1.04s/it]  

{'loss': 1.7048, 'learning_rate': 2.9999092785685634e-05, 'epoch': 9.6}


The hypothesis contains 0 counts of 2-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
The hypothesis contains 0 counts of 3-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
The hypothesis contains 0 counts of 4-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
                                                     
 96%|█████████▌| 1500/1560 [1:06:00<01:02,  1.04s/it]

원문:  너희 아빠랑 이번 주말에 보러 다녀와야겠네 그럼.
번역 정답 Then you should go and watch it with your dad this weekend.
번역 결과:  Then I will get an actual if
{'eval_loss': 1.866457223892212, 'eval_BLEU score': 0.06993452279385044, 'eval_runtime': 158.0103, 'eval_samples_per_second': 63.287, 'eval_steps_per_second': 15.822, 'epoch': 9.6}


100%|██████████| 1560/1560 [1:07:04<00:00,  2.58s/it]

{'train_runtime': 4024.6553, 'train_samples_per_second': 24.847, 'train_steps_per_second': 0.388, 'train_loss': 1.99858518747183, 'epoch': 9.98}





TrainOutput(global_step=1560, training_loss=1.99858518747183, metrics={'train_runtime': 4024.6553, 'train_samples_per_second': 24.847, 'train_steps_per_second': 0.388, 'train_loss': 1.99858518747183, 'epoch': 9.98})

In [11]:
torch.save({
    'epoch': epoch,  # 현재 학습 epoch
    'model_state_dict': model.state_dict(),  # 모델 저장
    'optimizer_state_dict': optimizer.state_dict(),  # 옵티마이저 저장
}, 'translator3.pth')

In [12]:
def infer(text, label):
  tmp = [tokenizer.bos_token_id] + tokenizer.encode(text) + [tokenizer.eos_token_id]
  out = model.generate(input_ids = torch.tensor(tmp)[None, :].to(device))
  result = tokenizer.decode(out[0])

  print("번역 결과: ", result)

  reference = result.split()
  candidate = []
  candidate.append(label.split())
  bleu = sentence_bleu(references=candidate, hypothesis=reference, weights=(1, 0, 0, 0))

  print("BLEU score", bleu)

In [26]:
infer("아쉽지만 그러면 한 명은 기다려야 할 것 같네요.", "This room stinks of cigarette smells. I want to change rooms.")

번역 결과:  </s><s> here is looks, are on</s>
BLEU score 0
