In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
# import locale
# locale.getpreferredencoding = lambda: "UTF-8"

# 필요한 모듈 install

In [3]:
 !pip install accelerate>=0.20.1
 !pip install transformers
 !pip install pytorch-lightning
 !pip install sentencepiece
 !pip install sacremoses

Collecting transformers
  Downloading transformers-4.33.2-py3-none-any.whl (7.6 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.6/7.6 MB[0m [31m18.0 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1 (from transformers)
  Downloading tokenizers-0.13.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.8/7.8 MB[0m [31m38.0 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting safetensors>=0.3.1 (from transformers)
  Downloading safetensors-0.3.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m34.4 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: tokenizers, safetensors, transformers
Successfully installed safetensors-0.3.3 tokenizers-0.13.3 transformers-4.33.2
Collecting pytorch-lightning
  Downloading pytorch_lightning-2.0.9-py3-none-any.whl (727 k

In [4]:
from transformers import (
    AutoModelForSeq2SeqLM,
    AutoTokenizer,
    Seq2SeqTrainingArguments,
    Seq2SeqTrainer,
    DataCollatorForSeq2Seq,
    MarianTokenizer,
    MarianMTModel,
    MarianConfig,
)

import torch
import torch.nn as nn
from torch.utils.data import DataLoader, Dataset
import pandas as pd
import random

# 데이터 로드 - 전체 데이터 9:1

In [8]:
data_path = '/content/drive/MyDrive/final_models/번역/data/data_100_1000/'
data_size = 1000

train_df = pd.read_csv(f'{data_path}df_{data_size}_train.csv')
valid_df = pd.read_csv(f'{data_path}df_{data_size}_valid.csv')

In [9]:
train_df.shape, valid_df.shape

((1000, 2), (100, 2))

In [10]:
train_df.head(), valid_df.head()

(                                                  원문  \
 0  국립환경과학원 대기질통합예보센터는 5일 수도권·강원 영서·충청권·전북에서 미세먼지 ...   
 1  그는 “RSK처럼 유명 메인넷임에도 한국 거래소에 많이 상장되지 않은 프로젝트들이 ...   
 2  유 작가는 진술서를 쓰다가 군법회의에서 공소기각 선고를 받고 풀려났으나, 영장이 나...   
 3  연구소 측은 “국내외 경제 이슈로 ‘시장 환경 변화에 신속히 대응할 수 있도록 유동...   
 4  버지, 엔가젯 등 IT외신은 이 또한 임시방편에 불과하며, 구글이 장차 크롬에서 파...   
 
                                                  번역문  
 0  The National Institute of Environmental Resear...  
 1  "There are some projects that are not listed o...  
 2  Writer Yoo was released after receiving an ind...  
 3  The research institute said, “As a result of d...  
 4  IT foreign media such as Burgie and Engadget a...  ,
                                                   원문  \
 0  이어 “골목에 있는 휴대전화 매장에서 거의 15분 안에 이 문제로 통신사 바꾸러 온...   
 1                         그 다음 한번 더 벗겨내는 과정이 5분도미이다.   
 2  그 뒤에는 ‘린킨 파크도 찜한 레전드 팀들의 탄생’이라는 자막이 등장해, 3회부터 ...   
 3            커뮤니티에서 할아버지의 무대가 화제되자 KBS는 전체 영상을 공개했다.   
 4  금융위는 “IFRS 관련 국내 기업이 가지는 어려움을 

# 모델, tokenizer 설정

In [11]:
model_name = "Helsinki-NLP/opus-mt-ko-en"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSeq2SeqLM.from_pretrained(model_name)

Downloading (…)okenizer_config.json:   0%|          | 0.00/44.0 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/1.39k [00:00<?, ?B/s]

Downloading (…)olve/main/source.spm:   0%|          | 0.00/842k [00:00<?, ?B/s]

Downloading (…)olve/main/target.spm:   0%|          | 0.00/813k [00:00<?, ?B/s]

Downloading (…)olve/main/vocab.json:   0%|          | 0.00/1.72M [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/312M [00:00<?, ?B/s]

Downloading (…)neration_config.json:   0%|          | 0.00/293 [00:00<?, ?B/s]

# 데이터 전처리
- target 데이터에 bos, eos 붙이기
- max_length : 512

In [12]:
class CustomDataset(Dataset):
    def __init__(self, tokenizer, max_length=512):
        self.max_length = max_length
        self.tokenizer = tokenizer

    def get_inputs(self, df):
        input_pairs = []

        for i in range(len(df)):
            input_text = df.iloc[i, 0]
            target_text = df.iloc[i, 1]

            input_ids = self.tokenizer(input_text, add_special_tokens=True, max_length=self.max_length, padding="max_length", truncation=True)
            target_ids = self.tokenizer(target_text, add_special_tokens=True, max_length=self.max_length, padding="max_length", truncation=True)


            input_pair = {'input_ids' : torch.LongTensor(input_ids['input_ids']),
                    'attention_mask' : torch.LongTensor(input_ids['attention_mask']),
                    'labels' : torch.LongTensor(target_ids['input_ids'])}

            input_pairs.append(input_pair)

        return input_pairs

    def get_input_ids(self, df):
        input_ids_list = []

        for i in range(len(df)):
            input_text = df.iloc[i, 0]
            target_text = df.iloc[i, 1]

            input_ids = self.tokenizer(input_text, padding="max_length", max_length=self.max_length, truncation=True)["input_ids"]
            input_ids_list.append(input_ids)

        return torch.tensor(input_ids_list)

In [13]:
custom_dataset = CustomDataset(tokenizer=tokenizer)

dataset_train = custom_dataset.get_inputs(train_df)
dataset_val = custom_dataset.get_inputs(valid_df)

In [14]:
print(f'dataset_train len : {len(dataset_train)}')
dataset_train[0]

dataset_train len : 1000


{'input_ids': tensor([30976,  2851,  4694,   792,  9096,   742, 37572, 29595, 10696,    55,
           309,   198,  1170,  1881,  4399,  1380,   792,  4272,   212,  4399,
          6714,  3192,  1881,  4399,   503,  6079,   168, 34648, 15569, 35351,
            62,    32,  2112,   140, 57783,    47, 10908,    51,     3,  6865,
         23160,    55,    32,   157, 57783,    47, 10908,    51, 23358,  2775,
             9,  6608,  1397,     2,     0, 65000, 65000, 65000, 65000, 65000,
         65000, 65000, 65000, 65000, 65000, 65000, 65000, 65000, 65000, 65000,
         65000, 65000, 65000, 65000, 65000, 65000, 65000, 65000, 65000, 65000,
         65000, 65000, 65000, 65000, 65000, 65000, 65000, 65000, 65000, 65000,
         65000, 65000, 65000, 65000, 65000, 65000, 65000, 65000, 65000, 65000,
         65000, 65000, 65000, 65000, 65000, 65000, 65000, 65000, 65000, 65000,
         65000, 65000, 65000, 65000, 65000, 65000, 65000, 65000, 65000, 65000,
         65000, 65000, 65000, 65000, 65

In [15]:
print(f'dataset_val len : {len(dataset_val)}')
dataset_val[0]

dataset_val len : 100


{'input_ids': tensor([ 6896,    19, 36301,    64,   217,     9, 28493,  2193, 22609,  2150,
           417,   627,  2206,    21, 29388, 12538,   340, 12235,   958,  1281,
           896,   140,  9313,   305,  7281,   875, 19547,    85,     9, 46034,
             2,     0, 65000, 65000, 65000, 65000, 65000, 65000, 65000, 65000,
         65000, 65000, 65000, 65000, 65000, 65000, 65000, 65000, 65000, 65000,
         65000, 65000, 65000, 65000, 65000, 65000, 65000, 65000, 65000, 65000,
         65000, 65000, 65000, 65000, 65000, 65000, 65000, 65000, 65000, 65000,
         65000, 65000, 65000, 65000, 65000, 65000, 65000, 65000, 65000, 65000,
         65000, 65000, 65000, 65000, 65000, 65000, 65000, 65000, 65000, 65000,
         65000, 65000, 65000, 65000, 65000, 65000, 65000, 65000, 65000, 65000,
         65000, 65000, 65000, 65000, 65000, 65000, 65000, 65000, 65000, 65000,
         65000, 65000, 65000, 65000, 65000, 65000, 65000, 65000, 65000, 65000,
         65000, 65000, 65000, 65000, 65

# Fine-Tuning
- decoder_layers : 디코더 개수
- lm_head : 마지막 linear 레이어

In [16]:
class CustomModel(nn.Module):
  def __init__(self, custom_dataset, model_name, decoder_layers=6, dropout=None):
    super().__init__()

    self.custom_dataset = custom_dataset

    config = MarianConfig.from_pretrained(model_name)

    config.decoder_layers = decoder_layers

    self.model = AutoModelForSeq2SeqLM.from_pretrained(model_name, config = config)

    if dropout:
      self.model.dropout = dropout

    # encoder freezing
    for param in self.model.get_encoder().parameters():
        param.requires_grad = False

    self.batch_size = -1
    self.batch_cnt = -1

    self.test_result = [[],[]]

  def train(self, training_args, train_df, valid_df):
      train_dataset = self.custom_dataset.get_inputs(train_df)
      valid_dataset = self.custom_dataset.get_inputs(valid_df)

      trainer = Seq2SeqTrainer(
        model=self.model,
        args=training_args,
        train_dataset=train_dataset,
        eval_dataset=valid_dataset,
      )

      trainer.train()

  def generate(self, df):
      input_ids = self.custom_dataset.get_input_ids(df)
      outputs = self.model.generate(input_ids.to(device), max_length=512)

      output_sentences = []
      for output in outputs:
          output_sentences.append(tokenizer.decode(output, skip_special_tokens=True))

      return output_sentences

  def test_accuracy(self, df, batch_size=100):
      self.batch_size = batch_size
      self.batch_cnt = 0
      self.test_result = [[], []]

      df_len = len(df)
      start,end = 0,self.batch_size

      performance = 0
      while 1:
          if start >= df_len:
              break

          end = min(end, df_len)
          performance += self._test_accuracy(df[start:end])

          start = end
          end += self.batch_size

      performance /= self.batch_cnt
      print(f'final performance : {performance}')

      return performance

  def _test_accuracy(self, df):
      self.batch_cnt += 1

      input_sentences = self.generate(df)
      target_sentences = [sentence for sentence in df['번역문']]

      df_len = len(df)
      cnt_prev, cnt, cntO = ((self.batch_cnt-1) * self.batch_size),0,0

      print(f'### {self.batch_cnt} batch start ###')
      for i in range(df_len) :
          cnt += 1
          cnt_global = cnt_prev + cnt

          input_sentence = input_sentences[i]
          target_sentence = target_sentences[i]

          if input_sentence == target_sentence :
              cntO += 1
          else :
              self.test_result[0].append(input_sentence)
              self.test_result[1].append(target_sentence)

          # if cnt % 100 == 0:
          #     print(f'{cnt_global} generated')
      # if cnt % 100 != 0:
      #     print(f'{cnt_global} generated')

      performance = cntO/cnt
      print(f'{self.batch_cnt} batch performance : {performance}\n')

      return performance


  def return_model(self):
      return self.model

# Model 1
- nothing changed

In [17]:
data_path = '/content/drive/MyDrive/final_models/번역/data/data_100_1000/'
# data_path = '/content/drive/MyDrive/final_models/번역/data/'
data_size = 1000

train_df = pd.read_csv(f'{data_path}df_{data_size}_train.csv')
valid_df = pd.read_csv(f'{data_path}df_{data_size}_valid.csv')

한 번에 에폭 다 돌고 성능 측정

In [None]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'
model = CustomModel(custom_dataset, model_name)
# model.return_model()

model_path = "/content/drive/MyDrive/model/"

num_train_epochs = 100
batch_size = 10
step = data_size / batch_size * 100

training_args = Seq2SeqTrainingArguments(
    output_dir=model_path, #The output directory
    overwrite_output_dir=True, #overwrite the content of the output directory
    num_train_epochs=num_train_epochs, # number of training epochs
    per_device_train_batch_size=batch_size, # batch size for training
    per_device_eval_batch_size=batch_size,  # batch size for evaluation
    eval_steps=step, # Number of update steps between two evaluations.
    save_steps=step, # after # steps model is saved
    logging_steps=step,
    prediction_loss_only=True,
    evaluation_strategy="steps",
    save_total_limit=3
    )

model.train(training_args, train_df, valid_df)

Step,Training Loss,Validation Loss
10000,0.1142,0.493613


In [None]:
model.test_accuracy(train_df, 10)

In [None]:
model.test_result

In [None]:
# torch.save(model, "/content/drive/MyDrive/model/trans_model_1000_100_10.pt")

에폭 별로 성능 측정

In [None]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'
model = CustomModel(custom_dataset, model_name)

model.return_model()

model_path = "/content/drive/MyDrive/model/"

num_train_epochs = 100
batch_size = 10
step = 500

training_args = Seq2SeqTrainingArguments(
    output_dir=model_path, #The output directory
    overwrite_output_dir=True, #overwrite the content of the output directory
    num_train_epochs=num_train_epochs, # number of training epochs
    per_device_train_batch_size=batch_size, # batch size for training
    per_device_eval_batch_size=batch_size,  # batch size for evaluation
    eval_steps=step, # Number of update steps between two evaluations.
    save_steps=step, # after # steps model is saved
    logging_steps=step,
    prediction_loss_only=True,
    evaluation_strategy="steps",
    save_total_limit=3
    )

performance_max = 0

for i in range(10):
    model.train(training_args, train_df, valid_df)
    performance = model.test_accuracy(train_df)
    print(f'{i+1} epoch performance : {performance}')

    if performance_max < performance :
        performance_max = performance
        print(f'{i+1} epoch is best model\n')

In [None]:
for i in range(10) :
  print(f'gen : {model.test_result[0][i]}')
  print(f'tar : {model.test_result[1][i]}\n')

# 모델 학습 재개

In [None]:
# 모델 초기화
model_name = "Helsinki-NLP/opus-mt-ko-en"
model1 = AutoModelForSeq2SeqLM.from_pretrained(model_name)

# 모델 가중치 로드
model1.load_state_dict(torch.load("/content/drive/MyDrive/model/base_model_10000.pt/pytorch_model.bin"))