In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
import locale
locale.getpreferredencoding = lambda: "UTF-8"

# 필요한 모듈 install

In [None]:
 !pip install accelerate>=0.20.1
 !pip install transformers
 !pip install pytorch-lightning

In [None]:
from transformers import (
    AutoModelForSeq2SeqLM,
    AutoTokenizer,
    Seq2SeqTrainingArguments,
    Seq2SeqTrainer,
    DataCollatorForSeq2Seq,
    BartConfig
)

import torch
import torch.nn as nn
from torch.utils.data import DataLoader, Dataset
import pandas as pd
import random

# 데이터 로드 - 전체 데이터 9:1

In [None]:
data_path = '/content/drive/MyDrive/final_models/요약/data/data_100_1000/'
data_size = 100

train_df = pd.read_csv(f'{data_path}summary_train_dataset_{data_size}.tsv', sep='\t')
valid_df = pd.read_csv(f'{data_path}summary_val_dataset_{data_size}.tsv', sep='\t')

# 모델, tokenizer 설정

In [None]:
model_name = 'digit82/kobart-summarization'
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSeq2SeqLM.from_pretrained(model_name)

# 데이터 전처리
- target 데이터에 bos, eos 붙이기
- max_length : 1024

In [None]:
class CustomDataset(Dataset):
    def __init__(self, tokenizer, max_length=1024):
        self.max_length = max_length
        self.tokenizer = tokenizer

    def get_inputs(self, df):
        input_pairs = []

        for i in range(len(df)):
            input_text = df.iloc[i, 0]
            target_text = df.iloc[i, 1]

            input_ids = self.tokenizer(input_text, padding="max_length", max_length=self.max_length, truncation=True)["input_ids"]
            attention = self.tokenizer(input_text, padding="max_length", max_length=self.max_length, truncation=True)["attention_mask"]
            target_ids = self.tokenizer('<s>' + target_text + '</s>', padding="max_length", max_length=self.max_length, truncation=True)["input_ids"]

            input_pair = {'input_ids' : torch.LongTensor(input_ids),
                    'attention_mask' : torch.LongTensor(attention),
                    'labels' : torch.LongTensor(target_ids)}

            input_pairs.append(input_pair)

        return input_pairs

    def get_input_ids(self, df):
        input_ids_list = []

        for i in range(len(df)):
            input_text = df.iloc[i, 0]
            target_text = df.iloc[i, 1]

            input_ids = self.tokenizer(input_text, padding="max_length", max_length=self.max_length, truncation=True)["input_ids"]
            input_ids_list.append(input_ids)

        return torch.tensor(input_ids_list)

In [None]:
custom_dataset = CustomDataset(tokenizer=tokenizer)

dataset_train = custom_dataset.get_inputs(train_df)
dataset_val = custom_dataset.get_inputs(valid_df)

In [None]:
print(f'dataset_train len : {len(dataset_train)}')
dataset_train[0]

In [None]:
print(f'dataset_val len : {len(dataset_val)}')
dataset_val[0]

# Fine-Tuning
- decoder_layers : 디코더 개수
- lm_head : 마지막 linear 레이어

In [None]:
class CustomModel(nn.Module):
  def __init__(self, custom_dataset, model_name, decoder_layers=6, dropout=None):
    super().__init__()

    self.custom_dataset = custom_dataset

    config = BartConfig.from_pretrained(model_name)

    config.decoder_layers = decoder_layers

    self.model = AutoModelForSeq2SeqLM.from_pretrained(model_name, config = config)

    if dropout:
      self.model.dropout = dropout

    # encoder freezing
    for param in self.model.get_encoder().parameters():
        param.requires_grad = False

    self.batch_size = -1
    self.batch_cnt = -1

    self.test_result = [[],[]]

  def train(self, training_args, train_df, valid_df):
      train_dataset = self.custom_dataset.get_inputs(train_df)
      valid_dataset = self.custom_dataset.get_inputs(valid_df)

      trainer = Seq2SeqTrainer(
        model=self.model,
        args=training_args,
        train_dataset=train_dataset,
        eval_dataset=valid_dataset,
      )

      trainer.train()

  def generate(self, df):
      input_ids = self.custom_dataset.get_input_ids(df)
      outputs = self.model.generate(input_ids.to(device), max_length=68)

      output_sentences = []
      for output in outputs:
          output_sentences.append(tokenizer.decode(output, skip_special_tokens=True))

      return output_sentences

  def test_accuracy(self, df, batch_size=100):
      self.batch_size = batch_size
      self.batch_cnt = 0
      self.test_result = [[], []]

      df_len = len(df)
      start,end = 0,self.batch_size

      performance = 0
      while 1:
          if start >= df_len:
              break

          end = min(end, df_len)
          performance += self._test_accuracy(df[start:end])

          start = end
          end += self.batch_size

      performance /= self.batch_cnt
      print(f'final performance : {performance}')

      return performance

  def _test_accuracy(self, df):
      self.batch_cnt += 1

      input_sentences = self.generate(df)
      target_sentences = [sentence for sentence in df['summary']]

      df_len = len(df)
      cnt_prev, cnt, cntO = ((self.batch_cnt-1) * self.batch_size),0,0

      print(f'### {self.batch_cnt} batch start ###')
      for i in range(df_len) :
          cnt += 1
          cnt_global = cnt_prev + cnt

          input_sentence = input_sentences[i]
          target_sentence = target_sentences[i]

          if input_sentence == target_sentence :
              cntO += 1
          else :
              self.test_result[0].append(input_sentence)
              self.test_result[1].append(target_sentence)

          # if cnt % 100 == 0:
          #     print(f'{cnt_global} generated')
      # if cnt % 100 != 0:
      #     print(f'{cnt_global} generated')

      performance = cntO/cnt
      print(f'{self.batch_cnt} batch performance : {performance}\n')

      return performance


  def return_model(self):
      return self.model

# Model 1
- nothing changed

In [52]:
data_path = '/content/drive/MyDrive/final_models/요약/data/data_100_1000/'
# data_path = '/content/drive/MyDrive/final_models/요약/data/'
data_size = 1000

train_df = pd.read_csv(f'{data_path}summary_train_dataset_{data_size}.tsv', sep='\t')
valid_df = pd.read_csv(f'{data_path}summary_val_dataset_{data_size}.tsv', sep='\t')

한 번에 에폭 다 돌고 성능 측정

In [None]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'
model = CustomModel(custom_dataset, model_name)
# model.return_model()

model_path = "/content/drive/MyDrive/model/"

num_train_epochs = 100
batch_size = 10
step = data_size / batch_size * 100

training_args = Seq2SeqTrainingArguments(
    output_dir=model_path, #The output directory
    overwrite_output_dir=True, #overwrite the content of the output directory
    num_train_epochs=num_train_epochs, # number of training epochs
    per_device_train_batch_size=batch_size, # batch size for training
    per_device_eval_batch_size=batch_size,  # batch size for evaluation
    eval_steps=step, # Number of update steps between two evaluations.
    save_steps=step, # after # steps model is saved
    logging_steps=step,
    prediction_loss_only=True,
    evaluation_strategy="steps",
    save_total_limit=3
    )

model.train(training_args, train_df, valid_df)

In [None]:
model.test_accuracy(train_df, 10)

In [None]:
model.test_result

에폭 별로 성능 측정

In [53]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'
model = CustomModel(custom_dataset, model_name)

model.return_model()

model_path = "/content/drive/MyDrive/model/"

num_train_epochs = 100
batch_size = 10
step = 500

training_args = Seq2SeqTrainingArguments(
    output_dir=model_path, #The output directory
    overwrite_output_dir=True, #overwrite the content of the output directory
    num_train_epochs=num_train_epochs, # number of training epochs
    per_device_train_batch_size=batch_size, # batch size for training
    per_device_eval_batch_size=batch_size,  # batch size for evaluation
    eval_steps=step, # Number of update steps between two evaluations.
    save_steps=step, # after # steps model is saved
    logging_steps=step,
    prediction_loss_only=True,
    evaluation_strategy="steps",
    save_total_limit=3
    )

performance_max = 0

for i in range(10):
    model.train(training_args, train_df, valid_df)
    performance = model.test_accuracy(train_df)
    print(f'{i+1} epoch performance : {performance}')

    if performance_max < performance :
        performance_max = performance
        print(f'{i+1} epoch is best model\n')

You passed along `num_labels=3` with an incompatible id to label map: {'0': 'NEGATIVE', '1': 'POSITIVE'}. The number of labels wil be overwritten to 2.


Step,Training Loss,Validation Loss


KeyboardInterrupt: ignored

In [None]:
for i in range(10) :
  print(f'gen : {model.test_result[0][i]}')
  print(f'tar : {model.test_result[1][i]}\n')

# 모델 학습 재개

In [None]:
# 모델 초기화
model_name = 'digit82/kobart-summarization'
model1 = AutoModelForSeq2SeqLM.from_pretrained(model_name)

# 모델 가중치 로드
model1.load_state_dict(torch.load("/content/drive/MyDrive/model/base_model_10000.pt/pytorch_model.bin"))