In [1]:
!pip install transformers

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting transformers
  Downloading transformers-4.27.4-py3-none-any.whl (6.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m6.8/6.8 MB[0m [31m62.3 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1
  Downloading tokenizers-0.13.3-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.8/7.8 MB[0m [31m64.2 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0,>=0.11.0
  Downloading huggingface_hub-0.13.4-py3-none-any.whl (200 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m200.1/200.1 kB[0m [31m13.6 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: tokenizers, huggingface-hub, transformers
Successfully installed huggingface-hub-0.13.4 tokenizers-0.13.3 transformers-4.27.4


In [2]:
import torch
from torch.utils.data import Dataset, DataLoader
import pandas as pd
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, Seq2SeqTrainingArguments, Seq2SeqTrainer, EarlyStoppingCallback
import pandas as pd
import urllib.request
import tensorflow_datasets as tfds
import tensorflow as tf
import time
import numpy as np
import matplotlib.pyplot as plt
import re

In [3]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'

In [4]:
# Define the dataset class
class ChatbotDataset(Dataset):
    def __init__(self, file_path, tokenizer):
        self.data = pd.read_csv(file_path)
        self.data = self.data.iloc[:len(self.data)//2]
        self.tokenizer = tokenizer

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        input_text = self.data.iloc[idx]['Q']
        response_text = self.data.iloc[idx]['A']
        
        # Tokenize the input and response text
        input_tokens = self.tokenizer.encode(input_text, truncation=True, padding='max_length', max_length=128)
        response_tokens = self.tokenizer.encode(response_text, truncation=True, padding='max_length', max_length=128)
        
        return {'input_ids': input_tokens, 'attention_mask': [1] * len(input_tokens), 'decoder_input_ids': response_tokens[:-1], 'decoder_attention_mask': [1] * len(response_tokens[:-1]), 'labels': response_tokens[1:]}

In [5]:
# Define the collate function
def collate_fn(batch):
    input_ids = torch.tensor([item['input_ids'] for item in batch])
    attention_mask = torch.tensor([item['attention_mask'] for item in batch])
    decoder_input_ids = torch.tensor([item['decoder_input_ids'] for item in batch])
    decoder_attention_mask = torch.tensor([item['decoder_attention_mask'] for item in batch])
    labels = torch.tensor([item['labels'] for item in batch])

    return {'input_ids': input_ids, 'attention_mask': attention_mask, 'decoder_input_ids': decoder_input_ids, 'decoder_attention_mask': decoder_attention_mask, 'labels': labels}

In [6]:
# Load the dataset and tokenizer
urllib.request.urlretrieve("https://raw.githubusercontent.com/songys/Chatbot_data/master/ChatbotData.csv", filename="ChatBotData.csv")
file_path = 'ChatBotData.csv'

In [7]:
tokenizer_name = 'facebook/bart-large'
tokenizer = AutoTokenizer.from_pretrained(tokenizer_name)
dataset = ChatbotDataset(file_path, tokenizer)

Downloading (…)okenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/1.63k [00:00<?, ?B/s]

Downloading (…)olve/main/vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

Downloading (…)olve/main/merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

In [8]:
# Split the dataset into training and validation sets
train_size = int(0.8 * len(dataset))
val_size = len(dataset) - train_size
train_dataset, val_dataset = torch.utils.data.random_split(dataset, [train_size, val_size])

In [9]:
# Initialize the model and training arguments
model_name = 'facebook/bart-large'
model = AutoModelForSeq2SeqLM.from_pretrained(model_name).to(device)

Downloading pytorch_model.bin:   0%|          | 0.00/1.02G [00:00<?, ?B/s]

In [10]:
# training_args = Seq2SeqTrainingArguments(
#     output_dir='./results',
#     num_train_epochs=10,
#     per_device_train_batch_size=4,
#     per_device_eval_batch_size=4,
#     evaluation_strategy = "steps",
#     warmup_steps=500,
#     weight_decay=0.01,
#     logging_dir='./logs',
#     load_best_model_at_end=True,
#     logging_steps=100,
#     eval_steps=500,
#     save_steps=500,
#     learning_rate=1e-4
# )
training_args = Seq2SeqTrainingArguments(
    output_dir='./results',
    evaluation_strategy = "steps",
    eval_steps = 100,
    save_steps = 500,
    num_train_epochs = 10,
    per_device_train_batch_size = 16,
    per_device_eval_batch_size = 16,
    warmup_steps = 500,
    learning_rate = 5e-5,
    logging_dir='./logs',
    logging_steps=1000,
    dataloader_num_workers=4,
    run_name = 'run_name',
    load_best_model_at_end=True, # set load_best_model_at_end to True
    metric_for_best_model="eval_loss",
    greater_is_better=False
)

In [11]:
# Define early stopping callback
early_stopping_callback = EarlyStoppingCallback(early_stopping_patience=3, early_stopping_threshold=0.01)

In [12]:
# Initialize the trainer
trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    data_collator=collate_fn,
    callbacks=[early_stopping_callback]
)

In [13]:
# Train the model
trainer.train()



Step,Training Loss,Validation Loss
100,No log,5.025115
200,No log,1.821499
300,No log,0.443696
400,No log,0.420549
500,No log,0.4003
600,No log,0.362727
700,No log,0.348425
800,No log,0.384596
900,No log,0.316419
1000,1.778600,0.310935




TrainOutput(global_step=1800, training_loss=1.1052657063802083, metrics={'train_runtime': 3688.2847, 'train_samples_per_second': 12.819, 'train_steps_per_second': 0.803, 'total_flos': 7788573939990528.0, 'train_loss': 1.1052657063802083, 'epoch': 6.08})

In [14]:
# Test the model
input_text = "안녕하세요~"
input_tokens = tokenizer.encode(input_text, truncation=True, padding='max_length', max_length=128)
input_ids = torch.tensor([input_tokens]).to(device)
attention_mask = torch.tensor([[1] * len(input_tokens)])
generated_ids = model.generate(input_ids)



In [19]:
def test_chatbot(sentence : str):
  
  # Tokenize test sentence(s)
  inputs = tokenizer.encode(sentence, return_tensors="pt").to(device)

  # Generate output
  outputs = model.generate(inputs, max_length=128, num_beams=4, early_stopping=True)
  output_str = tokenizer.decode(outputs[0], skip_special_tokens=True)

  # Print output
  print("Input: ", test_sentence)
  print("Output: ", output_str)

In [20]:
test_chatbot('뭐라고 부를까요?')

Input:  뭐라고 부를까요?
Output:  자신의 말해보세요.


추후 성능 개선을 위해 시도해볼 것.