In [2]:
!pip install transformers

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting transformers
  Downloading transformers-4.27.3-py3-none-any.whl (6.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m6.8/6.8 MB[0m [31m92.1 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1
  Downloading tokenizers-0.13.2-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.6 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.6/7.6 MB[0m [31m104.7 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0,>=0.11.0
  Downloading huggingface_hub-0.13.3-py3-none-any.whl (199 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m199.8/199.8 KB[0m [31m22.1 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: tokenizers, huggingface-hub, transformers
Successfully installed huggingface-hub-0.13.3 tokenizers-0.13.2 transformers-4.27.3


In [3]:
import torch
from torch.utils.data import Dataset, DataLoader
import pandas as pd
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, Seq2SeqTrainingArguments, Seq2SeqTrainer
import pandas as pd
import urllib.request
import tensorflow_datasets as tfds
import tensorflow as tf
import time
import numpy as np
import matplotlib.pyplot as plt
import re

In [10]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'

In [4]:
# Define the dataset class
class ChatbotDataset(Dataset):
    def __init__(self, file_path, tokenizer):
        self.data = pd.read_csv(file_path)
        self.data = self.data.iloc[:len(self.data)//2]
        self.tokenizer = tokenizer

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        input_text = self.data.iloc[idx]['Q']
        response_text = self.data.iloc[idx]['A']
        
        # Tokenize the input and response text
        input_tokens = self.tokenizer.encode(input_text, truncation=True, padding='max_length', max_length=128)
        response_tokens = self.tokenizer.encode(response_text, truncation=True, padding='max_length', max_length=128)
        
        return {'input_ids': input_tokens, 'attention_mask': [1] * len(input_tokens), 'decoder_input_ids': response_tokens[:-1], 'decoder_attention_mask': [1] * len(response_tokens[:-1]), 'labels': response_tokens[1:]}

In [5]:
# Define the collate function
def collate_fn(batch):
    input_ids = torch.tensor([item['input_ids'] for item in batch])
    attention_mask = torch.tensor([item['attention_mask'] for item in batch])
    decoder_input_ids = torch.tensor([item['decoder_input_ids'] for item in batch])
    decoder_attention_mask = torch.tensor([item['decoder_attention_mask'] for item in batch])
    labels = torch.tensor([item['labels'] for item in batch])

    return {'input_ids': input_ids, 'attention_mask': attention_mask, 'decoder_input_ids': decoder_input_ids, 'decoder_attention_mask': decoder_attention_mask, 'labels': labels}

In [6]:
# Load the dataset and tokenizer
urllib.request.urlretrieve("https://raw.githubusercontent.com/songys/Chatbot_data/master/ChatbotData.csv", filename="ChatBotData.csv")
file_path = 'ChatBotData.csv'

In [27]:
tokenizer_name = 'facebook/bart-large'
tokenizer = AutoTokenizer.from_pretrained(tokenizer_name)
dataset = ChatbotDataset(file_path, tokenizer)

In [8]:
# Split the dataset into training and validation sets
train_size = int(0.8 * len(dataset))
val_size = len(dataset) - train_size
train_dataset, val_dataset = torch.utils.data.random_split(dataset, [train_size, val_size])

In [12]:
# Initialize the model and training arguments
model_name = 'facebook/bart-large'
model = AutoModelForSeq2SeqLM.from_pretrained(model_name).to(device)

In [13]:
training_args = Seq2SeqTrainingArguments(
    output_dir='./results',
    num_train_epochs=10,
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    warmup_steps=500,
    weight_decay=0.01,
    logging_dir='./logs',
    logging_steps=100,
    eval_steps=500,
    save_steps=500,
    learning_rate=1e-4
)

In [14]:
# Initialize the trainer
trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    data_collator=collate_fn
)

In [15]:
# Train the model
trainer.train()



Step,Training Loss
100,9.5768
200,2.496
300,0.5221
400,0.6356
500,0.488
600,0.4389
700,0.4097
800,0.461
900,0.4062
1000,0.3823


TrainOutput(global_step=11820, training_loss=0.3224736106980674, metrics={'train_runtime': 7600.7325, 'train_samples_per_second': 6.22, 'train_steps_per_second': 1.555, 'total_flos': 1.280758819848192e+16, 'train_loss': 0.3224736106980674, 'epoch': 10.0})

In [25]:
# Test the model
input_text = "안녕하세요~"
input_tokens = tokenizer.encode(input_text, truncation=True, padding='max_length', max_length=128)
input_ids = torch.tensor([input_tokens]).to(device)
attention_mask = torch.tensor([[1] * len(input_tokens)])
generated_ids = model.generate(input_ids)



In [26]:
tokenizer.decode(generated_ids[0])

'</s><s>��할 수 있을 거�</s>'

#### 제대로 된 ChatBot Fine-tuning을 위해서 추후 수정을 계속 할 예정.