In [1]:
!nvidia-smi

Wed Apr 12 06:54:12 2023       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 525.85.12    Driver Version: 525.85.12    CUDA Version: 12.0     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla T4            Off  | 00000000:00:04.0 Off |                    0 |
| N/A   45C    P8    10W /  70W |      0MiB / 15360MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [3]:
import torch

print("Torch version:{}".format(torch.__version__))
print("cuda version: {}".format(torch.version.cuda))
print("cudnn version:{}".format(torch.backends.cudnn.version()))

Torch version:2.0.0+cu118
cuda version: 11.8
cudnn version:8700


In [4]:
!pip install transformers

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting transformers
  Downloading transformers-4.27.4-py3-none-any.whl (6.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m6.8/6.8 MB[0m [31m35.6 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1
  Downloading tokenizers-0.13.3-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.8/7.8 MB[0m [31m110.3 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting huggingface-hub<1.0,>=0.11.0
  Downloading huggingface_hub-0.13.4-py3-none-any.whl (200 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m200.1/200.1 kB[0m [31m27.3 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: tokenizers, huggingface-hub, transformers
Successfully installed huggingface-hub-0.13.4 tokenizers-0.13.3 transformers-4.27.4


In [5]:
import pandas as pd
import urllib.request

from torch.utils.data import Dataset, DataLoader
from transformers import Seq2SeqTrainingArguments, Seq2SeqTrainer, EarlyStoppingCallback, AdamW

In [6]:
from transformers import PreTrainedTokenizerFast, BartModel, BartForConditionalGeneration

checkpoint_name = 'gogamza/kobart-base-v2'
tokenizer = PreTrainedTokenizerFast.from_pretrained(checkpoint_name)
model = BartForConditionalGeneration.from_pretrained(checkpoint_name)

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/682k [00:00<?, ?B/s]

Downloading (…)in/added_tokens.json:   0%|          | 0.00/4.00 [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/1.36k [00:00<?, ?B/s]

You passed along `num_labels=3` with an incompatible id to label map: {'0': 'NEGATIVE', '1': 'POSITIVE'}. The number of labels wil be overwritten to 2.
You passed along `num_labels=3` with an incompatible id to label map: {'0': 'NEGATIVE', '1': 'POSITIVE'}. The number of labels wil be overwritten to 2.


Downloading pytorch_model.bin:   0%|          | 0.00/496M [00:00<?, ?B/s]

In [7]:
# Load the dataset
urllib.request.urlretrieve("https://raw.githubusercontent.com/songys/Chatbot_data/master/ChatbotData.csv", filename="ChatBotData.csv")
file_path = 'ChatBotData.csv'

In [8]:
data = pd.read_csv('ChatBotData.csv',encoding='utf-8-sig')

In [9]:
data.head()

Unnamed: 0,Q,A,label
0,12시 땡!,하루가 또 가네요.,0
1,1지망 학교 떨어졌어,위로해 드립니다.,0
2,3박4일 놀러가고 싶다,여행은 언제나 좋죠.,0
3,3박4일 정도 놀러가고 싶다,여행은 언제나 좋죠.,0
4,PPL 심하네,눈살이 찌푸려지죠.,0


In [10]:
class ChatbotDataset(Dataset):
    def __init__(self, file_path, tokenizer):
        self.data = pd.read_csv(file_path)
        self.tokenizer = tokenizer

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        input_text = self.data.iloc[idx]['Q']
        response_text = self.data.iloc[idx]['A']
        
        # Tokenize the input and response text
        input_tokens = self.tokenizer.encode(input_text, truncation=True, padding='max_length', max_length=128)
        response_tokens = self.tokenizer.encode(response_text, truncation=True, padding='max_length', max_length=128)
        
        return {'input_ids': input_tokens, 'attention_mask': [1] * len(input_tokens), 'decoder_input_ids': response_tokens[:-1], 'decoder_attention_mask': [1] * len(response_tokens[:-1]), 'labels': response_tokens[1:]}

In [11]:
train_dataset = ChatbotDataset('ChatBotData.csv', tokenizer)
data_collator = lambda data: {'input_ids': [item['input_ids'] for item in data], 
                              'attention_mask': [item['attention_mask'] for item in data],
                              'decoder_input_ids': [item['decoder_input_ids'] for item in data],
                              'decoder_attention_mask': [item['decoder_attention_mask'] for item in data],
                              'labels': [item['labels'] for item in data]}

train_dataloader = DataLoader(train_dataset, batch_size=8, shuffle=True, collate_fn=data_collator)

In [12]:
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')

In [13]:
optimizer = AdamW(model.parameters(), lr=1e-5)
model.to(device)
model.train()



BartForConditionalGeneration(
  (model): BartModel(
    (shared): Embedding(30000, 768, padding_idx=3)
    (encoder): BartEncoder(
      (embed_tokens): Embedding(30000, 768, padding_idx=3)
      (embed_positions): BartLearnedPositionalEmbedding(1028, 768)
      (layers): ModuleList(
        (0-5): 6 x BartEncoderLayer(
          (self_attn): BartAttention(
            (k_proj): Linear(in_features=768, out_features=768, bias=True)
            (v_proj): Linear(in_features=768, out_features=768, bias=True)
            (q_proj): Linear(in_features=768, out_features=768, bias=True)
            (out_proj): Linear(in_features=768, out_features=768, bias=True)
          )
          (self_attn_layer_norm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
          (activation_fn): GELUActivation()
          (fc1): Linear(in_features=768, out_features=3072, bias=True)
          (fc2): Linear(in_features=3072, out_features=768, bias=True)
          (final_layer_norm): LayerNorm((768,), eps=

In [14]:
from tqdm import tqdm

In [15]:
from torch import nn

In [16]:
criterion = nn.CrossEntropyLoss(ignore_index=model.config.pad_token_id)

In [18]:
epochs = 10

for epoch in range(epochs):
    epoch_loss = 0
    for batch in tqdm(train_dataloader):
        input_ids = torch.tensor(batch['input_ids']).to(device)
        attention_mask = torch.tensor(batch['attention_mask']).to(device)
        decoder_input_ids = torch.tensor(batch['decoder_input_ids']).to(device)
        decoder_attention_mask = torch.tensor(batch['decoder_attention_mask']).to(device)
        labels = torch.tensor(batch['labels']).to(device)

        optimizer.zero_grad()
        outputs = model(input_ids=input_ids, attention_mask=attention_mask, decoder_input_ids=decoder_input_ids, decoder_attention_mask=decoder_attention_mask)
        logits = outputs.logits
        loss = criterion(logits.view(-1, logits.size(-1)), labels.view(-1))
        loss.backward()
        optimizer.step()

        epoch_loss += loss.item()

    print(f'Epoch {epoch + 1} loss: {epoch_loss / len(train_dataloader):.5f}')

100%|██████████| 1478/1478 [06:31<00:00,  3.77it/s]


Epoch 1 loss: 2.88389


100%|██████████| 1478/1478 [06:31<00:00,  3.78it/s]


Epoch 2 loss: 2.50868


100%|██████████| 1478/1478 [06:30<00:00,  3.78it/s]


Epoch 3 loss: 2.16618


100%|██████████| 1478/1478 [06:30<00:00,  3.78it/s]


Epoch 4 loss: 1.86862


100%|██████████| 1478/1478 [06:30<00:00,  3.78it/s]


Epoch 5 loss: 1.58056


100%|██████████| 1478/1478 [06:30<00:00,  3.78it/s]


Epoch 6 loss: 1.30728


100%|██████████| 1478/1478 [06:31<00:00,  3.78it/s]


Epoch 7 loss: 1.06955


100%|██████████| 1478/1478 [06:30<00:00,  3.78it/s]


Epoch 8 loss: 0.86804


100%|██████████| 1478/1478 [06:31<00:00,  3.78it/s]


Epoch 9 loss: 0.69052


100%|██████████| 1478/1478 [06:31<00:00,  3.78it/s]

Epoch 10 loss: 0.54549





In [None]:
torch.save(model.state_dict(), 'bart_chatbot.pt')

In [26]:
model.eval()

def conversations():
  input_text = input('User: ')
  input_tokens = tokenizer.encode(input_text, truncation=True, padding='max_length', max_length=128)
  input_ids = torch.tensor([input_tokens]).to(device)
  output_tokens = model.generate(input_ids, max_length=32, num_beams=4, early_stopping=True)
  output_text = tokenizer.decode(output_tokens[0], skip_special_tokens=True)
  print(f'ChatBot: {output_text}')

In [27]:
conversations()

User: 안녕~
ChatBot: 안녕 안녕 안녕 안녕 안녕 안녕 안녕~~~~~~~~~~~~~~~~~~~~~~~


In [28]:
conversations()

User: 너 이름이 뭐야?
ChatBot: 심심 이름이 이름이 뭐 뭐심심심 자기 이름이 이름이 너무에에에에 너 이름이 이름이 너무 상 상에에에에 자기 이름이 이름이


In [29]:
conversations()

User: 저거 보여?
ChatBot: 보여 보여 보여 보여 보여 보여 보여 보여 보여 보여 보여 보여 보여 보여 보여 보여 보여 보여 보여 보여 보여 보여 보여 보여 보여 보여 보여 보여 보여 보여


추후 성능 개선을 위해 시도해볼 것.

질문에 대한 응답의 내용만을 봤을 때 나쁘지 않지만 동일하게 계속 반복해서 말하는 경향을 보임(최대 길이까지). 