In [1]:
!pip install transformers

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [2]:
from transformers import AutoTokenizer
from transformers import GPT2LMHeadModel

In [3]:
tokenizer = AutoTokenizer.from_pretrained('skt/kogpt2-base-v2', bos_token='</s>', eos_token='</s>', pad_token='<pad>')
model = GPT2LMHeadModel.from_pretrained('skt/kogpt2-base-v2')

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [None]:
print(tokenizer.bos_token_id)
print(tokenizer.eos_token_id)
print(tokenizer.pad_token_id)
print('-' * 10)
print(tokenizer.decode(1))
print(tokenizer.decode(2))
print(tokenizer.decode(3))
print(tokenizer.decode(4))

In [4]:
import pandas as pd
import tqdm
import urllib.request

In [5]:
urllib.request.urlretrieve("https://raw.githubusercontent.com/songys/Chatbot_data/master/ChatbotData.csv", filename="ChatBotData.csv")
train_data = pd.read_csv('ChatBotData.csv')

In [7]:
len(train_data)

11823

In [None]:
train_data.head()

In [10]:
train_data

Unnamed: 0,Q,A,label
0,12시 땡!,하루가 또 가네요.,0
1,1지망 학교 떨어졌어,위로해 드립니다.,0
2,3박4일 놀러가고 싶다,여행은 언제나 좋죠.,0
3,3박4일 정도 놀러가고 싶다,여행은 언제나 좋죠.,0
4,PPL 심하네,눈살이 찌푸려지죠.,0
...,...,...,...
11818,훔쳐보는 것도 눈치 보임.,티가 나니까 눈치가 보이는 거죠!,2
11819,훔쳐보는 것도 눈치 보임.,훔쳐보는 거 티나나봐요.,2
11820,흑기사 해주는 짝남.,설렜겠어요.,2
11821,힘든 연애 좋은 연애라는게 무슨 차이일까?,잘 헤어질 수 있는 사이 여부인 거 같아요.,2


In [6]:
batch_size = 32

In [7]:
import torch
from torch.utils.data import Dataset, DataLoader

class ChatDataset(Dataset):
    def __init__(self, train_data, tokenizer):
        self.train_data = train_data
        self.tokenizer = tokenizer

    def __len__(self):
        return len(self.train_data)

    def __getitem__(self, idx):
        question = self.train_data.Q.iloc[idx]
        answer = self.train_data.A.iloc[idx]
        bos_token = self.tokenizer.bos_token_id
        eos_token = self.tokenizer.eos_token_id
        sent = self.tokenizer.encode('<usr>' + question + '<sys>' + answer, add_special_tokens=False)
        return torch.tensor([bos_token] + sent + [eos_token], dtype=torch.long)

def collate_fn(batch):
    return torch.nn.utils.rnn.pad_sequence(batch, batch_first=True, padding_value=tokenizer.pad_token_id)

batch_size = 32
chat_dataset = ChatDataset(train_data, tokenizer)
data_loader = DataLoader(chat_dataset, batch_size=batch_size, collate_fn=collate_fn)

In [13]:
len(chat_dataset)

11823

In [14]:
chat_dataset[0]

tensor([    1,     2,  9349,  7888,   739,  7318,   376,     4, 12557,  6824,
         9108,  9028,  7098, 25856,     1])

In [None]:
tokenizer = AutoTokenizer.from_pretrained('skt/kogpt2-base-v2', bos_token='</s>', eos_token='</s>', pad_token='<pad>')
model = GPT2LMHeadModel.from_pretrained('skt/kogpt2-base-v2')

In [8]:
import tqdm

optimizer = torch.optim.Adam(model.parameters(), lr=3e-5, eps=1e-08)

steps = len(train_data) // batch_size + 1
print(steps)

EPOCHS = 3

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

370


GPT2LMHeadModel(
  (transformer): GPT2Model(
    (wte): Embedding(51200, 768)
    (wpe): Embedding(1024, 768)
    (drop): Dropout(p=0.1, inplace=False)
    (h): ModuleList(
      (0-11): 12 x GPT2Block(
        (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (attn): GPT2Attention(
          (c_attn): Conv1D()
          (c_proj): Conv1D()
          (attn_dropout): Dropout(p=0.1, inplace=False)
          (resid_dropout): Dropout(p=0.1, inplace=False)
        )
        (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (mlp): GPT2MLP(
          (c_fc): Conv1D()
          (c_proj): Conv1D()
          (act): NewGELUActivation()
          (dropout): Dropout(p=0.1, inplace=False)
        )
      )
    )
    (ln_f): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
  )
  (lm_head): Linear(in_features=768, out_features=51200, bias=False)
)

In [9]:
for epoch in range(EPOCHS):
    epoch_loss = 0

    for batch in tqdm.tqdm(data_loader, total=steps):
        batch = batch.to(device)
        labels = batch.clone()
        optimizer.zero_grad()
        result = model(input_ids=batch, labels=labels)
        loss = result.loss
        batch_loss = loss.mean()

        batch_loss.backward()
        optimizer.step()
        epoch_loss += batch_loss.item() / steps

    print('[Epoch: {:>4}] cost = {:>.9}'.format(epoch + 1, epoch_loss))

100%|██████████| 370/370 [01:41<00:00,  3.63it/s]


[Epoch:    1] cost = 2.12702096


100%|██████████| 370/370 [01:40<00:00,  3.69it/s]


[Epoch:    2] cost = 1.69815452


100%|██████████| 370/370 [01:42<00:00,  3.61it/s]

[Epoch:    3] cost = 1.37507791





In [10]:
text = '오늘도 좋은 하루!'

sent = '<usr>' + text + '<sys>'

input_ids = [tokenizer.bos_token_id] + tokenizer.encode(sent)
input_ids = torch.tensor([input_ids], dtype=torch.long).to(device)

output = model.generate(input_ids, max_length=50, early_stopping=True, eos_token_id=tokenizer.eos_token_id)

In [11]:
output

tensor([[    1,     2, 10070,  7235, 10586, 12557,   376,     4, 10586, 12557,
          7098, 25856,     1]], device='cuda:0')

In [12]:
decoded_sentence = tokenizer.decode(output[0].tolist())

In [13]:
decoded_sentence.split('<sys> ')[1].replace('</s>', '')

'좋은 하루네요.'

In [14]:
def return_answer_by_chatbot(user_text):
    sent = '<usr>' + user_text + '<sys>'
    input_ids = [tokenizer.bos_token_id] + tokenizer.encode(sent, add_special_tokens=False)
    input_ids = torch.tensor([input_ids], dtype=torch.long).to(device)
    output = model.generate(input_ids, max_length=50, do_sample=True, top_k=2)
    sentence = tokenizer.decode(output[0].tolist())
    chatbot_response = sentence.split('<sys> ')[1].replace('</s>', '')
    return chatbot_response

In [20]:
return_answer_by_chatbot('안녕! 반가워~')

'짝사랑은 영원할거예요.'

In [19]:
return_answer_by_chatbot('너 딥 러닝 잘해?')

'잘할 거예요.'

In [None]:
https://github.com/huggingface/notebooks/blob/main/examples/text_classification.ipynb