<a href="https://colab.research.google.com/github/pmj-chosim/Commit-Project-2023.1.20-2023.2.28-/blob/main/2023.02.13/nlp/transformerchatbot.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

#Transformer Chatbot

In [1]:
! pip install sentencepiece 

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting sentencepiece
  Downloading sentencepiece-0.1.97-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m9.0 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: sentencepiece
Successfully installed sentencepiece-0.1.97


In [2]:
import pandas as pd
import numpy as np
import re
import torch 
import sentencepiece as spm
import sys
from tqdm import tqdm
from torch.nn import Transformer
from torch import nn
import math
from torch.utils.data import Dataset, DataLoader

In [3]:
#Hyperparameter
MAX_LENGTH = 40
BATCH_SIZE = 64
lr = 1e-4
embed_size = 256 
n_head=8
n_hid = 512
n_layer = 2
dropout = 0.1
epoch = 30

#seed
random_seed = 9712
torch.manual_seed(random_seed)
torch.backends.cudnn.enabled = False
print(torch.randn(1, 3))
     

tensor([[0.3936, 0.5584, 0.9692]])


In [4]:
#GPU
! nvidia-smi
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(device)

Mon Feb 13 05:09:29 2023       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 510.47.03    Driver Version: 510.47.03    CUDA Version: 11.6     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla T4            Off  | 00000000:00:04.0 Off |                    0 |
| N/A   58C    P0    28W /  70W |      0MiB / 15360MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [5]:

# Data 
train_data = pd.read_csv('https://raw.githubusercontent.com/Doheon/Chatbot-Transformer/main/ChatBotData.csv')

questions = []
for sentence in train_data['Q']:
    sentence = re.sub(r"([?.!,])", r" \1 ", sentence)
    sentence = sentence.strip()
    questions.append(sentence)


answers = []
for sentence in train_data['A']:
    sentence = re.sub(r"([?.!,])", r" \1 ", sentence)
    sentence = sentence.strip()
    answers.append(sentence)

with open('all.txt', 'w', encoding='utf8') as f:
    f.write('\n'.join(questions))
    f.write('\n'.join(answers))


corpus = "all.txt"
prefix = "chatbot"
vocab_size = 16000
spm.SentencePieceTrainer.train(
    f"--input={corpus} --model_prefix={prefix} --vocab_size={vocab_size + 7}" + 
    "--min_frequency={3}"+
    " --model_type=bpe" +
    " --max_sentence_length=999999" + # 문장 최대 길이
    " --pad_id=0 --pad_piece=[PAD]" + # pad (0)
    " --unk_id=1 --unk_piece=[UNK]" + # unknown (1)
    " --bos_id=2 --bos_piece=[BOS]" + # begin of sequence (2)
    " --eos_id=3 --eos_piece=[EOS]" + # end of sequence (3)
    " --user_defined_symbols=[SEP],[CLS],[MASK]") # 사용자 정의 토큰

vocab_file = "chatbot.model"
vocab = spm.SentencePieceProcessor()
vocab.load(vocab_file)
line = "안녕하세요 만나서 반갑습니다"
pieces = vocab.encode_as_pieces(line)
ids = vocab.encode_as_ids(line)


print(line)
print(pieces)
print(ids)
print(vocab.GetPieceSize())
vocab_size = vocab.GetPieceSize()

안녕하세요 만나서 반갑습니다
['▁안녕하세요', '▁만나서', '▁반갑습니다']
[4626, 1930, 8499]
16007


In [6]:
START_TOKEN = [2]
END_TOKEN = [3]

#tokenize and padding  

def tokenize_and_filter(inputs, outputs):
  tokenized_inputs, tokenized_outputs = [], []
  for (sentence1, sentence2) in zip(inputs, outputs):
    zeros1 = np.zeros(MAX_LENGTH, dtype=int)
    zeros2 = np.zeros(MAX_LENGTH, dtype=int)
    sentence1 = START_TOKEN + vocab.encode_as_ids(sentence1) + END_TOKEN
    zeros1[:len(sentence1)] = sentence1[:MAX_LENGTH]

    sentence2 = START_TOKEN + vocab.encode_as_ids(sentence2) + END_TOKEN
    zeros2[:len(sentence2)] = sentence2[:MAX_LENGTH]

    tokenized_inputs.append(zeros1)
    tokenized_outputs.append(zeros2)
  return tokenized_inputs, tokenized_outputs

questions_encode, answers_encode = tokenize_and_filter(questions, answers)
print(questions_encode[0])
print(answers_encode[0])

[    2  5566 14968  3210   111     3     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0]
[   2 5192  217 5936    7    3    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0]


In [7]:
class MyDataset(Dataset):
    def __init__(self, questions, answers):
        questions = np.array(questions)
        answers = np.array(answers)
        self.inputs = questions
        self.dec_inputs = answers[:,:-1] #(datanum, max_len-1)
        self.outputs = answers[:,1:] #(datanum, max_len-1) 
        self.length = len(questions) #input_length 
    
    def __getitem__(self,idx):
        return (self.inputs[idx], self.dec_inputs[idx], self.outputs[idx])

    def __len__(self):
        return self.length


dataset = MyDataset(questions_encode, answers_encode)
dataloader = DataLoader(dataset, shuffle=True, batch_size=BATCH_SIZE)
print(f"data set num: {len(dataset)}")
print(f"data set num: {len(dataloader)}")

data set num: 11823
data set num: 185


In [8]:

class TransformerModel(nn.Module):
    def __init__(self, vocab_size, embed_size, n_head, n_hid, n_layer, dropout=0.5):
        super(TransformerModel, self).__init__()
        self.transformer = Transformer(embed_size, n_head, dim_feedforward=n_hid, num_encoder_layers=n_layer, num_decoder_layers=n_layer,dropout=dropout)
        self.e_pos = PositionalEncoding(embed_size, dropout)
        self.e_embedding = nn.Embedding(vocab_size, embed_size)
        self.d_pos = PositionalEncoding(embed_size, dropout)
        self.encoder_d = nn.Embedding(vocab_size, embed_size)
        self.embed_size = embed_size
        self.vocab_size = vocab_size
        self.linear = nn.Linear(embed_size, vocab_size)
        self.init_weights()

    def generate_square_subsequent_mask(self, sz):
        mask = (torch.triu(torch.ones(sz, sz)) == 1).transpose(0, 1)
        mask = mask.float().masked_fill(mask == 0, float('-inf')).masked_fill(mask == 1, float(0.0))
        return mask

    def init_weights(self):
        initrange = 0.1
        self.e_embedding.weight.data.uniform_(-initrange, initrange)

    def forward(self, src, tgt, srcmask, tgtmask, srcpadmask, tgtpadmask):
        src = self.e_embedding(src) * math.sqrt(self.embed_size) #(batch_size, max_len, embed_size)
        src = self.e_pos(src) #(batch_size, max_len, embed_size)
        tgt = self.encoder_d(tgt) * math.sqrt(self.embed_size)#(batch_size, max_len-1, embed_size)
        tgt = self.d_pos(tgt)#(batch_size, max_len-1, embed_size)
        output = self.transformer(src.transpose(0,1), tgt.transpose(0,1), srcmask, tgtmask, src_key_padding_mask=srcpadmask, tgt_key_padding_mask=tgtpadmask) #(max_len-1, batch_size,embed_size)
        output = self.linear(output) #???
        return output

class PositionalEncoding(nn.Module):
    def __init__(self, d_model, dropout=0.1, max_len=5000):
        super(PositionalEncoding, self).__init__()
        self.dropout = nn.Dropout(p=dropout)
        pe = torch.zeros(max_len, d_model)
        position = torch.arange(0, max_len, dtype=torch.float).unsqueeze(1)
        div_term = torch.exp(torch.arange(0, d_model, 2).float() * (-math.log(10000.0) / d_model))
        pe[:, 0::2] = torch.sin(position * div_term)
        pe[:, 1::2] = torch.cos(position * div_term)
        pe = pe.unsqueeze(0).transpose(0, 1)
        self.register_buffer('pe', pe)

    def forward(self, x):
        x = x + self.pe[:x.size(0), :]
        return self.dropout(x)

def gen_attention_mask(x):
    mask = torch.eq(x, 0)
    return mask
    
model = TransformerModel(vocab_size, embed_size=embed_size, n_head=n_head, n_hid=n_hid, n_layer=n_layer, dropout=dropout).to(device)
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=lr)

In [9]:
model.train()
for i in range(epoch):
    batchloss = 0.0
    progress = tqdm(dataloader)
    for (inputs, dec_inputs, outputs) in progress:
        optimizer.zero_grad()
        src_mask = model.generate_square_subsequent_mask(MAX_LENGTH).to(device) #(max_len, max_len)
        src_padding_mask = gen_attention_mask(inputs).to(device) #(batch_size, max_len)
        tgt_mask = model.generate_square_subsequent_mask(MAX_LENGTH-1).to(device)  #(max_len-1, max_len-1)
        tgt_padding_mask = gen_attention_mask(dec_inputs).to(device) #(batch_size, max_len-1)
        result = model(inputs.to(device), dec_inputs.to(device), src_mask, tgt_mask, src_padding_mask,tgt_padding_mask) #(max_len, batch_size, vocab_size)
        loss = criterion(result.permute(1,2,0), outputs.to(device).long())
        loss.backward()
        optimizer.step()
        batchloss += loss
        progress.set_description("{:0.3f}".format(loss))
    print("epoch:",i+1,"|","loss:",batchloss.cpu().item() / len(dataloader))
     

1.084: 100%|██████████| 185/185 [00:16<00:00, 11.19it/s]


epoch: 1 | loss: 1.8357174435177366


0.909: 100%|██████████| 185/185 [00:12<00:00, 14.30it/s]


epoch: 2 | loss: 0.9696116679423564


0.770: 100%|██████████| 185/185 [00:12<00:00, 14.66it/s]


epoch: 3 | loss: 0.9100667592641469


0.964: 100%|██████████| 185/185 [00:12<00:00, 14.52it/s]


epoch: 4 | loss: 0.8817562618771115


0.874: 100%|██████████| 185/185 [00:12<00:00, 14.39it/s]


epoch: 5 | loss: 0.859440159153294


0.776: 100%|██████████| 185/185 [00:13<00:00, 14.20it/s]


epoch: 6 | loss: 0.8401062836518158


0.927: 100%|██████████| 185/185 [00:13<00:00, 13.78it/s]


epoch: 7 | loss: 0.8214277937605574


0.806: 100%|██████████| 185/185 [00:13<00:00, 14.15it/s]


epoch: 8 | loss: 0.8023002830711571


0.677: 100%|██████████| 185/185 [00:13<00:00, 14.15it/s]


epoch: 9 | loss: 0.7826237034153294


0.660: 100%|██████████| 185/185 [00:12<00:00, 14.31it/s]


epoch: 10 | loss: 0.7623109559755068


0.678: 100%|██████████| 185/185 [00:12<00:00, 14.31it/s]


epoch: 11 | loss: 0.7418268976984798


0.694: 100%|██████████| 185/185 [00:13<00:00, 14.23it/s]


epoch: 12 | loss: 0.7208734460779138


0.677: 100%|██████████| 185/185 [00:13<00:00, 14.21it/s]


epoch: 13 | loss: 0.7003096296980574


0.669: 100%|██████████| 185/185 [00:13<00:00, 13.60it/s]


epoch: 14 | loss: 0.6794185329127956


0.650: 100%|██████████| 185/185 [00:12<00:00, 14.28it/s]


epoch: 15 | loss: 0.6584393475506757


0.670: 100%|██████████| 185/185 [00:12<00:00, 14.25it/s]


epoch: 16 | loss: 0.6366320223421664


0.592: 100%|██████████| 185/185 [00:12<00:00, 14.33it/s]


epoch: 17 | loss: 0.6154388427734375


0.599: 100%|██████████| 185/185 [00:12<00:00, 14.26it/s]


epoch: 18 | loss: 0.592549298260663


0.583: 100%|██████████| 185/185 [00:12<00:00, 14.25it/s]


epoch: 19 | loss: 0.5707731917097761


0.537: 100%|██████████| 185/185 [00:12<00:00, 14.25it/s]


epoch: 20 | loss: 0.5485796954180743


0.520: 100%|██████████| 185/185 [00:12<00:00, 14.26it/s]


epoch: 21 | loss: 0.5265193217509502


0.539: 100%|██████████| 185/185 [00:12<00:00, 14.24it/s]


epoch: 22 | loss: 0.5038204708614865


0.391: 100%|██████████| 185/185 [00:12<00:00, 14.27it/s]


epoch: 23 | loss: 0.48164132607949744


0.481: 100%|██████████| 185/185 [00:13<00:00, 14.22it/s]


epoch: 24 | loss: 0.45919478132918073


0.389: 100%|██████████| 185/185 [00:12<00:00, 14.26it/s]


epoch: 25 | loss: 0.4379471649994721


0.403: 100%|██████████| 185/185 [00:13<00:00, 14.21it/s]


epoch: 26 | loss: 0.4165530642947635


0.427: 100%|██████████| 185/185 [00:12<00:00, 14.24it/s]


epoch: 27 | loss: 0.3945894808382601


0.301: 100%|██████████| 185/185 [00:13<00:00, 14.16it/s]


epoch: 28 | loss: 0.373946153795397


0.416: 100%|██████████| 185/185 [00:12<00:00, 14.25it/s]


epoch: 29 | loss: 0.35336580018739444


0.395: 100%|██████████| 185/185 [00:12<00:00, 14.26it/s]

epoch: 30 | loss: 0.33408702128642315





In [11]:
torch.save(model.state_dict(), "chatbot.pth")

In [12]:
def preprocess_sentence(sentence):
    sentence = re.sub(r"([?.!,])", r" \1 ", sentence)
    sentence = sentence.strip()
    return sentence

def evaluate(sentence):
    sentence = preprocess_sentence(sentence)
    input = torch.tensor([START_TOKEN + vocab.encode_as_ids(sentence) + END_TOKEN]).to(device)
    output = torch.tensor([START_TOKEN]).to(device)

    model.eval()
    for i in range(MAX_LENGTH):
        #mask 
        src_mask = model.generate_square_subsequent_mask(input.shape[1]).to(device)
        tgt_mask = model.generate_square_subsequent_mask(output.shape[1]).to(device)
        src_padding_mask = gen_attention_mask(input).to(device)
        tgt_padding_mask = gen_attention_mask(output).to(device)

        predictions = model(input, output, src_mask, tgt_mask, src_padding_mask, tgt_padding_mask).transpose(0,1) #(batch_size, output_size, vocab_size)

        # 현재(마지막) 시점의 예측 단어를 받아온다.
        predictions = predictions[:, -1:, :]
        predicted_id = torch.LongTensor(torch.argmax(predictions.cpu(), axis=-1))


        # 만약 마지막 시점의 예측 단어가 종료 토큰이라면 예측을 중단
        if torch.equal(predicted_id[0][0], torch.tensor(END_TOKEN[0])):
            break

        # 마지막 시점의 예측 단어를 출력에 연결한다.
        # 이는 for문을 통해서 디코더의 입력으로 사용될 예정이다.
        output = torch.cat([output, predicted_id.to(device)], axis=1)

    return torch.squeeze(output, axis=0).cpu().numpy()

def predict(sentence):
    prediction = evaluate(sentence)
    predicted_sentence = vocab.Decode(list(map(int,[i for i in prediction if i < vocab_size])))
    
    print("========================================")
    print('Q: {}'.format(sentence))
    print('A: {}'.format(predicted_sentence))

In [17]:
model.load_state_dict(torch.load("chatbot.pth"))
result = predict("게임하고 싶어")
result = predict("놀고싶다")
result = predict("감기 같애")
result = predict("건강하게 다이어트 하는 방법")
result=predict("안녕")
result = predict("궁금하지?")
     

Q: 게임하고 싶어
A: 새로운 스타일 도전해 보시면 어때요 !
Q: 놀고싶다
A: 직장 스트레스가 심한가봐요 마세요 .
Q: 감기 같애
A: 병원가세요 게 좋겠죠 .
Q: 건강하게 다이어트 하는 방법
A: 너무 자책하지 마세요 .
Q: 안녕
A: 후회해도 늦었어요 하네요 .
Q: 궁금하지?
A: 아직 사귀는 경우도 있지만 정확하게 아직 모르겠어요 .
