In [1]:
import os
import re
import time
from collections import Counter
import numpy as np
import pandas as pd

import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader

In [2]:
BASE_DIR = r"..\data"
TRAIN_DIR = os.path.join(BASE_DIR, r"train\dialogues_train.txt")
VALID_DIR = os.path.join(BASE_DIR, r"validation\dialogues_validation.txt")
TEST_DIR = os.path.join(BASE_DIR, r"test\dialogues_test.txt")

In [3]:
def preprocess_text(text):
    text = text.lower()
    text = re.sub(r"[^a-zA-Z0-9\s]", "", text)
    text = re.sub(r"\s+", " ", text).strip()
    return text

In [4]:
def load_dialogue_pairs(file_path):
    with open(file_path, "r", encoding="UTF-8") as f:
        raw_lines = f.readlines()

    dialogs = [line.strip().split("__eou__") for line in raw_lines]
    dialogs_cleaned = [[utt.strip() for utt in dialog if utt.strip()] for dialog in dialogs]

    pairs = []
    for dialog in dialogs_cleaned:
        for i in range(len(dialog) - 1):
            context = " ".join(dialog[:i+1]).strip()
            response = dialog[i+1].strip()
            if context and response:
                pairs.append({"context": context, "response": response})
    return pairs

train_pairs = load_dialogue_pairs(TRAIN_DIR)
valid_pairs = load_dialogue_pairs(VALID_DIR)
test_pairs = load_dialogue_pairs(TEST_DIR)


df_train = pd.DataFrame(train_pairs)
df_valid = pd.DataFrame(valid_pairs)
df_test = pd.DataFrame(test_pairs)

df_train.head()

Unnamed: 0,context,response
0,"Say , Jim , how about going for a few beers af...",You know that is tempting but is really not go...
1,"Say , Jim , how about going for a few beers af...",What do you mean ? It will help us to relax .
2,"Say , Jim , how about going for a few beers af...",Do you really think so ? I don't . It will jus...
3,"Say , Jim , how about going for a few beers af...",I guess you are right.But what shall we do ? I...
4,"Say , Jim , how about going for a few beers af...",I suggest a walk over to the gym where we can ...


In [5]:
for df in [df_train, df_valid, df_test]:
    df["context_clean"] = df["context"].apply(preprocess_text)
    df["response_clean"] = df["response"].apply(preprocess_text)
    df["context_tokens"] = df["context_clean"].apply(lambda x: x.split())
    df["response_tokens"] = df["response_clean"].apply(lambda x: x.split())

In [6]:
df_train.head()

Unnamed: 0,context,response,context_clean,response_clean,context_tokens,response_tokens
0,"Say , Jim , how about going for a few beers af...",You know that is tempting but is really not go...,say jim how about going for a few beers after ...,you know that is tempting but is really not go...,"[say, jim, how, about, going, for, a, few, bee...","[you, know, that, is, tempting, but, is, reall..."
1,"Say , Jim , how about going for a few beers af...",What do you mean ? It will help us to relax .,say jim how about going for a few beers after ...,what do you mean it will help us to relax,"[say, jim, how, about, going, for, a, few, bee...","[what, do, you, mean, it, will, help, us, to, ..."
2,"Say , Jim , how about going for a few beers af...",Do you really think so ? I don't . It will jus...,say jim how about going for a few beers after ...,do you really think so i dont it will just mak...,"[say, jim, how, about, going, for, a, few, bee...","[do, you, really, think, so, i, dont, it, will..."
3,"Say , Jim , how about going for a few beers af...",I guess you are right.But what shall we do ? I...,say jim how about going for a few beers after ...,i guess you are rightbut what shall we do i do...,"[say, jim, how, about, going, for, a, few, bee...","[i, guess, you, are, rightbut, what, shall, we..."
4,"Say , Jim , how about going for a few beers af...",I suggest a walk over to the gym where we can ...,say jim how about going for a few beers after ...,i suggest a walk over to the gym where we can ...,"[say, jim, how, about, going, for, a, few, bee...","[i, suggest, a, walk, over, to, the, gym, wher..."


In [7]:
all_tokens = []
for df in [df_train, df_valid, df_test]:
    for tokens_list in df["context_tokens"]:
        all_tokens.extend(tokens_list)
    for tokens_list in df["response_tokens"]:
        all_tokens.extend(tokens_list)

word_counts = Counter(all_tokens)

PAD_IDX = 0
UNK_IDX = 1
SOS_IDX = 2
EOS_IDX = 3

vocab = {
    word: i + 4 for i, (word, _) in enumerate(word_counts.most_common()) if word not in ["<PAD>", "<UNK>", "<SOS>", "<EOS>"]
}
vocab["<PAD>"] = PAD_IDX
vocab["<UNK>"] = UNK_IDX
vocab["<SOS>"] = SOS_IDX
vocab["<EOS>"] = EOS_IDX

In [8]:
idx2word = {idx: word for word, idx in vocab.items()}

In [9]:
def tokens_to_indices(tokens, vocab):
    return [vocab.get(token, vocab["<UNK>"]) for token in tokens]

In [10]:
def pad(seq, max_len, pad_value=0):
    if len(seq) > max_len:
        seq = seq[:max_len]
    elif len(seq) < max_len:
        seq = seq + [pad_value] * (max_len - len(seq))
    return seq

max_len_context = 40
max_len_response = 42

for df in [df_train, df_valid, df_test]:
    df["context_idx"] = df["context_tokens"].apply(
        lambda x: tokens_to_indices(x, vocab)
    )
    df["response_idx"] = df["response_tokens"].apply(
        lambda x: [vocab["<SOS>"]] + tokens_to_indices(x, vocab) + [vocab["<EOS>"]]
    )
    df["context_idx_padded"] = df["context_idx"].apply(
        lambda x: pad(x, max_len_context, PAD_IDX)
    )
    df["response_idx_padded"] = df["response_idx"].apply(
        lambda x: pad(x, max_len_response, PAD_IDX)
    )

In [11]:
class Seq2SeqDataset(Dataset):
    def __init__(self, df):
        self.contexts = df["context_idx_padded"].tolist()
        self.responses = df["response_idx_padded"].tolist()

    def __len__(self):
        return len(self.contexts)

    def __getitem__(self, idx):
        context = torch.tensor(self.contexts[idx], dtype=torch.long)
        response = torch.tensor(self.responses[idx], dtype=torch.long)
        return context, response

In [12]:
BATCH_SIZE = 32

train_dataset = Seq2SeqDataset(df_train)
valid_dataset = Seq2SeqDataset(df_valid)
test_dataset = Seq2SeqDataset(df_test)

train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)
valid_loader = DataLoader(valid_dataset, batch_size=BATCH_SIZE, shuffle=False)
test_loader = DataLoader(test_dataset, batch_size=BATCH_SIZE, shuffle=False)

In [13]:
len(train_dataset), train_dataset[0]

(76053,
 (tensor([ 162,  885,   29,   33,   70,   15,    8,  186, 3197,  177,  289,    0,
             0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
             0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
             0,    0,    0,    0]),
  tensor([   2,    4,   44,   16,   10, 4335,   31,   10,   56,   43,   41,   15,
            71, 1764,    3,    0,    0,    0,    0,    0,    0,    0,    0,    0,
             0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
             0,    0,    0,    0,    0,    0])))

In [14]:
class Encoder(nn.Module):
    def __init__(self, vocab_size, embed_size, hidden_size, num_layers=1):
        super(Encoder, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embed_size, padding_idx=0)
        self.lstm = nn.LSTM(embed_size, hidden_size, num_layers, batch_first=True)

    def forward(self, x):
        embedded = self.embedding(x)
        _, (hidden, cell) = self.lstm(embedded)
        return hidden, cell

In [15]:
class Decoder(nn.Module):
    def __init__(self, vocab_size, embed_size, hidden_size, num_layers=1):
        super(Decoder, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embed_size, padding_idx=0)
        self.lstm = nn.LSTM(embed_size, hidden_size, num_layers, batch_first=True)
        self.fc = nn.Linear(hidden_size, vocab_size)

    def forward(self, x, hidden, cell):
        x = x.unsqueeze(1)
        embedded = self.embedding(x)
        outputs, (hidden, cell) = self.lstm(embedded, (hidden, cell))
        predictions = self.fc(outputs.squeeze(1))
        return predictions, hidden, cell

In [16]:
class Seq2Seq(nn.Module):
    def __init__(self, encoder, decoder, device):
        super(Seq2Seq, self).__init__()
        self.encoder = encoder
        self.decoder = decoder
        self.device = device

    def forward(self, source, target, teacher_forcing_ratio=0.5):
        encoder_hidden, encoder_cell = self.encoder(source)

        target_len = target.shape[1]
        batch_size = target.shape[0]
        vocab_size = self.decoder.fc.out_features

        outputs = torch.zeros(target_len, batch_size, vocab_size).to(self.device)

        decoder_input = target[:, 0]

        hidden = encoder_hidden
        cell = encoder_cell

        for t in range(1, target_len):
            prediction, hidden, cell = self.decoder(decoder_input, hidden, cell)
            outputs[t] = prediction
            teacher_force = torch.rand(1).item() < teacher_forcing_ratio
            top1 = prediction.argmax(1)
            decoder_input = target[:, t] if teacher_force else top1

        return outputs

In [17]:
def epoch_time(start_time, end_time):
    elapsed_time = end_time - start_time
    elapsed_mins = int(elapsed_time / 60)
    elapsed_secs = int(elapsed_time - (elapsed_mins * 60))
    return elapsed_mins, elapsed_secs

In [18]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
device

device(type='cuda')

In [19]:
LEN_VOCAB = len(vocab)
EMB_DIM = 128
EMB_DIM = 128
HID_DIM = 256
N_LAYERS = 1

In [20]:
encoder = Encoder(LEN_VOCAB, EMB_DIM, HID_DIM, N_LAYERS).to(device)
decoder = Decoder(LEN_VOCAB, EMB_DIM, HID_DIM, N_LAYERS).to(device)
model = Seq2Seq(encoder, decoder, device).to(device)

In [21]:
learning_rate = 0.001
optimizer = optim.Adam(model.parameters(), lr=learning_rate)

In [22]:
criterion = nn.CrossEntropyLoss(ignore_index=PAD_IDX)

In [23]:
N_EPOCHS = 10
CLIP = 1

best_valid_loss = float('inf')
MODEL_SAVE_PATH = 'dd_chatbot_model.pt'

In [24]:
def train(model, iterator, optimizer, criterion, clip):
    model.train()

    epoch_loss = 0
    
    for i, (src, trg) in enumerate(iterator):

        src = src.to(device)
        trg = trg.to(device)

        optimizer.zero_grad()
        output = model(src, trg, teacher_forcing_ratio=0.5)

        output_dim = output.shape[-1]
        output = output[1:].view(-1, output_dim)
        trg = trg[:, 1:].reshape(-1)

        loss = criterion(output, trg)

        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), clip)

        optimizer.step()

        epoch_loss += loss.item()

    return epoch_loss / len(iterator)

In [25]:
def evaluate(model, iterator, criterion):
    model.eval()

    epoch_loss = 0

    with torch.no_grad():
        for i, (src, trg) in enumerate(iterator):
            src = src.to(device)
            trg = trg.to(device)

            output = model(src, trg, teacher_forcing_ratio=0) 

            output_dim = output.shape[-1]
            output = output[1:].view(-1, output_dim)
            trg = trg[:, 1:].reshape(-1)

            loss = criterion(output, trg)

            epoch_loss += loss.item()

    return epoch_loss / len(iterator)

In [26]:
for epoch in range(N_EPOCHS):
    start_time = time.time()

    train_loss = train(model, train_loader, optimizer, criterion, CLIP)
    valid_loss = evaluate(model, valid_loader, criterion)

    end_time = time.time()
    epoch_mins, epoch_secs = epoch_time(start_time, end_time)

    print(f'Epoch: {epoch+1:02} | Time: {epoch_mins}m {epoch_secs}s')
    print(f'\tTrain Loss: {train_loss:.3f}')
    print(f'\tVal Loss: {valid_loss:.3f}')

    if valid_loss < best_valid_loss:
        best_valid_loss = valid_loss
        torch.save(model.state_dict(), MODEL_SAVE_PATH)
        print(f"Model is saved with Val Loss: {valid_loss:.3f}")

Epoch: 01 | Time: 7m 56s
	Train Loss: 6.302
	Val Loss: 6.247
Model is saved with Val Loss: 6.247
Epoch: 02 | Time: 9m 15s
	Train Loss: 6.231
	Val Loss: 6.258
Epoch: 03 | Time: 8m 57s
	Train Loss: 6.227
	Val Loss: 6.265
Epoch: 04 | Time: 8m 33s
	Train Loss: 6.223
	Val Loss: 6.272
Epoch: 05 | Time: 7m 44s
	Train Loss: 6.220
	Val Loss: 6.277
Epoch: 06 | Time: 7m 45s
	Train Loss: 6.219
	Val Loss: 6.283


KeyboardInterrupt: 