In [1]:
import torch
from torch.utils.data import DataLoader, Dataset
import torch.nn as nn
import torch.nn.functional as F
from torch import optim

import nltk
from tqdm.notebook import tqdm
import re
from pathlib import Path
from collections import Counter
import pandas as pd
from xml.dom import minidom
from bs4 import BeautifulSoup
import string
import matplotlib.pyplot as plt
import numpy as np
import operator
import gensim
import itertools

import time
import datetime

In [5]:
# use gpu if available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [None]:
# for use in google colab
!unzip data.zip

In [34]:
class ChatDataset(Dataset):
    """Dataset which contains tuples of questions and answers.
    
    This dataset extracts sequences from txt files. The txt files need to store one pair
    of question and answer per line in the following form:
    
    "['question_1','question_2',...]\t['answer_1','answer_2',...]\n"
    
    Each line contains a list of questions and answers, because a question can consist of 
    multiple messages, e.g. in a WhatsApp chat one user might send multiple consecutive messages
    without an answer in between.
    
    The dataset returns tuples of questions and answers, where questions and answers are given
    as lists of unique word indices. Multiple consecutive questions and answers are linked using
    the <new> token to symbolize the start of a new message.
    
    
    Functions:
        __get_data: loads data from a directory with txt files.
        __get_sequences: loads tuples of questions and answers from specific txt file.
        __get_vocab: creates vocabulary and pretrains word2vec model for the loaded sequences.
        get_embeddigns: returns the pretrained embeddigns of the word2vec model.
        
    Attributes:
        max_length: the maximum length of a sequence.
        embedding_size: the size of the pretrained embeddings.
        vocab: maps tokens to unique indices.
        inverse_vocab: maps indices to tokens.
        word2vec: trained gensim word2vec model.
    """
    def __init__(self, directory, max_length = 10, embedding_size = 128):
        """Initialize Dataset
        
        Get all sequences (questions and answers) from a directory of text files, 
        create vocabulary and pretrain word2vec embeddings.
        """
        self.max_length = max_length
        self.embedding_size = embedding_size
        
        print("READING TXT FILES")
        data = self.get_data(directory)
        print("OBTAINING EMBEDDINGS AND VOCABULARY")
        self.vocab, self.word2vec = self.get_vocab(data)
        self.inverse_vocab = {val: key for key, val in self.vocab.items()}
        print("DATA LOADED")
        
    def __get_data(self, directory):
        """Get tuples of questions and answers from directory of txt files.
        """
        data = []
        for filename in tqdm(list(Path(directory).glob("*.txt"))):
            sequences = self.get_sequences(filename)
            data += sequences
            
        return data
        
    def __get_sequences(self, filename):
        """Get tuples of questions and answers from specific txt file.
        """
        sequences = []
        
        with open(filename, "r", encoding = "utf-8") as f:
            for line in f:
                try:
                    # each line consists of [question_1, ...]\[answer_1, ...]\n
                    question_group, answer_group = re.findall("(.+?)\t(.+?)\n", line)[0]
                except:
                    # skip lines with false formatting
                    continue
                    
                # parse the strings to Python lists, 
                #e.g. question_group = ["hey", "how are you?"] and answer_group = ["good", "what about you?"]
                question_group, answer_group = eval(question_group), eval(answer_group)
                # tokenize each message,
                # e.g. question_group = [["hey"], ["how", "are", "you", "?"]]
                question_group, answer_group = [nltk.word_tokenize(question) for question in question_group], [nltk.word_tokenize(answer) for answer in answer_group]

                sequences.append((question_group, answer_group))

        return sequences
        
    def __get_vocab(self, data):
        # unzip to obtain question groups and answer groups
        question_groups, answer_groups = list(zip(*data))
        
        # gather all sequences from all groups into a single list of sentences
        questions = [question for question_group in question_groups for question in question_group]
        answers = [answer for answer_group in answer_groups for answer in answer_group]
        sentences = questions + answers
        
        #train word2vec model on sentences and remove infrequent words 
        word2vec = gensim.models.Word2Vec(sentences, iter = 100, window = 8, size = self.embedding_size, min_count = 5)
        
        # obtain vocabulary including special tokens
        vocab = {token: index + 5 for index, token in enumerate(word2vec.wv.index2word)}
        vocab["<pad>"] = 0
        vocab["<unk>"] = 1
        vocab["<start>"] = 2
        vocab["<stop>"] = 3
        vocab["<new>"] = 4
        
        return vocab, word2vec
    
    def get_embeddings(self):
        """ Return the pretrained embeddings together with 0-initialized embeddings for the 5 special tokens
        """
        embeddings = torch.cat((torch.zeros((5, self.embedding_size)), torch.FloatTensor(self.word2vec.wv.vectors)))
        return embeddings

    def __len__(self):
        """Return size of dataset
        """
        return len(self.data)

    def __getitem__(self, index):
        """Obtain tuple at given index
        """
        question_group, answer_group = self.data[index]
        # link together sequences of one group with the <new> token
        # e.g. question_group = [["hey"], ["how", "are", "you", "?"]] -> question = ["hey", "<new>", "how", "are", "you", "?"]
        question, answer = [[token for seq in seq_group for token in seq + ["<new>"]] for seq_group in [question_group, answer_group]]
        # replace every token with its unique index or with the <unk> index if it is not in the vocabulary
        question, answer = [[self.vocab[token] if token in self.vocab else self.vocab["<unk>"] for token in seq] for seq in [question, answer]]

        # either cut off long sequences or pad short sequences so that every sequence has length max_length
        question = question[:self.max_length] + [self.vocab["<pad>"]] * max(self.max_length - len(question), 0)
        # additionally, add sos and eos tokens to start and end of the answer
        answer = [self.vocab["<start>"]] + answer[:self.max_length - 2] + [self.vocab["<stop>"]] + [self.vocab["<pad>"]] * max(self.max_length - len(answer) - 2, 0)  
    
        return (torch.tensor(question), torch.tensor(answer))

In [None]:
# example for initializing a dataset from a directory "chats" which contains txt files
wa_dataset = ChatDataset("chats")
# print question and answer of the 9th dataset item
print(list(wa_dataset.inverse_vocab[token.item()] for token in wa_dataset[8][0]))
print(list(wa_dataset.inverse_vocab[token.item()] for token in wa_dataset[8][1]))

In [2]:
class Encoder(nn.Module):
    """Encoder for a Seq2Seq network.
    
    Takes a sequence of word indices as input and obtains the embeddings. The embeddings
    are then passed through a bi-LSTM network to produce a sequence of encoder vectors.
    """
    def __init__(self, input_size, hidden_size, num_layers, pretrained_emb = None):
        super(Encoder, self).__init__()

        self.hidden_size = hidden_size
        self.num_layers = num_layers
        
        # initialize embeddings, if given
        if pretrained_emb is None:
            self.embedding = nn.Embedding(input_size, hidden_size, padding_idx = 0)
        else:
            self.embedding = nn.Embedding.from_pretrained(pretrained_emb)
            
        # bi-LSTM network with num_layers layers and dropout
        self.lstm = nn.LSTM(hidden_size, hidden_size, num_layers, batch_first = True, dropout = 0.1, bidirectional = True)

    def forward(self, x, hidden, cell):
        """Forward pass of the encoder.
        
        Args:
            x: sequence of word indices of shape (batch, seq_len)
            hidden: last hidden state of the bi-LSTM of shape (2 * num_enc_layers, batch, hidden_size)
            cell: last cell state of the bi-LSTM of shape (2 * num_enc_layers, batch, hidden_size)
        """
        # obtain embedding of input index
        embedding = self.embedding(x)
        # get the encoder outputs
        output, (hidden, cell) = self.lstm(embedding, (hidden, cell))

        return output, hidden, cell

    def init_hidden(self, batch_size):
        # initialize hidden state with zeros
        return torch.zeros(2 * self.num_layers, batch_size, self.hidden_size, device = device)


In [3]:
class Decoder(nn.Module):
    """Decoder for a Seq2Seq network.
    
    Takes as input the lastly predicted output index and obtains the embedding. The embedding
    then attends over the sequence of encoder vectors and produces a context vector. Finally, 
    the embedding and the context vector are concatenated and passed to an LSTM network. Additionally,
    the last encoder state is added to the previous decoder state in every time step.
    """
    def __init__(self, hidden_size, output_size, num_layers, pretrained_emb = None):
        super(Decoder, self).__init__()

        self.hidden_size = hidden_size
        self.num_layers = num_layers

        # initialize embeddings, if given
        if pretrained_emb is None:
            self.embedding = nn.Embedding(output_size, hidden_size, padding_idx = 0)
        else:
            self.embedding = nn.Embedding.from_pretrained(pretrained_emb)
            
        # the weights for the Bahdanau attention mechanism
        # i.e. energy(s_t', h_t) = comb_attn_w * (enc_attn_w * h_t + dec_attn_w * s_t'),
        # where s_t' is the decoder state at time t' and h_t is the encoder state at time t
        self.enc_attn_w = nn.Linear(2 * hidden_size, hidden_size)
        self.dec_attn_w = nn.Linear(hidden_size, hidden_size)
        self.comb_attn_w = nn.Linear(hidden_size, 1)
        # softmax for normalizing attention energies
        self.attn_softmax = nn.Softmax(dim = -1)
        
        # linear layer for scaling down the last encoder state in order to add it to the previous decoder state
        # (bc the encoder is bidirectional, its states are double the size of decoder states)
        self.scale_enc_hidden = nn.Linear(2 * hidden_size, hidden_size)
        
        # LSTM network with num_layers layers
        self.lstm = nn.LSTM(hidden_size * 3, hidden_size, num_layers, batch_first = True, dropout = 0.1)
        
        # last layer which projects decoder state to the size of the output vocabulary
        self.linear = nn.Linear(hidden_size, output_size)

    def forward(self, y, enc_outputs, enc_hidden, hidden, cell):
        """Forward pass through the decoder.
        
        Args:
            y: lastly predicted output with shape (batch, 1)
            enc_outputs: sequence of encoder vectors of shape (batch, seq_len, hidden_size * 2)
            enc_hidden: the last encoder output of shape (batch, 1, hidden_size * 2)
            hidden: the last hidden state of the lstm network of shape (num_dec_layers, batch, hidden_size)
            cell: the last cell state of the lstm network of shape (num_dec_layers, batch, hidden_size)
        """
        # obtain embedding from lastly predicted symbol
        embedding = self.embedding(y)
        
        # obtain unnormalized attention energies by using Bahdanau attention
        # attn_energies is of shape (batch, 1, seq_len)
        # and contains energies for current decoder state and all encoder states
        attn_energies = torch.tanh(self.dec_attn_w(embedding[:, :, None]) + self.enc_attn_w(enc_outputs[:, None]))
        attn_energies = self.comb_attn_w(attn_energies)
        attn_energies = torch.squeeze(attn_energies, dim = -1)
        
        # obtain attention scores by normalizing energies
        attn_weights = F.softmax(attn_energies, dim = -1)
        # weigth encoder outputs with attention scores
        context = attn_weights[:, :, :, None] * enc_outputs[:, None]
        # obtain weighted sum
        context = torch.sum(context, dim = -2)
        
        # concatenate context and embedding
        combined = torch.cat((embedding, context), dim = -1)
        # scale last encoder state by a factor of 2
        enc_hidden = self.scale_enc_hidden(enc_hidden)
        
        # apply lstm network
        out, (hidden, cell) = self.lstm(combined, (hidden + enc_hidden, cell))
        # project to output vocabulary
        out = self.linear(out)
        # remove seq_len dimension
        out = torch.squeeze(out, dim = 1)

        return out, hidden, cell


    def init_hidden(self, batch_size):
        # initialize hidden state with zeros
        return torch.zeros(self.num_layers, batch_size, self.hidden_size, device = device)

In [None]:
def train(encoder, decoder, dataset, epochs = 500, batch_size = 512):
    """Train Seq2Seq network on a given dataset
    """
    # define trainloader
    trainloader = DataLoader(dataset, batch_size = batch_size, shuffle = True, drop_last = True)

    # use cross entropy as loss and Adam as optimizer
    criterion = torch.nn.CrossEntropyLoss()
    encoder_opt = torch.optim.Adam(encoder.parameters())
    decoder_opt = torch.optim.Adam(decoder.parameters())

    vocab_size = len(dataset.vocab)
    hidden_size = encoder.hidden_size

    encoder.train()
    decoder.train()

    for epoch in tqdm(range(epochs)):
        running_loss = 0
        for i, data in enumerate(trainloader):
            # get data and move it to the device
            input_tensor, output_tensor = data
            input_tensor = input_tensor.to(device)
            output_tensor = output_tensor.to(device)

            encoder_opt.zero_grad()
            decoder_opt.zero_grad()
            
            loss = 0

            # init encoder hidden state and cell
            enc_hidden = encoder.init_hidden(batch_size)
            enc_cell = encoder.init_hidden(batch_size)
            # obtain encoder outputs
            enc_outputs, enc_hidden, enc_cell = encoder(input_tensor, enc_hidden, enc_cell)
            # concatenate the last hidden states from both directions
            enc_hidden = enc_hidden.view(encoder.num_layers, 2, batch_size, hidden_size)
            enc_hidden = torch.cat((enc_hidden[-1, 0], enc_hidden[-1, 1]), dim = 1).view(1, batch_size, hidden_size * 2)

            # init decoder hidden and cell state
            dec_hidden = decoder.init_hidden(batch_size)
            dec_cell = decoder.init_hidden(batch_size)

            # pass the indices from the target sentence into the decoder, one at a time
            for i in range(output_tensor.size(1) - 1):
                dec_in = output_tensor[:, i].view(-1, 1)
                # use teacher forcing
                target = output_tensor[:, i + 1].view(-1, 1)

                # produce next decoder output
                dec_out, dec_hidden, dec_cell = decoder(dec_in, enc_outputs, enc_hidden, dec_hidden, dec_cell)

                # add to loss
                loss += criterion(torch.reshape(dec_out, (-1, vocab_size)), torch.reshape(target, (-1,)))

            # do backpropagation and update weights
            loss.backward()
            encoder_opt.step()
            decoder_opt.step()

            # add to running loss
            running_loss += loss.item()

        # print current mean loss after every epoch
        print("Epoch {} - Loss: {}".format(epoch, running_loss / ((len(dataset) // batch_size) * dataset.max_length)))

    # save the models
    torch.save(encoder.state_dict(), "encoder.pt")
    torch.save(decoder.state_dict(), "decoder.pt")



In [None]:
def gen_input(message: str, dataset):
    """Generate encoder input from message.
    """
    tokens = dataset.get_tokens(message)
    tokens = [dataset.vocab[token] if token in dataset.vocab else dataset.vocab["<unk>"] for token in tokens]
    inp = tokens[:dataset.max_length - 1] + [dataset.vocab["<new>"]] + [dataset.vocab["<pad>"]] * max(dataset.max_length - len(tokens) - 1, 0)

    return torch.tensor(inp)

In [None]:
def decode_beam(inp, encoder, decoder, dataset, beam_width):
    """Return the beam_width top predictions of the decoder given an input.
    """
    # go into eval mode (disable dropout)
    encoder.eval()
    decoder.eval()

    hidden_size = encoder.hidden_size
    batch_size = 1

    top_picks = []

    with torch.no_grad():
        # init encoder inputs
        enc_hidden = encoder.init_hidden(batch_size)
        enc_cell = encoder.init_hidden(batch_size)
        enc_in = inp.view(batch_size, -1).to(device)

        # obtain encoder outputs
        enc_outputs, enc_hidden, enc_cell = encoder(enc_in, enc_hidden, enc_cell)

        #prepare hidden encoder state for decoder
        enc_hidden = enc_hidden.view(3, 2, batch_size, hidden_size)
        enc_hidden = torch.cat((enc_hidden[:, 0], enc_hidden[:, 1]), dim = 1).view(3, batch_size, hidden_size * 2)

        # init decoder inputs
        dec_hidden = decoder.init_hidden(batch_size)
        dec_cell = decoder.init_hidden(batch_size)
        dec_in = torch.tensor(dataset.vocab["<start>"]).view(1, 1).to(device)

        # obtain first decoder outputs
        dec_out, dec_hidden, dec_cell = decoder(dec_in, enc_outputs, enc_hidden, dec_hidden, dec_cell)

        # get first top predictions
        top_k = torch.topk(F.softmax(dec_out, 1), beam_width, 1)

        # save parameters of the first top picks
        top_picks = [{
            "seq": [token],
            "prob": np.log(prob.item()),
            "hid": dec_hidden,
            "cell": dec_cell,
        } for prob, token in zip(top_k[0][0], top_k[1][0])]

        # do 10 decoding steps
        for i in range(10):
            hypotheses = []

            # go through every current top pick
            for pick in top_picks:
                # the lastly predicted symbol is the next input
                dec_in = pick["seq"][-1].view(1, 1)

                # get the next outputs
                dec_out, dec_hidden, dec_cell = decoder(dec_in, enc_outputs, enc_hidden, pick["hid"], pick["cell"])

                # get next top picks of the current hypothesis
                top_k = torch.topk(F.softmax(dec_out, 1), beam_width, 1)

                # store parameters of the top picks
                picks = [{
                    "seq": pick["seq"] + [token],
                    "prob": np.log(prob.item()) + pick["prob"],
                    "hid": dec_hidden,
                    "cell": dec_cell,
                } for prob, token in zip(top_k[0][0], top_k[1][0])]

                # add to current hypothesis
                hypotheses += picks

            # sort after probability
            hypotheses = sorted(hypotheses, key = operator.itemgetter("prob"), reverse = True)

            # get top k hyptheses
            top_picks = hypotheses[:beam_width]

    for pick in top_picks:
        print(np.exp(pick["prob"].item()), [dataset.inverse_vocab[token.item()] for token in pick["seq"]])





In [None]:
# example of how predictions can be obtained from an input sentence
inp = gen_input("hallo", wa_dataset)
decode_beam(inp, encoder, decoder, wa_dataset, 10)