## Setup


In [1]:
# connect to google drive
import os
# mount google drive
from google.colab import drive
drive.mount('/content/gdrive', force_remount=True)
#
# change root directory such that models are saved in google drive during training
root_dir = "/content/gdrive/My Drive/Chatbot"
os.chdir(root_dir)
# print the contents
!ls

Go to this URL in a browser: https://accounts.google.com/o/oauth2/auth?client_id=947318989803-6bn6qk8qdgf4n4g3pfee6491hc0brc4i.apps.googleusercontent.com&redirect_uri=urn%3Aietf%3Awg%3Aoauth%3A2.0%3Aoob&scope=email%20https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fdocs.test%20https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fdrive%20https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fdrive.photos.readonly%20https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fpeopleapi.readonly&response_type=code

Enter your authorization code:
··········
Mounted at /content/gdrive
Chatbot        Chatbot.optim		 Friends.ipynb	Tutorial
Chatbot.ipynb  chatbot_refactored.ipynb  Friends.pkl	vocab.json


### Imports

In [0]:
# basic packages
import sys
import math
import time
import string
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline

In [0]:
pd.set_option('display.max_colwidth', 150)

In [0]:
# export
# basic imports
import requests
from bs4 import BeautifulSoup
import seaborn as sns
from time import sleep
import re
import shutil
import json
from tqdm import tqdm_notebook

import spacy
nlp = spacy.load('en_core_web_sm')

In [0]:
import warnings
warnings.filterwarnings("ignore")

In [0]:

from collections import Counter, namedtuple
from docopt import docopt
from itertools import chain
import json
from typing import List, Tuple, Dict, Set, Union

#pytorch
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.nn.utils
from torch.nn.utils.rnn import pad_packed_sequence, pack_padded_sequence


#others
from nltk.translate.bleu_score import corpus_bleu, sentence_bleu, SmoothingFunction
from IPython.core.debugger import set_trace

In [0]:
from sklearn.model_selection import train_test_split

### load data

In [8]:
df_friends = pd.read_pickle('Friends.pkl')
df_friends.head()

Unnamed: 0,pre,post,is_valid
2,Monica: There's nothing to tell! He's just some guy\nI work with!,"Joey: C'mon, you're going out with the guy! There's\ngotta be something wrong with him!",True
3,"Joey: C'mon, you're going out with the guy! There's\ngotta be something wrong with him!","Chandler: All right Joey, be\nnice. So does he have a hump? A hump and a hairpiece?",True
4,"Chandler: All right Joey, be\nnice. So does he have a hump? A hump and a hairpiece?","Phoebe: Wait, does he eat chalk?",True
7,"Phoebe: Just, 'cause, I don't want her to go through\nwhat I went through with Carl- oh!","Monica: Okay, everybody relax. This is not even a\ndate. It's just two people going out to dinner and- not having sex.",True
8,"Monica: Okay, everybody relax. This is not even a\ndate. It's just two people going out to dinner and- not having sex.",Chandler: Sounds like a date to me.,True


In [0]:
def check_len(x):
  flag = True
  for sent in x:
    #set_trace()
    if len(sent) == 0:
      flag = False
  return flag

df_friends['flag'] = df_friends[['pre', 'post']].apply(lambda x: check_len(x), axis = 1)

In [10]:
df_friends = df_friends[df_friends['flag']].reset_index(drop = True)
len(df_friends)

36735

In [0]:
# handle contraction
contraction_mapping = {"c'mon": 'come on', "there's": 'there is', "it's": 'it is', "you're": 'you are',
                       "he's": 'he is', "she's": 'she is', "i'm": 'i am', "don't": 'do not', "i've": 'i have', 
                       "doesn't": 'does not', "didn't": 'did not', "you’re": 'you are', "i’m ": 'i am',
                       "he’s": 'he is', "you've": 'you have', "let's": 'let us'}
def correct_contraction(x, dic):
    for word in dic.keys():
        if word in x:
            x = x.replace(word, dic[word])
    return x

# handle punctuation
all_punct = list(string.punctuation)
def spacing_punctuation(text):
    """
    add space before and after punctuation and symbols
    """
    for punc in all_punct:
        if punc in text:
            text = text.replace(punc, f' {punc} ')
    return text

# put together
def clean_text(x):
  x = x.lower()
  x = re.sub(r"\n", " ", x) # remove \n character
  x = correct_contraction(x, contraction_mapping)
  x = spacing_punctuation(x)

  #words = x.split(" ")
  #words = [word.strip() for word in words]
  return x

In [0]:
df_friends['pre'] = df_friends['pre'].apply(lambda x: clean_text(x))
df_friends['post'] = df_friends['post'].apply(lambda x: clean_text(x))

In [13]:
df_friends.sample(5)

Unnamed: 0,pre,post,is_valid,flag
25423,joey : you know you’ve been spitting on me ? !,"richard : that’s what real actors do ! annunciation is the mark of a good actor ! and when you enunciate , you spit ! ( spits on the t )",True,True
18336,monica :,ross :,True,True
23750,"joey : dude , you soooo need this car .","phoebe : ( running up ) okay . okay , here’s what we’re gonna do . okay , i amgonna break into this mini - van and put it in neutral . you...",True,True
28262,rachel :,monica :,True,True
312,monica : what does she mean by ' involved ' ?,"chandler : i mean presumably , the biggest part of your job is done .",True,True


### Data preparation

In [0]:
# read from corpus: vocab building
def sent_tokenizer(sent, source):

    words = sent.split(' ')
    words = [word.lower() for word in words]
    
    if source == 'post':
      words = ['<sos>'] + words + ['<eos>']

    return words

In [0]:
# generate batches for taining
pre_sents, post_sents = [], []

for i in range(len(df_friends)):
  pre_raw = df_friends.iloc[i].pre
  post_raw = df_friends.iloc[i].post
  
  pre_tokens =  sent_tokenizer(pre_raw, 'pre')
  post_tokens =  sent_tokenizer(post_raw, 'post')
  
  pre_sents.append(pre_tokens)
  post_sents.append(post_tokens)

In [0]:
data = list(zip(pre_sents, post_sents))

In [18]:
data[110]

(['monica', ':', '', '', '-', 'leg', '?', ''],
 ['<sos>',
  'paul',
  ':',
  '',
  '',
  '(',
  'laughing',
  ')',
  '',
  'that',
  "'",
  's',
  'one',
  'way',
  '!',
  '',
  'me',
  ',',
  '',
  'i',
  '-',
  '',
  'i',
  'went',
  'for',
  'the',
  'watch',
  '.',
  '',
  '<eos>'])

### padding

In [0]:
# post-padding for source/target sequences
def pad_sents(sents, pad_token):
    
    """ Pad list of sentences according to the longest sentence in the batch.
    @param sents (list[list[str]]): list of sentences, where each sentence
                                    is represented as a list of words
    @param pad_token (str): padding token
    @returns sents_padded (list[list[str]]): list of sentences where sentences shorter
        than the max length sentence are padded out with the pad_token, such that
        each sentences in the batch now has equal length.
    """
    sents_padded = []

    max_len = min(max([len(sent) for sent in sents]),100)
    for sent in sents:
        sent = sent[:max_len]
        sent_len = len(sent)
        sents_padded.append(sent + (max_len - sent_len) * [pad_token])

    return sents_padded

### Build vocab

In [0]:
class Vocab(object):
    """ Construct vocabulary for chatbot.
    """
    def __init__(self, word2id=None):
        """ Init Vocab Instance.
        @param word2id (dict): dictionary mapping words 2 indices
        """
        if word2id:
            self.word2id = word2id
        else:
            self.word2id = dict()
            self.word2id['<pad>'] = 0   # Pad Token
            self.word2id['<sos>'] = 1 # Start Token
            self.word2id['<eos>'] = 2    # End Token
            self.word2id['<unk>'] = 3   # Unknown Token
        self.unk_id = self.word2id['<unk>']
        self.id2word = {v: k for k, v in self.word2id.items()}

    def __getitem__(self, word):
        """ Retrieve word's index. Return the index for the unk
        token if the word is out of vocabulary.
        @param word (str): word to look up.
        @returns index (int): index of word 
        """
        return self.word2id.get(word, self.unk_id)

    def __contains__(self, word):
        """ Check if word is captured by VocabEntry.
        @param word (str): word to look up
        @returns contains (bool): whether word is contained    
        """
        return word in self.word2id

    def __setitem__(self, key, value):
        """ Raise error, if one tries to edit the VocabEntry.
        """
        raise ValueError('vocabulary is readonly')

    def __len__(self):
        """ Compute number of words in VocabEntry.
        @returns len (int): number of words in VocabEntry
        """
        return len(self.word2id)

    def __repr__(self):
        """ Representation of VocabEntry to be used
        when printing the object.
        """
        return 'Vocabulary[size=%d]' % len(self)

    def id2word(self, wid):
        """ Return mapping of index to word.
        @param wid (int): word index
        @returns word (str): word corresponding to index
        """
        return self.id2word[wid]

    def add(self, word):
        """ Add word to VocabEntry, if it is previously unseen.
        @param word (str): word to add to VocabEntry
        @return index (int): index that the word has been assigned
        """
        if word not in self:
            wid = self.word2id[word] = len(self)
            self.id2word[wid] = word
            return wid
        else:
            return self[word]

    def words2indices(self, sents):
        """ Convert list of words or list of sentences of words
        into list or list of list of indices.
        @param sents (list[str] or list[list[str]]): sentence(s) in words
        @return word_ids (list[int] or list[list[int]]): sentence(s) in indices
        """
        if type(sents[0]) == list:
            return [[self[w] for w in s] for s in sents]
        else:
            return [self[w] for w in sents]

    def indices2words(self, word_ids):
        """ Convert list of indices into words.
        @param word_ids (list[int]): list of word ids
        @return sents (list[str]): list of words
        """
        return [self.id2word[w_id] for w_id in word_ids]

    def to_input_tensor(self, sents, device: torch.device):
        """ Convert list of sentences (words) into tensor with necessary padding for 
        shorter sentences.

        @param sents (List[List[str]]): list of sentences (words)
        @param device: device on which to load the tesnor, i.e. CPU or GPU

        @returns sents_var: tensor of (max_sentence_length, batch_size)
        """
        word_ids = self.words2indices(sents)
        sents_t = pad_sents(word_ids, self['<pad>'])
        sents_var = torch.tensor(sents_t, dtype=torch.long, device=device)
        return torch.t(sents_var)

    def save(self, file_path):
        """ Save Vocab to file as JSON dump.
        @param file_path (str): file path to vocab file
        """
        json.dump(dict(vocab_word2id=self.word2id), open(file_path, 'w'), indent=2)

    @staticmethod
    def load(file_path):
        """ Load vocabulary from JSON dump.
        @param file_path (str): file path to vocab file
        @returns Vocab object loaded from JSON dump
        """
        entry = json.load(open(file_path, 'r'))
        word2id = entry['vocab_word2id']
        vocab = Vocab(word2id)
        print('{} loaded!'.format(vocab))
        return vocab

    @staticmethod
    def from_corpus(corpus, size, freq_cutoff=2):
        """ Given a corpus construct a Vocab Entry.
        @param corpus (list[str]): corpus of text produced by read_corpus function
        @param size (int): # of words in vocabulary
        @param freq_cutoff (int): if word occurs n < freq_cutoff times, drop the word
        @returns vocab_entry (VocabEntry): VocabEntry instance produced from provided corpus
        """
        vocab_entry = Vocab()
        word_freq = Counter(chain(*corpus))
        valid_words = [w for w, v in word_freq.items() if v >= freq_cutoff]
        print('number of word types: {}, number of word types w/ frequency >= {}: {}'
              .format(len(word_freq), freq_cutoff, len(valid_words)))
        top_k_words = sorted(valid_words, key=lambda w: word_freq[w], reverse=True)[:size]
        for word in top_k_words:
            vocab_entry.add(word)
        return vocab_entry

#### unit test

In [21]:
# Check Vocab Object
lang = Vocab()
print('Check if <eos> token is inside vocab:')
print('<eos>' in lang) #__contains__#
print('Length of vocab:{}'.format(len(lang))) # __len__
print('Adding a new word')
lang.add('new') # add new entry 'add' method
print(lang) #__repr__
print('The index of "new" is:{}'.format(lang.word2id['new']))
## 
print('Generate a vocab with the from_corpus static method:')
en_vocab = Vocab.from_corpus(post_sents, 100)
print('The token for the word "the" is:{}'.format(en_vocab.word2id['the']))

Check if <eos> token is inside vocab:
True
Length of vocab:4
Adding a new word
Vocabulary[size=5]
The index of "new" is:4
Generate a vocab with the from_corpus static method:
number of word types: 14179, number of word types w/ frequency >= 2: 7568
The token for the word "the" is:12


In [22]:
# Save and load vocab
vocab_file = 'vocab.json'
size = 7500 # number of word tokens in vocab
freq_cutoff= 2
vocab = Vocab.from_corpus(pre_sents, size, freq_cutoff)
print('generated vocabulary, %d words' % (len(vocab)))

vocab.save(vocab_file)
print('vocabulary saved to %s' % vocab_file)
en_vocab = Vocab.load(vocab_file)

number of word types: 14125, number of word types w/ frequency >= 2: 7620
generated vocabulary, 7504 words
vocabulary saved to vocab.json
Vocabulary[size=7504] loaded!


In [23]:
len(en_vocab)

7504

### train-test split

In [24]:
# load data
train_data, valid_data = train_test_split(data, test_size = 0.15, random_state = 42)
#
print("=="*40)
print("Number of examples in train: {}".format(len(train_data)))
print("Number of examples in valid: {}".format(len(valid_data)))
#
print("=="*40)
print("Chatbot")
query, answer = next(iter(train_data))
print("Q: {}".format(' '.join(query)))
print("A: {}".format(' '.join(answer)))
print("=="*40)

Number of examples in train: 31224
Number of examples in valid: 5511
Chatbot
Q: rachel :  uh - huh . 
A: <sos> ross :  we live together .  you are having our baby .  i amnot gonna see anybody else .  are you - are you sure you don’t want something more ?  <eos>


### Embeddings

In [0]:
class ModelEmbeddings(nn.Module): 
    """
    Class that converts input words to their embeddings.
    """
    def __init__(self, vocab, embed_size=256):
        """
        Init the Embedding layers.

        @param embed_size (int): Embedding size (dimensionality)
        @param vocab (Vocab): Vocabulary object containing src and tgt languages
                              See vocab.py for documentation.
        """
        super(ModelEmbeddings, self).__init__()
        self.embed_size = embed_size

        pad_token_idx = vocab['<pad>']
        
        self.LUT = nn.Embedding(len(vocab), embed_size, padding_idx = pad_token_idx)

    def forward(self, x):
      """ Get the embedding for x
      @ param x: tokes (batch, seq_len)
      @ return x_embed (batch, seq_len, x_embed)
      """
      x_embed = self.LUT(x)*math.sqrt(self.embed_size)
      return x_embed

In [0]:
embed_size = 256
vocab = en_vocab
en_embeddings = ModelEmbeddings(vocab, embed_size)

### Encoder

In [0]:
class ChatbotEncoder(nn.Module):
  """ encodes query sentence """
  def __init__(self, en_embeddings, hidden_size, dropout_rate):
    super(ChatbotEncoder, self).__init__()

    # attributes
    self.hidden_size = hidden_size
    embed_size = en_embeddings.embed_size

    # layers
    self.embed = en_embeddings
    self.lstm = nn.LSTM(embed_size, hidden_size, bidirectional=True)
    self.dropout = nn.Dropout(dropout_rate)
    self.h_projection= nn.Linear(2*hidden_size, hidden_size, bias=False)
    self.c_projection = nn.Linear(2*hidden_size, hidden_size, bias=False)

  def generate_mask(self, source_padded, source_lengths):
    """ 
    Generate sentence masks for encoder hidden states. Applies mask for padded inputs.
    param:
    source_padded: tensor (max_len, batch)
    source_lengths: List containing lengths of input sentences

    returns:
    enc_masks: Tensor (batch, src_len)
    """
    

    max_len, batch = source_padded.shape[0], source_padded.shape[1]
    enc_masks = torch.zeros(batch, max_len, dtype = torch.float)
    
    for e_id, length in enumerate(source_lengths):
      enc_masks[e_id, length: ] = 1

    return enc_masks.bool().to(self.device)

    
  def forward(self, source_padded, source_lengths):
    """ 
    Apply the encoder to source sentences to obtain encoder hidden states.
    Take the final states of the encoder and project them to obtain initial states for decoder.
    params:
    source_padded: Tensor (src_len, b)
    source_lengths: List containing original sentence lengths 
    """
    
    enc_masks = self.generate_mask(source_padded, source_lengths) # (batch, src_len) # to be used in decoder
    X = self.embed(source_padded) #(src_len, b, embed_size)
    X = pack_padded_sequence(X, source_lengths)
    enc_hiddens, (last_hidden,last_cell) = self.lstm(X) #(h0,c0) defaults to zero
    enc_hiddens, _ = pad_packed_sequence(enc_hiddens, batch_first=True)
    # enc_hiddens: (batch, src_len, num_directions*hidden_size)
    # last_hidden: (num_dim*n_layes, batch, hidden_size)
    last_hidden = torch.cat((last_hidden[0,:],last_hidden[1,:]),1) #(batch, 2*hidden_size)
    last_cell = torch.cat((last_cell[0,:],last_cell[1,:]),1) #(batch, 2*hidden_size)
    #set_trace()
    init_decoder_hidden = self.h_projection(last_hidden) #(batch, hidden_size)
    init_decoder_cell = self.c_projection(last_cell) #(batch, hidden_size)
    dec_init_state = (init_decoder_hidden, init_decoder_cell)

    return enc_hiddens, enc_masks, dec_init_state

  @property
  def device(self) -> torch.device:
      """ Determine which device to place the Tensors upon, CPU or GPU.
      """
      return self.embed.LUT.weight.device

In [0]:
encoder = ChatbotEncoder(en_embeddings, hidden_size = 128, dropout_rate = 0.3)

In [0]:
query = [['i', 'am', 'going', 'there', 'today'], ['it', 'is', 'not', 'possible', 'to', 'do', 'this', 'in', 'such', 'short', 'time'], ['hey', 'amigo']]
query_sorted = sorted(query, key= lambda e: len(e), reverse=True)
query_lengths = [len(sent) for sent in query_sorted]
query_padded = en_vocab.to_input_tensor(query_sorted, device = 'cpu') # (src_len, batch)

In [30]:
query_padded

tensor([[  23,    9,   50],
        [  24,   59,    3],
        [  31,  100,    0],
        [1433,   82,    0],
        [  13,  315,    0],
        [  33,    0,    0],
        [  35,    0,    0],
        [  38,    0,    0],
        [ 401,    0,    0],
        [1315,    0,    0],
        [ 115,    0,    0]])

In [0]:
enc_hiddens, enc_masks, dec_init_state = encoder(query_padded, query_lengths)

In [32]:
print(enc_masks)

tensor([[False, False, False, False, False, False, False, False, False, False,
         False],
        [False, False, False, False, False,  True,  True,  True,  True,  True,
          True],
        [False, False,  True,  True,  True,  True,  True,  True,  True,  True,
          True]])


In [33]:
enc_hiddens.shape

torch.Size([3, 11, 256])

In [34]:
dec_init_state[0].shape

torch.Size([3, 128])

In [35]:
encoder.device

device(type='cpu')

### Attention Mechanism

In [0]:
class GlobalAttention(nn.Module):
  """
  Performs global attention mechanism
  """
  def __init__(self, hidden_size):
    super(GlobalAttention, self).__init__()

    self.hidden_size = hidden_size
    
    # multipolicative attention
    self.mult_atten = nn.Linear(hidden_size, hidden_size)

  
  def forward(self, enc_hiddens_proj, dec_hidden, enc_masks =None):
    """
    Return the softmax normalized probability scores
    
    params:
    enc_hiddens_proj: Tensor (batch, src_len, hidden_size)
    dec_hiddens: Tensor (batch, hidden_size)
    enc_masks: Bool Tensor (batch, src_len)

    Return:
    alpha_t: Tensor (batch, src_len)
    """
    assert self.hidden_size == dec_hidden.shape[1], \
    "Decoder LSTM hidden size and Linear layer output of mult_atten size must match"

    assert self.hidden_size == enc_hiddens_proj.shape[2], \
    "Encoder output projection size and Linear layer output of mult_atten size must match"
    aug_dec_hidden = torch.unsqueeze(dec_hidden, dim=2) #(batch, hidden_size, 1)
    mul_enc_proj = self.mult_atten(enc_hiddens_proj) # (batch, src_len, hidden_size)
    e_t = torch.bmm(mul_enc_proj, aug_dec_hidden) 
    # (batch, src_len, hidden_size) * (batch,hidden_size, 1) --> (batch, src_len, 1)
    e_t = torch.squeeze(e_t, dim=2) #(batch, src_len)

    # masked attention
    if enc_masks is not None:
      e_t.data.masked_fill_(enc_masks.byte(), -float('inf'))
    # compute attention scores
    alpha_t = F.softmax(e_t, dim=1) #(batch, src_len)
    
    return alpha_t
    

In [0]:
attn_mech = GlobalAttention(hidden_size= 5)

In [0]:
enc_hiddens_proj = torch.randn((3, 2, 5))
dec_hidden = torch.randn((3, 5))
alpha = attn_mech(enc_hiddens_proj, dec_hidden)

In [39]:
alpha

tensor([[0.1764, 0.8236],
        [0.4260, 0.5740],
        [0.2992, 0.7008]], grad_fn=<SoftmaxBackward>)

### Decoder

In [0]:
class ChatbotDecoder(nn.Module):
  def __init__(self, en_embeddings, hidden_size, attn_mech, dropout_rate):
    super(ChatbotDecoder, self).__init__()
    
    # attributes
    self.hidden_size = hidden_size
    embed_size = en_embeddings.embed_size
    self.attention = attn_mech
    
    # layers
    self.embed = en_embeddings
    self.lstm_decode = nn.LSTMCell(embed_size + hidden_size, hidden_size)
    self.att_projection = nn.Linear(2*hidden_size, hidden_size, bias=False)
    self.combined_output_projection = nn.Linear(3*hidden_size, hidden_size, bias=False)
    self.dropout = nn.Dropout(dropout_rate)
    

  def forward(self, enc_hiddens, enc_masks, dec_init_state, target_padded):
    """
    params:
    target_padded: Tensor (tgt_len, batch)
    """
    # Chop of the <eos> token for max length sentences
    target_padded = target_padded[:-1] # (tgt_len-1, batch)
    # Initialize the decoder state (hidden and cell)
    dec_state = dec_init_state
    # Initialize previous combined output vector o_{t-1} as zero
    batch_size = enc_hiddens.shape[0]
    o_prev = torch.zeros(batch_size, self.hidden_size, device=self.device)
    # Initialize a list we will use to collect the combined output o_t on each step
    combined_outputs = []

    # compute enc_hiddens projection
    enc_hiddens_proj = self.att_projection(enc_hiddens) # (batch, src_len, hidden_size)

    # teacher-forcing
    Y = self.embed(target_padded) # (tgt_len, batch, embed_size)
    Y_splited = torch.split(Y, 1, dim=0) # tuple of size tgt_len containing Tensors of shape (1, batch, embed_size)
    tgt_len = target_padded.shape[0]
    #
    for i in range(tgt_len):
      Y_t = Y_splited[i] # Tensor (1, batch, embed_size)
      Y_t = torch.squeeze(Y_t, dim = 0) # Tensor (batch, embed_size)
      Ybar_t = torch.cat((Y_t, o_prev), dim = 1) # Tensor (batch, embed_size + hidden_size)
      dec_state = self.lstm_decode(Ybar_t, dec_state) # tuple of (dec_hidden, dec_cell) Tensors of shape (batch, hidden_size)
      dec_hidden, dec_cell = dec_state # (batch, hidden_size)

      # Global Attention Mechanisms
      alpha_t = self.attention(enc_hiddens_proj, dec_hidden, enc_masks) 
      
      # Compute attention vector
      aug_att = torch.unsqueeze(alpha_t,2) #(batch, src_len, 1)
      tr_hiddens = enc_hiddens.transpose(1,2) #(batch, hidden_size*2, src_len)
      a_t = torch.bmm(tr_hiddens, aug_att) #(batch, 2*hidden_size, 1)
      a_t = torch.squeeze(a_t, dim=2) #(batch, 2*hidden_size) # Attention vector
      
      # Compute combined output
      U_t = torch.cat((a_t, dec_hidden), dim=1) #(batch, 3*hidden_size)
      V_t = self.combined_output_projection(U_t) #(batch, hidden_size)
      O_t = self.dropout(torch.tanh(V_t)) # (batch, hidden_size)
      combined_outputs.append(O_t)

    # compute overall combined outputs
    combined_outputs = torch.stack(combined_outputs)  # (tgt_len, batch, hidden_size)
    return combined_outputs

  @property
  def device(self) -> torch.device:
      """ Determine which device to place the Tensors upon, CPU or GPU.
      """
      return self.embed.LUT.weight.device
   

In [0]:
attn_mech = GlobalAttention(128)
decoder = ChatbotDecoder(en_embeddings, 128, attn_mech, 0.9)

In [0]:
target = [['<sos>', 'i', 'came', 'from', 'there', '<eos>'], ['<sos>', 'it', 'is', 'not', 'possible', '<eos>'], ['<sos>', '<eos>']]
target_padded = en_vocab.to_input_tensor(target, device = 'cpu') # (src_len, batch)

In [0]:
combined_outputs = decoder(enc_hiddens, enc_masks, dec_init_state, target_padded)

In [44]:
combined_outputs.shape

torch.Size([5, 3, 128])

### Chatbot

In [0]:
class Chatbot(nn.Module):
  """build chatbot with encode and decoder """
  def __init__(self, vocab, embed_size = 256, hidden_size = 256, dropout_rate = 0.3):
    super(Chatbot, self).__init__()

    # attributes
    self.embed_size = embed_size
    self.hidden_size = hidden_size
    self.dropout_rate = dropout_rate
    self.vocab = vocab

    # layers
    self.master_embeddings = ModelEmbeddings(vocab, embed_size)
    self.encoder = ChatbotEncoder(self.master_embeddings, hidden_size, dropout_rate)
    self.attn_mech = GlobalAttention(hidden_size)
    self.decoder = ChatbotDecoder(self.master_embeddings, hidden_size, self.attn_mech, dropout_rate)
    self.vocab_projection = nn.Linear(hidden_size, len(vocab), bias=False)
    self.dropout = nn.Dropout(dropout_rate)


  def forward(self, query, target):
    """
    computes loss during training
    """
    query_lengths = [len(s) for s in query]
    query_lengths = [min(x,100) for x in query_lengths]

    # Convert list of lists into tensors
    query_padded = self.vocab.to_input_tensor(query, device=self.device)   # Tensor: (src_len, batch)
    target_padded = self.vocab.to_input_tensor(target, device=self.device)   # Tensor: (tgt_len, batch)

    # encoder
    enc_hiddens, enc_masks, dec_init_state = self.encoder(query_padded, query_lengths)
    # decoder
    combined_outputs = self.decoder(enc_hiddens, enc_masks, dec_init_state, target_padded)
    # predicted target words
    pred_scores_raw = self.vocab_projection(combined_outputs)
    # softmax probabilities
    P = F.log_softmax(pred_scores_raw, dim=-1)
    
    # Zero out, probabilities for which we have nothing in the target text
    target_masks = (target_padded != self.vocab['<pad>']).float()

    # Compute log probability of generating true target words
    target_gold_words_log_prob = torch.gather(P, index=target_padded[1:].unsqueeze(-1), dim=-1).squeeze(-1) * target_masks[1:]
    scores = target_gold_words_log_prob.sum(dim=0)
    return scores
  
  @property
  def device(self) -> torch.device:
      """ Determine which device to place the Tensors upon, CPU or GPU.
      """
      return self.master_embeddings.LUT.weight.device

  @staticmethod
  def load(model_path: str):
      """ Load the model from a file.
      @param model_path (str): path to model
      """
      params = torch.load(model_path, map_location=lambda storage, loc: storage)
      args = params['args']
      model = Chatbot(vocab=params['vocab'], **args)
      model.load_state_dict(params['state_dict'])

      return model

  def save(self, path: str):
      """ Save the odel to a file.
      @param path (str): path to the model
      """
      print('save model parameters to [%s]' % path, file=sys.stderr)

      params = {
          'args': dict(embed_size=self.embed_size, hidden_size=self.hidden_size, dropout_rate=self.dropout_rate),
          'vocab': self.vocab,
          'state_dict': self.state_dict()
      }

      torch.save(params, path)

    

In [0]:
chatbot = Chatbot(en_vocab)

In [0]:
scores = chatbot(query_sorted, target)

In [48]:
scores

tensor([-44.5802, -44.9359,  -9.0270], grad_fn=<SumBackward1>)

### Data Loader


In [0]:
# generate batches for taining

def batch_iter(data, batch_size, shuffle=False):
    """ Yield batches of source and target sentences reverse sorted by length (largest to smallest).
    @param data (list of (src_sent, tgt_sent)): list of tuples containing source and target sentence
    @param batch_size (int): batch size
    @param shuffle (boolean): whether to randomly shuffle the dataset
    """
    batch_num = math.ceil(len(data) / batch_size)
    index_array = list(range(len(data)))

    if shuffle:
        np.random.shuffle(index_array)

    for i in range(batch_num):
        indices = index_array[i * batch_size: (i + 1) * batch_size]
        examples = [data[idx] for idx in indices]

        examples = sorted(examples, key=lambda e: len(e[0]), reverse=True)
        src_sents = [e[0] for e in examples]
        tgt_sents = [e[1] for e in examples]

        yield src_sents, tgt_sents

### Evaluation metric

In [0]:
## Compute Perplexity to keep track of training
def evaluate_ppl(model, dev_data, batch_size=16):
    """ Evaluate perplexity on dev sentences
    @param model (Chatbot): Chatbot Model
    @param dev_data (list of (src_sent, tgt_sent)): list of tuples containing source and target sentence
    @param batch_size (batch size)
    @returns ppl (perplixty on dev sentences)
    """
    was_training = model.training
    model.eval()

    cum_loss = 0.
    cum_tgt_words = 0.

    # no_grad() signals backend to throw away all gradients
    with torch.no_grad():
        for src_sents, tgt_sents in batch_iter(dev_data, batch_size):
            loss = -model(src_sents, tgt_sents).sum()

            cum_loss += loss.item()
            tgt_word_num_to_predict = sum(len(s[1:]) for s in tgt_sents)  # omitting leading `<s>`
            cum_tgt_words += tgt_word_num_to_predict

        ppl = np.exp(cum_loss / cum_tgt_words)

    if was_training:
        model.train()

    return ppl

### Training Loop

In [0]:
######## Train Model ########

model_save_path = 'Chatbot'

##
def train_model(model, optimizer, clip_grad =5.0, max_epoch =30, max_patience = 3, 
                max_trial = 3, lr_decay = 0.5, train_batch_size = 128, log_every = 50, valid_niter = 200):
  
  
  print('Training begins...')
  ## Temp variables
  num_trial = 0
  train_iter = patience = cum_loss = report_loss = cum_tgt_words = report_tgt_words = 0
  cum_examples = report_examples  = valid_num = 0
  hist_valid_scores = []
  train_time = begin_time = time.time()
  
  # put the model in training mode
  model.train()
  
  
  # iterate over the epochs
  for epoch in range(max_epoch):
    for src_sents, tgt_sents in batch_iter(train_data, batch_size=train_batch_size, shuffle=True):
        
        train_iter += 1
        optimizer.zero_grad()
        batch_size = len(src_sents)
        
        example_losses = -model(src_sents, tgt_sents)
        batch_loss = example_losses.sum()
        loss = batch_loss/batch_size
        loss.backward() # autograd
        
        # Clip gradient
        grad_norn = torch.nn.utils.clip_grad_norm_(model.parameters(), clip_grad)
        optimizer.step() # update parameters
        
        batch_losses_val = batch_loss.item()
        report_loss += batch_losses_val
        cum_loss += batch_losses_val
        
        tgt_words_num_to_predict = sum(len(s[1:]) for s in tgt_sents)  # omitting leading `<s>`
        report_tgt_words += tgt_words_num_to_predict
        cum_tgt_words += tgt_words_num_to_predict
        report_examples += batch_size
        cum_examples += batch_size
        
        # print interim report about training
        
        if train_iter % log_every == 0:
            #set_trace()
            print('| Epoch %d, Iter %d| Avg Loss = %.2f| Avg. ppl = %.2f| Speed %.2f words/sec| Time %.2f min|' 
                  % (epoch+1, train_iter, report_loss / report_examples, math.exp(report_loss / report_tgt_words),
                          report_tgt_words / (time.time() - train_time), (time.time() - begin_time)/60.0))

            train_time = time.time()
            report_loss = report_tgt_words = report_examples = 0.
        
        # validation
        if train_iter % valid_niter == 0:
            
            print('| <Train Summary> | Epoch %d, Iter %d| Cum. loss = %.2f| Cum. ppl = %.2f|' 
                  % (epoch+1, train_iter, cum_loss / cum_examples, np.exp(cum_loss / cum_tgt_words)))

            cum_loss = cum_examples = cum_tgt_words = 0.
            valid_num += 1

            print('Report on validation set:', file=sys.stderr)

            # compute dev. ppl and bleu
            dev_ppl = evaluate_ppl(model, valid_data, batch_size=128)   # dev batch size can be a bit larger
            valid_metric = -dev_ppl

            print('Validation:  Dev. ppl = %f' % (dev_ppl), file=sys.stderr)

            
            # learning rate scheduling
            
            is_better = (len(hist_valid_scores) == 0 or valid_metric > max(hist_valid_scores))
            hist_valid_scores.append(valid_metric)

            if is_better:
                patience = 0
                print('Save currently the best model to [%s]' % model_save_path, file=sys.stderr)
                model.save(model_save_path)

                # also save the optimizers' state
                torch.save(optimizer.state_dict(), model_save_path + '.optim')
                
            elif patience < int(max_patience):
                patience += 1
                print('Hit patience %d' % patience, file=sys.stderr)

                if patience == int(max_patience):
                    num_trial += 1
                    print('Hit #%d trial' % num_trial, file=sys.stderr)
                    
                    if num_trial == int(max_trial):
                        print('early stop!', file=sys.stderr)
                        return

                    # decay lr, and restore from previously best checkpoint
                    lr = optimizer.param_groups[0]['lr'] * float(lr_decay)
                    print('load previously best model and decay learning rate to %f' % lr, file=sys.stderr)

                    # load model
                    params = torch.load(model_save_path, map_location=lambda storage, loc: storage)
                    model.load_state_dict(params['state_dict'])
                    model = model.to(device)

                    print('restore parameters of the optimizers', file=sys.stderr)
                    optimizer.load_state_dict(torch.load(model_save_path + '.optim'))

                    # set new lr
                    for param_group in optimizer.param_groups:
                        param_group['lr'] = lr

                    # reset patience
                    patience = 0

            if epoch +1 == int(max_epoch):
                print('Training stopped <-> Reached maximum number of epochs!', file=sys.stderr)
                return

### Model initialization

In [0]:
model = Chatbot(en_vocab)
## Model in training mode
model.train();

In [53]:
## Parameter Initialization
uniform_init = 0.1

def init_weights(m):
    for name, param in m.named_parameters():
        nn.init.uniform_(param.data, -uniform_init, uniform_init)
        
model.apply(init_weights);
# Count total parameters
def count_parameters(model):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)

print(f'The model has {count_parameters(model):,} trainable parameters')

# Use Adam Optimizaer
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

# transfer model to cuda if available
device = torch.device("cuda:0" if torch.cuda.device_count()>0 else "cpu")
print('use device: %s' % device)
model = model.to(device)

The model has 6,338,816 trainable parameters
use device: cuda:0


### Training

In [0]:
# train parameters
max_epoch = 10
train_batch_size = 64

# train the model
train_model(model, optimizer, max_epoch = max_epoch, train_batch_size = train_batch_size)

Training begins...
| Epoch 1, Iter 50| Avg Loss = 107.15| Avg. ppl = 118.59| Speed 3543.51 words/sec| Time 0.34 min|
| Epoch 1, Iter 100| Avg Loss = 83.25| Avg. ppl = 36.99| Speed 3635.75 words/sec| Time 0.68 min|
| Epoch 1, Iter 150| Avg Loss = 75.90| Avg. ppl = 27.48| Speed 3589.32 words/sec| Time 1.02 min|
| Epoch 1, Iter 200| Avg Loss = 74.39| Avg. ppl = 21.18| Speed 3713.16 words/sec| Time 1.37 min|
| <Train Summary> | Epoch 1, Iter 200| Cum. loss = 85.17| Cum. ppl = 39.35|


Report on validation set:
Validation:  Dev. ppl = 21.943671
Save currently the best model to [Chatbot]
save model parameters to [Chatbot]


| Epoch 1, Iter 250| Avg Loss = 71.73| Avg. ppl = 20.54| Speed 2533.10 words/sec| Time 1.87 min|
| Epoch 1, Iter 300| Avg Loss = 70.35| Avg. ppl = 22.44| Speed 3575.69 words/sec| Time 2.20 min|
| Epoch 1, Iter 350| Avg Loss = 69.66| Avg. ppl = 20.52| Speed 3601.92 words/sec| Time 2.54 min|
| Epoch 1, Iter 400| Avg Loss = 70.13| Avg. ppl = 16.61| Speed 3835.78 words/sec| Time 2.89 min|
| <Train Summary> | Epoch 1, Iter 400| Cum. loss = 70.47| Cum. ppl = 19.83|


Report on validation set:
Validation:  Dev. ppl = 18.744193
Save currently the best model to [Chatbot]
save model parameters to [Chatbot]


| Epoch 1, Iter 450| Avg Loss = 67.67| Avg. ppl = 16.63| Speed 2532.65 words/sec| Time 3.40 min|
| Epoch 2, Iter 500| Avg Loss = 68.69| Avg. ppl = 19.11| Speed 3578.52 words/sec| Time 3.74 min|
| Epoch 2, Iter 550| Avg Loss = 64.78| Avg. ppl = 17.00| Speed 3753.37 words/sec| Time 4.07 min|
| Epoch 2, Iter 600| Avg Loss = 64.86| Avg. ppl = 17.20| Speed 3588.61 words/sec| Time 4.41 min|
| <Train Summary> | Epoch 2, Iter 600| Cum. loss = 66.50| Cum. ppl = 17.46|


Report on validation set:
Validation:  Dev. ppl = 17.454501
Save currently the best model to [Chatbot]
save model parameters to [Chatbot]


| Epoch 2, Iter 650| Avg Loss = 63.65| Avg. ppl = 16.49| Speed 2360.03 words/sec| Time 4.92 min|
| Epoch 2, Iter 700| Avg Loss = 64.50| Avg. ppl = 16.94| Speed 3464.54 words/sec| Time 5.27 min|
| Epoch 2, Iter 750| Avg Loss = 63.30| Avg. ppl = 16.55| Speed 3598.84 words/sec| Time 5.61 min|
| Epoch 2, Iter 800| Avg Loss = 66.75| Avg. ppl = 14.60| Speed 3715.08 words/sec| Time 5.96 min|
| <Train Summary> | Epoch 2, Iter 800| Cum. loss = 64.55| Cum. ppl = 16.08|


Report on validation set:
Validation:  Dev. ppl = 16.741361
Save currently the best model to [Chatbot]
save model parameters to [Chatbot]


| Epoch 2, Iter 850| Avg Loss = 64.42| Avg. ppl = 12.88| Speed 2658.11 words/sec| Time 6.47 min|
| Epoch 2, Iter 900| Avg Loss = 64.19| Avg. ppl = 16.41| Speed 3635.67 words/sec| Time 6.81 min|
| Epoch 2, Iter 950| Avg Loss = 64.73| Avg. ppl = 15.17| Speed 3789.12 words/sec| Time 7.14 min|
| Epoch 3, Iter 1000| Avg Loss = 64.85| Avg. ppl = 14.61| Speed 3776.87 words/sec| Time 7.48 min|
| <Train Summary> | Epoch 3, Iter 1000| Cum. loss = 64.55| Cum. ppl = 14.67|


Report on validation set:
Validation:  Dev. ppl = 16.149976
Save currently the best model to [Chatbot]
save model parameters to [Chatbot]


| Epoch 3, Iter 1050| Avg Loss = 60.11| Avg. ppl = 14.09| Speed 2373.04 words/sec| Time 7.99 min|
| Epoch 3, Iter 1100| Avg Loss = 61.48| Avg. ppl = 14.23| Speed 3587.88 words/sec| Time 8.34 min|
| Epoch 3, Iter 1150| Avg Loss = 61.64| Avg. ppl = 13.58| Speed 3763.18 words/sec| Time 8.67 min|
| Epoch 3, Iter 1200| Avg Loss = 61.20| Avg. ppl = 14.35| Speed 3647.94 words/sec| Time 9.01 min|
| <Train Summary> | Epoch 3, Iter 1200| Cum. loss = 61.11| Cum. ppl = 14.06|


Report on validation set:
Validation:  Dev. ppl = 16.007677
Save currently the best model to [Chatbot]
save model parameters to [Chatbot]


| Epoch 3, Iter 1250| Avg Loss = 61.04| Avg. ppl = 14.24| Speed 2395.46 words/sec| Time 9.52 min|
| Epoch 3, Iter 1300| Avg Loss = 60.24| Avg. ppl = 11.09| Speed 3995.07 words/sec| Time 9.85 min|
| Epoch 3, Iter 1350| Avg Loss = 61.85| Avg. ppl = 11.75| Speed 3847.78 words/sec| Time 10.20 min|
| Epoch 3, Iter 1400| Avg Loss = 61.87| Avg. ppl = 14.89| Speed 3594.53 words/sec| Time 10.54 min|
| <Train Summary> | Epoch 3, Iter 1400| Cum. loss = 61.25| Cum. ppl = 12.82|


Report on validation set:
Validation:  Dev. ppl = 15.765635
Save currently the best model to [Chatbot]
save model parameters to [Chatbot]


| Epoch 3, Iter 1450| Avg Loss = 60.97| Avg. ppl = 14.20| Speed 2403.45 words/sec| Time 11.05 min|
| Epoch 4, Iter 1500| Avg Loss = 58.42| Avg. ppl = 12.58| Speed 3554.15 words/sec| Time 11.40 min|
| Epoch 4, Iter 1550| Avg Loss = 57.72| Avg. ppl = 11.00| Speed 3938.36 words/sec| Time 11.72 min|
| Epoch 4, Iter 1600| Avg Loss = 58.72| Avg. ppl = 11.64| Speed 3715.65 words/sec| Time 12.07 min|
| <Train Summary> | Epoch 4, Iter 1600| Cum. loss = 58.96| Cum. ppl = 12.28|


Report on validation set:
Validation:  Dev. ppl = 15.749348
Save currently the best model to [Chatbot]
save model parameters to [Chatbot]


| Epoch 4, Iter 1650| Avg Loss = 57.23| Avg. ppl = 12.49| Speed 2321.55 words/sec| Time 12.59 min|
| Epoch 4, Iter 1700| Avg Loss = 58.16| Avg. ppl = 12.98| Speed 3743.21 words/sec| Time 12.91 min|
| Epoch 4, Iter 1750| Avg Loss = 58.27| Avg. ppl = 11.45| Speed 3652.58 words/sec| Time 13.26 min|
| Epoch 4, Iter 1800| Avg Loss = 59.13| Avg. ppl = 13.12| Speed 3580.76 words/sec| Time 13.60 min|
| <Train Summary> | Epoch 4, Iter 1800| Cum. loss = 58.20| Cum. ppl = 12.48|


Report on validation set:
Validation:  Dev. ppl = 15.621592
Save currently the best model to [Chatbot]
save model parameters to [Chatbot]


| Epoch 4, Iter 1850| Avg Loss = 59.42| Avg. ppl = 12.21| Speed 2442.71 words/sec| Time 14.12 min|
| Epoch 4, Iter 1900| Avg Loss = 59.26| Avg. ppl = 11.34| Speed 3946.99 words/sec| Time 14.45 min|
| Epoch 4, Iter 1950| Avg Loss = 59.23| Avg. ppl = 13.27| Speed 3590.88 words/sec| Time 14.79 min|
| Epoch 5, Iter 2000| Avg Loss = 56.19| Avg. ppl = 11.19| Speed 3585.72 words/sec| Time 15.14 min|
| <Train Summary> | Epoch 5, Iter 2000| Cum. loss = 58.53| Cum. ppl = 11.96|


Report on validation set:
Validation:  Dev. ppl = 15.620355
Save currently the best model to [Chatbot]
save model parameters to [Chatbot]


| Epoch 5, Iter 2050| Avg Loss = 56.61| Avg. ppl = 11.26| Speed 2424.96 words/sec| Time 15.65 min|
| Epoch 5, Iter 2100| Avg Loss = 56.70| Avg. ppl = 11.41| Speed 3599.87 words/sec| Time 15.99 min|
| Epoch 5, Iter 2150| Avg Loss = 55.11| Avg. ppl = 11.24| Speed 3530.03 words/sec| Time 16.34 min|
| Epoch 5, Iter 2200| Avg Loss = 55.68| Avg. ppl = 11.47| Speed 3532.32 words/sec| Time 16.68 min|
| <Train Summary> | Epoch 5, Iter 2200| Cum. loss = 56.03| Cum. ppl = 11.34|


Report on validation set:
Validation:  Dev. ppl = 15.627443
Hit patience 1


| Epoch 5, Iter 2250| Avg Loss = 56.91| Avg. ppl = 9.94| Speed 2561.98 words/sec| Time 17.20 min|
| Epoch 5, Iter 2300| Avg Loss = 56.14| Avg. ppl = 11.80| Speed 3680.10 words/sec| Time 17.53 min|
| Epoch 5, Iter 2350| Avg Loss = 56.71| Avg. ppl = 9.83| Speed 3867.91 words/sec| Time 17.87 min|
| Epoch 5, Iter 2400| Avg Loss = 56.02| Avg. ppl = 11.99| Speed 3695.13 words/sec| Time 18.20 min|
| <Train Summary> | Epoch 5, Iter 2400| Cum. loss = 56.45| Cum. ppl = 10.80|


Report on validation set:
Validation:  Dev. ppl = 15.540236
Save currently the best model to [Chatbot]
save model parameters to [Chatbot]


| Epoch 6, Iter 2450| Avg Loss = 54.63| Avg. ppl = 10.01| Speed 2514.24 words/sec| Time 18.70 min|
| Epoch 6, Iter 2500| Avg Loss = 52.90| Avg. ppl = 8.85| Speed 3862.99 words/sec| Time 19.03 min|
| Epoch 6, Iter 2550| Avg Loss = 52.71| Avg. ppl = 9.22| Speed 3697.61 words/sec| Time 19.38 min|
| Epoch 6, Iter 2600| Avg Loss = 54.45| Avg. ppl = 10.49| Speed 3627.40 words/sec| Time 19.72 min|
| <Train Summary> | Epoch 6, Iter 2600| Cum. loss = 53.67| Cum. ppl = 9.61|


Report on validation set:
Validation:  Dev. ppl = 15.843303
Hit patience 1


| Epoch 6, Iter 2650| Avg Loss = 54.08| Avg. ppl = 10.55| Speed 2437.36 words/sec| Time 20.22 min|
| Epoch 6, Iter 2700| Avg Loss = 55.25| Avg. ppl = 10.53| Speed 3681.11 words/sec| Time 20.56 min|
| Epoch 6, Iter 2750| Avg Loss = 54.23| Avg. ppl = 10.69| Speed 3591.27 words/sec| Time 20.90 min|
| Epoch 6, Iter 2800| Avg Loss = 55.49| Avg. ppl = 10.88| Speed 3554.68 words/sec| Time 21.25 min|
| <Train Summary> | Epoch 6, Iter 2800| Cum. loss = 54.76| Cum. ppl = 10.66|


Report on validation set:
Validation:  Dev. ppl = 15.808691
Hit patience 2


| Epoch 6, Iter 2850| Avg Loss = 53.67| Avg. ppl = 9.40| Speed 2542.07 words/sec| Time 21.75 min|
| Epoch 6, Iter 2900| Avg Loss = 54.46| Avg. ppl = 9.51| Speed 3860.67 words/sec| Time 22.08 min|
| Epoch 7, Iter 2950| Avg Loss = 53.07| Avg. ppl = 9.94| Speed 3561.85 words/sec| Time 22.43 min|
| Epoch 7, Iter 3000| Avg Loss = 50.39| Avg. ppl = 9.19| Speed 3624.63 words/sec| Time 22.76 min|
| <Train Summary> | Epoch 7, Iter 3000| Cum. loss = 52.90| Cum. ppl = 9.51|


Report on validation set:
Validation:  Dev. ppl = 16.186788
Hit patience 3
Hit #1 trial
load previously best model and decay learning rate to 0.000500
restore parameters of the optimizers


| Epoch 7, Iter 3050| Avg Loss = 51.71| Avg. ppl = 7.50| Speed 2707.55 words/sec| Time 23.27 min|
| Epoch 7, Iter 3100| Avg Loss = 51.99| Avg. ppl = 10.00| Speed 3669.62 words/sec| Time 23.60 min|
| Epoch 7, Iter 3150| Avg Loss = 53.83| Avg. ppl = 9.37| Speed 3576.78 words/sec| Time 23.96 min|
| Epoch 7, Iter 3200| Avg Loss = 52.98| Avg. ppl = 10.11| Speed 3587.59 words/sec| Time 24.30 min|
| <Train Summary> | Epoch 7, Iter 3200| Cum. loss = 52.63| Cum. ppl = 9.13|


Report on validation set:
Validation:  Dev. ppl = 15.557750
Hit patience 1


| Epoch 7, Iter 3250| Avg Loss = 51.90| Avg. ppl = 9.94| Speed 2386.95 words/sec| Time 24.80 min|
| Epoch 7, Iter 3300| Avg Loss = 53.00| Avg. ppl = 8.83| Speed 3829.29 words/sec| Time 25.14 min|
| Epoch 7, Iter 3350| Avg Loss = 53.79| Avg. ppl = 10.19| Speed 3742.18 words/sec| Time 25.47 min|
| Epoch 7, Iter 3400| Avg Loss = 53.82| Avg. ppl = 10.23| Speed 3684.72 words/sec| Time 25.81 min|
| <Train Summary> | Epoch 7, Iter 3400| Cum. loss = 53.13| Cum. ppl = 9.76|


Report on validation set:
Validation:  Dev. ppl = 15.548370
Hit patience 2


| Epoch 8, Iter 3450| Avg Loss = 51.85| Avg. ppl = 9.40| Speed 2425.42 words/sec| Time 26.31 min|
| Epoch 8, Iter 3500| Avg Loss = 51.47| Avg. ppl = 9.27| Speed 3534.28 words/sec| Time 26.66 min|
| Epoch 8, Iter 3550| Avg Loss = 50.75| Avg. ppl = 8.10| Speed 3778.06 words/sec| Time 27.00 min|
| Epoch 8, Iter 3600| Avg Loss = 50.76| Avg. ppl = 9.25| Speed 3594.48 words/sec| Time 27.34 min|
| <Train Summary> | Epoch 8, Iter 3600| Cum. loss = 51.21| Cum. ppl = 8.97|


Report on validation set:
Validation:  Dev. ppl = 15.853365
Hit patience 3
Hit #2 trial
load previously best model and decay learning rate to 0.000250
restore parameters of the optimizers


| Epoch 8, Iter 3650| Avg Loss = 53.17| Avg. ppl = 9.00| Speed 2539.04 words/sec| Time 27.85 min|
| Epoch 8, Iter 3700| Avg Loss = 53.04| Avg. ppl = 9.35| Speed 3722.53 words/sec| Time 28.19 min|
| Epoch 8, Iter 3750| Avg Loss = 52.49| Avg. ppl = 9.64| Speed 3603.05 words/sec| Time 28.53 min|
| Epoch 8, Iter 3800| Avg Loss = 51.38| Avg. ppl = 8.56| Speed 3725.49 words/sec| Time 28.88 min|
| <Train Summary> | Epoch 8, Iter 3800| Cum. loss = 52.52| Cum. ppl = 9.13|


Report on validation set:
Validation:  Dev. ppl = 15.398778
Save currently the best model to [Chatbot]
save model parameters to [Chatbot]


| Epoch 8, Iter 3850| Avg Loss = 52.10| Avg. ppl = 9.25| Speed 2506.65 words/sec| Time 29.37 min|
| Epoch 8, Iter 3900| Avg Loss = 52.70| Avg. ppl = 9.90| Speed 3544.02 words/sec| Time 29.72 min|
| Epoch 9, Iter 3950| Avg Loss = 49.89| Avg. ppl = 9.29| Speed 3421.48 words/sec| Time 30.07 min|
| Epoch 9, Iter 4000| Avg Loss = 52.14| Avg. ppl = 9.41| Speed 3554.39 words/sec| Time 30.42 min|
| <Train Summary> | Epoch 9, Iter 4000| Cum. loss = 51.71| Cum. ppl = 9.46|


Report on validation set:
Validation:  Dev. ppl = 15.505146
Hit patience 1


| Epoch 9, Iter 4050| Avg Loss = 51.29| Avg. ppl = 9.32| Speed 2401.64 words/sec| Time 30.93 min|
| Epoch 9, Iter 4100| Avg Loss = 51.84| Avg. ppl = 7.25| Speed 4163.84 words/sec| Time 31.26 min|
| Epoch 9, Iter 4150| Avg Loss = 51.96| Avg. ppl = 9.59| Speed 3650.21 words/sec| Time 31.60 min|
| Epoch 9, Iter 4200| Avg Loss = 50.09| Avg. ppl = 9.27| Speed 3648.19 words/sec| Time 31.93 min|
| <Train Summary> | Epoch 9, Iter 4200| Cum. loss = 51.29| Cum. ppl = 8.75|


Report on validation set:
Validation:  Dev. ppl = 15.556466
Hit patience 2


| Epoch 9, Iter 4250| Avg Loss = 51.83| Avg. ppl = 8.05| Speed 2594.20 words/sec| Time 32.44 min|
| Epoch 9, Iter 4300| Avg Loss = 51.55| Avg. ppl = 9.59| Speed 3745.79 words/sec| Time 32.76 min|
| Epoch 9, Iter 4350| Avg Loss = 53.33| Avg. ppl = 9.60| Speed 3562.13 words/sec| Time 33.12 min|
| Epoch 10, Iter 4400| Avg Loss = 51.87| Avg. ppl = 9.36| Speed 3530.25 words/sec| Time 33.47 min|
| <Train Summary> | Epoch 10, Iter 4400| Cum. loss = 52.15| Cum. ppl = 9.11|


Report on validation set:
Validation:  Dev. ppl = 15.574506
Hit patience 3
Hit #3 trial
early stop!


### Model reload

In [54]:
# load model
params = torch.load(model_save_path, map_location=lambda storage, loc: storage)
model.load_state_dict(params['state_dict'])
model = model.to(device)

print('restore parameters of the optimizers', file=sys.stderr)
optimizer.load_state_dict(torch.load(model_save_path + '.optim'))
model.eval()

restore parameters of the optimizers


Chatbot(
  (master_embeddings): ModelEmbeddings(
    (LUT): Embedding(7504, 256, padding_idx=0)
  )
  (encoder): ChatbotEncoder(
    (embed): ModelEmbeddings(
      (LUT): Embedding(7504, 256, padding_idx=0)
    )
    (lstm): LSTM(256, 256, bidirectional=True)
    (dropout): Dropout(p=0.3, inplace=False)
    (h_projection): Linear(in_features=512, out_features=256, bias=False)
    (c_projection): Linear(in_features=512, out_features=256, bias=False)
  )
  (attn_mech): GlobalAttention(
    (mult_atten): Linear(in_features=256, out_features=256, bias=True)
  )
  (decoder): ChatbotDecoder(
    (attention): GlobalAttention(
      (mult_atten): Linear(in_features=256, out_features=256, bias=True)
    )
    (embed): ModelEmbeddings(
      (LUT): Embedding(7504, 256, padding_idx=0)
    )
    (lstm_decode): LSTMCell(512, 256)
    (att_projection): Linear(in_features=512, out_features=256, bias=False)
    (combined_output_projection): Linear(in_features=768, out_features=256, bias=False)
    (d

In [55]:
# check validation ppl
evaluate_ppl(model, valid_data, batch_size=128)

15.398778353470794

### Beam search

In [0]:
Hypothesis = namedtuple('Hypothesis', ['value', 'score'])

In [0]:
class BeamSearch(nn.Module):
  def __init__(self, chatbot, beam_size = 5, max_steps = 70):
    
    super(BeamSearch, self).__init__()
    self.model = chatbot
    self.beam_size = beam_size
    self.max_steps = max_steps
  
  
  def forward(self, src_sent):
    
    src_sent = [word for word in src_sent.split(" ")]
    src_sents_var = self.model.vocab.to_input_tensor([src_sent], self.device)
    src_encodings, enc_masks, dec_init_vec = self.model.encoder(src_sents_var, [len(src_sent)])
    src_encodings_att_linear = self.model.decoder.att_projection(src_encodings)
    h_tm1 = dec_init_vec
    att_tm1 = torch.zeros(1, self.model.hidden_size, device=self.device)
    eos_id = self.model.vocab['<eos>']

    hypotheses = [['<sos>']]
    hyp_scores = torch.zeros(len(hypotheses), dtype=torch.float, device=self.device)
    completed_hypotheses = []
    t = 0
    while len(completed_hypotheses) < self.beam_size and t < self.max_steps:
      t += 1
      hyp_num = len(hypotheses)
      exp_src_encodings = src_encodings.expand(hyp_num,
                                                src_encodings.size(1),
                                                src_encodings.size(2))

      exp_src_encodings_att_linear = src_encodings_att_linear.expand(hyp_num,
                                                                      src_encodings_att_linear.size(1),
                                                                      src_encodings_att_linear.size(2))

      y_tm1 = torch.tensor([self.model.vocab[hyp[-1]] for hyp in hypotheses], dtype=torch.long, device=self.device)
      y_t_embed = self.model.decoder.embed(y_tm1)

      x = torch.cat([y_t_embed, att_tm1], dim=-1)
      
      #(h_t, cell_t), att_t, _  = self.step(x, h_tm1,
      #                                          exp_src_encodings, exp_src_encodings_att_linear, enc_masks=None)
      
      
      (h_t, cell_t) = self.model.decoder.lstm_decode(x, h_tm1) # tuple of (dec_hidden, dec_cell) Tensors of shape (batch, hidden_size)
      #dec_hidden, dec_cell = dec_state # (batch, hidden_size)

      # Global Attention Mechanisms
      alpha_t = self.model.decoder.attention(exp_src_encodings_att_linear, h_tm1[0]) 
      
      
      # Compute attention vector
      aug_att = torch.unsqueeze(alpha_t,2) #(batch, src_len, 1)
      tr_hiddens = exp_src_encodings.transpose(1,2) #(batch, hidden_size*2, src_len)
      a_t = torch.bmm(tr_hiddens, aug_att) #(batch, 2*hidden_size, 1)
      a_t = torch.squeeze(a_t, dim=2) #(batch, 2*hidden_size) # Attention vector
      
      # Compute combined output
      U_t = torch.cat((a_t, h_t), dim=1) #(batch, 3*hidden_size)
      V_t = self.model.decoder.combined_output_projection(U_t) #(batch, hidden_size)
      O_t = self.model.decoder.dropout(torch.tanh(V_t)) # (batch, hidden_size)




      
      # log probabilities over target words
      log_p_t = F.log_softmax(self.model.vocab_projection(O_t), dim=-1)
      
      live_hyp_num = self.beam_size - len(completed_hypotheses)
      contiuating_hyp_scores = (hyp_scores.unsqueeze(1).expand_as(log_p_t) + log_p_t).view(-1)
      top_cand_hyp_scores, top_cand_hyp_pos = torch.topk(contiuating_hyp_scores, k=live_hyp_num)
      prev_hyp_ids = top_cand_hyp_pos / len(self.model.vocab)
      hyp_word_ids = top_cand_hyp_pos % len(self.model.vocab)
      
      new_hypotheses = []
      live_hyp_ids = []
      new_hyp_scores = []

      #set_trace()

      for prev_hyp_id, hyp_word_id, cand_new_hyp_score in zip(prev_hyp_ids, hyp_word_ids, top_cand_hyp_scores):
        prev_hyp_id = prev_hyp_id.item()
        hyp_word_id = hyp_word_id.item()
        cand_new_hyp_score = cand_new_hyp_score.item()

        hyp_word = self.model.vocab.id2word[hyp_word_id]
        new_hyp_sent = hypotheses[prev_hyp_id] + [hyp_word]
        if hyp_word == '<eos>':
          completed_hypotheses.append(Hypothesis(value=new_hyp_sent[1:-1], score=cand_new_hyp_score))
        else:
          new_hypotheses.append(new_hyp_sent)
          live_hyp_ids.append(prev_hyp_id)
          new_hyp_scores.append(cand_new_hyp_score)
      if len(completed_hypotheses) == self.beam_size:
        break
      live_hyp_ids = torch.tensor(live_hyp_ids, dtype=torch.long, device=self.device)
      h_tm1 = (h_t[live_hyp_ids], cell_t[live_hyp_ids])
      att_tm1 = O_t[live_hyp_ids]
      
      hypotheses = new_hypotheses
      hyp_scores = torch.tensor(new_hyp_scores, dtype=torch.float, device=self.device)
    
    
    if len(completed_hypotheses) == 0:
      completed_hypotheses.append(Hypothesis(value=hypotheses[0][1:], score=hyp_scores[0].item()))
    completed_hypotheses.sort(key=lambda hyp: hyp.score, reverse=True)
    return completed_hypotheses

  
  @property
  def device(self) -> torch.device:
      """ Determine which device to place the Tensors upon, CPU or GPU.
      """
      return self.model.master_embeddings.LUT.weight.device



In [58]:
beamBot = BeamSearch(model)
src_sent = 'monica: how are you?'
x = beamBot(src_sent)
x

[Hypothesis(value=['joey', ':', '', 'hey', '!', ''], score=-6.241628646850586)]

### Inference

In [79]:
####
Q = 'ross: ( entering room )'
A = beamBot(Q)

print("=="*40)
print(Q)
print("Bot:\n")
print('{}'.format(' '.join(A[0].value)))
print('{}'.format(' '.join(A[1].value)))
print("=="*40)

ross: ( entering room )
Bot:

joey :  hey ! 
joey :  hey ,  hey ! 


In [95]:
valid_data[idx][1]

['<sos>',
 'monica',
 ':',
 '',
 'you',
 'do',
 'not',
 'know',
 'that',
 '.',
 '',
 '<eos>']

In [96]:
####
idx = 4
Q = " ".join(valid_data[idx][0])
A_tr = " ".join(valid_data[idx][1])

# Q = 'rachel :  really ? '
A = beamBot(Q)

print("=="*40)
print(Q)
print("Bot:\n")
print('{}'.format(' '.join(A[0].value)))
print('{}'.format(A_tr))
print("=="*40)

chandler :  you know what just occurred to me ?  this could be our last thanksgiving just the two of us .  i mean ,  we could be getting a baby soon ! 
Bot:

monica :   ( entering )  hey ! 
<sos> monica :  you do not know that .  <eos>


In [61]:
####
Q = 'ross : why are you not ready yet'
A = beamBot(Q)

print("=="*40)
print(Q)
print("Bot:\n")
print('{}'.format(' '.join(A[0].value)))
print('{}'.format(' '.join(A[1].value)))
print("=="*40)

ross : why are you not ready yet
Bot:

phoebe :  i know . 
phoebe :  i don’t know . 


In [63]:
####
Q = "why"
A = beamBot(Q)

print("=="*40)
print(Q)
print("Bot:\n")
print('{}'.format(' '.join(A[0].value)))
print('{}'.format(' '.join(A[1].value)))
print('{}'.format(' '.join(A[2].value)))
print('{}'.format(' '.join(A[3].value)))
print("=="*40)

why
Bot:

rachel
chandler
monica
rachel : 
