# Building a Chatbot using Pytorch

### SAGNIK GHOSAL

In [211]:
import torch
import torch.nn as nn
from torch import optim
import torch.nn.functional as F
import csv
import random 
import re
import io
import os
import unicodedata
import codecs
import itertools

In [212]:
CUDA = torch.cuda.is_available()
device = torch.device("cuda" if CUDA else "cpu")

### Data Processing: Part 1
###### Storing contents in from given ZIP folder to a destination folder, then reading them into a file

In [213]:
corpus_name = "cornell movie-dialogs corpus"
corpus = os.path.join("data", corpus_name)

lines_filepath = os.path.join("cornell movie-dialogs corpus", "movie_lines.txt")
conv_filepath = os.path.join("cornell movie-dialogs corpus", "movie_conversations.txt")

In [214]:
with open(conv_filepath, 'r') as file:
    conv = file.readlines()
for i in conv[:8]:
    print(i.strip())

u0 +++$+++ u2 +++$+++ m0 +++$+++ ['L194', 'L195', 'L196', 'L197']
u0 +++$+++ u2 +++$+++ m0 +++$+++ ['L198', 'L199']
u0 +++$+++ u2 +++$+++ m0 +++$+++ ['L200', 'L201', 'L202', 'L203']
u0 +++$+++ u2 +++$+++ m0 +++$+++ ['L204', 'L205', 'L206']
u0 +++$+++ u2 +++$+++ m0 +++$+++ ['L207', 'L208']
u0 +++$+++ u2 +++$+++ m0 +++$+++ ['L271', 'L272', 'L273', 'L274', 'L275']
u0 +++$+++ u2 +++$+++ m0 +++$+++ ['L276', 'L277']
u0 +++$+++ u2 +++$+++ m0 +++$+++ ['L280', 'L281']


In [215]:
# Visualize some lines
with open(lines_filepath, 'r') as file:
    lines = file.readlines()
## Just for checking
for line in lines[:8]:
    print(line.strip())

L1045 +++$+++ u0 +++$+++ m0 +++$+++ BIANCA +++$+++ They do not!
L1044 +++$+++ u2 +++$+++ m0 +++$+++ CAMERON +++$+++ They do to!
L985 +++$+++ u0 +++$+++ m0 +++$+++ BIANCA +++$+++ I hope so.
L984 +++$+++ u2 +++$+++ m0 +++$+++ CAMERON +++$+++ She okay?
L925 +++$+++ u0 +++$+++ m0 +++$+++ BIANCA +++$+++ Let's go.
L924 +++$+++ u2 +++$+++ m0 +++$+++ CAMERON +++$+++ Wow
L872 +++$+++ u0 +++$+++ m0 +++$+++ BIANCA +++$+++ Okay -- you're gonna need to learn how to lie.
L871 +++$+++ u2 +++$+++ m0 +++$+++ CAMERON +++$+++ No


###### Grouping each line of file into a dictionary of fields

In [216]:
# Splits each line of the file into a dictionary of fields(LineID, CharacterID, MoveiID, character, text)
line_fields = ["lineID", "characterID", "movieID", "character", "text"]
lines = {}  ## Empty dictionary
with open(lines_filepath, 'r', encoding='iso-8859-1')as f:
    for line in f:
        values = line.split(" +++$+++ ")
        ## Extract values
        lineObj = {}
        for i,field in enumerate(line_fields):
            lineObj[field] = values[i]
        lines[lineObj['lineID']] = lineObj      
        

In [217]:
lines

{'L1045': {'lineID': 'L1045',
  'characterID': 'u0',
  'movieID': 'm0',
  'character': 'BIANCA',
  'text': 'They do not!\n'},
 'L1044': {'lineID': 'L1044',
  'characterID': 'u2',
  'movieID': 'm0',
  'character': 'CAMERON',
  'text': 'They do to!\n'},
 'L985': {'lineID': 'L985',
  'characterID': 'u0',
  'movieID': 'm0',
  'character': 'BIANCA',
  'text': 'I hope so.\n'},
 'L984': {'lineID': 'L984',
  'characterID': 'u2',
  'movieID': 'm0',
  'character': 'CAMERON',
  'text': 'She okay?\n'},
 'L925': {'lineID': 'L925',
  'characterID': 'u0',
  'movieID': 'm0',
  'character': 'BIANCA',
  'text': "Let's go.\n"},
 'L924': {'lineID': 'L924',
  'characterID': 'u2',
  'movieID': 'm0',
  'character': 'CAMERON',
  'text': 'Wow\n'},
 'L872': {'lineID': 'L872',
  'characterID': 'u0',
  'movieID': 'm0',
  'character': 'BIANCA',
  'text': "Okay -- you're gonna need to learn how to lie.\n"},
 'L871': {'lineID': 'L871',
  'characterID': 'u2',
  'movieID': 'm0',
  'character': 'CAMERON',
  'text': 'No

In [218]:
list(lines.items())[0]

('L1045',
 {'lineID': 'L1045',
  'characterID': 'u0',
  'movieID': 'm0',
  'character': 'BIANCA',
  'text': 'They do not!\n'})

In [219]:
lines['L194']

{'lineID': 'L194',
 'characterID': 'u0',
 'movieID': 'm0',
 'character': 'BIANCA',
 'text': 'Can we make this quick?  Roxanne Korrine and Andrew Barrett are having an incredibly horrendous public break- up on the quad.  Again.\n'}

### Data Preprocessing: Part 2

In [220]:
# Group fields of lines from 'LoadLines' into conversations based on "movie_conversations.txt"
conv_fields = ["character1ID", "character2ID", "movieID", "utteranceIDs"] 
## utteranceIDs is nothing but the lineIDs when characters speak with each other in a particular movie
conversations = []
with open(conv_filepath, 'r', encoding='iso-8859-1') as f:
    for line in f:
        values = line.split(" +++$+++ ")
        # Extract fields
        convObj = {}
        for i,field in enumerate(conv_fields):
            convObj[field] = values[i] 
        # Convert string result from split to list
        # utterance_id_pattern = re.compile('L[0-9]+ ')
        # lineIds = utterance_id_pattern.findall(convObj["utteranceIDs"])
        lineIds = eval(convObj["utteranceIDs"])
        # Reassemble lines
        convObj["lines"] = []
        for lineId in lineIds:
            convObj["lines"].append(lines[lineId])
        conversations.append(convObj)

In [221]:
conversations

[{'character1ID': 'u0',
  'character2ID': 'u2',
  'movieID': 'm0',
  'utteranceIDs': "['L194', 'L195', 'L196', 'L197']\n",
  'lines': [{'lineID': 'L194',
    'characterID': 'u0',
    'movieID': 'm0',
    'character': 'BIANCA',
    'text': 'Can we make this quick?  Roxanne Korrine and Andrew Barrett are having an incredibly horrendous public break- up on the quad.  Again.\n'},
   {'lineID': 'L195',
    'characterID': 'u2',
    'movieID': 'm0',
    'character': 'CAMERON',
    'text': "Well, I thought we'd start with pronunciation, if that's okay with you.\n"},
   {'lineID': 'L196',
    'characterID': 'u0',
    'movieID': 'm0',
    'character': 'BIANCA',
    'text': 'Not the hacking and gagging and spitting part.  Please.\n'},
   {'lineID': 'L197',
    'characterID': 'u2',
    'movieID': 'm0',
    'character': 'CAMERON',
    'text': "Okay... then how 'bout we try out some French cuisine.  Saturday?  Night?\n"}]},
 {'character1ID': 'u0',
  'character2ID': 'u2',
  'movieID': 'm0',
  'uttera

### Data Processing: Part 3

In [222]:
# Extract pairs of sentences from conversations
qa_pairs = []
for conversation in conversations:
    # Iterate over all the lines of the conversation
    for i in range(len(conversation["lines"])-1):
        inputLine = conversation["lines"][i]["text"].strip()
        targetLine = conversation["lines"][i+1]["text"].strip()
        # Filter wrong samples(if one of the list is empty)
        if inputLine and targetLine:
            qa_pairs.append([inputLine, targetLine])

In [223]:
qa_pairs[0]

['Can we make this quick?  Roxanne Korrine and Andrew Barrett are having an incredibly horrendous public break- up on the quad.  Again.',
 "Well, I thought we'd start with pronunciation, if that's okay with you."]

### Data Processing: Part 4

In [224]:
# Define path to new file
datafile = os.path.join("cornell movie-dialogs corpus", "formatted_movie_lines.txt")
delimiter = '\t' ## TAB sign
# unescape the delimiter
delimiter = str(codecs.decode(delimiter, "unicode_escape"))

# Write new csv file
print("\nWriting newly formatted file...")
with open(datafile, 'w', encoding='utf-8') as outputfile:
    writer = csv.writer(outputfile, delimiter=delimiter)
    for pair in qa_pairs:
        writer.writerow(pair)
print("Done writing the file!!")


Writing newly formatted file...
Done writing the file!!


In [225]:
# Visualize some lines
datafile = os.path.join("cornell movie-dialogs corpus", "formatted_movie_lines.txt")
with open(datafile, 'r') as file:
    lines = file.readlines()
for line in lines[:8]:
    print (line)

Can we make this quick?  Roxanne Korrine and Andrew Barrett are having an incredibly horrendous public break- up on the quad.  Again.	Well, I thought we'd start with pronunciation, if that's okay with you.



Well, I thought we'd start with pronunciation, if that's okay with you.	Not the hacking and gagging and spitting part.  Please.



Not the hacking and gagging and spitting part.  Please.	Okay... then how 'bout we try out some French cuisine.  Saturday?  Night?



You're asking me out.  That's so cute. What's your name again?	Forget it.





### Processing the words

In [226]:
PAD_token = 0 ## Used for padding short sentences 
SOS_token = 1 ## Start of sentence token
EOS_token = 2 ## End of sentence token

class Vocabulary:
    def __init__(self, name):  ## Double underscore mark is very important for the sake of initialization 
        self.name = name
        self.word2index = {}
        self.word2count = {}
        self.index2word = {PAD_token: "PAD", SOS_token: "SOS", EOS_token: "EOS"}
        self.num_words = 3  ## Count PAD, SOS, EOS 
        
    def addsentence(self, sentence):
        for word in sentence.split(' '):
            self.addword(word)
        
    def addword(self, word):
        if word not in self.word2index:
            self.word2index[word] = self.num_words
            self.word2count[word] = 1
            self.index2word[self.num_words] = word
            self.num_words += 1
        else:
            self.word2count[word] +=1
    # Remove words below a certain count threshold    
    def trim(self, min_count):
        keep_words = []
        for k,v in self.word2count.items():
            if v>= min_count:
                keep_words.append(k)
                
        print('keep_words {}/{} = {:,4f}', format(len(keep_words), len(self.word2index), len(keep_words)/len(self.word2index)))
        # Reinitialize dictionaries
        self.word2index = {}
        self.word2count = {}
        self.index2word = {PAD_token: "PAD", SOS_token: "SOS", EOS_token: "EOS"}
        self.num_words = 3  ## Count default tokens
        
        for word in keep_words:
            self.addword(word) 


### Processing the Text: Part 1

In [227]:
# Turn a UNICODE TO plain ASCII
def unicodeToAscii(s):
    return ''.join(c for c in unicodedata.normalize('NFD', s) if unicodedata.category(c) != 'Mn')

In [228]:
# Test the function
unicodeToAscii("Montréalais....")

'Montrealais....'

In [229]:
# Lowercase, trim white spaces, lines... etc, and remove non-letter characters
def normalizestring(s):
    s = unicodeToAscii(s.lower().strip())
    # Remove the 
    s = re.sub(r"([.?!])", r" \1", s)
    # Remove any character other than the alphabets and the three special characters and + means one or more
    s = re.sub(r"[^a-zA-Z.!?]+", r" ", s)
    # Remove sequence of whitespace characters
    s = re.sub(r"\s+", r" ",s).strip()
    return s

In [230]:
# Test the function
normalizestring("Mo!s12?a'sntréalais....    dd?\n")

'mo !s ?a sntrealais . . . . dd ?'

### Processing the Text: Part 2

In [231]:
datafile = os.path.join("cornell movie-dialogs corpus", "formatted_movie_lines.txt")
# Read the file and split into lines
print("\nReading and processing the file... Please wait")
lines = open(datafile, encoding='utf-8').read().strip().split('\n')
# Split every line into pairs and normalize
pairs = [[normalizestring(s) for s in pair.split('\t')] for pair in lines]
print("Done reading!")
voc = Vocabulary("cornell movie-dialogs corpus")


Reading and processing the file... Please wait
Done reading!


### Filtering the Text

In [232]:
# Returns true if both the sentences in a pair 'p' are under the MAX_LENGTH threshold
MAX_LENGTH = 10 # Max number of words
def filterPair(p):
    return len(p[0].split()) < MAX_LENGTH and len(p[1].split()) < MAX_LENGTH

# Filter pairs using filterpair condition
def filterPairs(pairs):
    return [pair for pair in pairs if filterPair(pair)]

In [233]:
pairs = [pair for pair in pairs if(len(pair)>1)]
pairs = filterPairs(pairs)

### Getting rid of rare words

In [234]:
# 
save_dir = os.path.join("data", "save")
for pair in pairs:
    voc.addsentence(pair[0])
    voc.addsentence(pair[1])
print("Counted words: ", voc.num_words)
# Visualization
for pair in pairs[:10]:
    print(pair)

Counted words:  18008
['there .', 'where ?']
['you have my word . as a gentleman', 'you re sweet .']
['hi .', 'looks like things worked out tonight huh ?']
['you know chastity ?', 'i believe we share an art instructor']
['have fun tonight ?', 'tons']
['well no . . .', 'then that s all you had to say .']
['then that s all you had to say .', 'but']
['but', 'you always been this selfish ?']
['do you listen to this crap ?', 'what crap ?']
['what good stuff ?', 'the real you .']


In [235]:
MIN_COUNT = 3 # Minimum word count threshold for trimming
def trimRareWords(voc, pairs, MIN_COUNT):
    # Trim words using the trim function defined previously
    voc.trim(MIN_COUNT)
    # Filter out pairs with trimmed words
    keep_pairs = []
    for pair in pairs:
        input_sentence = pair[0]
        output_sentence = pair[1]
        keep_input = True
        keep_output = True
        # Check input sentence
        for word in input_sentence.split(' '):
            if word not in voc.word2index:
                keep_input = False
                break
        # Check output sentence
        for word in output_sentence.split(' '):
            if word not in voc.word2index:
                keep_output = False
                break
        # Keep pairs of sentences with words which appear more than the threshold values
        if keep_input and keep_output:
            keep_pairs.append(pair)
    print("Trimmed from {} pairs to {}, {:,4f} of total".format(len(pairs), len(keep_pairs), len(keep_pairs)/len(pairs)))

    # Trim voc and pairs
    pairs = trimRareWords(voc, pairs, MIN_COUNT)

### Preparing the data: Part 1

In [236]:
def indexesFromSentence(voc, sentence):
    return [voc.word2index[word] for word in sentence.split(' ')] + [EOS_token]

In [237]:
# Test the function
indexesFromSentence(voc, pairs[1][0])

[7, 8, 9, 10, 4, 11, 12, 13, 2]

In [238]:
pairs[1][0]

'you have my word . as a gentleman'

In [239]:
# Define some samples for testing
inp = []
out = []
for pair in pairs[:10]:
    inp.append(pair[0])
    out.append(pair[1])
print (inp)
print(len(inp))
indexes = [indexesFromSentence(voc, sentence) for sentence in inp]
indexes


['there .', 'you have my word . as a gentleman', 'hi .', 'you know chastity ?', 'have fun tonight ?', 'well no . . .', 'then that s all you had to say .', 'but', 'do you listen to this crap ?', 'what good stuff ?']
10


[[3, 4, 2],
 [7, 8, 9, 10, 4, 11, 12, 13, 2],
 [16, 4, 2],
 [7, 24, 25, 6, 2],
 [8, 33, 22, 6, 2],
 [35, 36, 4, 4, 4, 2],
 [37, 38, 39, 40, 7, 41, 42, 43, 4, 2],
 [44, 2],
 [49, 7, 50, 42, 47, 51, 6, 2],
 [52, 53, 54, 6, 2]]

### Preparing the data: Part 2

In [240]:
def zeroPadding(l, fillvalue = 0):
    return list(itertools.zip_longest(*l, fillvalue=fillvalue))

In [241]:
leng = [len(ind) for ind in indexes]
max(leng)

10

In [242]:
leng

[3, 9, 3, 5, 5, 6, 10, 2, 8, 5]

In [243]:
# Test the function
test_result = zeroPadding(indexes)
print(len(test_result))
test_result

10


[(3, 7, 16, 7, 8, 35, 37, 44, 49, 52),
 (4, 8, 4, 24, 33, 36, 38, 2, 7, 53),
 (2, 9, 2, 25, 22, 4, 39, 0, 50, 54),
 (0, 10, 0, 6, 6, 4, 40, 0, 42, 6),
 (0, 4, 0, 2, 2, 4, 7, 0, 47, 2),
 (0, 11, 0, 0, 0, 2, 41, 0, 51, 0),
 (0, 12, 0, 0, 0, 0, 42, 0, 6, 0),
 (0, 13, 0, 0, 0, 0, 43, 0, 2, 0),
 (0, 2, 0, 0, 0, 0, 4, 0, 0, 0),
 (0, 0, 0, 0, 0, 0, 2, 0, 0, 0)]

### Preparing the data: Part 3

In [244]:
def binaryMatrix(l, value=0):
    m = []
    for i, seq in enumerate(l):
        m.append([])
        for token in seq:
            if token == PAD_token:
                m[i].append(0)
            else:
                m[i].append(1)
    return m

In [245]:
binary_result = binaryMatrix(test_result)
binary_result

[[1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
 [1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
 [1, 1, 1, 1, 1, 1, 1, 0, 1, 1],
 [0, 1, 0, 1, 1, 1, 1, 0, 1, 1],
 [0, 1, 0, 1, 1, 1, 1, 0, 1, 1],
 [0, 1, 0, 0, 0, 1, 1, 0, 1, 0],
 [0, 1, 0, 0, 0, 0, 1, 0, 1, 0],
 [0, 1, 0, 0, 0, 0, 1, 0, 1, 0],
 [0, 1, 0, 0, 0, 0, 1, 0, 0, 0],
 [0, 0, 0, 0, 0, 0, 1, 0, 0, 0]]

In [246]:
def inputVar(l, voc):
        indexes_batch = [indexesFromSentence(voc, sentence) for sentence in l]
        lengths = torch.tensor([len(indexes) for indexes in indexes_batch])
        padList = zeroPadding(indexes_batch)
        padVar = torch.LongTensor(padList)
        return padVar, lengths

In [247]:
def outputVar(l, voc):
        indexes_batch = [indexesFromSentence(voc, sentence) for sentence in l]
        max_target_len = max([len(indexes) for indexes in indexes_batch])
        padList = zeroPadding(indexes_batch)
        mask = binaryMatrix(padList)
        mask = torch.ByteTensor(mask)
        padVar = torch.LongTensor(padList)
        return padVar, mask, max_target_len

### Preparing the data: Part 4

In [248]:
# Returns all items for a given batch of pairs
def batch2TrainData(voc, pair_batch):
    # Sorting the questions in descending order of length
    pair_batch.sort(key=lambda x: len(x[0].split(" ")), reverse=True)
    input_batch, output_batch = [], []
    for pair in pair_batch:
        input_batch.append(pair[0])
        output_batch.append(pair[1])
    inp, lengths = inputVar(input_batch, voc)
    output, mask, max_target_len = outputVar(output_batch, voc)
    return inp, lengths, output, mask, max_target_len


In [249]:
# Example for validation
small_batch_size = 5
batches = batch2TrainData(voc,[random.choice(pairs) for i in range(small_batch_size)])
inp, lengths, output, mask, max_target_len = batches
print(inp,"\n",lengths,"\n",output,"\nMASK",mask,"\n",max_target_len)

tensor([[2938,   99,   52,   99, 2580],
        [ 686,  123,   52,   26,   39],
        [   4,   12,   52,  318,  760],
        [ 165,  597,  120,    7,    4],
        [  86,  424,   79,   12,    2],
        [ 552,   12,    6,  738,    0],
        [  42, 3714,   69,    6,    0],
        [7322,    6,    2,    2,    0],
        [   4,    2,    0,    0,    0],
        [   2,    0,    0,    0,    0]]) 
 tensor([10,  9,  8,  8,  5]) 
 tensor([[ 237,   26,  191,  415, 1578],
        [ 431,  214,   55,    4,   39],
        [ 293,  123,  945,    2,  731],
        [  42,   77,   69,    0,    4],
        [7361,   67,  165,    0,    2],
        [   4,    4,  191,    0,    0],
        [ 100,    2,   55,    0,    0],
        [   4,    0,  945,    0,    0],
        [   2,    0,   69,    0,    0],
        [   0,    0,    2,    0,    0]]) 
MASK tensor([[1, 1, 1, 1, 1],
        [1, 1, 1, 1, 1],
        [1, 1, 1, 1, 1],
        [1, 1, 1, 0, 1],
        [1, 1, 1, 0, 1],
        [1, 1, 1, 0, 0],
        [

### Defining the models

### Encoder

In [250]:
class EncoderRNN(nn.Module):
    def __init__(self, hidden_size, embedding, n_layers=1, dropout=0):
        super(EncoderRNN, self).__init__()
        self.n_layers = n_layers
        self.hidden_size = hidden_size
        self.embedding = embedding
        # Initialize GRU; The input_size and hidden_size are both set to 'hidden_size'
        self.gru = nn.GRU(hidden_size, hidden_size, n_layers, dropout=(0 if n_layers==1 else dropout), bidirectional=True)
        ## Dropout reduces overfitting
    
    
    def forward(self, input_seq, input_lengths, hidden=None):
        embedded = self.embedding(input_seq)
        packed = torch.nn.utils.rnn.pack_padded_sequence(embedded, input_lengths)
        outputs, hidden = self.gru(packed, hidden)
        outputs, _ = torch.nn.utils.rnn.pad_packed_sequence(outputs)
        outputs = outputs[:, :, :self.hidden_size] + outputs[:, : , self.hidden_size:]
        return outputs, hidden        

### Decoder Designing: Defining the Attention Mechanism

In [251]:
class Attn(torch.nn.Module):
    def __init__(self, method, hidden_size):
        super(Attn, self).__init__()
        self.method = method
        self.hidden_size = hidden_size
        
    def dot_score(self, hidden, encoder_output):
        return torch.sum(hidden * encoder_output, dim=2)
    
    def forward(self, hidden, encoder_outputs):
        attn_energies = self.dot_score(hidden, encoder_outputs)
        attn_energies = attn_energies.t()
        return F.softmax(attn_energies, dim=1).unsqueeze(1)

### Decoder Designing: Using Attention Mechanism to design the Decoder

In [252]:
class LuongAttnDecoderRNN(nn.Module):
    def __init__(self, attn_model, embedding, hidden_size, output_size, n_layers=1, dropout=0.1):
        super(LuongAttnDecoderRNN, self).__init__()
        self.attn_model = attn_model
        self.hidden_size = hidden_size
        self.output_size = output_size
        self.n_layers = n_layers
        self.dropout = dropout
        
        # Define Layers
        self.embedding = embedding
        self.embedding_dropout = nn.Dropout(dropout)
        self.gru = nn.GRU(hidden_size, hidden_size, n_layers, dropout=(0 if n_layers == 1 else dropout))
        ## Dropout reduces overfitting
        self.concat = nn.Linear(hidden_size * 2, hidden_size)
        self.out = nn.Linear(hidden_size, output_size)
        self.attn = Attn(attn_model, hidden_size)
    
    def forward(self, input_step, last_hidden, encoder_outputs):
        embedded = self.embedding(input_step)
        embedded = self.embedding_dropout(embedded)
        rnn_output, hidden = self.gru(embedded, last_hidden)
        attn_weights = self.attn(rnn_output, encoder_outputs)
        context = attn_weights.bmm(encoder_outputs.transpose(0,1))
        rnn_output = rnn_output.squeeze(0)
        context = context.squeeze(1)
        concat_input = torch.cat((rnn_output, context), 1) ## This results the (hidden_size * 2)
        concat_output = torch.tanh(self.concat(concat_input))
        output = self.out(concat_output)
        output = F.softmax(output, dim=1)
        return output, hidden ## Hidden is the hidden_state of the current timestep of the GRU
    ## We need to pass the hidden state of this timestep to the next timestep

### Training Code: Creating the Loss Function

In [253]:
def maskNLLLoss(decoder_out, target, mask):  ## NLL - Negative Log Likelikelihood
    nTotal = mask.sum() ## How many elements should we consider
    target = target.view(-1,1)
    gathered_tensor = torch.gather(decoder_out, 1, target)
    crossEntropy = -torch.log(gathered_tensor)
    # Select the non-zero elements
    loss = crossEntropy.masked_select(mask)
    loss = loss.mean()
    loss = loss.to(device)
    return loss, nTotal.item()

### Teacher Training: Visulaize training Part 1

In [254]:
small_batch_size = 5
batches = batch2TrainData(voc, [random.choice(pairs) for i in range(small_batch_size)])
input_variable, lengths, target_variable, mask, max_target_len = batches
print("\nInput_variable\t", input_variable.shape,"\nLengths\t",lengths.shape,
      "\nOutput\t",target_variable.shape,"\nMask",mask.shape,"\nMax_target_len",max_target_len)
## Input_variable = questions; Output = reply to the questions; mask = for calculate loss

# Define the parameters
hidden_size = 500 ## How many GRU cells/hidden neurons we have
encoder_n_layers = 2
decoder_n_layers = 2
dropout = 0.1
attn_model = 'dot'
embedding = nn.Embedding(voc.num_words, hidden_size)

# Define the encoder and decoder
encoder = EncoderRNN(hidden_size, embedding, encoder_n_layers, dropout)
decoder = LuongAttnDecoderRNN(attn_model, embedding, hidden_size, voc.num_words, decoder_n_layers, dropout) 
## Attention class is included in the decoder class
encoder = encoder.to(device)
decoder = decoder.to(device)

# Ensure that dropout layers are in train mode
encoder.train()
decoder.train()

# Initialize optimizers
encoder_optimizer = optim.Adam(encoder.parameters(), lr=0.0001)
decoder_optimizer = optim.Adam(decoder.parameters(), lr=0.0001)
## lr - Learning Rate which should not be very big
## Adam is a way of optimization which is better than the backpropagation method
encoder_optimizer.zero_grad()
decoder_optimizer.zero_grad()

input_variable = input_variable.to(device)
lengths = lengths.to(device) 
target_variable = target_variable.to(device) 
mask = mask.to(device)

loss = 0
print_losses = []
n_totals = 0

encoder_outputs, encoder_hidden = encoder(input_variable, lengths)
print("\nEncoder Output Shape = \t", encoder_outputs.shape)
print("\nLast Encoder Hidden Shape = \t", encoder_hidden.shape)

decoder_input = torch.LongTensor([[SOS_token for i in range(small_batch_size)]])
decoder_input = decoder_input.to(device)
print("\nInitial Decoder Input Shape = \t", decoder_input.shape)
print(decoder_input)

# Set decoder's initial hidden state to encoder's final hidden state
decoder_hidden = encoder_hidden[:decoder.n_layers]
print("\nInitial Decoder Hidden State Shape = \t", decoder_hidden.shape)
print("\n")
print("----------------------------------------------------------------------------------------")
print("Now lets take a look at whatis happening at every time step of the GRU!")
print("----------------------------------------------------------------------------------------")     
print("\n")

# Assume we are using Teacher Forcing
for t in range(max_target_len):
    decoder_output, decoder_hidden = decoder(decoder_input, decoder_hidden, encoder_outputs)
    print("\nDecoder Output Shape = \t", decoder_output.shape)
    print("\nDecoder Hidden Shape = \t", decoder_hidden.shape)
    
    # Teacher forcing: next input is the current target
    decoder_input = target_variable[t].view(1, -1) ## 1 - since we are using only 1 timestep
    print("\nThe target variable at the current timestep before reshapping = \t", target_variable[t])
    print("\nThe target variable shape at the current timestep before reshapping = \t", 
          target_variable[t].shape)
    print("\nAfter rashapping the target variable shape = \t",decoder_input.shape)
    
    # Calculate and accumulate loss
    print("\nThe mask at the current timestep = \t", mask[t])
    print("\nThe mask shape at the current timestep = \t", mask[t].shape)
    mask_loss, nTotal = maskNLLLoss(decoder_output, target_variable[t], mask[t])
    print("\nMask Loss = \t", mask_loss)
    print("\nTotal = \t", nTotal)
    loss += mask_loss
    print_losses.append(mask_loss.item() * nTotal)
    print(print_losses)
    n_totals += nTotal
    print(n_totals)
    encoder_optimizer.step()
    decoder_optimizer.step()
    returned_loss = sum(print_losses) / n_totals
    print("\nReturned Loss = \t", returned_loss)
    print("\n")
    print("------------------------------------DONE ONE TIMESTEP-------------------------------")
    print("\n")


Input_variable	 torch.Size([8, 5]) 
Lengths	 torch.Size([5]) 
Output	 torch.Size([8, 5]) 
Mask torch.Size([8, 5]) 
Max_target_len 8

Encoder Output Shape = 	 torch.Size([8, 5, 500])

Last Encoder Hidden Shape = 	 torch.Size([4, 5, 500])

Initial Decoder Input Shape = 	 torch.Size([1, 5])
tensor([[1, 1, 1, 1, 1]])

Initial Decoder Hidden State Shape = 	 torch.Size([2, 5, 500])


----------------------------------------------------------------------------------------
Now lets take a look at whatis happening at every time step of the GRU!
----------------------------------------------------------------------------------------



Decoder Output Shape = 	 torch.Size([5, 18008])

Decoder Hidden Shape = 	 torch.Size([2, 5, 500])

The target variable at the current timestep before reshapping = 	 tensor([ 52, 925,  79,  67,  52])

The target variable shape at the current timestep before reshapping = 	 torch.Size([5])

After rashapping the target variable shape = 	 torch.Size([1, 5])

The mask 




Mask Loss = 	 tensor(9.7928, grad_fn=<MeanBackward0>)

Total = 	 5
[48.96378993988037]
5

Returned Loss = 	 9.792757987976074


------------------------------------DONE ONE TIMESTEP-------------------------------



Decoder Output Shape = 	 torch.Size([5, 18008])

Decoder Hidden Shape = 	 torch.Size([2, 5, 500])

The target variable at the current timestep before reshapping = 	 tensor([  107, 14562,    39,    52,    49])

The target variable shape at the current timestep before reshapping = 	 torch.Size([5])

After rashapping the target variable shape = 	 torch.Size([1, 5])

The mask at the current timestep = 	 tensor([1, 1, 1, 1, 1], dtype=torch.uint8)

The mask shape at the current timestep = 	 torch.Size([5])





Mask Loss = 	 tensor(9.8189, grad_fn=<MeanBackward0>)

Total = 	 5
[48.96378993988037, 49.094624519348145]
10

Returned Loss = 	 9.805841445922852


------------------------------------DONE ONE TIMESTEP-------------------------------



Decoder Output Shape = 	 torch.Size([5, 18008])

Decoder Hidden Shape = 	 torch.Size([2, 5, 500])

The target variable at the current timestep before reshapping = 	 tensor([ 52,  58, 273,  49,   7])

The target variable shape at the current timestep before reshapping = 	 torch.Size([5])

After rashapping the target variable shape = 	 torch.Size([1, 5])

The mask at the current timestep = 	 tensor([1, 1, 1, 1, 1], dtype=torch.uint8)

The mask shape at the current timestep = 	 torch.Size([5])





Mask Loss = 	 tensor(9.7714, grad_fn=<MeanBackward0>)

Total = 	 5
[48.96378993988037, 49.094624519348145, 48.85712146759033]
15

Returned Loss = 	 9.794369061787924


------------------------------------DONE ONE TIMESTEP-------------------------------



Decoder Output Shape = 	 torch.Size([5, 18008])

Decoder Hidden Shape = 	 torch.Size([2, 5, 500])

The target variable at the current timestep before reshapping = 	 tensor([    6, 14563,   819,    26,    77])

The target variable shape at the current timestep before reshapping = 	 torch.Size([5])

After rashapping the target variable shape = 	 torch.Size([1, 5])

The mask at the current timestep = 	 tensor([1, 1, 1, 1, 1], dtype=torch.uint8)

The mask shape at the current timestep = 	 torch.Size([5])





Mask Loss = 	 tensor(9.7988, grad_fn=<MeanBackward0>)

Total = 	 5
[48.96378993988037, 49.094624519348145, 48.85712146759033, 48.993964195251465]
20

Returned Loss = 	 9.795475006103516


------------------------------------DONE ONE TIMESTEP-------------------------------



Decoder Output Shape = 	 torch.Size([5, 18008])

Decoder Hidden Shape = 	 torch.Size([2, 5, 500])

The target variable at the current timestep before reshapping = 	 tensor([  2, 411, 391,  49,   6])

The target variable shape at the current timestep before reshapping = 	 torch.Size([5])

After rashapping the target variable shape = 	 torch.Size([1, 5])

The mask at the current timestep = 	 tensor([1, 1, 1, 1, 1], dtype=torch.uint8)

The mask shape at the current timestep = 	 torch.Size([5])





Mask Loss = 	 tensor(9.7760, grad_fn=<MeanBackward0>)

Total = 	 5
[48.96378993988037, 49.094624519348145, 48.85712146759033, 48.993964195251465, 48.879828453063965]
25

Returned Loss = 	 9.79157314300537


------------------------------------DONE ONE TIMESTEP-------------------------------



Decoder Output Shape = 	 torch.Size([5, 18008])

Decoder Hidden Shape = 	 torch.Size([2, 5, 500])

The target variable at the current timestep before reshapping = 	 tensor([ 0,  4,  4, 37,  2])

The target variable shape at the current timestep before reshapping = 	 torch.Size([5])

After rashapping the target variable shape = 	 torch.Size([1, 5])

The mask at the current timestep = 	 tensor([0, 1, 1, 1, 1], dtype=torch.uint8)

The mask shape at the current timestep = 	 torch.Size([5])





Mask Loss = 	 tensor(9.8132, grad_fn=<MeanBackward0>)

Total = 	 4
[48.96378993988037, 49.094624519348145, 48.85712146759033, 48.993964195251465, 48.879828453063965, 39.25286102294922]
29

Returned Loss = 	 9.794558262002878


------------------------------------DONE ONE TIMESTEP-------------------------------



Decoder Output Shape = 	 torch.Size([5, 18008])

Decoder Hidden Shape = 	 torch.Size([2, 5, 500])

The target variable at the current timestep before reshapping = 	 tensor([0, 2, 2, 6, 0])

The target variable shape at the current timestep before reshapping = 	 torch.Size([5])

After rashapping the target variable shape = 	 torch.Size([1, 5])

The mask at the current timestep = 	 tensor([0, 1, 1, 1, 0], dtype=torch.uint8)

The mask shape at the current timestep = 	 torch.Size([5])





Mask Loss = 	 tensor(9.7432, grad_fn=<MeanBackward0>)

Total = 	 3
[48.96378993988037, 49.094624519348145, 48.85712146759033, 48.993964195251465, 48.879828453063965, 39.25286102294922, 29.229618072509766]
32

Returned Loss = 	 9.78974398970604


------------------------------------DONE ONE TIMESTEP-------------------------------



Decoder Output Shape = 	 torch.Size([5, 18008])

Decoder Hidden Shape = 	 torch.Size([2, 5, 500])

The target variable at the current timestep before reshapping = 	 tensor([0, 0, 0, 2, 0])

The target variable shape at the current timestep before reshapping = 	 torch.Size([5])

After rashapping the target variable shape = 	 torch.Size([1, 5])

The mask at the current timestep = 	 tensor([0, 0, 0, 1, 0], dtype=torch.uint8)

The mask shape at the current timestep = 	 torch.Size([5])





Mask Loss = 	 tensor(9.8016, grad_fn=<MeanBackward0>)

Total = 	 1
[48.96378993988037, 49.094624519348145, 48.85712146759033, 48.993964195251465, 48.879828453063965, 39.25286102294922, 29.229618072509766, 9.80158519744873]
33

Returned Loss = 	 9.79010281418309


------------------------------------DONE ONE TIMESTEP-------------------------------




In [255]:
def train(input_variable, lengths, target_variable, mask, max_target_len, encoder, decoder, embedding, encoder_optimizer, 
          decoder_optimizer, batch_size, clip, max_length=MAX_LENGTH):
    encoder_optimizer.zero_grad()
    decoder_optimizer.zero_grad()
    input_variable = input_variable.to(device)
    lengths = lengths.to(device)
    target_variable = target_variable.to(device)
    mask = mask.to(device)
    
    loss = 0
    print_losses = []
    n_totals = 0
    
    encoder_outputs, encoder_hidden = encoder(input_variable, lengths)
        
    decoder_input = torch.LongTensor([[SOS_token for i in range(small_batch_size)]])
    decoder_input = decoder_input.to(device)
    
    # Set decoder's initial hidden state to encoder's final hidden state
    decoder_hidden = encoder_hidden[:decoder.n_layers]
    
    # Determine if we are using teacher forcing in this iteration
    use_teacher_forcing = True if random.random() < teacher_forcing_ratio else False
    
    if use_teacher_forcing:
        for t in range(max_target_len):
            decoder_output, decoder_hidden = decoder(decoder_input, decoder_hidden, encoder_outputs)
            decoder_input = target_variable[t].view(1, -1) ## 1 - since we are using only 1 timestep
            mask_loss, nTotal = maskNLLLoss(decoder_output, target_variable[t], mask[t])
            loss += mask_loss
            print_losses.append(mask_loss.item() * nTotal)
            n_totals += nTotal
    else:
        for t in range(max_target_len):
            decoder_output, decoder_hidden = decoder(decoder_input, decoder_hidden, encoder_outputs)
            _, topi = decoder_output.topk(1)
            decoder_input = torch.LongTensor([[topi[i][0] for i in range(batch_size)]])
            decoder_input = decoder_input.to(device)
            mask_loss, nTotal = maskNLLLoss(decoder_output, target_variable[t], mask[t])
            loss += mask_loss
            print_losses.append(mask_loss.item() * nTotal)
            n_totals += nTotal
            
    #Perform backpropagation
    loss.backward()
    
    # Clip gradients: gradients are modified in place
    ## Clipping is done to prevent gradients from becoming too large
    _ = torch.nn.utils.clip_grad_norm_(encoder.parameters(), clip)  
    _ = torch.nn.utils.clip_grad_norm_(decoder.parameters(), clip)
    
    #Adjust model weights
    encoder_optimizer.step()
    decoder_optimizer.step()
    returned_loss = sum(print_losses) / n_totals
    return returned_loss

In [256]:
def trainIters(model_name, voc, pairs, encoder, decoder, encoder_optimizer, decoder_optimizer, 
               embedding, encoder_n_layers, decoder_n_layers, 
               save_dir, n_iteration, batch_size, print_every, save_every, clip, corpus_name, loadFilename):

    # Load batches for each iteration
    training_batches = [batch2TrainData(voc, [random.choice(pairs) for _ in range(batch_size)])
                      for _ in range(n_iteration)]

    # Initializations
    print('Initializing ...')
    start_iteration = 1
    print_loss = 0
    if loadFilename:
        start_iteration = checkpoint['iteration'] + 1

    # Training loop
    print("Training...")
    for iteration in range(start_iteration, n_iteration + 1):
        training_batch = training_batches[iteration - 1]
        # Extract fields from batch
        input_variable, lengths, target_variable, mask, max_target_len = training_batch

        # Run a training iteration with batch
        loss = train(input_variable, lengths, target_variable, mask, max_target_len, encoder,
                     decoder, embedding, encoder_optimizer, decoder_optimizer, batch_size, clip)
        print_loss += loss

        # Print progress
        if iteration % print_every == 0:
            print_loss_avg = print_loss / print_every
            print("Iteration: {}; Percent complete: {:.1f}%; Average loss: {:.4f}".format(iteration, iteration / n_iteration * 100, print_loss_avg))
            print_loss = 0

        # Save checkpoint
        if (iteration % save_every == 0):
            directory = os.path.join(save_dir, model_name, corpus_name, '{}-{}_{}'.format(encoder_n_layers, decoder_n_layers, hidden_size))
            if not os.path.exists(directory):
                os.makedirs(directory)
            torch.save({
                'iteration': iteration,
                'en': encoder.state_dict(),
                'de': decoder.state_dict(),
                'en_opt': encoder_optimizer.state_dict(),
                'de_opt': decoder_optimizer.state_dict(),
                'loss': loss,
                'voc_dict': voc.__dict__,
                'embedding': embedding.state_dict()
            }, os.path.join(directory, '{}_{}.tar'.format(iteration, 'checkpoint')))

In [257]:
class GreedySearchDecoder(nn.Module):
    def __init__(self, encoder, decoder):
        super(GreedySearchDecoder, self).__init__()
        self.encoder = encoder
        self.decoder = decoder

    def forward(self, input_seq, input_length, max_length):
        # Forward input through encoder model
        encoder_outputs, encoder_hidden = self.encoder(input_seq, input_length)
        # Prepare encoder's final hidden layer to be first hidden input to the decoder
        decoder_hidden = encoder_hidden[:decoder.n_layers]
        # Initialize decoder input with SOS_token
        decoder_input = torch.ones(1, 1, device=device, dtype=torch.long) * SOS_token
        # Initialize tensors to append decoded words to
        all_tokens = torch.zeros([0], device=device, dtype=torch.long)
        all_scores = torch.zeros([0], device=device)
        # Iteratively decode one word token at a time
        for _ in range(max_length):
            # Forward pass through decoder
            decoder_output, decoder_hidden = self.decoder(decoder_input, decoder_hidden, encoder_outputs)
            # Obtain most likely word token and its softmax score
            decoder_scores, decoder_input = torch.max(decoder_output, dim=1)
            # Record token and score
            all_tokens = torch.cat((all_tokens, decoder_input), dim=0)
            all_scores = torch.cat((all_scores, decoder_scores), dim=0)
            # Prepare current token to be next decoder input (add a dimension)
            decoder_input = torch.unsqueeze(decoder_input, 0)
        # Return collections of word tokens and scores
        return all_tokens, all_scores

In [258]:
def evaluate(encoder, decoder, searcher, voc, sentence, max_length=MAX_LENGTH):
    ### Format input sentence as a batch
    # words -> indexes
    indexes_batch = [indexesFromSentence(voc, sentence)]
    # Create lengths tensor
    lengths = torch.tensor([len(indexes) for indexes in indexes_batch])
    # Transpose dimensions of batch to match models' expectations
    input_batch = torch.LongTensor(indexes_batch).transpose(0, 1)
    # Use appropriate device
    input_batch = input_batch.to(device)
    lengths = lengths.to(device)
    # Decode sentence with searcher
    tokens, scores = searcher(input_batch, lengths, max_length)
    # indexes -> words
    decoded_words = [voc.index2word[token.item()] for token in tokens]
    return decoded_words


def evaluateInput(encoder, decoder, searcher, voc):
    input_sentence = ''
    while(1):
        try:
            # Get input sentence
            input_sentence = input('> ')
            # Check if it is quit case
            if input_sentence == 'q' or input_sentence == 'quit': break
            # Normalize sentence
            input_sentence = normalizeString(input_sentence)
            # Evaluate sentence
            output_words = evaluate(encoder, decoder, searcher, voc, input_sentence)
            # Format and print response sentence
            output_words[:] = [x for x in output_words if not (x == 'EOS' or x == 'PAD')]
            print('Bot:', ' '.join(output_words))

        except KeyError:
            print("Error: Encountered unknown word.")

In [259]:
# Configure models
model_name = 'cb_model'
attn_model = 'dot'
#attn_model = 'general'
#attn_model = 'concat'
hidden_size = 500
encoder_n_layers = 2
decoder_n_layers = 2
dropout = 0.1
batch_size = 64

# Set checkpoint to load from; set to None if starting from scratch
loadFilename = None
checkpoint_iter = 4000
#loadFilename = os.path.join(save_dir, model_name, corpus_name,
#                            '{}-{}_{}'.format(encoder_n_layers, decoder_n_layers, hidden_size),
#                            '{}_checkpoint.tar'.format(checkpoint_iter))


# Load model if a loadFilename is provided
if loadFilename:
    # If loading on same machine the model was trained on
    checkpoint = torch.load(loadFilename)
    # If loading a model trained on GPU to CPU
    #checkpoint = torch.load(loadFilename, map_location=torch.device('cpu'))
    encoder_sd = checkpoint['en']
    decoder_sd = checkpoint['de']
    encoder_optimizer_sd = checkpoint['en_opt']
    decoder_optimizer_sd = checkpoint['de_opt']
    embedding_sd = checkpoint['embedding']
    voc.__dict__ = checkpoint['voc_dict']


print('Building encoder and decoder ...')
# Initialize word embeddings
embedding = nn.Embedding(voc.num_words, hidden_size)
if loadFilename:
    embedding.load_state_dict(embedding_sd)
# Initialize encoder & decoder models
encoder = EncoderRNN(hidden_size, embedding, encoder_n_layers, dropout)
decoder = LuongAttnDecoderRNN(attn_model, embedding, hidden_size, voc.num_words, decoder_n_layers, dropout)
if loadFilename:
    encoder.load_state_dict(encoder_sd)
    decoder.load_state_dict(decoder_sd)
# Use appropriate device
encoder = encoder.to(device)
decoder = decoder.to(device)
print('Models built and ready to go!')

Building encoder and decoder ...
Models built and ready to go!


In [261]:
# Configure training/optimization
clip = 50.0
teacher_forcing_ratio = 1.0
learning_rate = 0.0001
decoder_learning_ratio = 5.0
n_iteration = 4000
print_every = 1
save_every = 500

# Ensure dropout layers are in train mode
encoder.train()
decoder.train()

# Initialize optimizers
print('Building optimizers ...')
encoder_optimizer = optim.Adam(encoder.parameters(), lr=learning_rate)
decoder_optimizer = optim.Adam(decoder.parameters(), lr=learning_rate * decoder_learning_ratio)
if loadFilename:
    encoder_optimizer.load_state_dict(encoder_optimizer_sd)
    decoder_optimizer.load_state_dict(decoder_optimizer_sd)

# If you have cuda, configure cuda to call
for state in encoder_optimizer.state.values():
    for k, v in state.items():
        if isinstance(v, torch.Tensor):
            state[k] = v.cuda()

for state in decoder_optimizer.state.values():
    for k, v in state.items():
        if isinstance(v, torch.Tensor):
            state[k] = v.cuda()

# Run training iterations
print("Starting Training!")
trainIters(model_name, voc, pairs, encoder, decoder, encoder_optimizer, decoder_optimizer,
           embedding, encoder_n_layers, decoder_n_layers, save_dir, n_iteration, batch_size,
           print_every, save_every, clip, corpus_name, loadFilename)

Building optimizers ...
Starting Training!
Initializing ...
Training...


RuntimeError: Expected hidden size (2, 5, 500), got (2, 64, 500)