In [1]:
import torch
import torch.nn as nn
from torch import optim
import torch.nn.functional as F
import csv
import random 
import re
import os
import unicodedata
import codecs
import itertools
# =

In [2]:
CUDA = torch.cuda.is_available()
device = torch.device("cuda" if CUDA else "cpu")
print(device)

cpu


In [3]:
# data preprocessing 
lines_filepath = os.path.join("Dataset_chatbot","data","cornell movie-dialogs corpus","movie_lines.txt")
conv_filepath = os.path.join("Dataset_chatbot","data","cornell movie-dialogs corpus","movie_conversations.txt")

In [4]:
#visualizatize some lines
with open(lines_filepath,'r') as file:
    lines = file.readlines()
for line in lines[:8]:
    print(line.strip())

L1045 +++$+++ u0 +++$+++ m0 +++$+++ BIANCA +++$+++ They do not!
L1044 +++$+++ u2 +++$+++ m0 +++$+++ CAMERON +++$+++ They do to!
L985 +++$+++ u0 +++$+++ m0 +++$+++ BIANCA +++$+++ I hope so.
L984 +++$+++ u2 +++$+++ m0 +++$+++ CAMERON +++$+++ She okay?
L925 +++$+++ u0 +++$+++ m0 +++$+++ BIANCA +++$+++ Let's go.
L924 +++$+++ u2 +++$+++ m0 +++$+++ CAMERON +++$+++ Wow
L872 +++$+++ u0 +++$+++ m0 +++$+++ BIANCA +++$+++ Okay -- you're gonna need to learn how to lie.
L871 +++$+++ u2 +++$+++ m0 +++$+++ CAMERON +++$+++ No


In [5]:
#split eacg line of the file into a dictionary of fields(lineID,characterID,movieID,character,txt)
line_fields = ["lineID","characterID","movieID","character","text"]
lines = {}
with open(lines_filepath,'r',encoding='iso-8859-1') as f:
    for line in f:
        values = line.split(" +++$+++ " )
        lineobj = {}
        for i,field in enumerate (line_fields):
            lineobj[field] = values[i]
        lines[lineobj['lineID']] = lineobj

In [6]:
# list(lines.items())[0]  

In [7]:
#groups fields of lines from 'loadlines into conversation based on "movie_conversation.txt"
conv_fields = ["characterID","character2ID","movieID","utteranceIDs"]
conversations = []
with open(conv_filepath,'r',encoding='iso-8859-1')as f:
    for line in f:
        values = line.split(" +++$+++ ")
        convobj = {}
        for i,field in enumerate(conv_fields):
            convobj[field] = values[i]
        lineIds = eval(convobj["utteranceIDs"])
        #reassemble lines
        convobj["lines"] = []
        for lineId in lineIds:
            convobj["lines"].append(lines[lineId])
        conversations.append(convobj)

In [8]:
#nested dictionary visualization
#conversations[0]

In [9]:
#extracts pairs of sentences from conversations
qa_pairs = []
for conversation in conversations:
    #iterate over all the lines of the conversation 
    for i in range(len(conversation["lines"])-1):
        inputline = conversation["lines"][i]["text"].strip()
        targetline = conversation["lines"][i+1]["text"].strip()
        #filter wrong samples (if one of the list is empty)
        if inputline and targetline:
            qa_pairs.append([inputline,targetline])

In [10]:
#question and answer pairs
#qa_pairs[0]

In [11]:
#define path to new file
datafile = os.path.join("Dataset_chatbot","data","cornell movie-dialogs corpus","formatted_movie_lines.txt")
delimiter = "\t"
#unescape the delimiter 
delimiter = str(codecs.decode(delimiter,"unicode_escape"))

#write new csv file 
print("\nwriting newly formatted data...")
with open(datafile,'w',encoding = "utf-8") as outputfile:
    writer = csv.writer(outputfile,delimiter=delimiter)
    for pair in qa_pairs:
        writer.writerow(pair)
    print("done wirting to file")


writing newly formatted data...
done wirting to file


In [13]:
#visualize some lines 
datafile = os.path.join("Dataset_chatbot","data","cornell movie-dialogs corpus","formatted_movie_lines.txt")
with open(datafile,'rb') as file:
    lines = file.readlines()
for line in lines[:2]:
    print(line)

b"Can we make this quick?  Roxanne Korrine and Andrew Barrett are having an incredibly horrendous public break- up on the quad.  Again.\tWell, I thought we'd start with pronunciation, if that's okay with you.\r\r\n"
b"Well, I thought we'd start with pronunciation, if that's okay with you.\tNot the hacking and gagging and spitting part.  Please.\r\r\n"


In [14]:
PAD_token = 0 #used for padding short sentences 
SOS_token = 1 # start of  sentance taken
EOS_token = 2 #end of sentence taken

class Vocabulary:
    def __init__(self,name):
        self.name = name
        self.word2index = {}
        self.word2count = {}
        self.index2word = {PAD_token : "PAD",SOS_token : "SOS",EOS_token : "EOS"}
        self.num_words = 3 #count sos,eos,pad
        
    def addSentence(self,sentence):
        for word in sentence.split(' '):
            self.addWord(word)
    
    # increment count if the word exists else create a new stack
    def addWord(self,word):
        if word not in self.word2index:
            self.word2index[word] = self.num_words
            self.word2count[word] = 1
            self.index2word[self.num_words] = word
            self.num_words += 1
        else:
            self.word2count[word] +=1
            
    #remove words below a certain count threshold (remove rare words)
    def trim(self,min_count):
        keep_words = []
        # k - key(key) , v - value(threshold check)
        for k,v in self.word2count.items():
            if v>= min_count:
                keep_words.append(k)
        print('keep_words {} / {} '.format(len(keep_words),len(self.word2index)))
        
        #reinitialize dictionaries 
        self.word2index = {}
        self.word2count = {}
        self.index2word = {PAD_token: "PAD", SOS_token: "SOS", EOS_token: "EOS"}
        self.num_words = 3 #count default tokens
        
        for word in keep_words:
            self.addWord(word)

In [15]:
#turn a unicode to ascii 
def unicodeToAscii(s):
    return ''.join(c for c in unicodedata.normalize('NFD',s) if unicodedata.category(c) !='Mn')

#'Mn' - non marking space

In [16]:
#test for function 
unicodeToAscii('montreal,francoise...')

'montreal,francoise...'

In [17]:
#lowercase , trim white spaces, lines...etc and remove non letter characters 
def normalizeString(s):
    s = unicodeToAscii(s.lower().strip())
    #replace any .!? by a white space as well as the character
    s = re.sub(r"([.!?])",r" \1",s)
    #remove any character that is not a sequence of the lower or upper case letters
    s = re.sub(r"[^a-zA-Z.!?]+",r" ",s)
    #remove a sequence of white space characters 
    s = re.sub(r"\s+",r" ",s).strip()
    return s

In [18]:
#test the function
normalizeString("aa123aa!s's  DD?")

'aa aa !s s dd ?'

In [19]:
datafile = os.path.join("Dataset_chatbot","data","cornell movie-dialogs corpus","formatted_movie_lines.txt")
#reading the file and split into lines 
print("reading and processing files...please wait")
lines = open(datafile,encoding='utf-8').read().strip().split('\n')
#split every line into pairs and normalize 
pairs = [[normalizeString(s) for s in pair.split('\t')]for pair in lines]
print("reading ...Done")
voc = Vocabulary("cornell movie-dialogs corpus")

reading and processing files...please wait
reading ...Done


In [20]:
print(lines[0].split('\t'))
print('\n')
print("total number of conversations :",len(pairs))

['Can we make this quick?  Roxanne Korrine and Andrew Barrett are having an incredibly horrendous public break- up on the quad.  Again.', "Well, I thought we'd start with pronunciation, if that's okay with you."]


total number of conversations : 442563


In [21]:
#returns true if both sequences in pair 'p' are under the MAX_LENGTH threshold
MAX_LENGTH = 10 #max senntence length to consider
def filterPair(p):
    #input sequences need to preserve the last word for EOS taken
    return len(p[0].split()) < MAX_LENGTH and len(p[1].split()) < MAX_LENGTH

def filterPairs(pairs):
    return[pair for pair in pairs if filterPair(pair)]
#remove the conversation from the database if teh length exceeds

# Using the functions defined above, return a populated voc object and pairs list
def loadPrepareData(corpus, corpus_name, datafile, save_dir):
    print("Start preparing training data ...")
    voc, pairs = readVocs(datafile, corpus_name)
    print("Read {!s} sentence pairs".format(len(pairs)))
    pairs = filterPairs(pairs)
    print("Trimmed to {!s} sentence pairs".format(len(pairs)))
    print("Counting words...")
    for pair in pairs:
        voc.addSentence(pair[0])
        voc.addSentence(pair[1])
    print("Counted words:", voc.num_words)
    return voc, pairs


In [22]:
pairs = [pair for pair in pairs if len(pair)>1]
print("there are {} pairs/conversations in the dataset".format(len(pairs)))
pairs = filterPairs(pairs)
print("after filtering , there are {} pairs/conversations".format(len(pairs)))


there are 221282 pairs/conversations in the dataset
after filtering , there are 64271 pairs/conversations


In [23]:
#loop through each pair and add the questions and reply sentence to the vocabulary
for pair in pairs:
    voc.addSentence(pair[0])
    voc.addSentence(pair[1])
print("counted words:",voc.num_words)
for pair in pairs[:10]:
    print(pair)

counted words: 18008
['there .', 'where ?']
['you have my word . as a gentleman', 'you re sweet .']
['hi .', 'looks like things worked out tonight huh ?']
['you know chastity ?', 'i believe we share an art instructor']
['have fun tonight ?', 'tons']
['well no . . .', 'then that s all you had to say .']
['then that s all you had to say .', 'but']
['but', 'you always been this selfish ?']
['do you listen to this crap ?', 'what crap ?']
['what good stuff ?', 'the real you .']


In [24]:
MIN_COUNT = 3  #minimum word count for trimming 

def trimRareWords(voc,pairs,MIN_OCUNT):
    #trim words used under the MIN_OCUNT from the voc
    voc.trim(MIN_COUNT)
    #filter out pairs with trimmed words 
    keep_pairs = []
    for pair in pairs : 
        input_sentence = pair[0]
        output_senence = pair[1]
        keep_input = True
        keep_output = True
        #check input sentence 
        for word in input_sentence.split(' '):
            if word not in voc.word2index:
                keep_output = False
                break
        
        #only keep pairs that do not contain trimmed word(s) in their input or output sentence
        if keep_input and keep_output:
            keep_pairs.append(pair)
    
    print('Trimmed from {} pairs to {},'.format(len(pairs), len(keep_pairs)))
    return keep_pairs

pairs = trimRareWords(voc,pairs,MIN_COUNT)

keep_words 7823 / 18005 
Trimmed from 64271 pairs to 58043,


In [25]:
### prepairing the data for the model

#number of rows will indicate the batch size
#number of coloms will indicate the maximum length of the sentence

In [53]:
#conversion from word to index
def indexesFromSentence(voc,sentence):
    #for word in sentence.split(' '):
        #print ("sentence is :",sentence)
        #print("word is :",word)
        #print("VOC word 2 index is:", voc.word2index[word])
        
    try :
        return [voc.word2index[word] for word in sentence.split(' ')] + [EOS_token]
        
    except KeyError:
        return [3]

In [54]:
#test the function 
print(pairs[1][0])
indexesFromSentence(voc,pairs[1][0])

you have my word . as a gentleman


[7, 8, 9, 10, 4, 11, 12, 13, 2]

In [55]:
#define some samples for testing 
inp = []
out = []
i = 0
for pair in pairs[:10]:
    inp.append(pair[0])
    out.append(pair[1])
print(inp)
print(len(inp))
indexes = [indexesFromSentence(voc,sentence) for sentence in inp]
indexes

['there .', 'you have my word . as a gentleman', 'hi .', 'have fun tonight ?', 'well no . . .', 'then that s all you had to say .', 'but', 'do you listen to this crap ?', 'what good stuff ?', 'the real you .']
10


[[3, 4, 2],
 [7, 8, 9, 10, 4, 11, 12, 13, 2],
 [16, 4, 2],
 [8, 31, 22, 6, 2],
 [33, 34, 4, 4, 4, 2],
 [35, 36, 37, 38, 7, 39, 40, 41, 4, 2],
 [42, 2],
 [47, 7, 48, 40, 45, 49, 6, 2],
 [50, 51, 52, 6, 2],
 [53, 54, 7, 4, 2]]

In [56]:
#in order to keep the colom length consistent , pad the rest with 0
def zeroPadding(l, fillvalue =  0):
    return list(itertools.zip_longest(*l, fillvalue=fillvalue))

In [57]:
leng = [len(ind) for ind in indexes]
max(leng)

10

In [58]:
#test the function
test_result = zeroPadding(indexes)
print(len(test_result))  #the max length is now the number of rows or the batch size
test_result

10


[(3, 7, 16, 8, 33, 35, 42, 47, 50, 53),
 (4, 8, 4, 31, 34, 36, 2, 7, 51, 54),
 (2, 9, 2, 22, 4, 37, 0, 48, 52, 7),
 (0, 10, 0, 6, 4, 38, 0, 40, 6, 4),
 (0, 4, 0, 2, 4, 7, 0, 45, 2, 2),
 (0, 11, 0, 0, 2, 39, 0, 49, 0, 0),
 (0, 12, 0, 0, 0, 40, 0, 6, 0, 0),
 (0, 13, 0, 0, 0, 41, 0, 2, 0, 0),
 (0, 2, 0, 0, 0, 4, 0, 0, 0, 0),
 (0, 0, 0, 0, 0, 2, 0, 0, 0, 0)]

In [59]:
def binaryMatrix(l , value = 0):
    m = []
    for i , seq in enumerate(l):
        m.append([])
        for token in seq:
            if token == PAD_token:
                m[i].append(0)
            else:
                 m[i].append(1)
    return m                

In [60]:
binary_result = binaryMatrix(test_result)
binary_result

[[1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
 [1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
 [1, 1, 1, 1, 1, 1, 0, 1, 1, 1],
 [0, 1, 0, 1, 1, 1, 0, 1, 1, 1],
 [0, 1, 0, 1, 1, 1, 0, 1, 1, 1],
 [0, 1, 0, 0, 1, 1, 0, 1, 0, 0],
 [0, 1, 0, 0, 0, 1, 0, 1, 0, 0],
 [0, 1, 0, 0, 0, 1, 0, 1, 0, 0],
 [0, 1, 0, 0, 0, 1, 0, 0, 0, 0],
 [0, 0, 0, 0, 0, 1, 0, 0, 0, 0]]

In [61]:
#returns padded input sequence tensor and as well as a tensor of lengths for each of the sequence in the batch 
def inputVar(l, voc):
    indexes_batch = [indexesFromSentence(voc, sentence) for sentence in l]
    lengths = torch.tensor([len(indexes) for indexes in indexes_batch])
    padList = zeroPadding(indexes_batch)
    padVar = torch.LongTensor(padList)
    return padVar, lengths

In [62]:
#returns padded target sequence tensor, padding mask and max target length
def outputVar(l, voc):
    indexes_batch = [indexesFromSentence(voc, sentence) for sentence in l]
    max_target_len = max([len(indexes) for indexes in indexes_batch])
    padList = zeroPadding(indexes_batch)
    mask = binaryMatrix(padList)
    mask = torch.ByteTensor(mask)
    padVar = torch.LongTensor(padList)
    return padVar, mask, max_target_len

In [63]:
#returns all items for a given batch of pairs
def batch2TrainData(voc, pair_batch):
    #sort the question in decending length
    pair_batch.sort(key=lambda x: len(x[0].split(" ")), reverse = True)
    input_batch, output_batch = [],[]
    for pair in pair_batch:
        input_batch.append(pair[0])
        output_batch.append(pair[1])
    inp, lengths = inputVar(input_batch, voc)
    
    output, mask, max_target_len = outputVar(output_batch, voc)
    return inp, lengths, output, mask, max_target_len

In [64]:
# example for validation 
small_batch_size = 5
batches = batch2TrainData(voc, [random.choice(pairs) for _ in range(small_batch_size)])
input_variable, lengths, target_variable, mask, max_target_len = batches

print("input variable :")
print(input_variable)
print("lengths : ")
print(lengths)
print("target_variable: ")
print(target_variable)
print("mask : ")
print(mask)
print("max_target_len : ",max_target_len)

input variable :
tensor([[ 122,   25,    7,  464,   36],
        [   7, 3513,   24, 5645,   37],
        [  24,    7,    4,    7,  123],
        [ 101,   40,   53,   89,   40],
        [ 659,  400,   52,  534,  863],
        [ 334,   45,   25,   12,   83],
        [  67,  879,  882,  228,    6],
        [   4,    4,    4,    4,    2],
        [   2,    2,    2,    2,    0]])
lengths : 
tensor([9, 9, 9, 9, 8])
target_variable: 
tensor([[ 122,    7,   25, 2762,    3],
        [3867, 3513, 3897,    4,    0],
        [ 743,   83,   76,    4,    0],
        [ 188,   40,  276,    4,    0],
        [ 480,  547,   53,   51,    0],
        [   4,    9, 2542,    4,    0],
        [   2, 1948,    4,    2,    0],
        [   0,    4,    2,    0,    0],
        [   0,    2,    0,    0,    0]])
mask : 
tensor([[1, 1, 1, 1, 1],
        [1, 1, 1, 1, 0],
        [1, 1, 1, 1, 0],
        [1, 1, 1, 1, 0],
        [1, 1, 1, 1, 0],
        [1, 1, 1, 1, 0],
        [1, 1, 1, 1, 0],
        [0, 1, 1, 0, 0],


## defining models

In [65]:
# - convert word indexes to embeddings
# - pack padded batch of sequences for RNN module 
# forward pass through GRU
# unpack padding
#return the output and the hidden state

In [66]:
#bi directional GRU used over here - to take advantage of the past and future

class EncoderRNN(nn.Module):
    def __init__(self, hidden_size, embedding, n_layers = 1, dropout = 0):
        super(EncoderRNN, self).__init__()
        self.n_layers = n_layers
        self.hidden_size = hidden_size
        self.embedding = embedding
        #initialize GRU ; input_size and hidden_size params are both set to 'hidden_size'
        #because our input size is a word embedding with number of features == hidden_size
        self.gru = nn.GRU(hidden_size, hidden_size, n_layers, dropout=(0 if n_layers == 1 else dropout),bidirectional=True)
    
    def forward(self, input_seq, input_lengths, hidden =None):
        # input_seq: batch of input sentences; shape = (max_length,batch_size)
        # input_lengths : list of sentence lengths corresponding to each sentence in the batch
        # hidden state, of shape: (n_layers x num_directions, batch_size, hidden_size)
        # convert word indexes to embeddings
        embedded = self.embedding(input_seq)
        #pack padded batch of sequences for RNN module
        packed = torch.nn.utils.rnn.pack_padded_sequence(embedded, input_lengths)
        #forward pass through GRU
        outputs, hidden = self.gru(packed, hidden)
        #unpacked padding
        outputs, _ = torch.nn.utils.rnn.pad_packed_sequence(outputs)
        #sum bidirectional GRU outputs
        outputs = outputs[:, :, :self.hidden_size] + outputs[:, :, self.hidden_size:]
        #return output and final hidden state
        return outputs , hidden
    

In [67]:
# decorder -  attention model (has an attention operation - layer)
# the luong attentio layer has been implemented 

class Attn(torch.nn.Module):
    def __init__(self, method, hidden_size):
        super(Attn, self).__init__()
        self.method = method
        self.hidden_size = hidden_size
    
    def dot_score(self, hidden, encoder_output):
        #element wose miltiply the current target decorder state with the encorder output and sum
        return torch.sum(hidden * encoder_output, dim = 2)
    
    def forward(self, hidden, encoder_outputs):
        #hidden of shape (1, batch_size, hidden_size)
        #encorder_outputs of shape : (max_length, batch_size, hidden_size)
        
        #calculate the attention weights (energies)
        attn_energies = self.dot_score(hidden, encoder_outputs) #maxlength and batchsize
        #transpose max length and batch size dimensions
        attn_energies = attn_energies.t()
        #return the softmax normalized probability scores (with added dimensions)
        return F.softmax(attn_energies, dim=1).unsqueeze(1)


In [68]:
class LuongAttnDecoderRNN(nn.Module):
    def __init__(self, attn_model, embedding, hidden_size, output_size, n_layers=1, dropout=0.1):
        super(LuongAttnDecoderRNN, self).__init__()
        self.attn_model = attn_model
        self.hidden_size = output_size
        self.n_layers = n_layers
        self.dropout = dropout
        
        #define layers 
        self.embedding = embedding
        self.embedding_dropout = nn.Dropout(dropout)
        self.gru = nn.GRU(hidden_size, hidden_size, n_layers, dropout = (0 if n_layers == 1 else dropout))
        self.concat = nn.Linear(hidden_size * 2, hidden_size)
        self.out = nn.Linear(hidden_size, output_size)
        
        self.attn = Attn(attn_model, hidden_size)
        
    def forward(self, input_step, last_hidden, encoder_outputs): 
        #get embedded of the current input word
        embedded = self.embedding(input_step)
        embedded = self.embedding_dropout(embedded)
        #forward  through unidirectional GRU 
        rnn_output, hidden = self.gru(embedded, last_hidden)
        #calculate attention weights from the current GRU output
        attn_weights = self.attn(rnn_output, encoder_outputs)
        #multiply attention weights to encorder outputs to get the new "weighted sum" contect vector
        context = attn_weights.bmm(encoder_outputs.transpose(0,1))
        #conctenate weighted context vector and GRU outputs 
        rnn_output = rnn_output.squeeze(0)
        context = context.squeeze(1)
        concat_input = torch.cat((rnn_output, context), 1)
        concat_output = torch.tanh(self.concat(concat_input))
        #predict next word using luoung eq 6 
        output = self.out(concat_output)
        output = F.softmax(output, dim=1)
        #return output and the final hidden state
        return output, hidden
        
        

## training the model

In [69]:
def maskNLLLoss(decoder_out, target, mask):
    nTotal = mask.sum()
    target = target.view(-1,1)
    # decorder_out shape: (batch_size, vocab_size) , target_size =(batch_size,1)
    gathered_tensor = torch.gather(decoder_out, 1, target)
    #calculate the Negative log likelihood loss
    crossEntropy = -torch.log(gathered_tensor)
    #select the non zero elements
    loss = crossEntropy.masked_select(mask)
    #calculate the mean of the loss
    loss = loss.mean()
    loss = loss.to(device)
    return loss, nTotal.item()

In [70]:
#visualization of the performance of the loss function 

dec_o = torch.rand(5,7)
dec_o = F.softmax(dec_o, dim=1)
tar = torch.tensor([2, 1, 5, 4,0], dtype = torch.long)
tar = tar.view(-1,1)
mask = torch.tensor([1,0,1,1,0], dtype = torch.uint8)
print(dec_o)
print(tar)
gath_ten = torch.gather(dec_o, 1, tar)
print(gath_ten)
print(gath_ten.shape)

tensor([[0.1801, 0.0805, 0.1245, 0.1951, 0.1729, 0.0863, 0.1606],
        [0.1784, 0.1260, 0.1122, 0.1196, 0.1385, 0.1506, 0.1747],
        [0.1593, 0.1629, 0.1656, 0.1712, 0.0829, 0.1087, 0.1494],
        [0.1061, 0.1920, 0.0971, 0.2278, 0.1388, 0.0962, 0.1420],
        [0.1709, 0.1873, 0.1938, 0.1144, 0.0903, 0.1311, 0.1121]])
tensor([[2],
        [1],
        [5],
        [4],
        [0]])
tensor([[0.1245],
        [0.1260],
        [0.1087],
        [0.1388],
        [0.1709]])
torch.Size([5, 1])


## teacher forced training

In [71]:
# we are supplying the inputs regardless the last word generated - dsnt mimic real life situations

In [72]:
#visualize whats happening in one iteration. 
small_batch_size = 5
batches = batch2TrainData(voc, [random.choice(pairs) for _ in range(small_batch_size)])
input_variable, lengths, target_variable, mask, max_target_len = batches

print("input variable shape:",input_variable.shape)
print("lengths shape:",lengths.shape)
print("target_variable shape :", target_variable.shape)
print("mask shape :", mask.shape)
print("max_target_len:",max_target_len)

#define parameters
hidden_size = 500
encoder_n_layers = 2 
decoder_n_layers = 2
dropout = 0.1
attn_model = 'dot'
embedding = nn.Embedding(voc.num_words, hidden_size)

#define the encoder and decoder
encoder = EncoderRNN(hidden_size, embedding, encoder_n_layers, dropout)
decoder = LuongAttnDecoderRNN(attn_model, embedding, hidden_size, voc.num_words, decoder_n_layers, dropout)
encoder = encoder.to(device)
decoder = decoder.to(device)
#ensure dropout layers are in train mode
encoder.train()
decoder.train()

#initialize optimizers 
encoder_optimizer = optim.Adam(encoder.parameters(), lr =0.001)
decoder_optimizer = optim.Adam(decoder.parameters(), lr =0.001)
encoder_optimizer.zero_grad()
decoder_optimizer.zero_grad()

input_variable = input_variable.to(device)
lengths = lengths.to(device)
target_variable = target_variable.to(device)
mask = mask.to(device)

loss = 0
print_losses = []
n_totals = 0

encoder_outputs, encoder_hidden = encoder(input_variable, lengths)
print("encoder outputs Shape:", encoder_outputs.shape)
print("last encoder hidden shape:",encoder_hidden.shape)

decoder_input = torch.LongTensor([[SOS_token for _ in range(small_batch_size)]])
decoder_input = decoder_input.to(device)

#set initial decoder hidden state to encoders final hidden state
decoder_hidden = encoder_hidden[:decoder.n_layers]
print("initial decoder hidden state shape:",decoder_hidden.shape)
print("\n")

#assume we are using teacher forcing
for t in range(max_target_len):
    decoder_output, decoder_hidden = decoder(decoder_input, decoder_hidden, encoder_outputs)
    print("decoder output shape :", decoder_output.shape)
    print("decoder hidden shape :", decoder_hidden.shape)
    
    #teacher forcing 
    decoder_input = target_variable[t].view(1,-1)
    print("the target variable at the current timestep before time shaping :", target_variable[t])
    print("the target variable at the current timestep shape before reshaping :",target_variable[t].shape)
    print("the decoder input shape :",decoder_input.shape)
    
    #calculate and accumlate loss 
    print("the mask at the current timestep :",mask[t])
    print("the mask at the current time step shape :",mask[t].shape)
    mask_loss , nTotal = maskNLLLoss(decoder_output, target_variable[t], mask[t])
    print("mask loss :",mask_loss)
    print("Total :",nTotal)
    n_totals += nTotal
    print(n_totals)
    #update the weights
    encoder_optimizer.step()
    decoder_optimizer.step()
    returned_loss = sum(print_losses) / n_totals
    print("returned loss:",returned_loss)
    print("\n")
    print("-----------DONE ONE TIME---------")
    print("\n")


input variable shape: torch.Size([10, 5])
lengths shape: torch.Size([5])
target_variable shape : torch.Size([10, 5])
mask shape : torch.Size([10, 5])
max_target_len: 10
encoder outputs Shape: torch.Size([10, 5, 500])
last encoder hidden shape: torch.Size([4, 5, 500])
initial decoder hidden state shape: torch.Size([2, 5, 500])


decoder output shape : torch.Size([5, 7826])
decoder hidden shape : torch.Size([2, 5, 500])
the target variable at the current timestep before time shaping : tensor([  86, 7696,  197,   25,    3])
the target variable at the current timestep shape before reshaping : torch.Size([5])
the decoder input shape : torch.Size([1, 5])
the mask at the current timestep : tensor([1, 1, 1, 1, 1], dtype=torch.uint8)
the mask at the current time step shape : torch.Size([5])




mask loss : tensor(8.9326, grad_fn=<MeanBackward0>)
Total : 5
5
returned loss: 0.0


-----------DONE ONE TIME---------


decoder output shape : torch.Size([5, 7826])
decoder hidden shape : torch.Size([2, 5, 500])
the target variable at the current timestep before time shaping : tensor([ 25,  66, 117, 200,   0])
the target variable at the current timestep shape before reshaping : torch.Size([5])
the decoder input shape : torch.Size([1, 5])
the mask at the current timestep : tensor([1, 1, 1, 1, 0], dtype=torch.uint8)
the mask at the current time step shape : torch.Size([5])




mask loss : tensor(8.9209, grad_fn=<MeanBackward0>)
Total : 4
9
returned loss: 0.0


-----------DONE ONE TIME---------


decoder output shape : torch.Size([5, 7826])
decoder hidden shape : torch.Size([2, 5, 500])
the target variable at the current timestep before time shaping : tensor([ 63, 158,   7, 169,   0])
the target variable at the current timestep shape before reshaping : torch.Size([5])
the decoder input shape : torch.Size([1, 5])
the mask at the current timestep : tensor([1, 1, 1, 1, 0], dtype=torch.uint8)
the mask at the current time step shape : torch.Size([5])




mask loss : tensor(8.9421, grad_fn=<MeanBackward0>)
Total : 4
13
returned loss: 0.0


-----------DONE ONE TIME---------


decoder output shape : torch.Size([5, 7826])
decoder hidden shape : torch.Size([2, 5, 500])
the target variable at the current timestep before time shaping : tensor([  4,  96, 393,   7,   0])
the target variable at the current timestep shape before reshaping : torch.Size([5])
the decoder input shape : torch.Size([1, 5])
the mask at the current timestep : tensor([1, 1, 1, 1, 0], dtype=torch.uint8)
the mask at the current time step shape : torch.Size([5])




mask loss : tensor(8.9646, grad_fn=<MeanBackward0>)
Total : 4
17
returned loss: 0.0


-----------DONE ONE TIME---------


decoder output shape : torch.Size([5, 7826])
decoder hidden shape : torch.Size([2, 5, 500])
the target variable at the current timestep before time shaping : tensor([   2,   53,   50, 1948,    0])
the target variable at the current timestep shape before reshaping : torch.Size([5])
the decoder input shape : torch.Size([1, 5])
the mask at the current timestep : tensor([1, 1, 1, 1, 0], dtype=torch.uint8)
the mask at the current time step shape : torch.Size([5])




mask loss : tensor(8.9137, grad_fn=<MeanBackward0>)
Total : 4
21
returned loss: 0.0


-----------DONE ONE TIME---------


decoder output shape : torch.Size([5, 7826])
decoder hidden shape : torch.Size([2, 5, 500])
the target variable at the current timestep before time shaping : tensor([   0,  705,   65, 3392,    0])
the target variable at the current timestep shape before reshaping : torch.Size([5])
the decoder input shape : torch.Size([1, 5])
the mask at the current timestep : tensor([0, 1, 1, 1, 0], dtype=torch.uint8)
the mask at the current time step shape : torch.Size([5])




mask loss : tensor(8.9569, grad_fn=<MeanBackward0>)
Total : 3
24
returned loss: 0.0


-----------DONE ONE TIME---------


decoder output shape : torch.Size([5, 7826])
decoder hidden shape : torch.Size([2, 5, 500])
the target variable at the current timestep before time shaping : tensor([   0,   66, 1075,    4,    0])
the target variable at the current timestep shape before reshaping : torch.Size([5])
the decoder input shape : torch.Size([1, 5])
the mask at the current timestep : tensor([0, 1, 1, 1, 0], dtype=torch.uint8)
the mask at the current time step shape : torch.Size([5])




mask loss : tensor(8.9452, grad_fn=<MeanBackward0>)
Total : 3
27
returned loss: 0.0


-----------DONE ONE TIME---------


decoder output shape : torch.Size([5, 7826])
decoder hidden shape : torch.Size([2, 5, 500])
the target variable at the current timestep before time shaping : tensor([0, 2, 7, 2, 0])
the target variable at the current timestep shape before reshaping : torch.Size([5])
the decoder input shape : torch.Size([1, 5])
the mask at the current timestep : tensor([0, 1, 1, 1, 0], dtype=torch.uint8)
the mask at the current time step shape : torch.Size([5])




mask loss : tensor(8.9336, grad_fn=<MeanBackward0>)
Total : 3
30
returned loss: 0.0


-----------DONE ONE TIME---------


decoder output shape : torch.Size([5, 7826])
decoder hidden shape : torch.Size([2, 5, 500])
the target variable at the current timestep before time shaping : tensor([0, 0, 6, 0, 0])
the target variable at the current timestep shape before reshaping : torch.Size([5])
the decoder input shape : torch.Size([1, 5])
the mask at the current timestep : tensor([0, 0, 1, 0, 0], dtype=torch.uint8)
the mask at the current time step shape : torch.Size([5])




mask loss : tensor(8.9394, grad_fn=<MeanBackward0>)
Total : 1
31
returned loss: 0.0


-----------DONE ONE TIME---------


decoder output shape : torch.Size([5, 7826])
decoder hidden shape : torch.Size([2, 5, 500])
the target variable at the current timestep before time shaping : tensor([0, 0, 2, 0, 0])
the target variable at the current timestep shape before reshaping : torch.Size([5])
the decoder input shape : torch.Size([1, 5])
the mask at the current timestep : tensor([0, 0, 1, 0, 0], dtype=torch.uint8)
the mask at the current time step shape : torch.Size([5])




mask loss : tensor(8.9072, grad_fn=<MeanBackward0>)
Total : 1
32
returned loss: 0.0


-----------DONE ONE TIME---------




In [73]:
# turn off the teacher enforcing to train data 
# the same code but target would be the top prediction
def train(input_variable, lengths, target_variable, mask, max_target_len, encoder, decoder, embedding,
          encoder_optimizer, decoder_optimizer, batch_size, clip, max_length=MAX_LENGTH):

    # Zero gradients
    encoder_optimizer.zero_grad()
    decoder_optimizer.zero_grad()

    # Set device options
    input_variable = input_variable.to(device)
    lengths = lengths.to(device)
    target_variable = target_variable.to(device)
    mask = mask.to(device)

    # Initialize variables
    loss = 0
    print_losses = []
    n_totals = 0

    # Forward pass through encoder
    encoder_outputs, encoder_hidden = encoder(input_variable, lengths)

    # Create initial decoder input (start with SOS tokens for each sentence)
    decoder_input = torch.LongTensor([[SOS_token for _ in range(batch_size)]])
    decoder_input = decoder_input.to(device)

    # Set initial decoder hidden state to the encoder's final hidden state
    decoder_hidden = encoder_hidden[:decoder.n_layers]

    # Determine if we are using teacher forcing this iteration
    use_teacher_forcing = True if random.random() < teacher_forcing_ratio else False

    # Forward batch of sequences through decoder one time step at a time
    if use_teacher_forcing:
        for t in range(max_target_len):
            decoder_output, decoder_hidden = decoder(
                decoder_input, decoder_hidden, encoder_outputs
            )
            # Teacher forcing: next input is current target
            decoder_input = target_variable[t].view(1, -1)
            # Calculate and accumulate loss
            mask_loss, nTotal = maskNLLLoss(decoder_output, target_variable[t], mask[t])
            loss += mask_loss
            print_losses.append(mask_loss.item() * nTotal)
            n_totals += nTotal
    else:
        for t in range(max_target_len):
            decoder_output, decoder_hidden = decoder(
                decoder_input, decoder_hidden, encoder_outputs
            )
            # No teacher forcing: next input is decoder's own current output
            _, topi = decoder_output.topk(1)
            decoder_input = torch.LongTensor([[topi[i][0] for i in range(batch_size)]])
            decoder_input = decoder_input.to(device)
            # Calculate and accumulate loss
            mask_loss, nTotal = maskNLLLoss(decoder_output, target_variable[t], mask[t])
            loss += mask_loss
            print_losses.append(mask_loss.item() * nTotal)
            n_totals += nTotal

    # Perform backpropatation
    loss.backward()

    # Clip gradients: gradients are modified in place
    _ = nn.utils.clip_grad_norm_(encoder.parameters(), clip)
    _ = nn.utils.clip_grad_norm_(decoder.parameters(), clip)

    # Adjust model weights
    encoder_optimizer.step()
    decoder_optimizer.step()

    return sum(print_losses) / n_totals

In [74]:
def trainIters(model_name, voc, pairs, encoder, decoder, encoder_optimizer, decoder_optimizer, embedding, encoder_n_layers, decoder_n_layers, save_dir, n_iteration, batch_size, print_every, save_every, clip, corpus_name, loadFilename):
    
    
    # Load batches for each iteration
    training_batches = [batch2TrainData(voc, [random.choice(pairs) for _ in range(batch_size)])
                      for _ in range(n_iteration)]

    # Initializations
    print('Initializing ...')
    start_iteration = 1
    print_loss = 0
    if loadFilename:
        start_iteration = checkpoint['iteration'] + 1

    # Training loop
    print("Training...")
    for iteration in range(start_iteration, n_iteration + 1):
        training_batch = training_batches[iteration - 1]
        # Extract fields from batch
        input_variable, lengths, target_variable, mask, max_target_len = training_batch

        # Run a training iteration with batch
        loss = train(input_variable, lengths, target_variable, mask, max_target_len, encoder,
                     decoder, embedding, encoder_optimizer, decoder_optimizer, batch_size, clip)
        print_loss += loss

        # Print progress
        if iteration % print_every == 0:
            print_loss_avg = print_loss / print_every
            print("Iteration: {}; Percent complete: {:.1f}%; Average loss: {:.4f}".format(iteration, iteration / n_iteration * 100, print_loss_avg))
            print_loss = 0

        # Save checkpoint
        if (iteration % save_every == 0):
            directory = os.path.join(save_dir, model_name, corpus_name, '{}-{}_{}'.format(encoder_n_layers, decoder_n_layers, hidden_size))
            if not os.path.exists(directory):
                os.makedirs(directory)
            torch.save({
                'iteration': iteration,
                'en': encoder.state_dict(),
                'de': decoder.state_dict(),
                'en_opt': encoder_optimizer.state_dict(),
                'de_opt': decoder_optimizer.state_dict(),
                'loss': loss,
                'voc_dict': voc.__dict__,
                'embedding': embedding.state_dict()
            }, os.path.join(directory, '{}_{}.tar'.format(iteration, 'checkpoint')))

In [75]:
def evaluate(encoder, decoder, searcher, voc, sentence, max_length=MAX_LENGTH):
    ### Format input sentence as a batch
    # words -> indexes
    indexes_batch = [indexesFromSentence(voc, sentence)]
    # Create lengths tensor
    lengths = torch.tensor([len(indexes) for indexes in indexes_batch])
    # Transpose dimensions of batch to match models' expectations
    input_batch = torch.LongTensor(indexes_batch).transpose(0, 1)
    # Use appropriate device
    input_batch = input_batch.to(device)
    lengths = lengths.to(device)
    # Decode sentence with searcher
    tokens, scores = searcher(input_batch, lengths, max_length)
    # indexes -> words
    decoded_words = [voc.index2word[token.item()] for token in tokens]
    return decoded_words


def evaluateInput(encoder, decoder, searcher, voc):
    input_sentence = ''
    while(1):
        try:
            # Get input sentence
            input_sentence = input('> ')
            # Check if it is quit case
            if input_sentence == 'q' or input_sentence == 'quit': break
            # Normalize sentence
            input_sentence = normalizeString(input_sentence)
            # Evaluate sentence
            output_words = evaluate(encoder, decoder, searcher, voc, input_sentence)
            # Format and print response sentence
            output_words[:] = [x for x in output_words if not (x == 'EOS' or x == 'PAD')]
            print('Bot:', ' '.join(output_words))

        except KeyError:
            print("Error: Encountered unknown word.")

In [76]:
class GreedySearchDecoder(nn.Module):
    def __init__(self, encoder, decoder):
        super(GreedySearchDecoder, self).__init__()
        self.encoder = encoder
        self.decoder = decoder

    def forward(self, input_seq, input_length, max_length):
        # Forward input through encoder model
        encoder_outputs, encoder_hidden = self.encoder(input_seq, input_length)
        # Prepare encoder's final hidden layer to be first hidden input to the decoder
        decoder_hidden = encoder_hidden[:decoder.n_layers]
        # Initialize decoder input with SOS_token
        decoder_input = torch.ones(1, 1, device=device, dtype=torch.long) * SOS_token
        # Initialize tensors to append decoded words to
        all_tokens = torch.zeros([0], device=device, dtype=torch.long)
        all_scores = torch.zeros([0], device=device)
        # Iteratively decode one word token at a time
        for _ in range(max_length):
            # Forward pass through decoder
            decoder_output, decoder_hidden = self.decoder(decoder_input, decoder_hidden, encoder_outputs)
            # Obtain most likely word token and its softmax score
            decoder_scores, decoder_input = torch.max(decoder_output, dim=1)
            # Record token and score
            all_tokens = torch.cat((all_tokens, decoder_input), dim=0)
            all_scores = torch.cat((all_scores, decoder_scores), dim=0)
            # Prepare current token to be next decoder input (add a dimension)
            decoder_input = torch.unsqueeze(decoder_input, 0)
        # Return collections of word tokens and scores
        return all_tokens, all_scores

In [77]:
# Configure models
model_name = 'cb_model'
attn_model = 'dot'
#attn_model = 'general'
#attn_model = 'concat'
hidden_size = 500
encoder_n_layers = 2
decoder_n_layers = 2
dropout = 0.1
batch_size = 64

# Set checkpoint to load from; set to None if starting from scratch
loadFilename = None
checkpoint_iter = 4000
#loadFilename = os.path.join(save_dir, model_name, corpus_name,
#                            '{}-{}_{}'.format(encoder_n_layers, decoder_n_layers, hidden_size),
#                            '{}_checkpoint.tar'.format(checkpoint_iter))


# Load model if a loadFilename is provided
if loadFilename:
    # If loading on same machine the model was trained on
    checkpoint = torch.load(loadFilename)
    # If loading a model trained on GPU to CPU
    #checkpoint = torch.load(loadFilename, map_location=torch.device('cpu'))
    encoder_sd = checkpoint['en']
    decoder_sd = checkpoint['de']
    encoder_optimizer_sd = checkpoint['en_opt']
    decoder_optimizer_sd = checkpoint['de_opt']
    embedding_sd = checkpoint['embedding']
    voc.__dict__ = checkpoint['voc_dict']


print('Building encoder and decoder ...')
# Initialize word embeddings
embedding = nn.Embedding(voc.num_words, hidden_size)
if loadFilename:
    embedding.load_state_dict(embedding_sd)
# Initialize encoder & decoder models
encoder = EncoderRNN(hidden_size, embedding, encoder_n_layers, dropout)
decoder = LuongAttnDecoderRNN(attn_model, embedding, hidden_size, voc.num_words, decoder_n_layers, dropout)
if loadFilename:
    encoder.load_state_dict(encoder_sd)
    decoder.load_state_dict(decoder_sd)
# Use appropriate device
encoder = encoder.to(device)
decoder = decoder.to(device)
print('Models built and ready to go!')

Building encoder and decoder ...
Models built and ready to go!


In [78]:
MAX_LENGTH = 10  # Maximum sentence length to consider

# Turn a Unicode string to plain ASCII, thanks to
# https://stackoverflow.com/a/518232/2809427
def unicodeToAscii(s):
    return ''.join(
        c for c in unicodedata.normalize('NFD', s)
        if unicodedata.category(c) != 'Mn'
    )

# Lowercase, trim, and remove non-letter characters
def normalizeString(s):
    s = unicodeToAscii(s.lower().strip())
    s = re.sub(r"([.!?])", r" \1", s)
    s = re.sub(r"[^a-zA-Z.!?]+", r" ", s)
    s = re.sub(r"\s+", r" ", s).strip()
    return s

# Read query/response pairs and return a voc object
def readVocs(datafile, corpus_name):
    print("Reading lines...")
    # Read the file and split into lines
    lines = open(datafile, encoding='utf-8').\
        read().strip().split('\n')
    # Split every line into pairs and normalize
    pairs = [[normalizeString(s) for s in l.split('\t')] for l in lines]
    voc = Voc(corpus_name)
    return voc, pairs

# Returns True iff both sentences in a pair 'p' are under the MAX_LENGTH threshold
def filterPair(p):
    # Input sequences need to preserve the last word for EOS token
    return len(p[0].split(' ')) < MAX_LENGTH and len(p[1].split(' ')) < MAX_LENGTH

# Filter pairs using filterPair condition
def filterPairs(pairs):
    return [pair for pair in pairs if filterPair(pair)]

# Using the functions defined above, return a populated voc object and pairs list
def loadPrepareData(corpus, corpus_name, datafile, save_dir):
    print("Start preparing training data ...")
    voc, pairs = readVocs(datafile, corpus_name)
    print("Read {!s} sentence pairs".format(len(pairs)))
    pairs = filterPairs(pairs)
    print("Trimmed to {!s} sentence pairs".format(len(pairs)))
    print("Counting words...")
    for pair in pairs:
        voc.addSentence(pair[0])
        voc.addSentence(pair[1])
    print("Counted words:", voc.num_words)
    return voc, pairs


# Load/Assemble voc and pairs
save_dir = os.path.join("Dataset_chatbot","data", "save")
corpus_name = "cornell movie-dialogs corpus"

In [None]:
# Configure training/optimization
clip = 50.0
teacher_forcing_ratio = 1.0
learning_rate = 0.0001
decoder_learning_ratio = 5.0
n_iteration = 4000
print_every = 1
save_every = 500

# Ensure dropout layers are in train mode
encoder.train()
decoder.train()

# Initialize optimizers
print('Building optimizers ...')
encoder_optimizer = optim.Adam(encoder.parameters(), lr=learning_rate)
decoder_optimizer = optim.Adam(decoder.parameters(), lr=learning_rate * decoder_learning_ratio)
if loadFilename:
    encoder_optimizer.load_state_dict(encoder_optimizer_sd)
    decoder_optimizer.load_state_dict(decoder_optimizer_sd)

# If you have cuda, configure cuda to call
for state in encoder_optimizer.state.values():
    for k, v in state.items():
        if isinstance(v, torch.Tensor):
            state[k] = v.cuda()

for state in decoder_optimizer.state.values():
    for k, v in state.items():
        if isinstance(v, torch.Tensor):
            state[k] = v.cuda()

# Run training iterations
print("Starting Training!")
trainIters(model_name, voc, pairs, encoder, decoder, encoder_optimizer, decoder_optimizer,
           embedding, encoder_n_layers, decoder_n_layers, save_dir, n_iteration, batch_size,
           print_every, save_every, clip, corpus_name, loadFilename)

Building optimizers ...
Starting Training!
Initializing ...
Training...




Iteration: 1; Percent complete: 0.0%; Average loss: 8.9589




Iteration: 2; Percent complete: 0.1%; Average loss: 8.7529




Iteration: 3; Percent complete: 0.1%; Average loss: 8.4280




Iteration: 4; Percent complete: 0.1%; Average loss: 7.9585




Iteration: 5; Percent complete: 0.1%; Average loss: 7.2537




Iteration: 6; Percent complete: 0.1%; Average loss: 6.4443


In [None]:
# Set dropout layers to eval mode
encoder.eval()
decoder.eval()

# Initialize search module
searcher = GreedySearchDecoder(encoder, decoder)

# Begin chatting (uncomment and run the following line to begin)
evaluateInput(encoder, decoder, searcher, voc)