# Building a chatbot: Pytorch

In [1]:
import torch
import torch.nn as nn
from torch import optim
import torch.nn.functional as F
import csv
import random
import re
import os
import unicodedata
import codecs
import itertools

In [2]:
CUDA = torch.cuda.is_available()
device = torch.device("cuda" if CUDA else "cpu")
device

device(type='cpu')

### Part 1. Preprocessing

In [3]:
lines_filepath = os.path.join('cornell movie-dialogs corpus', 'movie_lines.txt')
conv_filepath = os.path.join('cornell movie-dialogs corpus', 'movie_conversations.txt')

In [4]:
# Visualize some lines
with open(lines_filepath, 'r') as file:
    lines = file.readlines()
for line in lines[:8]:
    print(line.strip())

L1045 +++$+++ u0 +++$+++ m0 +++$+++ BIANCA +++$+++ They do not!
L1044 +++$+++ u2 +++$+++ m0 +++$+++ CAMERON +++$+++ They do to!
L985 +++$+++ u0 +++$+++ m0 +++$+++ BIANCA +++$+++ I hope so.
L984 +++$+++ u2 +++$+++ m0 +++$+++ CAMERON +++$+++ She okay?
L925 +++$+++ u0 +++$+++ m0 +++$+++ BIANCA +++$+++ Let's go.
L924 +++$+++ u2 +++$+++ m0 +++$+++ CAMERON +++$+++ Wow
L872 +++$+++ u0 +++$+++ m0 +++$+++ BIANCA +++$+++ Okay -- you're gonna need to learn how to lie.
L871 +++$+++ u2 +++$+++ m0 +++$+++ CAMERON +++$+++ No


In [5]:
# Split each line of the file into a dictionaty of fields (LineID, characerID, movieID, character, text)
line_fields = ['lineID', 'characterID', 'movieID', 'character', 'text']
lines={}
with open(lines_filepath, 'r', encoding='iso-8859-1') as f:
    for line in f:
        values = line.split(" +++$+++ ")
        #extract files
        lineObj = {}
        for i, field in enumerate(line_fields):
            lineObj[field] = values[i]
        lines[lineObj['lineID']] = lineObj

### Part 3. Processing the dataset

In [6]:
# Group fields of lines from 'loadlines' into conversations based on *movie_conversations.txt
conv_fields = ['character1ID', 'character2ID','movieID', 'utteranceIDs']
conversations = []
with open(conv_filepath, 'r', encoding='iso-8859-1') as f:
    for line in f:
        values = line.split(" +++$+++ ")
        # Extract fields
        convObj = {}
        for i, field in enumerate(conv_fields):
            convObj[field] = values[i]
        # convert string result from split to list, since convObj['utteranceIDs'] == "['']"
        lineIds = eval(convObj['utteranceIDs'])
        # Reassemble lines
        convObj['lines'] = []
        for lineId in lineIds:
            convObj['lines'].append(lines[lineId])
        conversations.append(convObj)

In [12]:
conversations[0]['lines']

[{'lineID': 'L194',
  'characterID': 'u0',
  'movieID': 'm0',
  'character': 'BIANCA',
  'text': 'Can we make this quick?  Roxanne Korrine and Andrew Barrett are having an incredibly horrendous public break- up on the quad.  Again.\n'},
 {'lineID': 'L195',
  'characterID': 'u2',
  'movieID': 'm0',
  'character': 'CAMERON',
  'text': "Well, I thought we'd start with pronunciation, if that's okay with you.\n"},
 {'lineID': 'L196',
  'characterID': 'u0',
  'movieID': 'm0',
  'character': 'BIANCA',
  'text': 'Not the hacking and gagging and spitting part.  Please.\n'},
 {'lineID': 'L197',
  'characterID': 'u2',
  'movieID': 'm0',
  'character': 'CAMERON',
  'text': "Okay... then how 'bout we try out some French cuisine.  Saturday?  Night?\n"}]

In [8]:
# Extracts pairs of sentences from conversations
qa_pairs = []
for conversation in conversations:
    for i in range(len(conversation['lines']) - 1 ):
        inputLine = conversation['lines'][i]['text'].strip()
        targetLine = conversation['lines'][i+1]['text'].strip()
        # Filter wrong samples (if one of the lists is empty)
        if inputLine and targetLine:
            qa_pairs.append([inputLine, targetLine])

In [14]:
qa_pairs[0]

['Can we make this quick?  Roxanne Korrine and Andrew Barrett are having an incredibly horrendous public break- up on the quad.  Again.',
 "Well, I thought we'd start with pronunciation, if that's okay with you."]

In [15]:
# Define path to new file
datafile = os.path.join("cornell movie-dialogs corpus", "formatted_movie_lines.txt")
delimiter = '\t'
# unescape the delimiter
delimiter = str(codecs.decode(delimiter, 'unicode_escape'))

# Write the new csv file
print("\nWriting newly formatted file...")
with open(datafile, 'w', encoding='utf-8') as outputfile:
    writer = csv.writer(outputfile, delimiter=delimiter)
    for pair in qa_pairs:
        writer.writerow(pair)
print("Done writing to file")


Writing newly formatted file...
Done writing to file


In [17]:
# Visualize some lines
datafile = os.path.join('cornell movie-dialogs corpus', 'formatted_movie_lines.txt')
with open(datafile, 'rb') as file:
    lines = file.readlines()
for line in lines[:8]:
    print(line)

b"Can we make this quick?  Roxanne Korrine and Andrew Barrett are having an incredibly horrendous public break- up on the quad.  Again.\tWell, I thought we'd start with pronunciation, if that's okay with you.\r\r\n"
b"Well, I thought we'd start with pronunciation, if that's okay with you.\tNot the hacking and gagging and spitting part.  Please.\r\r\n"
b"Not the hacking and gagging and spitting part.  Please.\tOkay... then how 'bout we try out some French cuisine.  Saturday?  Night?\r\r\n"
b"You're asking me out.  That's so cute. What's your name again?\tForget it.\r\r\n"
b"No, no, it's my fault -- we didn't have a proper introduction ---\tCameron.\r\r\n"
b"Cameron.\tThe thing is, Cameron -- I'm at the mercy of a particularly hideous breed of loser.  My sister.  I can't date until she does.\r\r\n"
b"The thing is, Cameron -- I'm at the mercy of a particularly hideous breed of loser.  My sister.  I can't date until she does.\tSeems like she could get a date easy enough...\r\r\n"
b'Why?\tU

### Processsing the words

In [23]:
PAD_token = 0
SOS_token = 1
EOS_token = 2

class Vocabulary:
    def __init__(self, name):
        self.name = name
        self.word2index = {}
        self.word2count = {} # Count the frequency of the word
        self.index2word = {PAD_token:'PAD', SOS_token: 'SOS', EOS_token: 'EOS'}
        self.num_words = 3 # Count SOS, 
    
    def addSentence(self, sentence):
        for word in sentence.split(' '):
            self.addWord(word)
    
    def addWord(self, word):
        if word not in self.word2index:
            self.word2index[word] = self.num_words
            self.word2count[word] = 1
            self.index2word[self.num_words] = word
            self.num_words += 1
        else:
            self.word2count[word] += 1
            
    def trim(self, min_count):
        keep_words = []
        for k, v in self.word2count.items():
            if v >= min_count:
                keep_words.append(k)
                
        print("keep_words {} / {} = {:.4f}".format(len(keep_words), len(self.word2index), len(keep_words) / len(self.word2index)))
        # reainitialize dictionaries
        self.word2index = {}
        self.word2count = {}
        self.index2word = {PAD_token: "PAD", SOS_token: "SOS", EOS_token: "EOS"}
        self.num_words = 3
        
        for word in keep_words:
            self.addWord(word)

## Chatbot implementation

### Processing the T

In [21]:
# Turn Unicode string to plan ASCII
def unicodeToAscii(s):
    return ''.join(c for c in unicodedata.normalize('NFD', s) if unicodedata.category(c) != 'Mn')

In [22]:
# Test the function
unicodeToAscii('Montréal,Fraçoise....')

'Montreal,Fracoise....'

In [24]:
# Lowercase, trim white spaces, lines... etc and remove non-letter characters
def normalizeString(s):
    s = unicodeToAscii(s.lower().strip())
    # Replace any.!? by a whitespace + the character --> '!'=' ! '. \1 means the first backeted group --> [,!?]. r is to
    # not consider \1 as a character (r to escape a backslash)
    s = re.sub(r"([.!?])", r" \1", s)
    # Remove any characer that is not a sequence of lower or upper case letters. + means one or more
    s = re.sub(r"[^a-zA-Z.!?]+", r" ", s)
    # Remove a sequence of whitespace characters
    s = re.sub(r"\s+",r" ", s).strip()
    return s

In [25]:
normalizeString("aaa123aa!s's     dd?")

'aaa aa !s s dd ?'

In [28]:
datafile = os.path.join("cornell movie-dialogs corpus", "formatted_movie_lines.txt")
# Read the file and split into lines
print("Reading and processing file... Please wait")
lines = open(datafile, encoding="utf-8").read().strip().split('\n')
#Split every line into pairs and normalize
pairs = [[normalizeString(s) for s in pair.split('\t')] for pair in lines]
print("Done Reading!")
voc = Vocabulary("cornell movie-dialogs corpus")

Reading and processing file... Please wait
Done Reading!


### Filtering the Text

In [47]:
MAX_LENGTH = 10
def filterPair(p):
    ''' Returns True if both sentences in a pair p are under the max_length threshold'''
    # input sequences need to presence the last word for EOS token
    return len(p[0].split()) < MAX_LENGTH and len(p[1].split()) < MAX_LENGTH
    
def filterPairs(pairs):
    return [pair for pair in pairs if filterPair(pair)]

In [54]:
pairs = [pair for pair in pairs if len(pair)>1]
print("There are {} pairs/conversations in the dataset".format(len(pairs)))
pairs = filterPairs(pairs)
print("After filtering, there are {} pairs/conversations".format(len(pairs)))

There are 221282 pairs/conversations in the dataset
After filtering, there are 64271 pairs/conversations


### Getting rid of weird words

In [62]:
# Loop through each pair of and add the question and reply sentence to the vocabulary
for pair in pairs:
    voc.addSentence(pair[0])
    voc.addSentence(pair[1])
print("Counted Words:", voc.num_words)
for pair in pairs[:10]:
    print(pair)

Counted Words: 18008
['there .', 'where ?']
['you have my word . as a gentleman', 'you re sweet .']
['hi .', 'looks like things worked out tonight huh ?']
['you know chastity ?', 'i believe we share an art instructor']
['have fun tonight ?', 'tons']
['well no . . .', 'then that s all you had to say .']
['then that s all you had to say .', 'but']
['but', 'you always been this selfish ?']
['do you listen to this crap ?', 'what crap ?']
['what good stuff ?', 'the real you .']


In [63]:
MIN_COUNT = 3

def trimRareWords(voc, pairs, MIN_COUNT):
    voc.trim(MIN_COUNT)
    keep_pairs=[]
    for pair in pairs:
        input_sentence = pair[0]
        output_sentence = pair[1]
        keep_input = True
        keep_output = True
        #Check input sentence
        for word in input_sentence.split(' '):
            if word not in voc.word2index:
                keep_input=False
                break
        for word in output_sentence.split(' '):
            if word not in voc.word2index:
                keep_output = False
                break
                
        # Only keep pairs that do not contain trimed word(s) in their input or output sentence
        if keep_input and keep_output:
            keep_pairs.append(pair)
            
    print("Trimmed from {} pairs to {}, {:.4f} of total".format(len(pairs), len(keep_pairs), len(keep_pairs)/len(pairs)))
    return keep_pairs

#Trim voc and pairs
pairs = trimRareWords(voc, pairs, MIN_COUNT)

keep_words 7823 / 18005 = 0.4345
Trimmed from 64271 pairs to 53165,0.8272 of total


## Preparing the data

In [64]:
def indexesFromSentence(voc, sentence):
    return [voc.word2index[word] for word in sentence.split(' ')] + [EOS_token]

In [65]:
# Test the function
indexesFromSentence(voc, pairs[1][0])

[7, 8, 9, 10, 4, 11, 12, 13, 2]

In [66]:
# Define some samples for testing
inp = []
out = []
for pair in pairs[:10]:
    inp.append(pair[0])
    out.append(pair[1])
print(inp)
print(len(inp))
indexes = [indexesFromSentence(voc, sentence) for sentence in inp]
indexes

['there .', 'you have my word . as a gentleman', 'hi .', 'have fun tonight ?', 'well no . . .', 'then that s all you had to say .', 'but', 'do you listen to this crap ?', 'what good stuff ?', 'wow']
10


[[3, 4, 2],
 [7, 8, 9, 10, 4, 11, 12, 13, 2],
 [16, 4, 2],
 [8, 31, 22, 6, 2],
 [33, 34, 4, 4, 4, 2],
 [35, 36, 37, 38, 7, 39, 40, 41, 4, 2],
 [42, 2],
 [47, 7, 48, 40, 45, 49, 6, 2],
 [50, 51, 52, 6, 2],
 [58, 2]]

### Understanding the Zip function

In [68]:
def zeroPadding(l, fillvalue = 0):
    return list(itertools.zip_longest(*l, fillvalue=fillvalue))

In [69]:
leng = [len(ind) for ind in indexes]
max(leng)

10

In [83]:
#test the function
test_result = zeroPadding(indexes)
print(len(test_result))
test_result

10


[(3, 7, 16, 8, 33, 35, 42, 47, 50, 58),
 (4, 8, 4, 31, 34, 36, 2, 7, 51, 2),
 (2, 9, 2, 22, 4, 37, 0, 48, 52, 0),
 (0, 10, 0, 6, 4, 38, 0, 40, 6, 0),
 (0, 4, 0, 2, 4, 7, 0, 45, 2, 0),
 (0, 11, 0, 0, 2, 39, 0, 49, 0, 0),
 (0, 12, 0, 0, 0, 40, 0, 6, 0, 0),
 (0, 13, 0, 0, 0, 41, 0, 2, 0, 0),
 (0, 2, 0, 0, 0, 4, 0, 0, 0, 0),
 (0, 0, 0, 0, 0, 2, 0, 0, 0, 0)]

In [84]:
def binaryMatrix(l, value=0):
    m=[]
    for i, seq in enumerate(l):
        m.append([])
        for token in seq:
            if token in seq:
                if token == PAD_token:
                    m[i].append(0)
                else:
                    m[i].append(1)
    return m

In [85]:
binaryMatrix(test_result)

[[1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
 [1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
 [1, 1, 1, 1, 1, 1, 0, 1, 1, 0],
 [0, 1, 0, 1, 1, 1, 0, 1, 1, 0],
 [0, 1, 0, 1, 1, 1, 0, 1, 1, 0],
 [0, 1, 0, 0, 1, 1, 0, 1, 0, 0],
 [0, 1, 0, 0, 0, 1, 0, 1, 0, 0],
 [0, 1, 0, 0, 0, 1, 0, 1, 0, 0],
 [0, 1, 0, 0, 0, 1, 0, 0, 0, 0],
 [0, 0, 0, 0, 0, 1, 0, 0, 0, 0]]

In [86]:
# Returns padded input sequence tensor and as well as a tensor of lengths for each of the sequences in the batch
def inputVar(l, voc):
    indexes_batch = [indexesFromSentence(voc, sentence) for sentence in l]
    lengths = torch.tensor([len(indexes) for indexes in indexes_batch])
    padList = zeroPadding(indexes_batch)
    padVar = torch.LongTensor(padList)
    return padVar, lengths

In [90]:
# Returns padded target sequence tensor, padding mask, and max target length
def outputVar(l, voc):
    indexes_batch = [indexesFromSentence(voc, sentence) for sentence in l]
    max_target_len = max([len(indexes) for indexes in indexes_batch])
    padList = zeroPadding(indexes_batch)
    mask = binaryMatrix(padList)
    mask = torch.ByteTensor(mask)
    padVar = torch.LongTensor(padList)
    return padVar, mask, max_target_len

In [91]:
def batch2TrainData(voc, pair_batch):
    # Sort the questions in descending length
    pair_batch.sort(key=lambda x: len(x[0].split(" ")), reverse=True)
    input_batch, output_batch = [], []
    for pair in pair_batch:
        input_batch.append(pair[0])
        output_batch.append(pair[1])
        inp, lengths = inputVar(input_batch, voc)
    # assert len(inp) == lengths[0]
    output, mask, max_target_len = outputVar(output_batch, voc)
    return inp, lengths, output, mask, max_target_len
    

In [92]:
# Example for validation
small_batch_size = 5
batches = batch2TrainData(voc, [random.choice(pairs) for _ in range(small_batch_size)])
input_variable, lengths, target_variable, mask, max_target_len = batches

print("input_variable: ")
print(input_variable)
print("lengths: ", lengths)
print("target_variable: ")
print(target_variable)
print("mask:")
print(mask)
print("max_target_len:", max_target_len)

input_variable: 
tensor([[  34, 3652,   27,   45,   50],
        [ 678,    7,  527, 3654,    6],
        [   7,   62,  174,    4,    2],
        [  94, 1110,    4,    2,    0],
        [ 117,    6,    2,    0,    0],
        [   4,    2,    0,    0,    0],
        [   2,    0,    0,    0,    0]])
lengths:  tensor([7, 6, 5, 4, 3])
target_variable: 
tensor([[ 100,   25,    7,   27,   70],
        [  67,  200,  288,   14, 2554],
        [   6,  274,  117,  509, 2468],
        [   2,   76, 5050,  158,  115],
        [   0,   37,   40,  537, 1029],
        [   0,  112, 1172,  112,  111],
        [   0,  665,  174,  274,   96],
        [   0, 3801,    4,    4,  479],
        [   0,    4,    2,    2,    4],
        [   0,    2,    0,    0,    2]])
mask:
tensor([[1, 1, 1, 1, 1],
        [1, 1, 1, 1, 1],
        [1, 1, 1, 1, 1],
        [1, 1, 1, 1, 1],
        [0, 1, 1, 1, 1],
        [0, 1, 1, 1, 1],
        [0, 1, 1, 1, 1],
        [0, 1, 1, 1, 1],
        [0, 1, 1, 1, 1],
        [0, 1, 0, 

# PART 3. DEFINING THE MODELS

In [None]:
class EncoderRNN(nn.Module):
    def __init__(self, hidden_size, embedding, n_layers=1, dropout=0):
        super(EncoderRNN, self).__init__()
        self.n_layers = n_layers
        self.hidden_size = hidden_size
        self.embedding = embedding
        # Initialize GRU; the input_size and hidden_size params are both set to 'hidden_size'
        # because our input size is a word embedding with number of features == hidden_size
        self.gru = nn.GRU(hidden_size, hidden_size, n_layers, dropout=(0 if n_layers==1 else dropout), bidirectional=True)
        
    def forward(self, input_seq, input_lengths, hidden=None):
        # input_seq: batch of input sentences; shape=(max_lengths, batch_size)
        # input_lengths: list of sentence lengths correesponding to each sentence in the batch
        # hidden state, of shape: (n_layers x num_directions, batch_size, hidden_size)
        # convert word indexes to embeddings
        embedded = self.embedding(input_seq)
        packed = torch.nn.utils.rnn.pack_padded_sequence(embedded, input_lengths)
        # Forward pass through GRU
        outputs, hidden = self.gru(packed, hidden)
        # unpack padding
        outputs, _ = torch.nn.utils.rnn.pad_packed_sequence(outputs)
        # Sum bidirectional GRU outputs
        outputs = outputs[:, :, :self.hidden_size] + outputs[:, :, self.hidden_size:]
        # Return output and final hidden state
        return outputs, hidden