# **Import Libraries**

In [None]:
import pandas as pd
from __future__ import absolute_import, division, print_function, unicode_literals
import torch
import torch.nn as nn
from torch import optim
from torch.jit import script, trace
import torch.nn.functional as F
import csv
import random

import re
import os
import unicodedata
import codecs
from io import open
import itertools
import math

# **Mount Drive & Redirect to local directory**

In [None]:
from google.colab import drive
drive.mount('/content/drive', force_remount=True)

# **Read Data**

In [None]:
# View dataframe
southpark_df = pd.read_csv("All-seasons - Copy.csv")
southpark_df.head(10)

Unnamed: 0,Season,Episode,Character,Line
0,1,1,Boys,"School day, school day, teacher's golden ru...\n"
1,1,1,Kyle,"Ah, damn it! My little brother's trying to fol..."
2,1,1,Ike,Zeeponanner.\n
3,1,1,Kyle,"Ike, you can't come to school with me. \n"
4,1,1,Cartman,"Yeah, go home you little dildo.\n"
5,1,1,Kyle,"Dude, don't call my brother a dildo!\n"
6,1,1,Stan,What's a dildo?\n
7,1,1,Kyle,"Well, I don't know... and I'll bet Cartman do..."
8,1,1,Cartman,I know what it means!\n
9,1,1,Kyle,"Well, what?\n"


In [None]:
# View first few lines
print("South Park lines:")
for i in range(0,5):
    print("Line #",i+1)
    print(southpark_df.Line[i])

South Park lines:
Line # 1
School day, school day, teacher's golden ru...

Line # 2
Ah, damn it! My little brother's trying to follow me to school again.

Line # 3
Zeeponanner.

Line # 4
Ike, you can't come to school with me. 

Line # 5
Yeah, go home you little dildo.



# **Data Preprocessing**

#### **Find number of seasons**

In [None]:
# Find number of seasons
seasons_list = southpark_df["Season"].tolist()
s_list = []
for season in seasons_list:
  try:
    s_int = int(season)
    s_list.append(s_int)
  except:
    pass
season_set = set(s_list)
print(season_set)

{1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18}


#### **Find where each episode index end in Dataframe per season**

In [None]:
# Find where ep_index end per season
ep_per_season = [13,18,17,17,14,17,15,14,14,14,14,14,14,14,14,14,10,10]
seasons_dictionary = {"season_1":{}, 
                      "season_2":{}, 
                      "season_3":{}, 
                      "season_4":{}, 
                      "season_5":{}, 
                      "season_6":{}, 
                      "season_7":{}, 
                      "season_8":{}, 
                      "season_9":{}, 
                      "season_10":{}, 
                      "season_11":{}, 
                      "season_12":{}, 
                      "season_13":{}, 
                      "season_14":{}, 
                      "season_15":{}, 
                      "season_16":{}, 
                      "season_17":{}, 
                      "season_18":{}}

count_index = 0
prev_season = 0
for a_season in range(1,19):
  curr_season = "season_" + str(a_season)
  total_eps_for_season = ep_per_season[a_season-1]

  for an_ep in range(1,total_eps_for_season):
    for i in range(len(southpark_df["Season"])):
      if int(southpark_df["Season"][i]) > prev_season:
        if int(southpark_df["Season"][i]) == a_season and int(southpark_df["Episode"][i]) == an_ep:
          count_index += 1
        elif int(southpark_df["Season"][i]) != a_season and int(southpark_df["Episode"][i]) != an_ep:
          break

    sub_dictionary = seasons_dictionary[curr_season]
    sub_dictionary[an_ep] = count_index
  prev_season = a_season

In [None]:
# print(seasons_dictionary)

{'season_1': {1: 391, 2: 688, 3: 974, 4: 1338, 5: 1652, 6: 2001, 7: 2338, 8: 2617, 9: 2912, 10: 3222, 11: 3553, 12: 3867}, 'season_2': {1: 4205, 2: 4537, 3: 4832, 4: 5193, 5: 5506, 6: 5835, 7: 6187, 8: 6503, 9: 6839, 10: 7233, 11: 7593, 12: 8037, 13: 8400, 14: 8745, 15: 9096, 16: 9506, 17: 9886}, 'season_3': {1: 10213, 2: 10583, 3: 10971, 4: 11329, 5: 11747, 6: 12055, 7: 12336, 8: 12695, 9: 12984, 10: 13369, 11: 13680, 12: 14055, 13: 14399, 14: 14747, 15: 15061, 16: 15344}, 'season_4': {1: 15703, 2: 16076, 3: 16431, 4: 16777, 5: 17043, 6: 17378, 7: 17703, 8: 18025, 9: 18385, 10: 18735, 11: 19102, 12: 19442, 13: 19752, 14: 20025, 15: 20367, 16: 20690}, 'season_5': {1: 21036, 2: 21315, 3: 21648, 4: 21977, 5: 22312, 6: 22591, 7: 22909, 8: 23270, 9: 23584, 10: 23899, 11: 24225, 12: 24574, 13: 24820}, 'season_6': {1: 25175, 2: 25464, 3: 25796, 4: 26178, 5: 26482, 6: 26698, 7: 26989, 8: 27235, 9: 27536, 10: 27902, 11: 28143, 12: 28416, 13: 28757, 14: 29039, 15: 29368, 16: 29679}, 'season_7':

In [None]:
import pickle

# Store pickle
# pickle_out = open("/content/drive/My Drive/season_ep_index.pickle","wb")
# pickle.dump(seasons_dictionary, pickle_out)
# pickle_out.close()

# Open pickle
pickle_in = open("season_ep_index.pickle","rb")
seasons_dictionary = pickle.load(pickle_in)

#### **Creating a reformatted data corpus**

In [None]:
def clean_text(text):
    '''Clean text by removing unnecessary characters and altering 
    the format of words. We will do force expansion on some words and
    transform some of them to root form'''

    # Lowercase
    text = text.lower()
    
    # Substitute text
    text = re.sub(r"\n", "",  text)
    text = re.sub(r"[-()]", "", text)
    text = re.sub(r"\.", " .", text)
    text = re.sub(r"\!", " !", text)
    text = re.sub(r"\?", " ?", text)
    text = re.sub(r"\,", " ,", text)

    # Force expansion
    text = re.sub(r"i'm", "i am", text)
    text = re.sub(r"he's", "he is", text)
    text = re.sub(r"she's", "she is", text)
    text = re.sub(r"it's", "it is", text)
    text = re.sub(r"that's", "that is", text)
    text = re.sub(r"what's", "that is", text)
    text = re.sub(r"\'ll", " will", text)
    text = re.sub(r"\'re", " are", text)
    text = re.sub(r"won't", "will not", text)
    text = re.sub(r"can't", "cannot", text)
    text = re.sub(r"n't", " not", text)
    text = re.sub(r"n'", "ng", text)

    # Root word transformation
    text = re.sub(r"ohh", "oh", text)
    text = re.sub(r"ohhh", "oh", text)
    text = re.sub(r"ohhhh", "oh", text)
    text = re.sub(r"ohhhhh", "oh", text)
    text = re.sub(r"ohhhhhh", "oh", text)
    text = re.sub(r"ahh", "ah", text)
    
    return text

def extractSentencePairs(conversations):
  """
  1 conversation = 1 episode of 1 season
  conversations = ALL eps in that season


  Iterate over all the lines of the conversation
  The final line of the conversation would be unable to find a pair hence it 
  would be ignored.
  """
  qa_pairs = []
  for conversation in conversations:
      for i in range(len(conversation)-1):
          inputLine = conversation[i].strip()
          targetLine = conversation[i+1].strip() + " \r"
          if inputLine and targetLine:
              qa_pairs.append([inputLine, targetLine])
  return qa_pairs

def printLines(file, n=10):
    with open(file, 'rb') as datafile:
        lines = datafile.readlines()
    for line in lines[:n]:
        print(line)

In [None]:
# Get ALL seasons of data 
conversations = []
start_index = 0
for a_key in seasons_dictionary:
  index_dictionary = seasons_dictionary[a_key]

  for ep_key in index_dictionary:
    ep_end_index = index_dictionary[ep_key]
    conversation = []
    for i in range(start_index,ep_end_index):
      line = southpark_df["Line"][i]
      # Level 1 preprocessing - basic data processing
      line = clean_text(line)
      conversation.append(line)
    start_index = ep_end_index
    conversations.append(conversation)

In [None]:
corpus_name = "southpark_corpus"
new_path = os.path.join(corpus_name, "formatted_southpark_lines_v2.txt")
delimiter = '\t'
delimiter = str(codecs.decode(delimiter, "unicode_escape"))

print("\n Writing conversation pairs to new file")
with open(new_path, 'w', encoding = 'utf-8') as outfile:
    writer = csv.writer(outfile, delimiter=delimiter, lineterminator = '\n')
    for pair in extractSentencePairs(conversations):
        writer.writerow(pair)

print("\nSample lines from file:")
printLines(new_path)


 Writing conversation pairs to new file

Sample lines from file:
b"school day , school day , teacher's golden ru . . .\tah , damn it ! my little brother's trying to follow me to school again . \r\n"
b"ah , damn it ! my little brother's trying to follow me to school again .\tzeeponanner . \r\n"
b'zeeponanner .\tike , you cannot come to school with me . \r\n'
b'ike , you cannot come to school with me .\tyeah , go home you little dildo . \r\n'
b'yeah , go home you little dildo .\tdude , do not call my brother a dildo ! \r\n'
b'dude , do not call my brother a dildo !\tthat is a dildo ? \r\n'
b'that is a dildo ?\twell , i do not know . . .  and i will bet cartman does not know either ! \r\n'
b'well , i do not know . . .  and i will bet cartman does not know either !\ti know what it means ! \r\n'
b'i know what it means !\twell , what ? \r\n'
b'well , what ?\ti am not telling you . \r\n'


#### **Load and Trim data**

In [None]:
PAD_token = 0 # pad short sentences
SOS_token = 1 # sentence start token
EOS_token = 2 # sentence end token

class Voc:
  def __init__(self, name):
    self.name = name
    self.trimmed = False
    self.word2index  = {}
    self.index2word = {PAD_token: "PAD", SOS_token: "SOS", EOS_token: "EOS"}
    self.word2count = {}
    self.num_words = 3 
  
  def addSentences(self, sentence):
    for word in sentence.split(' '):
      self.addWord(word)

  def addWord(self, word):
    if word not in self.word2index:
      self.word2index[word] = self.num_words
      self.word2count[word] = 1
      self.index2word[self.num_words] = word
      self.num_words +=1
    else:
      # Increase word count if word appears before
      self.word2count[word] += 1
    
  # remove words below count threshold
  def trim(self, min_count):
    if self.trimmed:
      self.trimmed = True
    
    keep_words = []
    
    for k, v in self.word2count.items():
      if v>=min_count:
        keep_words.append(k)
    
    self.word2index = {}
    self.word2count = {}
    self.index2word = {PAD_token: "PAD", SOS_token: "SOS", EOS_token: "EOS"}
    self.num_words = 3
    
    for word in keep_words:
      self.addWord(word)

#### **Data Preprocessing Part 2**

##### **Arrange into Q-R pairs** 
Steps:
1. Convert  Unicode strings to ASCII using unicodeToAscii. 
2. Convert all letters to lowercase and trim all non-letter characters except for basic punctuation (normalizeString)
3. Filter out sentences with length greater than the MAX_LENGTH threshold (filterPairs).

In [None]:
MAX_LENGTH = 15

def unicode_to_ascii(s):
  return ''.join(c for c in unicodedata.normalize('NFD', s) if unicodedata.category(c) != 'Mn')

def normalizeString(s):
  s = unicode_to_ascii(s.lower().strip())
  s  = re.sub(r"([.!?])", r" \1", s)
  s = re.sub(r"[^a-zA-Z.!?]+", r" ", s)
  s = re.sub(r"\s+", r" ", s).strip()
  return s

def readVocs(datafile, corpus_name):
  print("Reading Lines ...")
  
  lines = open(datafile, encoding = 'utf-8').\
      read().strip().split('\n')
  
  pairs = [[normalizeString(s) for s in l.split('\t')] for l in lines]
  voc = Voc(corpus_name)
  return voc, pairs

# Returns True if both sentences in a pair 'p' are under the MAX_LENGTH threshold
def filterPair(p):
  # Input sequences need to preserve the last word for EOS token
  if len(p) == 2:
    return len(p[0].split(' ')) < MAX_LENGTH and len(p[1].split(' ')) < MAX_LENGTH

# Filter pairs using filterPair condition
def filterPairs(pairs):
  return [pair for pair in pairs if filterPair(pair)]

# Using the functions defined above, return a populated voc object and pairs list
def loadPrepareData(corpus_name, datafile):
  print("Start preparing training data ...")
  voc, pairs = readVocs(datafile, corpus_name)
  print("Read {!s} sentence pairs".format(len(pairs)))

  
  pairs = filterPairs(pairs)

  print("Trimmed to {!s} sentence pairs".format(len(pairs)))
  print("Counting words...")
  for pair in pairs:
      voc.addSentences(pair[0])
      voc.addSentences(pair[1])
  print("Counted words:", voc.num_words)
  return voc, pairs

In [None]:
# Load/Assemble voc and pairs
corpus_name = "southpark_corpus"
new_path = os.path.join(corpus_name, "formatted_southpark_lines_v2.txt")
datafile = new_path
voc, pairs = loadPrepareData(corpus_name, datafile)

# Print some pairs to validate
print("\npairs:")
for pair in pairs[:10]:
    print(pair)

#### **Trim sentences**

In [None]:
MIN_COUNT = 3    # Minimum word count threshold for trimming

def trimRareWords(voc, pairs, MIN_COUNT):
    # Trim words used under the MIN_COUNT from the voc
    voc.trim(MIN_COUNT)
    # Filter out pairs with trimmed words
    keep_pairs = []
    for pair in pairs:
        input_sentence = pair[0]
        output_sentence = pair[1]
        keep_input = True
        keep_output = True
        # Check input sentence
        for word in input_sentence.split(' '):
            if word not in voc.word2index:
                keep_input = False
                break
        # Check output sentence
        for word in output_sentence.split(' '):
            if word not in voc.word2index:
                keep_output = False
                break

        # Only keep pairs that do not contain trimmed word(s) in their input or output sentence
        if keep_input and keep_output:
            keep_pairs.append(pair)

    print("Trimmed from {} pairs to {}, {:.4f} of total".format(len(pairs), len(keep_pairs), len(keep_pairs) / len(pairs)))
    return keep_pairs


# Trim voc and pairs
pairs = trimRareWords(voc, pairs, MIN_COUNT)

Trimmed from 29681 pairs to 21382, 0.7204 of total


# **Data Preparation**

In [None]:
def indexesFromSentence(voc, sentence):
    return [voc.word2index[word] for word in sentence.split(' ')] + [EOS_token]

def zeroPadding(l, fillvalue=PAD_token):
    return list(itertools.zip_longest(*l, fillvalue=fillvalue))

def binaryMatrix(l, value=PAD_token):
  m = []
  for i, seq in enumerate(l):
    m.append([])
    for token in seq:
      if token == PAD_token:
        m[i].append(0)
      else:
        m[i].append(1)
  return m

# Input Sequence Padding 
def inputVar(l, voc):
  indx_b = [indexesFromSentence(voc, sentence) for sentence in l]
  lengths = torch.tensor([len(indexes) for indexes in indx_b])
  padList = zeroPadding(indx_b)
  padVar = torch.LongTensor(padList)
  return padVar, lengths

# Output Sequence Padding 
def outputVar(l, voc):
  """"
  This function will output:
  1. Padded target 
  2. Padding mask
  3. Max target length
  """
  indx_b = [indexesFromSentence(voc, sentence) for sentence in l]
  max_target_len = max([len(indexes) for indexes in indx_b])
  padList = zeroPadding(indx_b)
  mask = binaryMatrix(padList)
  mask = torch.ByteTensor(mask)
  padVar = torch.LongTensor(padList)
  return padVar, mask, max_target_len

# Returns all items for a given batch of pairs
def batch2TrainData(voc, pair_batch):
  pair_batch.sort(key=lambda x: len(x[0].split(" ")), reverse=True)
  input_batch, output_batch = [], []
  for pair in pair_batch:
    input_batch.append(pair[0])
    output_batch.append(pair[1])
  inp, lengths = inputVar(input_batch, voc)
  output, mask, max_target_len = outputVar(output_batch, voc)
  return inp, lengths, output, mask, max_target_len


# Example for validation
small_batch_size = 5
batches = batch2TrainData(voc, [random.choice(pairs) for _ in range(small_batch_size)])
input_variable, lengths, target_variable, mask, max_target_len = batches

# Observe output and check to make sure it's correct
# DO NOT MOVE ON if tensor lengths are not matching
print("input_variable:", input_variable)
print("lengths:", lengths)
print("target_variable:", target_variable)
print("mask:", mask)
print("max_target_len:", max_target_len)

input_variable: tensor([[1063,  155,   28,  119,  114],
        [  24,   26,  222,  297,   93],
        [2448, 4492,   64,   41,    3],
        [   5,  150,   31, 1298,    2],
        [  48,   41,    3,   24,    0],
        [2517,    9,    2,    2,    0],
        [  24,    3,    0,    0,    0],
        [   2,    2,    0,    0,    0]])
lengths: tensor([8, 8, 6, 6, 4])
target_variable: tensor([[  54,   30, 4046,  151,  235],
        [ 534,   27,   26,  222,   28],
        [ 326,    2,   25,   19,  367],
        [ 228,    0,   23,   24,    5],
        [  24,    0,  931,    2,   27],
        [   2,    0,  648,    0,    2],
        [   0,    0,  117,    0,    0],
        [   0,    0,   41,    0,    0],
        [   0,    0, 4568,    0,    0],
        [   0,    0,   27,    0,    0],
        [   0,    0,    2,    0,    0]])
mask: tensor([[1, 1, 1, 1, 1],
        [1, 1, 1, 1, 1],
        [1, 1, 1, 1, 1],
        [1, 0, 1, 1, 1],
        [1, 0, 1, 1, 1],
        [1, 0, 1, 0, 1],
        [0, 0, 1

# Define Model

#### Encoder

In [None]:
class EncoderRNN(nn.Module):
  def __init__(self, hidden_size, embedding, n_layers=1, dropout=0):
    super(EncoderRNN, self).__init__()
    self.n_layers = n_layers
    self.hidden_size = hidden_size
    self.embedding = embedding
    
    # Initialize GRU
    self.gru = nn.GRU(hidden_size, 
                      hidden_size, 
                      n_layers,
                      dropout=(0 if n_layers == 1 else dropout), 
                      bidirectional=True)

  def forward(self, input_seq, input_lengths, hidden=None):
    # Convert word indexes to embeddings
    embedded = self.embedding(input_seq)
    
    # Pack padded batch of sequences for RNN module
    packed = torch.nn.utils.rnn.pack_padded_sequence(embedded, input_lengths)
    # Forward pass through GRU
    outputs, hidden = self.gru(packed, hidden)
    # Unpack padding
    outputs, _ = torch.nn.utils.rnn.pad_packed_sequence(outputs)
    # Sum bidirectional GRU outputs
    outputs = outputs[:, :, :self.hidden_size] + outputs[:, : ,self.hidden_size:]
    # Return output and final hidden state
    return outputs, hidden

#### Attention Mechanism

In [None]:
# Luong attention layer
class Attn(torch.nn.Module):
    def __init__(self, method, hidden_size):
        super(Attn, self).__init__()
        self.method = method
        self.hidden_size = hidden_size
        if self.method == 'general':
            self.attn = torch.nn.Linear(self.hidden_size, hidden_size)
        elif self.method == 'concat':
            self.attn = torch.nn.Linear(self.hidden_size * 2, hidden_size)
            self.v = torch.nn.Parameter(torch.FloatTensor(hidden_size))

    def dot_score(self, hidden, encoder_output):
        return torch.sum(hidden * encoder_output, dim=2)

    def general_score(self, hidden, encoder_output):
        energy = self.attn(encoder_output)
        return torch.sum(hidden * energy, dim=2)

    def concat_score(self, hidden, encoder_output):
        energy = self.attn(torch.cat((hidden.expand(encoder_output.size(0), -1, -1), encoder_output), 2)).tanh()
        return torch.sum(self.v * energy, dim=2)

    def forward(self, hidden, encoder_outputs):
        # Calculate the attention weights (energies) based on the given method
        if self.method == 'general':
            attn_energies = self.general_score(hidden, encoder_outputs)
        elif self.method == 'concat':
            attn_energies = self.concat_score(hidden, encoder_outputs)
        elif self.method == 'dot':
            attn_energies = self.dot_score(hidden, encoder_outputs)

        # Transpose max_length and batch_size dimensions
        attn_energies = attn_energies.t()

        # Return the softmax normalized probability scores (with added dimension)
        return F.softmax(attn_energies, dim=1).unsqueeze(1)

In [None]:
class LuongAttnDecoderRNN(nn.Module):
    def __init__(self, attn_model, embedding, hidden_size, output_size, n_layers=1, dropout=0.1):
        super(LuongAttnDecoderRNN, self).__init__()

        # Keep for reference
        self.attn_model = attn_model
        self.hidden_size = hidden_size
        self.output_size = output_size
        self.n_layers = n_layers
        self.dropout = dropout

        # Define layers
        self.embedding = embedding
        self.embedding_dropout = nn.Dropout(dropout)
        self.gru = nn.GRU(hidden_size, hidden_size, n_layers, dropout=(0 if n_layers == 1 else dropout))
        self.concat = nn.Linear(hidden_size * 2, hidden_size)
        self.out = nn.Linear(hidden_size, output_size)

        self.attn = Attn(attn_model, hidden_size)

    def forward(self, input_seq, last_hidden, encoder_outputs):
        # Note: we run this one step (word) at a time
        # Get embedding of current input word
        embedded = self.embedding(input_seq)
        embedded = self.embedding_dropout(embedded)
        # Forward through unidirectional GRU
        rnn_output, hidden = self.gru(embedded, last_hidden)
        # Calculate attention weights from the current GRU output
        attn_weights = self.attn(rnn_output, encoder_outputs)
        # Multiply attention weights to encoder outputs to get new "weighted sum" context vector
        context = attn_weights.bmm(encoder_outputs.transpose(0, 1))
        # Concatenate weighted context vector and GRU output using Luong eq. 5
        rnn_output = rnn_output.squeeze(0)
        context = context.squeeze(1) 
        concat_input = torch.cat((rnn_output, context), 1)
        concat_output = torch.tanh(self.concat(concat_input))
        # Predict next word using Luong eq. 6
        output = self.out(concat_output)
        output = F.softmax(output, dim=1)
        # Return output and final hidden state
        return output, hidden

### Masked loss

Since we are dealing with batches of padded sequences, we cannot simply consider all elements of the tensor when calculating loss. We define maskNLLLoss to calculate our loss based on our decoder's output tensor, the target tensor, and a binary mask tensor describing the padding of the target tensor. This loss function calculates the average negative log likelihood of the elements that correspond to a 1 in the mask tensor.

In [None]:
def maskNLLLoss(inp, target, mask):
    nTotal = mask.sum()
    crossEntropy = -torch.log(torch.gather(inp, 1, target.view(-1, 1)))
    loss = crossEntropy.masked_select(mask).mean()
    loss = loss.to(device)
    return loss, nTotal.item()

### Single training iteration
The train function contains the algorithm for a single training iteration (a single batch of inputs).

We will use a couple of clever tricks to aid in convergence:

The first trick is using teacher forcing. This means that at some probability, set by teacher_forcing_ratio, we use the current target word as the decoder's next input rather than using the decoder's current guess. This technique acts as training wheels for the decoder, aiding in more efficient training. However, teacher forcing can lead to model instability during inference, as the decoder may not have a sufficient chance to truly craft its own output sequences during training. Thus, we must be mindful of how we are setting the teacher_forcing_ratio, and not be fooled by fast convergence.
The second trick that we implement is gradient clipping. This is a commonly used technique for countering the "exploding gradient" problem. In essence, by clipping or thresholding gradients to a maximum value, we prevent the gradients from growing exponentially and either overflow (NaN), or overshoot steep cliffs in the cost function.

In [None]:
def train(input_variable, lengths, target_variable, mask, max_target_len, encoder, decoder, embedding,
          encoder_optimizer, decoder_optimizer, batch_size, clip, max_length=MAX_LENGTH):

    # Zero gradients
    encoder_optimizer.zero_grad()
    decoder_optimizer.zero_grad()

    # Set device options
    input_variable = input_variable.to(device)
    lengths = lengths.to(device)
    target_variable = target_variable.to(device)
    mask = mask.to(device)

    # Initialize variables
    loss = 0
    print_losses = []
    n_totals = 0

    # Forward pass through encoder
    encoder_outputs, encoder_hidden = encoder(input_variable, lengths)

    # Create initial decoder input (start with SOS tokens for each sentence)
    decoder_input = torch.LongTensor([[SOS_token for _ in range(batch_size)]])
    decoder_input = decoder_input.to(device)

    # Set initial decoder hidden state to the encoder's final hidden state
    decoder_hidden = encoder_hidden[:decoder.n_layers]

    # Determine if we are using teacher forcing this iteration
    use_teacher_forcing = True if random.random() < teacher_forcing_ratio else False

    # Forward batch of sequences through decoder one time step at a time
    if use_teacher_forcing:
        for t in range(max_target_len):
            decoder_output, decoder_hidden = decoder(
                decoder_input, decoder_hidden, encoder_outputs
            )
            # Teacher forcing: next input is current target
            decoder_input = target_variable[t].view(1, -1)
            # Calculate and accumulate loss
            mask_loss, nTotal = maskNLLLoss(decoder_output, target_variable[t], mask[t])
            loss += mask_loss
            print_losses.append(mask_loss.item() * nTotal)
            n_totals += nTotal
    else:
        for t in range(max_target_len):
            decoder_output, decoder_hidden = decoder(
                decoder_input, decoder_hidden, encoder_outputs
            )
            # No teacher forcing: next input is decoder's own current output
            _, topi = decoder_output.topk(1)
            decoder_input = torch.LongTensor([[topi[i][0] for i in range(batch_size)]])
            decoder_input = decoder_input.to(device)
            # Calculate and accumulate loss
            mask_loss, nTotal = maskNLLLoss(decoder_output, target_variable[t], mask[t])
            loss += mask_loss
            print_losses.append(mask_loss.item() * nTotal)
            n_totals += nTotal

    # Perform backpropatation
    loss.backward()

    # Clip gradients: gradients are modified in place
    _ = torch.nn.utils.clip_grad_norm_(encoder.parameters(), clip)
    _ = torch.nn.utils.clip_grad_norm_(decoder.parameters(), clip)

    # Adjust model weights
    encoder_optimizer.step()
    decoder_optimizer.step()

    return sum(print_losses) / n_totals

### Training iterations
It is finally time to tie the full training procedure together with the data. The trainIters function is responsible for running n_iterations of training given the passed models, optimizers, data, etc. This function is quite self explanatory, as we have done the heavy lifting with the train function.

One thing to note is that when we save our model, we save a tarball containing the encoder and decoder state_dicts (parameters), the optimizers' state_dicts, the loss, the iteration, etc. Saving the model in this way will give us the ultimate flexibility with the checkpoint. After loading a checkpoint, we will be able to use the model parameters to run inference, or we can continue training right where we left off.

In [None]:
def trainIters(model_name, voc, pairs, encoder, decoder, encoder_optimizer, decoder_optimizer, 
               embedding, encoder_n_layers, decoder_n_layers, save_dir, n_iteration, batch_size, 
               print_every, save_every, clip, corpus_name, loadFilename):

    # Load batches for each iteration
    training_batches = [batch2TrainData(voc, [random.choice(pairs) for _ in range(batch_size)])
                      for _ in range(n_iteration)]

    # Initializations
    print('Initializing ...')
    start_iteration = 1
    print_loss = 0
    if loadFilename:
        start_iteration = checkpoint['iteration'] + 1

    # Training loop
    print("Training...")
    for iteration in range(start_iteration, n_iteration + 1):
        training_batch = training_batches[iteration - 1]
        # Extract fields from batch
        input_variable, lengths, target_variable, mask, max_target_len = training_batch

        # Run a training iteration with batch
        loss = train(input_variable, lengths, target_variable, mask, max_target_len, encoder,
                     decoder, embedding, encoder_optimizer, decoder_optimizer, batch_size, clip)
        print_loss += loss

        # Print progress
        if iteration % print_every == 0:
            print_loss_avg = print_loss / print_every
            print("Iteration: {}; Percent complete: {:.1f}%; Average loss: {:.4f}".format(iteration, iteration / n_iteration * 100, print_loss_avg))
            print_loss = 0

        # Save checkpoint
        if (iteration % save_every == 0):
            directory = os.path.join(save_dir, model_name, corpus_name, '{}-{}_{}'.format(encoder_n_layers, decoder_n_layers, hidden_size))
            if not os.path.exists(directory):
                os.makedirs(directory)
            torch.save({
                'iteration': iteration,
                'en': encoder.state_dict(),
                'de': decoder.state_dict(),
                'en_opt': encoder_optimizer.state_dict(),
                'de_opt': decoder_optimizer.state_dict(),
                'loss': loss,
                'voc_dict': voc.__dict__,
                'embedding': embedding.state_dict()
            }, os.path.join(directory, '{}_{}.tar'.format(iteration, 'checkpoint')))

### Define Evaluation
After training a model, we want to be able to talk to the bot ourselves. First, we must define how we want the model to decode the encoded input.

#### Greedy decoding
Greedy decoding is the decoding method that we use during training when we are NOT using teacher forcing. In other words, for each time step, we simply choose the word from decoder_output with the highest softmax value. This decoding method is optimal on a single time-step level.

To facilite the greedy decoding operation, we define a GreedySearchDecoder class. When run, an object of this class takes an input sequence (input_seq) of shape (input_seq length, 1), a scalar input length (input_length) tensor, and a max_length to bound the response sentence length. The input sentence is evaluated using the following computational graph:

**Computation Graph:**

1. Forward input through encoder model.
2. Prepare encoder's final hidden layer to be first hidden input to the decoder.
3. Initialize decoder's first input as SOS_token.
4. Initialize tensors to append decoded words to.
5. Iteratively decode one word token at a time:
    - a) Forward pass through decoder.
    - b) Obtain most likely word token and its softmax score.
    - c) Record token and score.
    - d) Prepare current token to be next decoder input.
6. Return collections of word tokens and scores.

In [None]:
class GreedySearchDecoder(nn.Module):
    def __init__(self, encoder, decoder):
        super(GreedySearchDecoder, self).__init__()
        self.encoder = encoder
        self.decoder = decoder

    def forward(self, input_seq, input_length, max_length):
        # Forward input through encoder model
        encoder_outputs, encoder_hidden = self.encoder(input_seq, input_length)
        # Prepare encoder's final hidden layer to be first hidden input to the decoder
        decoder_hidden = encoder_hidden[:decoder.n_layers]
        # Initialize decoder input with SOS_token
        decoder_input = torch.LongTensor([[SOS_token]])
        decoder_input = decoder_input.to(device)
        # Initialize tensors to append decoded words to
        all_tokens = torch.zeros([0], device=self._device, dtype=torch.long)
        all_scores = torch.zeros([0], device=self._device)
        # Iteratively decode one word token at a time
        for _ in range(max_length):
            # Forward pass through decoder
            decoder_output, decoder_hidden = self.decoder(decoder_input, decoder_hidden, encoder_outputs)
            # Obtain most likely word token and its softmax score
            decoder_scores, decoder_input = torch.max(decoder_output, dim=1)
            # Record token and score
            all_tokens = torch.cat((all_tokens, decoder_input), dim=0)
            all_scores = torch.cat((all_scores, decoder_scores), dim=0)
            # Prepare current token to be next decoder input (add a dimension)
            decoder_input = torch.unsqueeze(decoder_input, 0)
        # Return collections of word tokens and scores
        return all_tokens, all_scores

### Evaluate my text
Now that we have our decoding method defined, we can write functions for evaluating a string input sentence. The evaluate function manages the low-level process of handling the input sentence. We first format the sentence as an input batch of word indexes with batch_size==1. We do this by converting the words of the sentence to their corresponding indexes, and transposing the dimensions to prepare the tensor for our models. Concurrently, we create our lengths tensor which contains the length of our input sentence. In this case, lengths is scalar because we are only evaluating one sentence at a time (batch_size==1). Next, we obtain the decoded response sentence tensor using our GreedySearchDecoder object (searcher). Finally, we convert the response's indexes to words and return the list of decoded words.

evaluateInput acts as the user interface for our chatbot. When called, an input text field will spawn in which we can enter our query sentence. After typing our input sentence and pressing Enter, our text is normalized in the same way as our training data, and is ultimately fed to the evaluate function to obtain a decoded output sentence. We loop this process, so we can keep chatting with our bot until we enter either "q" or "quit".

Finally, if a sentence is entered that contains a word that is not in the vocabulary, we handle this gracefully by printing an error message and prompting the user to enter another sentence.

In [None]:
def evaluate(encoder, decoder, searcher, voc, sentence, max_length=MAX_LENGTH):
    ### Format input sentence as a batch
    # words -> indexes
    indexes_batch = [indexesFromSentence(voc, sentence)]
    # Create lengths tensor
    lengths = torch.tensor([len(indexes) for indexes in indexes_batch])
    # Transpose dimensions of batch to match models' expectations
    input_batch = torch.LongTensor(indexes_batch).transpose(0, 1)
    # Use appropriate device
    input_batch = input_batch.to(device)
    lengths = lengths.to(device)
    # Decode sentence with searcher
    tokens, scores = searcher(input_batch, lengths, max_length)
    # indexes -> words
    decoded_words = [voc.index2word[token.item()] for token in tokens]
    return decoded_words


def evaluateInput(encoder, decoder, searcher, voc):
    input_sentence = ''
    while(1):
        try:
            # Get input sentence
            input_sentence = input('> ')
            # Check if it is quit case
            if input_sentence == 'q' or input_sentence == 'quit': break
            # Normalize sentence
            input_sentence = normalizeString(input_sentence)
            # Evaluate sentence
            output_words = evaluate(encoder, decoder, searcher, voc, input_sentence)
            # Format and print response sentence
            output_words[:] = [x for x in output_words if not (x == 'EOS' or x == 'PAD')]
            print('Bot:', ' '.join(output_words))
    
        except KeyError:
            # print("Error: Encountered unknown word.")
            print("I don't know!")

# Run Model

In [None]:
model_name = 'cb_model_2'
attn_model = 'dot'
#attn_model = 'general'
#attn_model = 'concat'
hidden_size = 500
encoder_n_layers = 2
decoder_n_layers = 2
dropout = 0.1
batch_size = 64

# Set checkpoint to load from; set to None if starting from scratch
loadFilename = None
checkpoint_iter = 4000
# loadFilename = os.path.join(corpus_name, 
#                             '{}-{}_{}'.format(encoder_n_layers, decoder_n_layers, hidden_size) +'{}_checkpoint.tar'.format(checkpoint_iter))


# Load model if a loadFilename is provided
if loadFilename:
    # If loading on same machine the model was trained on
    checkpoint = torch.load(loadFilename)
    # If loading a model trained on GPU to CPU
    #checkpoint = torch.load(loadFilename, map_location=torch.device('cpu'))
    encoder_sd = checkpoint['en']
    decoder_sd = checkpoint['de']
    encoder_optimizer_sd = checkpoint['en_opt']
    decoder_optimizer_sd = checkpoint['de_opt']
    embedding_sd = checkpoint['embedding']
    voc.__dict__ = checkpoint['voc_dict']
    

print('Building encoder and decoder ...')
# Initialize word embeddings
embedding = nn.Embedding(voc.num_words, hidden_size)

if loadFilename:
    embedding.load_state_dict(embedding_sd)
# Initialize encoder & decoder models
encoder = EncoderRNN(hidden_size, embedding, encoder_n_layers, dropout)
decoder = LuongAttnDecoderRNN(attn_model, embedding, hidden_size, voc.num_words, decoder_n_layers, dropout)
if loadFilename:
    encoder.load_state_dict(encoder_sd)
    decoder.load_state_dict(decoder_sd)

USE_CUDA = torch.cuda.is_available()
device = torch.device("cuda" if USE_CUDA else "cpu")
# Use appropriate device
encoder = encoder.to(device)
decoder = decoder.to(device)
print('Models built and ready to go!')

Building encoder and decoder ...
Models built and ready to go!


### Run Training
Run the following block if you want to train the model.

First we set training parameters, then we initialize our optimizers, and finally we call the trainIters function to run our training iterations.

In [None]:
# Configure training/optimization
clip = 50.0
teacher_forcing_ratio = 1.0
learning_rate = 0.0001
decoder_learning_ratio = 5.0
n_iteration = 10000
print_every = 1
save_every = 500

# Ensure dropout layers are in train mode
encoder.train()
decoder.train()

# Initialize optimizers
print('Building optimizers ...')
encoder_optimizer = optim.Adam(encoder.parameters(), lr=learning_rate)
decoder_optimizer = optim.Adam(decoder.parameters(), lr=learning_rate * decoder_learning_ratio)
if loadFilename:
    encoder_optimizer.load_state_dict(encoder_optimizer_sd)
    decoder_optimizer.load_state_dict(decoder_optimizer_sd)

save_dir = "new_training"
# Run training iterations
print("Starting Training!")
trainIters(model_name, voc, pairs, encoder, decoder, encoder_optimizer, decoder_optimizer, 
           embedding, encoder_n_layers, decoder_n_layers, save_dir, n_iteration, batch_size, 
           print_every, save_every, clip, corpus_name, loadFilename)