# HPML Assignment 3

**Author:** Rugved Mhatre (rrm9598)

## Problem 1.1

### Preparations

Downloading Cornell Movie-Dialogs Corpus dataset

In [None]:
!mkdir -p data
!wget -O data/movie-corpus.zip https://zissou.infosci.cornell.edu/convokit/datasets/movie-corpus/movie-corpus.zip
!unzip -o data/movie-corpus.zip -d data

--2024-10-24 04:07:11--  https://zissou.infosci.cornell.edu/convokit/datasets/movie-corpus/movie-corpus.zip
Resolving zissou.infosci.cornell.edu (zissou.infosci.cornell.edu)... 128.253.51.179
Connecting to zissou.infosci.cornell.edu (zissou.infosci.cornell.edu)|128.253.51.179|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 40854701 (39M) [application/zip]
Saving to: ‘data/movie-corpus.zip’


2024-10-24 04:07:14 (13.5 MB/s) - ‘data/movie-corpus.zip’ saved [40854701/40854701]

Archive:  data/movie-corpus.zip
   creating: data/movie-corpus/
  inflating: data/movie-corpus/utterances.jsonl  
  inflating: data/movie-corpus/conversations.json  
  inflating: data/movie-corpus/corpus.json  
  inflating: data/movie-corpus/speakers.json  
  inflating: data/movie-corpus/index.json  


Importing libraries

In [None]:
import torch
from torch.jit import script, trace
import torch.nn as nn
from torch import optim
import torch.nn.functional as F
import csv
import random
import re
import os
import unicodedata
import codecs
from io import open
import itertools
import math
import json
import time
import numpy as np

USE_CUDA = torch.cuda.is_available()
device = torch.device("cuda" if USE_CUDA else "cpu")

### Load & Preprocess Data

In [None]:
corpus_name = "movie-corpus"
corpus = os.path.join("data", corpus_name)

def printLines(file, n=10):
    with open(file, 'rb') as datafile:
        lines = datafile.readlines()
    for line in lines[:n]:
        print(line)

printLines(os.path.join(corpus, "utterances.jsonl"))

b'{"id": "L1045", "conversation_id": "L1044", "text": "They do not!", "speaker": "u0", "meta": {"movie_id": "m0", "parsed": [{"rt": 1, "toks": [{"tok": "They", "tag": "PRP", "dep": "nsubj", "up": 1, "dn": []}, {"tok": "do", "tag": "VBP", "dep": "ROOT", "dn": [0, 2, 3]}, {"tok": "not", "tag": "RB", "dep": "neg", "up": 1, "dn": []}, {"tok": "!", "tag": ".", "dep": "punct", "up": 1, "dn": []}]}]}, "reply-to": "L1044", "timestamp": null, "vectors": []}\n'
b'{"id": "L1044", "conversation_id": "L1044", "text": "They do to!", "speaker": "u2", "meta": {"movie_id": "m0", "parsed": [{"rt": 1, "toks": [{"tok": "They", "tag": "PRP", "dep": "nsubj", "up": 1, "dn": []}, {"tok": "do", "tag": "VBP", "dep": "ROOT", "dn": [0, 2, 3]}, {"tok": "to", "tag": "TO", "dep": "dobj", "up": 1, "dn": []}, {"tok": "!", "tag": ".", "dep": "punct", "up": 1, "dn": []}]}]}, "reply-to": null, "timestamp": null, "vectors": []}\n'
b'{"id": "L985", "conversation_id": "L984", "text": "I hope so.", "speaker": "u0", "meta": {

Creating a formatted data file

In [None]:
# Splits each line of the file to create lines and conversations
def loadLinesAndConversations(fileName):
    lines = {}
    conversations = {}
    with open(fileName, 'r', encoding='iso-8859-1') as f:
        for line in f:
            lineJson = json.loads(line)
            # Extract fields for line object
            lineObj = {}
            lineObj["lineID"] = lineJson["id"]
            lineObj["characterID"] = lineJson["speaker"]
            lineObj["text"] = lineJson["text"]
            lines[lineObj['lineID']] = lineObj

            # Extract fields for conversation object
            if lineJson["conversation_id"] not in conversations:
                convObj = {}
                convObj["conversationID"] = lineJson["conversation_id"]
                convObj["movieID"] = lineJson["meta"]["movie_id"]
                convObj["lines"] = [lineObj]
            else:
                convObj = conversations[lineJson["conversation_id"]]
                convObj["lines"].insert(0, lineObj)
            conversations[convObj["conversationID"]] = convObj

    return lines, conversations


# Extracts pairs of sentences from conversations
def extractSentencePairs(conversations):
    qa_pairs = []
    for conversation in conversations.values():
        # Iterate over all the lines of the conversation
        for i in range(len(conversation["lines"]) - 1):  # We ignore the last line (no answer for it)
            inputLine = conversation["lines"][i]["text"].strip()
            targetLine = conversation["lines"][i+1]["text"].strip()
            # Filter wrong samples (if one of the lists is empty)
            if inputLine and targetLine:
                qa_pairs.append([inputLine, targetLine])
    return qa_pairs

In [None]:
# Define path to new file
datafile = os.path.join(corpus, "formatted_movie_lines.txt")

delimiter = '\t'
# Unescape the delimiter
delimiter = str(codecs.decode(delimiter, "unicode_escape"))

# Initialize lines dict and conversations dict
lines = {}
conversations = {}
# Load lines and conversations
print("\nProcessing corpus into lines and conversations...")
lines, conversations = loadLinesAndConversations(os.path.join(corpus, "utterances.jsonl"))

# Write new csv file
print("\nWriting newly formatted file...")
with open(datafile, 'w', encoding='utf-8') as outputfile:
    writer = csv.writer(outputfile, delimiter=delimiter, lineterminator='\n')
    for pair in extractSentencePairs(conversations):
        writer.writerow(pair)

# Print a sample of lines
print("\nSample lines from file:")
printLines(datafile)


Processing corpus into lines and conversations...

Writing newly formatted file...

Sample lines from file:
b'They do to!\tThey do not!\n'
b'She okay?\tI hope so.\n'
b"Wow\tLet's go.\n"
b'"I\'m kidding.  You know how sometimes you just become this ""persona""?  And you don\'t know how to quit?"\tNo\n'
b"No\tOkay -- you're gonna need to learn how to lie.\n"
b"I figured you'd get to the good stuff eventually.\tWhat good stuff?\n"
b'What good stuff?\t"The ""real you""."\n'
b'"The ""real you""."\tLike my fear of wearing pastels?\n'
b'do you listen to this crap?\tWhat crap?\n'
b"What crap?\tMe.  This endless ...blonde babble. I'm like, boring myself.\n"


Loading and trimming the data

In [None]:
# Default word tokens
PAD_token = 0  # Used for padding short sentences
SOS_token = 1  # Start-of-sentence token
EOS_token = 2  # End-of-sentence token

class Voc:
    def __init__(self, name):
        self.name = name
        self.trimmed = False
        self.word2index = {}
        self.word2count = {}
        self.index2word = {PAD_token: "PAD", SOS_token: "SOS", EOS_token: "EOS"}
        self.num_words = 3  # Count SOS, EOS, PAD

    def addSentence(self, sentence):
        for word in sentence.split(' '):
            self.addWord(word)

    def addWord(self, word):
        if word not in self.word2index:
            self.word2index[word] = self.num_words
            self.word2count[word] = 1
            self.index2word[self.num_words] = word
            self.num_words += 1
        else:
            self.word2count[word] += 1

    # Remove words below a certain count threshold
    def trim(self, min_count):
        if self.trimmed:
            return
        self.trimmed = True

        keep_words = []

        for k, v in self.word2count.items():
            if v >= min_count:
                keep_words.append(k)

        print('keep_words {} / {} = {:.4f}'.format(
            len(keep_words), len(self.word2index), len(keep_words) / len(self.word2index)
        ))

        # Reinitialize dictionaries
        self.word2index = {}
        self.word2count = {}
        self.index2word = {PAD_token: "PAD", SOS_token: "SOS", EOS_token: "EOS"}
        self.num_words = 3 # Count default tokens

        for word in keep_words:
            self.addWord(word)

In [None]:
MAX_LENGTH = 10  # Maximum sentence length to consider

# Turn a Unicode string to plain ASCII, thanks to
# https://stackoverflow.com/a/518232/2809427
def unicodeToAscii(s):
    return ''.join(
        c for c in unicodedata.normalize('NFD', s)
        if unicodedata.category(c) != 'Mn'
    )

# Lowercase, trim, and remove non-letter characters
def normalizeString(s):
    s = unicodeToAscii(s.lower().strip())
    s = re.sub(r"([.!?])", r" \1", s)
    s = re.sub(r"[^a-zA-Z.!?]+", r" ", s)
    s = re.sub(r"\s+", r" ", s).strip()
    return s

# Read query/response pairs and return a voc object
def readVocs(datafile, corpus_name):
    print("Reading lines...")
    # Read the file and split into lines
    lines = open(datafile, encoding='utf-8').\
        read().strip().split('\n')
    # Split every line into pairs and normalize
    pairs = [[normalizeString(s) for s in l.split('\t')] for l in lines]
    voc = Voc(corpus_name)
    return voc, pairs

# Returns True if both sentences in a pair 'p' are under the MAX_LENGTH threshold
def filterPair(p):
    # Input sequences need to preserve the last word for EOS token
    return len(p[0].split(' ')) < MAX_LENGTH and len(p[1].split(' ')) < MAX_LENGTH

# Filter pairs using the ``filterPair`` condition
def filterPairs(pairs):
    return [pair for pair in pairs if filterPair(pair)]

# Using the functions defined above, return a populated voc object and pairs list
def loadPrepareData(corpus, corpus_name, datafile, save_dir):
    print("Start preparing training data ...")
    voc, pairs = readVocs(datafile, corpus_name)
    print("Read {!s} sentence pairs".format(len(pairs)))
    pairs = filterPairs(pairs)
    print("Trimmed to {!s} sentence pairs".format(len(pairs)))
    print("Counting words...")
    for pair in pairs:
        voc.addSentence(pair[0])
        voc.addSentence(pair[1])
    print("Counted words:", voc.num_words)
    return voc, pairs


# Load/Assemble voc and pairs
save_dir = os.path.join("data", "save")
voc, pairs = loadPrepareData(corpus, corpus_name, datafile, save_dir)
# Print some pairs to validate
print("\npairs:")
for pair in pairs[:10]:
    print(pair)

Start preparing training data ...
Reading lines...
Read 221282 sentence pairs
Trimmed to 64313 sentence pairs
Counting words...
Counted words: 18082

pairs:
['they do to !', 'they do not !']
['she okay ?', 'i hope so .']
['wow', 'let s go .']
['what good stuff ?', 'the real you .']
['the real you .', 'like my fear of wearing pastels ?']
['do you listen to this crap ?', 'what crap ?']
['well no . . .', 'then that s all you had to say .']
['then that s all you had to say .', 'but']
['but', 'you always been this selfish ?']
['have fun tonight ?', 'tons']


In [None]:
MIN_COUNT = 3    # Minimum word count threshold for trimming

def trimRareWords(voc, pairs, MIN_COUNT):
    # Trim words used under the MIN_COUNT from the voc
    voc.trim(MIN_COUNT)
    # Filter out pairs with trimmed words
    keep_pairs = []
    for pair in pairs:
        input_sentence = pair[0]
        output_sentence = pair[1]
        keep_input = True
        keep_output = True
        # Check input sentence
        for word in input_sentence.split(' '):
            if word not in voc.word2index:
                keep_input = False
                break
        # Check output sentence
        for word in output_sentence.split(' '):
            if word not in voc.word2index:
                keep_output = False
                break

        # Only keep pairs that do not contain trimmed word(s) in their input or output sentence
        if keep_input and keep_output:
            keep_pairs.append(pair)

    print("Trimmed from {} pairs to {}, {:.4f} of total".format(len(pairs), len(keep_pairs), len(keep_pairs) / len(pairs)))
    return keep_pairs


# Trim voc and pairs
pairs = trimRareWords(voc, pairs, MIN_COUNT)

keep_words 7833 / 18079 = 0.4333
Trimmed from 64313 pairs to 53131, 0.8261 of total


### Prepare Data for Models

In [None]:
def indexesFromSentence(voc, sentence):
    return [voc.word2index[word] for word in sentence.split(' ')] + [EOS_token]


def zeroPadding(l, fillvalue=PAD_token):
    return list(itertools.zip_longest(*l, fillvalue=fillvalue))

def binaryMatrix(l, value=PAD_token):
    m = []
    for i, seq in enumerate(l):
        m.append([])
        for token in seq:
            if token == PAD_token:
                m[i].append(0)
            else:
                m[i].append(1)
    return m

# Returns padded input sequence tensor and lengths
def inputVar(l, voc):
    indexes_batch = [indexesFromSentence(voc, sentence) for sentence in l]
    lengths = torch.tensor([len(indexes) for indexes in indexes_batch])
    padList = zeroPadding(indexes_batch)
    padVar = torch.LongTensor(padList)
    return padVar, lengths

# Returns padded target sequence tensor, padding mask, and max target length
def outputVar(l, voc):
    indexes_batch = [indexesFromSentence(voc, sentence) for sentence in l]
    max_target_len = max([len(indexes) for indexes in indexes_batch])
    padList = zeroPadding(indexes_batch)
    mask = binaryMatrix(padList)
    mask = torch.BoolTensor(mask)
    padVar = torch.LongTensor(padList)
    return padVar, mask, max_target_len

# Returns all items for a given batch of pairs
def batch2TrainData(voc, pair_batch):
    pair_batch.sort(key=lambda x: len(x[0].split(" ")), reverse=True)
    input_batch, output_batch = [], []
    for pair in pair_batch:
        input_batch.append(pair[0])
        output_batch.append(pair[1])
    inp, lengths = inputVar(input_batch, voc)
    output, mask, max_target_len = outputVar(output_batch, voc)
    return inp, lengths, output, mask, max_target_len


# Example for validation
small_batch_size = 5
batches = batch2TrainData(voc, [random.choice(pairs) for _ in range(small_batch_size)])
input_variable, lengths, target_variable, mask, max_target_len = batches

print("input_variable:", input_variable)
print("lengths:", lengths)
print("target_variable:", target_variable)
print("mask:", mask)
print("max_target_len:", max_target_len)

input_variable: tensor([[  24,   24, 1012,   36,  658],
        [   4,  246,   67,   17,   14],
        [  79,  135,   90,  380,    2],
        [ 606, 1805, 3210,   14,    0],
        [ 284,  160, 1012,    2,    0],
        [ 900,   99,   10,    0,    0],
        [ 307, 1845,    2,    0,    0],
        [ 738,   14,    0,    0,    0],
        [  14,    2,    0,    0,    0],
        [   2,    0,    0,    0,    0]])
lengths: tensor([10,  9,  7,  5,  3])
target_variable: tensor([[1684,  104, 1250,   11,   20],
        [  14,  246,  449,  208,  658],
        [   2,  135,  113,  135,  681],
        [   0,  136, 1257,  136,   14],
        [   0,   72,  160,    5, 7011],
        [   0,    5,   14,  186,   14],
        [   0,   14,    2,   92,    2],
        [   0,    2,    0,   66,    0],
        [   0,    0,    0,   14,    0],
        [   0,    0,    0,    2,    0]])
mask: tensor([[ True,  True,  True,  True,  True],
        [ True,  True,  True,  True,  True],
        [ True,  True,  True,  

### Models

#### Seq2Seq Model

##### Encoder

In [None]:
class EncoderRNN(nn.Module):
    def __init__(self, hidden_size, embedding, n_layers=1, dropout=0):
        super(EncoderRNN, self).__init__()
        self.n_layers = n_layers
        self.hidden_size = hidden_size
        self.embedding = embedding

        # Initialize GRU; the input_size and hidden_size parameters are both set to 'hidden_size'
        #   because our input size is a word embedding with number of features == hidden_size
        self.gru = nn.GRU(hidden_size, hidden_size, n_layers,
                          dropout=dropout, bidirectional=True)

    def forward(self, input_seq: torch.Tensor, input_lengths: torch.Tensor):
        hidden = torch.zeros(self.n_layers * 2, input_seq.shape[1], self.hidden_size).to(input_seq.device)
        # Convert word indexes to embeddings
        embedded = self.embedding(input_seq)
        # Pack padded batch of sequences for RNN module
        packed = nn.utils.rnn.pack_padded_sequence(embedded, input_lengths)
        # Forward pass through GRU
        outputs, hidden = self.gru(packed, hidden)
        # Unpack padding
        outputs, _ = nn.utils.rnn.pad_packed_sequence(outputs)
        # Sum bidirectional GRU outputs
        outputs = outputs[:, :, :self.hidden_size] + outputs[:, : ,self.hidden_size:]
        # Return output and final hidden state
        return outputs, hidden

##### Luong Attention Layer

In [None]:
# Luong attention layer
class Attn(nn.Module):
    def __init__(self, method, hidden_size):
        super(Attn, self).__init__()
        self.method = method
        if self.method not in ['dot', 'general', 'concat']:
            raise ValueError(self.method, "is not an appropriate attention method.")
        self.hidden_size = hidden_size
        self.attn = nn.Linear(self.hidden_size, hidden_size)
        self.v = nn.Parameter(torch.FloatTensor(hidden_size))

    def dot_score(self, hidden, encoder_output):
        return torch.sum(hidden * encoder_output, dim=2)

    def general_score(self, hidden, encoder_output):
        energy = self.attn(encoder_output)
        return torch.sum(hidden * energy, dim=2)

    def concat_score(self, hidden, encoder_output):
        energy = self.attn(torch.cat((hidden.expand(encoder_output.size(0), -1, -1), encoder_output), 2)).tanh()
        return torch.sum(self.v * energy, dim=2)

    def forward(self, hidden, encoder_outputs):
        # Calculate the attention weights (energies) based on the given method
        attn_energies = torch.randn(1,1,1)

        if self.method == 'general':
            attn_energies = self.general_score(hidden, encoder_outputs)
        elif self.method == 'concat':
            attn_energies = self.concat_score(hidden, encoder_outputs)
        elif self.method == 'dot':
            attn_energies = self.dot_score(hidden, encoder_outputs)

        # Transpose max_length and batch_size dimensions
        attn_energies = attn_energies.t()

        # Return the softmax normalized probability scores (with added dimension)
        return F.softmax(attn_energies, dim=1).unsqueeze(1)

##### Decoder

In [None]:
class LuongAttnDecoderRNN(nn.Module):
    def __init__(self, attn_model, embedding, hidden_size, output_size, n_layers=1, dropout=0.1):
        super(LuongAttnDecoderRNN, self).__init__()

        # Keep for reference
        self.attn_model = attn_model
        self.hidden_size = hidden_size
        self.output_size = output_size
        self.n_layers = n_layers
        self.dropout = dropout

        # Define layers
        self.embedding = embedding
        self.embedding_dropout = nn.Dropout(dropout)
        self.gru = nn.GRU(hidden_size, hidden_size, n_layers, dropout=(0 if n_layers == 1 else dropout))
        self.concat = nn.Linear(hidden_size * 2, hidden_size)
        self.out = nn.Linear(hidden_size, output_size)

        self.attn = Attn(attn_model, hidden_size)

    def forward(self, input_step: torch.Tensor, last_hidden: torch.Tensor, encoder_outputs: torch.Tensor):
        # Note: we run this one step (word) at a time
        # Get embedding of current input word
        embedded = self.embedding(input_step)
        embedded = self.embedding_dropout(embedded)
        # Forward through unidirectional GRU
        rnn_output, hidden = self.gru(embedded, last_hidden)
        # Calculate attention weights from the current GRU output
        attn_weights = self.attn(rnn_output, encoder_outputs)
        # Multiply attention weights to encoder outputs to get new "weighted sum" context vector
        context = attn_weights.bmm(encoder_outputs.transpose(0, 1))
        # Concatenate weighted context vector and GRU output using Luong eq. 5
        rnn_output = rnn_output.squeeze(0)
        context = context.squeeze(1)
        concat_input = torch.cat((rnn_output, context), 1)
        concat_output = torch.tanh(self.concat(concat_input))
        # Predict next word using Luong eq. 6
        output = self.out(concat_output)
        output = F.softmax(output, dim=1)
        # Return output and final hidden state
        return output, hidden

### Training Procedure

#### Masked Loss

In [None]:
def maskNLLLoss(inp, target, mask):
    nTotal = mask.sum()
    crossEntropy = -torch.log(torch.gather(inp, 1, target.view(-1, 1)).squeeze(1))
    loss = crossEntropy.masked_select(mask).mean()
    loss = loss.to(device)
    return loss, nTotal.item()

#### Single Training Iteration

In [None]:
def train(input_variable, lengths, target_variable, mask, max_target_len, encoder, decoder, embedding,
          encoder_optimizer, decoder_optimizer, batch_size, clip, teacher_forcing_ratio, max_length=MAX_LENGTH, record=False):

    # Zero gradients
    encoder_optimizer.zero_grad()
    decoder_optimizer.zero_grad()

    # Set device options
    input_variable = input_variable.to(device)
    target_variable = target_variable.to(device)
    mask = mask.to(device)
    # Lengths for RNN packing should always be on the CPU
    lengths = lengths.to('cpu')

    # Initialize variables
    loss = 0
    print_losses = []
    n_totals = 0

    # Forward pass through encoder
    encoder_outputs, encoder_hidden = encoder(input_variable, lengths)

    # Create initial decoder input (start with SOS tokens for each sentence)
    decoder_input = torch.LongTensor([[SOS_token for _ in range(batch_size)]])
    decoder_input = decoder_input.to(device)

    # Set initial decoder hidden state to the encoder's final hidden state
    decoder_hidden = encoder_hidden[:decoder.n_layers]

    # Determine if we are using teacher forcing this iteration
    use_teacher_forcing = True if random.random() < teacher_forcing_ratio else False

    # Forward batch of sequences through decoder one time step at a time
    if use_teacher_forcing:
        for t in range(max_target_len):
            decoder_output, decoder_hidden = decoder(
                decoder_input, decoder_hidden, encoder_outputs
            )
            # Teacher forcing: next input is current target
            decoder_input = target_variable[t].view(1, -1)
            # Calculate and accumulate loss
            mask_loss, nTotal = maskNLLLoss(decoder_output, target_variable[t], mask[t])
            loss += mask_loss
            print_losses.append(mask_loss.item() * nTotal)
            n_totals += nTotal
    else:
        for t in range(max_target_len):
            decoder_output, decoder_hidden = decoder(
                decoder_input, decoder_hidden, encoder_outputs
            )
            # No teacher forcing: next input is decoder's own current output
            _, topi = decoder_output.topk(1)
            decoder_input = torch.LongTensor([[topi[i][0] for i in range(batch_size)]])
            decoder_input = decoder_input.to(device)
            # Calculate and accumulate loss
            mask_loss, nTotal = maskNLLLoss(decoder_output, target_variable[t], mask[t])
            loss += mask_loss
            print_losses.append(mask_loss.item() * nTotal)
            n_totals += nTotal

    # Recording loss on wandb
    if record:
        wandb.log({"loss": sum(print_losses) / n_totals})

    # Perform backpropagation
    loss.backward()

    # Clip gradients: gradients are modified in place
    _ = nn.utils.clip_grad_norm_(encoder.parameters(), clip)
    _ = nn.utils.clip_grad_norm_(decoder.parameters(), clip)

    # Adjust model weights
    encoder_optimizer.step()
    decoder_optimizer.step()

    return sum(print_losses) / n_totals

#### Training iterations

In [None]:
def trainIters(model_name, voc, pairs, encoder, decoder, encoder_optimizer, decoder_optimizer, embedding, encoder_n_layers, decoder_n_layers, save_dir, n_iteration, batch_size, print_every, save_every, clip, teacher_forcing_ratio, hidden_size, corpus_name, record=False):

    # Load batches for each iteration
    training_batches = [batch2TrainData(voc, [random.choice(pairs) for _ in range(batch_size)])
                      for _ in range(n_iteration)]

    # Initializations
    print('Initializing ...')
    start_iteration = 1
    print_loss = 0

    # Training loop
    print("Training...")
    for iteration in range(start_iteration, n_iteration + 1):
        training_batch = training_batches[iteration - 1]
        # Extract fields from batch
        input_variable, lengths, target_variable, mask, max_target_len = training_batch

        # Run a training iteration with batch
        loss = train(input_variable, lengths, target_variable, mask, max_target_len, encoder,
                     decoder, embedding, encoder_optimizer, decoder_optimizer, batch_size, clip, teacher_forcing_ratio, record=record)
        print_loss += loss

        # Print progress
        if iteration % print_every == 0:
            print_loss_avg = print_loss / print_every
            print("Iteration: {}; Percent complete: {:.1f}%; Average loss: {:.4f}".format(iteration, iteration / n_iteration * 100, print_loss_avg))
            print_loss = 0

        # Save checkpoint
        if (iteration % save_every == 0):
            directory = os.path.join(save_dir, model_name, corpus_name, '{}-{}_{}'.format(encoder_n_layers, decoder_n_layers, hidden_size))
            if not os.path.exists(directory):
                os.makedirs(directory)
            torch.save({
                'iteration': iteration,
                'en': encoder.state_dict(),
                'de': decoder.state_dict(),
                'en_opt': encoder_optimizer.state_dict(),
                'de_opt': decoder_optimizer.state_dict(),
                'loss': loss,
                'voc_dict': voc.__dict__,
                'embedding': embedding.state_dict()
            }, os.path.join(directory, '{}_{}.tar'.format(iteration, 'checkpoint')))

### Evaluation Procedure

#### Greedy Search Decoder

In [None]:
class GreedySearchDecoder(nn.Module):
    def __init__(self, encoder, decoder):
        super(GreedySearchDecoder, self).__init__()
        self.encoder = encoder
        self.decoder = decoder

    def forward(self, input_seq: torch.Tensor, input_length: torch.Tensor, max_length: int):
        # Forward input through encoder model
        encoder_outputs, encoder_hidden = self.encoder(input_seq, input_length)
        # Prepare encoder's final hidden layer to be first hidden input to the decoder
        decoder_hidden = encoder_hidden[:self.decoder.n_layers]
        # Initialize decoder input with SOS_token
        decoder_input = torch.ones(1, input_seq.shape[1], device=device, dtype=torch.long) * SOS_token
        # Initialize tensors to append decoded words to
        all_tokens = torch.zeros([0], device=device, dtype=torch.long)
        all_scores = torch.zeros([0], device=device)
        # Iteratively decode one word token at a time
        for _ in range(max_length):
            # Forward pass through decoder
            decoder_output, decoder_hidden = self.decoder(decoder_input, decoder_hidden, encoder_outputs)
            # Obtain most likely word token and its softmax score
            decoder_scores, decoder_input = torch.max(decoder_output, dim=1)
            # Record token and score
            all_tokens = torch.cat((all_tokens, decoder_input), dim=0)
            all_scores = torch.cat((all_scores, decoder_scores), dim=0)
            # Prepare current token to be next decoder input (add a dimension)
            decoder_input = torch.unsqueeze(decoder_input, 0)
        # Return collections of word tokens and scores
        return all_tokens, all_scores

#### Evaluate text

In [None]:
def evaluate(searcher, voc, sentence, max_length=MAX_LENGTH):
    ### Format input sentence as a batch
    # words -> indexes
    indexes_batch = [indexesFromSentence(voc, sentence)]
    # Create lengths tensor
    lengths = torch.tensor([len(indexes) for indexes in indexes_batch])
    # Transpose dimensions of batch to match models' expectations
    input_batch = torch.LongTensor(indexes_batch).transpose(0, 1)
    # Use appropriate device
    input_batch = input_batch.to(device)
    lengths = lengths.to('cpu')
    # Decode sentence with searcher
    tokens, scores = searcher(input_batch, lengths, max_length)
    # indexes -> words
    decoded_words = [voc.index2word[token.item()] for token in tokens]
    return decoded_words

# Evaluate inputs from user input (``stdin``)
def evaluateInput(searcher, voc):
    input_sentence = ''
    while(1):
        try:
            # Get input sentence
            input_sentence = input('> ')
            # Check if it is quit case
            if input_sentence == 'q' or input_sentence == 'quit': break
            # Normalize sentence
            input_sentence = normalizeString(input_sentence)
            # Evaluate sentence
            output_words = evaluate(searcher, voc, input_sentence)
            # Format and print response sentence
            output_words[:] = [x for x in output_words if not (x == 'EOS' or x == 'PAD')]
            print('Bot:', ' '.join(output_words))

        except KeyError:
            print("Error: Encountered unknown word.")

# Normalize input sentence and call ``evaluate()``
def evaluateExample(sentence, searcher, voc):
    print("> " + sentence)
    # Normalize sentence
    input_sentence = normalizeString(sentence)
    # Evaluate sentence
    output_words = evaluate(searcher, voc, input_sentence)
    output_words[:] = [x for x in output_words if not (x == 'EOS' or x == 'PAD')]
    print('Bot:', ' '.join(output_words))

### Run Model

In [None]:
# Configure models
model_name = 'cb_model'
attn_model = 'dot'
hidden_size = 500
encoder_n_layers = 2
decoder_n_layers = 2
dropout = 0.1
batch_size = 64

print('Building encoder and decoder ...')
# Initialize word embeddings
embedding = nn.Embedding(voc.num_words, hidden_size)
# Initialize encoder & decoder models
encoder = EncoderRNN(hidden_size, embedding, encoder_n_layers, dropout)
decoder = LuongAttnDecoderRNN(attn_model, embedding, hidden_size, voc.num_words, decoder_n_layers, dropout)
# Use appropriate device
encoder = encoder.to(device)
decoder = decoder.to(device)
print('Models built and ready to go!')

Building encoder and decoder ...
Models built and ready to go!


#### Run Training

In [None]:
# Configure training/optimization
clip = 50.0
teacher_forcing_ratio = 1.0
learning_rate = 0.0001
decoder_learning_ratio = 5.0
n_iteration = 4000
print_every = 100
save_every = 500

# Ensure dropout layers are in train mode
encoder.train()
decoder.train()

# Initialize optimizers
print('Building optimizers ...')
encoder_optimizer = optim.Adam(encoder.parameters(), lr=learning_rate)
decoder_optimizer = optim.Adam(decoder.parameters(), lr=learning_rate * decoder_learning_ratio)

# If you have CUDA, configure CUDA to call
for state in encoder_optimizer.state.values():
    for k, v in state.items():
        if isinstance(v, torch.Tensor):
            state[k] = v.cuda()

for state in decoder_optimizer.state.values():
    for k, v in state.items():
        if isinstance(v, torch.Tensor):
            state[k] = v.cuda()

# Run training iterations
print("Starting Training!")
trainIters(model_name, voc, pairs, encoder, decoder, encoder_optimizer, decoder_optimizer,
           embedding, encoder_n_layers, decoder_n_layers, save_dir, n_iteration, batch_size,
           print_every, save_every, clip, teacher_forcing_ratio, hidden_size, corpus_name)

Building optimizers ...
Starting Training!
Initializing ...
Training...
Iteration: 100; Percent complete: 2.5%; Average loss: 5.0211
Iteration: 200; Percent complete: 5.0%; Average loss: 4.2475
Iteration: 300; Percent complete: 7.5%; Average loss: 3.9929
Iteration: 400; Percent complete: 10.0%; Average loss: 3.8505
Iteration: 500; Percent complete: 12.5%; Average loss: 3.7781
Iteration: 600; Percent complete: 15.0%; Average loss: 3.6838
Iteration: 700; Percent complete: 17.5%; Average loss: 3.6264
Iteration: 800; Percent complete: 20.0%; Average loss: 3.5803
Iteration: 900; Percent complete: 22.5%; Average loss: 3.5066
Iteration: 1000; Percent complete: 25.0%; Average loss: 3.4795
Iteration: 1100; Percent complete: 27.5%; Average loss: 3.4608
Iteration: 1200; Percent complete: 30.0%; Average loss: 3.3994
Iteration: 1300; Percent complete: 32.5%; Average loss: 3.3778
Iteration: 1400; Percent complete: 35.0%; Average loss: 3.3276
Iteration: 1500; Percent complete: 37.5%; Average loss: 3.

#### Run Evaluation

In [None]:
# Set dropout layers to ``eval`` mode
encoder.eval()
decoder.eval()

# Initialize search module
searcher = GreedySearchDecoder(encoder, decoder)

# Begin chatting (uncomment and run the following line to begin)
evaluateInput(searcher, voc)

> hi
Bot: hi . s van day .
> how are you?
Bot: i m fine . . . .
> what's up?
Bot: you re a good man . it .
> quit


## Problem 1.2-1.4

### Installing `wandb`

In [None]:
!pip install wandb --upgrade



Importing `wandb` and logging in

In [None]:
import wandb

wandb.login()

[34m[1mwandb[0m: Using wandb-core as the SDK backend. Please refer to https://wandb.me/wandb-core for more information.


<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
wandb: Paste an API key from your profile and hit enter, or press ctrl+c to quit:

 ··········


[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


True

### Define Sweep Configuration

In [None]:
sweep_config = {
    'method': 'random'
    }

In [None]:
metric = {
    'name': 'loss',
    'goal': 'minimize'
    }

sweep_config['metric'] = metric

In [None]:
parameters_dict = {
      'learning_rate': {
          'values': [0.0001, 0.00025, 0.0005, 0.001]
      },
      'optimizer': {
          'values': ["adam", "sgd"]
      },
      'clip': {
          'values': [0, 25, 50, 100]
      },
      'teacher_forcing_ratio': {
          'values': [0, 0.5, 1.0]
      },
      'decoder_learning_ratio':{
          'values': [1.0, 3.0, 5.0, 10.0]
      }
    }

sweep_config['parameters'] = parameters_dict

In [None]:
import pprint

pprint.pprint(sweep_config)

{'method': 'random',
 'metric': {'goal': 'minimize', 'name': 'loss'},
 'parameters': {'clip': {'values': [0, 25, 50, 100]},
                'decoder_learning_ratio': {'values': [1.0, 3.0, 5.0, 10.0]},
                'learning_rate': {'values': [0.0001, 0.00025, 0.0005, 0.001]},
                'optimizer': {'values': ['adam', 'sgd']},
                'teacher_forcing_ratio': {'values': [0, 0.5, 1.0]}}}


Creating a train and record function, that initializes `wandb` recording, and runs training on the model

In [None]:
def train_and_record():
    run = wandb.init(project="W&BProjectName", entity="W&BUserName")
    config = run.config

    # Configure models
    model_name = 'cb_model'
    attn_model = 'dot'
    hidden_size = 500
    encoder_n_layers = 2
    decoder_n_layers = 2
    dropout = 0.1
    batch_size = 64

    print('Building encoder and decoder ...')
    # Initialize word embeddings
    embedding = nn.Embedding(voc.num_words, hidden_size)

    # Initialize encoder & decoder models
    encoder = EncoderRNN(hidden_size, embedding, encoder_n_layers, dropout)
    decoder = LuongAttnDecoderRNN(attn_model, embedding, hidden_size, voc.num_words, decoder_n_layers, dropout)

    # Use appropriate device
    encoder = encoder.to(device)
    decoder = decoder.to(device)
    print('Models built and ready to go!')

    # Configure training/optimization
    clip = config.clip
    teacher_forcing_ratio = config.teacher_forcing_ratio
    learning_rate = config.learning_rate
    decoder_learning_ratio = config.decoder_learning_ratio
    n_iteration = 4000
    # Changed to not populate the entire console with loss outputs
    print_every = 1000
    # Fix to NOT save model while running wandb
    save_every = 5000

    # Ensure dropout layers are in train mode
    encoder.train()
    decoder.train()

    # Initialize optimizers
    print('Building optimizers ...')
    if config.optimizer == 'adam':
        encoder_optimizer = optim.Adam(encoder.parameters(), lr=learning_rate)
        decoder_optimizer = optim.Adam(decoder.parameters(), lr=learning_rate * decoder_learning_ratio)
    elif config.optimizer == 'sgd':
        encoder_optimizer = optim.SGD(encoder.parameters(), lr=learning_rate)
        decoder_optimizer = optim.SGD(decoder.parameters(), lr=learning_rate * decoder_learning_ratio)

    # If you have CUDA, configure CUDA to call
    for state in encoder_optimizer.state.values():
        for k, v in state.items():
            if isinstance(v, torch.Tensor):
                state[k] = v.cuda()

    for state in decoder_optimizer.state.values():
        for k, v in state.items():
            if isinstance(v, torch.Tensor):
                state[k] = v.cuda()

    # Run training iterations
    print("Starting Training!")
    trainIters(model_name, voc, pairs, encoder, decoder, encoder_optimizer, decoder_optimizer,
            embedding, encoder_n_layers, decoder_n_layers, save_dir, n_iteration, batch_size,
            print_every, save_every, clip, teacher_forcing_ratio, hidden_size, corpus_name, record=True)

### Run Hyperparameter Sweep

In [None]:
sweep_id = wandb.sweep(sweep_config, project="hpml-chatbot")

Create sweep with ID: gv10awn2
Sweep URL: https://wandb.ai/nyu-hpml/hpml-chatbot/sweeps/gv10awn2


In [29]:
wandb.agent(sweep_id, function=train_and_record, count=25)

[34m[1mwandb[0m: Agent Starting Run: ci913wk0 with config:
[34m[1mwandb[0m: 	clip: 50
[34m[1mwandb[0m: 	decoder_learning_ratio: 1
[34m[1mwandb[0m: 	learning_rate: 0.00025
[34m[1mwandb[0m: 	optimizer: sgd
[34m[1mwandb[0m: 	teacher_forcing_ratio: 0.5
[34m[1mwandb[0m: Currently logged in as: [33mrrm9598[0m. Use [1m`wandb login --relogin`[0m to force relogin


Building encoder and decoder ...
Models built and ready to go!
Building optimizers ...
Starting Training!
Initializing ...
Training...
Iteration: 1000; Percent complete: 25.0%; Average loss: 7.0455
Iteration: 2000; Percent complete: 50.0%; Average loss: 5.6932
Iteration: 3000; Percent complete: 75.0%; Average loss: 5.2878
Iteration: 4000; Percent complete: 100.0%; Average loss: 5.0883


0,1
loss,███▇▅▄▄▄▃▃▃▃▃▃▃▃▂▂▃▂▂▂▂▂▂▂▁▂▁▁▁▁▂▂▂▁▂▁▁▂

0,1
loss,5.04522


[34m[1mwandb[0m: Agent Starting Run: sypg6rwf with config:
[34m[1mwandb[0m: 	clip: 25
[34m[1mwandb[0m: 	decoder_learning_ratio: 1
[34m[1mwandb[0m: 	learning_rate: 0.0001
[34m[1mwandb[0m: 	optimizer: sgd
[34m[1mwandb[0m: 	teacher_forcing_ratio: 1


Building encoder and decoder ...
Models built and ready to go!
Building optimizers ...
Starting Training!
Initializing ...
Training...
Iteration: 1000; Percent complete: 25.0%; Average loss: 8.2744
Iteration: 2000; Percent complete: 50.0%; Average loss: 6.4895
Iteration: 3000; Percent complete: 75.0%; Average loss: 6.0242
Iteration: 4000; Percent complete: 100.0%; Average loss: 5.7004


0,1
loss,███▆▅▄▃▃▃▃▃▃▃▃▃▃▂▃▃▃▂▂▃▂▂▂▂▂▂▁▂▂▂▂▂▁▁▁▁▁

0,1
loss,5.68057


[34m[1mwandb[0m: Agent Starting Run: kwns0dtx with config:
[34m[1mwandb[0m: 	clip: 0
[34m[1mwandb[0m: 	decoder_learning_ratio: 10
[34m[1mwandb[0m: 	learning_rate: 0.0005
[34m[1mwandb[0m: 	optimizer: adam
[34m[1mwandb[0m: 	teacher_forcing_ratio: 1


Building encoder and decoder ...
Models built and ready to go!
Building optimizers ...
Starting Training!
Initializing ...
Training...
Iteration: 1000; Percent complete: 25.0%; Average loss: 8.9637
Iteration: 2000; Percent complete: 50.0%; Average loss: 8.9636
Iteration: 3000; Percent complete: 75.0%; Average loss: 8.9637
Iteration: 4000; Percent complete: 100.0%; Average loss: 8.9637


0,1
loss,▃▄▃▅▄▅▄▁▆▂▃▆▄▂▅▂▅▃▁▄▃▄▂▄▅▃▄█▄▅▄▄▂▄▂▂▃▆▃▄

0,1
loss,8.96294


[34m[1mwandb[0m: Agent Starting Run: gs60l9xa with config:
[34m[1mwandb[0m: 	clip: 0
[34m[1mwandb[0m: 	decoder_learning_ratio: 1
[34m[1mwandb[0m: 	learning_rate: 0.00025
[34m[1mwandb[0m: 	optimizer: sgd
[34m[1mwandb[0m: 	teacher_forcing_ratio: 0


Building encoder and decoder ...
Models built and ready to go!
Building optimizers ...
Starting Training!
Initializing ...
Training...
Iteration: 1000; Percent complete: 25.0%; Average loss: 8.9584
Iteration: 2000; Percent complete: 50.0%; Average loss: 8.9583
Iteration: 3000; Percent complete: 75.0%; Average loss: 8.9583
Iteration: 4000; Percent complete: 100.0%; Average loss: 8.9583


0,1
loss,▅▁▆▃▆▅▁▃▂▅▅▅▅▄█▇▇▅▄▄█▄▃▅▇▂▂▆▅▇▆▄▅▄▄▅▄▄▄▅

0,1
loss,8.95523


[34m[1mwandb[0m: Agent Starting Run: 7e0y907q with config:
[34m[1mwandb[0m: 	clip: 50
[34m[1mwandb[0m: 	decoder_learning_ratio: 5
[34m[1mwandb[0m: 	learning_rate: 0.0005
[34m[1mwandb[0m: 	optimizer: adam
[34m[1mwandb[0m: 	teacher_forcing_ratio: 0


Building encoder and decoder ...
Models built and ready to go!
Building optimizers ...
Starting Training!
Initializing ...
Training...
Iteration: 1000; Percent complete: 25.0%; Average loss: 5.0509
Iteration: 2000; Percent complete: 50.0%; Average loss: 4.8957
Iteration: 3000; Percent complete: 75.0%; Average loss: 4.7213
Iteration: 4000; Percent complete: 100.0%; Average loss: 4.5114


0,1
loss,██▇▆▇▅▅█▅▆▆▅▄▆▆▅▅▅▄▆▅▆▅▅▅▄▄▆▇▄▅▅▅▃▄▄▄▃▁▃

0,1
loss,4.54506


[34m[1mwandb[0m: Agent Starting Run: 7p62xl7g with config:
[34m[1mwandb[0m: 	clip: 25
[34m[1mwandb[0m: 	decoder_learning_ratio: 5
[34m[1mwandb[0m: 	learning_rate: 0.001
[34m[1mwandb[0m: 	optimizer: sgd
[34m[1mwandb[0m: 	teacher_forcing_ratio: 0


Building encoder and decoder ...
Models built and ready to go!
Building optimizers ...
Starting Training!
Initializing ...
Training...
Iteration: 1000; Percent complete: 25.0%; Average loss: 5.1616
Iteration: 2000; Percent complete: 50.0%; Average loss: 4.7487
Iteration: 3000; Percent complete: 75.0%; Average loss: 4.6950
Iteration: 4000; Percent complete: 100.0%; Average loss: 4.6629


0,1
loss,█▂▃▂▂▂▂▂▁▂▂▂▂▁▁▁▁▁▂▁▂▁▂▂▂▂▁▂▁▂▁▁▁▁▁▁▁▁▁▁

0,1
loss,4.54752


[34m[1mwandb[0m: Agent Starting Run: toncuq80 with config:
[34m[1mwandb[0m: 	clip: 100
[34m[1mwandb[0m: 	decoder_learning_ratio: 10
[34m[1mwandb[0m: 	learning_rate: 0.0005
[34m[1mwandb[0m: 	optimizer: adam
[34m[1mwandb[0m: 	teacher_forcing_ratio: 0.5


Building encoder and decoder ...
Models built and ready to go!
Building optimizers ...
Starting Training!
Initializing ...
Training...
Iteration: 1000; Percent complete: 25.0%; Average loss: 4.9804
Iteration: 2000; Percent complete: 50.0%; Average loss: 4.8257
Iteration: 3000; Percent complete: 75.0%; Average loss: 4.7935
Iteration: 4000; Percent complete: 100.0%; Average loss: 4.8363


0,1
loss,▅█▆▂▂▅▂▁▃▇▄▅▆▂▆▆▄▆▄▅▆▆▂▆█▇▂▇▂▃▃▂█▇▅▅▂▃▄▄

0,1
loss,5.68426


[34m[1mwandb[0m: Agent Starting Run: 6jykk190 with config:
[34m[1mwandb[0m: 	clip: 25
[34m[1mwandb[0m: 	decoder_learning_ratio: 1
[34m[1mwandb[0m: 	learning_rate: 0.0001
[34m[1mwandb[0m: 	optimizer: adam
[34m[1mwandb[0m: 	teacher_forcing_ratio: 0


Building encoder and decoder ...
Models built and ready to go!
Building optimizers ...
Starting Training!
Initializing ...
Training...
Iteration: 1000; Percent complete: 25.0%; Average loss: 4.8235
Iteration: 2000; Percent complete: 50.0%; Average loss: 4.5969
Iteration: 3000; Percent complete: 75.0%; Average loss: 4.5651
Iteration: 4000; Percent complete: 100.0%; Average loss: 4.5123


VBox(children=(Label(value='0.011 MB of 0.011 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

0,1
loss,█▄▂▂▃▂▃▂▂▂▂▂▂▂▂▂▂▂▂▂▂▂▂▁▂▂▁▂▂▁▁▂▂▂▂▂▂▂▂▁

0,1
loss,4.46142


[34m[1mwandb[0m: Agent Starting Run: mn0jgr6r with config:
[34m[1mwandb[0m: 	clip: 100
[34m[1mwandb[0m: 	decoder_learning_ratio: 5
[34m[1mwandb[0m: 	learning_rate: 0.00025
[34m[1mwandb[0m: 	optimizer: sgd
[34m[1mwandb[0m: 	teacher_forcing_ratio: 0.5


Building encoder and decoder ...
Models built and ready to go!
Building optimizers ...
Starting Training!
Initializing ...
Training...
Iteration: 1000; Percent complete: 25.0%; Average loss: 5.8261
Iteration: 2000; Percent complete: 50.0%; Average loss: 4.9082
Iteration: 3000; Percent complete: 75.0%; Average loss: 4.7576
Iteration: 4000; Percent complete: 100.0%; Average loss: 4.6801


0,1
loss,█▆▅▃▃▃▃▃▂▂▃▂▃▃▂▂▂▂▂▂▃▂▁▂▂▂▁▂▂▂▂▁▂▂▁▁▁▁▂▁

0,1
loss,5.02757


[34m[1mwandb[0m: Agent Starting Run: zdght34v with config:
[34m[1mwandb[0m: 	clip: 50
[34m[1mwandb[0m: 	decoder_learning_ratio: 3
[34m[1mwandb[0m: 	learning_rate: 0.0005
[34m[1mwandb[0m: 	optimizer: adam
[34m[1mwandb[0m: 	teacher_forcing_ratio: 1


Building encoder and decoder ...
Models built and ready to go!
Building optimizers ...
Starting Training!
Initializing ...
Training...
Iteration: 1000; Percent complete: 25.0%; Average loss: 3.7578
Iteration: 2000; Percent complete: 50.0%; Average loss: 3.0330
Iteration: 3000; Percent complete: 75.0%; Average loss: 2.5742
Iteration: 4000; Percent complete: 100.0%; Average loss: 2.1613


0,1
loss,█▇▆▆▆▅▅▆▆▅▅▆▅▅▅▅▄▅▄▄▄▃▃▄▄▃▃▃▃▃▃▂▃▃▂▂▂▂▂▁

0,1
loss,2.01315


[34m[1mwandb[0m: Agent Starting Run: ejee6dyz with config:
[34m[1mwandb[0m: 	clip: 100
[34m[1mwandb[0m: 	decoder_learning_ratio: 3
[34m[1mwandb[0m: 	learning_rate: 0.001
[34m[1mwandb[0m: 	optimizer: sgd
[34m[1mwandb[0m: 	teacher_forcing_ratio: 0


Building encoder and decoder ...
Models built and ready to go!
Building optimizers ...
Starting Training!
Initializing ...
Training...
Iteration: 1000; Percent complete: 25.0%; Average loss: 5.2955
Iteration: 2000; Percent complete: 50.0%; Average loss: 4.8008
Iteration: 3000; Percent complete: 75.0%; Average loss: 4.7284
Iteration: 4000; Percent complete: 100.0%; Average loss: 4.7004


0,1
loss,█▇▃▃▃▃▂▂▂▂▃▃▂▂▂▂▂▃▂▂▂▂▁▂▂▂▂▂▂▂▂▁▁▂▂▁▂▂▃▂

0,1
loss,4.61409


[34m[1mwandb[0m: Sweep Agent: Waiting for job.
[34m[1mwandb[0m: Job received.
[34m[1mwandb[0m: Agent Starting Run: q72x4a70 with config:
[34m[1mwandb[0m: 	clip: 50
[34m[1mwandb[0m: 	decoder_learning_ratio: 1
[34m[1mwandb[0m: 	learning_rate: 0.0001
[34m[1mwandb[0m: 	optimizer: sgd
[34m[1mwandb[0m: 	teacher_forcing_ratio: 1


Building encoder and decoder ...
Models built and ready to go!
Building optimizers ...
Starting Training!
Initializing ...
Training...
Iteration: 1000; Percent complete: 25.0%; Average loss: 8.3837
Iteration: 2000; Percent complete: 50.0%; Average loss: 6.5788
Iteration: 3000; Percent complete: 75.0%; Average loss: 6.0874
Iteration: 4000; Percent complete: 100.0%; Average loss: 5.7312


0,1
loss,██████▇▇▇▇▄▄▃▃▄▃▃▂▃▂▃▂▂▃▂▂▂▂▂▂▂▂▂▁▁▂▁▁▁▁

0,1
loss,5.5288


[34m[1mwandb[0m: Agent Starting Run: pw9ba5af with config:
[34m[1mwandb[0m: 	clip: 50
[34m[1mwandb[0m: 	decoder_learning_ratio: 3
[34m[1mwandb[0m: 	learning_rate: 0.0001
[34m[1mwandb[0m: 	optimizer: sgd
[34m[1mwandb[0m: 	teacher_forcing_ratio: 0


Building encoder and decoder ...
Models built and ready to go!
Building optimizers ...
Starting Training!
Initializing ...
Training...
Iteration: 1000; Percent complete: 25.0%; Average loss: 6.8362
Iteration: 2000; Percent complete: 50.0%; Average loss: 5.7641
Iteration: 3000; Percent complete: 75.0%; Average loss: 5.3650
Iteration: 4000; Percent complete: 100.0%; Average loss: 5.1645


0,1
loss,█▆▅▄▄▄▄▃▃▃▃▃▂▂▃▃▂▂▂▂▂▂▂▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁

0,1
loss,5.00222


[34m[1mwandb[0m: Agent Starting Run: 2rgfnvus with config:
[34m[1mwandb[0m: 	clip: 50
[34m[1mwandb[0m: 	decoder_learning_ratio: 3
[34m[1mwandb[0m: 	learning_rate: 0.001
[34m[1mwandb[0m: 	optimizer: sgd
[34m[1mwandb[0m: 	teacher_forcing_ratio: 0


Building encoder and decoder ...
Models built and ready to go!
Building optimizers ...
Starting Training!
Initializing ...
Training...
Iteration: 1000; Percent complete: 25.0%; Average loss: 5.2995
Iteration: 2000; Percent complete: 50.0%; Average loss: 4.8106
Iteration: 3000; Percent complete: 75.0%; Average loss: 4.7302
Iteration: 4000; Percent complete: 100.0%; Average loss: 4.7020


0,1
loss,█▄▃▃▄▃▃▃▃▂▂▂▃▃▂▂▁▁▃▁▂▂▂▂▂▂▁▁▂▁▁▂▂▂▂▃▂▂▂▂

0,1
loss,4.55736


[34m[1mwandb[0m: Agent Starting Run: saa7wvwi with config:
[34m[1mwandb[0m: 	clip: 0
[34m[1mwandb[0m: 	decoder_learning_ratio: 5
[34m[1mwandb[0m: 	learning_rate: 0.00025
[34m[1mwandb[0m: 	optimizer: sgd
[34m[1mwandb[0m: 	teacher_forcing_ratio: 0


Building encoder and decoder ...
Models built and ready to go!
Building optimizers ...
Starting Training!
Initializing ...
Training...
Iteration: 1000; Percent complete: 25.0%; Average loss: 8.9829
Iteration: 2000; Percent complete: 50.0%; Average loss: 8.9827
Iteration: 3000; Percent complete: 75.0%; Average loss: 8.9827
Iteration: 4000; Percent complete: 100.0%; Average loss: 8.9827


0,1
loss,█▅▃▂▁▆▃▄▆▅▂▃▄▄▂▃▄▆▅▄▅▆▅▃█▃▆▂▄▅▅▂▄▅▅▅▄▂▅▆

0,1
loss,8.98237


[34m[1mwandb[0m: Agent Starting Run: lbi9olgj with config:
[34m[1mwandb[0m: 	clip: 50
[34m[1mwandb[0m: 	decoder_learning_ratio: 3
[34m[1mwandb[0m: 	learning_rate: 0.0005
[34m[1mwandb[0m: 	optimizer: adam
[34m[1mwandb[0m: 	teacher_forcing_ratio: 0.5


Building encoder and decoder ...
Models built and ready to go!
Building optimizers ...
Starting Training!
Initializing ...
Training...
Iteration: 1000; Percent complete: 25.0%; Average loss: 4.6219
Iteration: 2000; Percent complete: 50.0%; Average loss: 4.0759
Iteration: 3000; Percent complete: 75.0%; Average loss: 3.7695
Iteration: 4000; Percent complete: 100.0%; Average loss: 3.4625


0,1
loss,█▆▆▆▆▄▆▄▄▄▅▅▅▃▅▃▃▃▃▂▅▅▃▂▅▄▄▂▁▂▁▄▁▄▄▄▄▃▁▁

0,1
loss,3.62506


[34m[1mwandb[0m: Agent Starting Run: m32thjkw with config:
[34m[1mwandb[0m: 	clip: 50
[34m[1mwandb[0m: 	decoder_learning_ratio: 1
[34m[1mwandb[0m: 	learning_rate: 0.0005
[34m[1mwandb[0m: 	optimizer: adam
[34m[1mwandb[0m: 	teacher_forcing_ratio: 0


Building encoder and decoder ...
Models built and ready to go!
Building optimizers ...
Starting Training!
Initializing ...
Training...
Iteration: 1000; Percent complete: 25.0%; Average loss: 4.7667
Iteration: 2000; Percent complete: 50.0%; Average loss: 4.5798
Iteration: 3000; Percent complete: 75.0%; Average loss: 4.4487
Iteration: 4000; Percent complete: 100.0%; Average loss: 4.2706


0,1
loss,█▇▇▇▆▆▆▆▅▆▇▅▅▆▄▆▄▅▅▅▄▄▄▅▄▅▄▃▅▃▄▃▄▂▂▂▃▃▃▁

0,1
loss,4.24082


[34m[1mwandb[0m: Agent Starting Run: srxegavg with config:
[34m[1mwandb[0m: 	clip: 50
[34m[1mwandb[0m: 	decoder_learning_ratio: 3
[34m[1mwandb[0m: 	learning_rate: 0.0001
[34m[1mwandb[0m: 	optimizer: sgd
[34m[1mwandb[0m: 	teacher_forcing_ratio: 0.5


Building encoder and decoder ...
Models built and ready to go!
Building optimizers ...
Starting Training!
Initializing ...
Training...
Iteration: 1000; Percent complete: 25.0%; Average loss: 7.0867
Iteration: 2000; Percent complete: 50.0%; Average loss: 5.6834
Iteration: 3000; Percent complete: 75.0%; Average loss: 5.2767
Iteration: 4000; Percent complete: 100.0%; Average loss: 5.0796


0,1
loss,██▇▄▅▄▄▄▃▄▃▃▃▃▂▂▂▂▂▂▂▂▂▂▁▂▂▁▁▂▂▁▂▂▁▁▁▂▁▁

0,1
loss,4.80389


[34m[1mwandb[0m: Agent Starting Run: 5c6ud02d with config:
[34m[1mwandb[0m: 	clip: 100
[34m[1mwandb[0m: 	decoder_learning_ratio: 3
[34m[1mwandb[0m: 	learning_rate: 0.001
[34m[1mwandb[0m: 	optimizer: adam
[34m[1mwandb[0m: 	teacher_forcing_ratio: 1


Building encoder and decoder ...
Models built and ready to go!
Building optimizers ...
Starting Training!
Initializing ...
Training...
Iteration: 1000; Percent complete: 25.0%; Average loss: 4.1190
Iteration: 2000; Percent complete: 50.0%; Average loss: 3.5663
Iteration: 3000; Percent complete: 75.0%; Average loss: 3.3127
Iteration: 4000; Percent complete: 100.0%; Average loss: 3.1376


0,1
loss,█▆▄▆▅▅▅▅▅▅▃▄▄▃▄▃▃▅▃▃▂▃▃▃▂▃▂▃▃▂▂▂▃▂▃▂▃▃▃▁

0,1
loss,3.28074


[34m[1mwandb[0m: Agent Starting Run: ytbavdsy with config:
[34m[1mwandb[0m: 	clip: 25
[34m[1mwandb[0m: 	decoder_learning_ratio: 1
[34m[1mwandb[0m: 	learning_rate: 0.00025
[34m[1mwandb[0m: 	optimizer: adam
[34m[1mwandb[0m: 	teacher_forcing_ratio: 0.5


Building encoder and decoder ...
Models built and ready to go!
Building optimizers ...
Starting Training!
Initializing ...
Training...
Iteration: 1000; Percent complete: 25.0%; Average loss: 4.5678
Iteration: 2000; Percent complete: 50.0%; Average loss: 4.2062
Iteration: 3000; Percent complete: 75.0%; Average loss: 4.0256
Iteration: 4000; Percent complete: 100.0%; Average loss: 3.9041


0,1
loss,█▄▄▄▃▃▃▃▄▃▃▃▂▃▃▄▄▄▂▂▂▂▄▄▄▃▄▃▂▂▄▂▃▁▂▁▁▄▁▁

0,1
loss,3.29276


[34m[1mwandb[0m: Agent Starting Run: 0v2a6d91 with config:
[34m[1mwandb[0m: 	clip: 25
[34m[1mwandb[0m: 	decoder_learning_ratio: 5
[34m[1mwandb[0m: 	learning_rate: 0.00025
[34m[1mwandb[0m: 	optimizer: adam
[34m[1mwandb[0m: 	teacher_forcing_ratio: 1


Building encoder and decoder ...
Models built and ready to go!
Building optimizers ...
Starting Training!
Initializing ...
Training...
Iteration: 1000; Percent complete: 25.0%; Average loss: 3.7541
Iteration: 2000; Percent complete: 50.0%; Average loss: 3.0946
Iteration: 3000; Percent complete: 75.0%; Average loss: 2.6655
Iteration: 4000; Percent complete: 100.0%; Average loss: 2.3014


0,1
loss,█▇▇▇▇▆▇▆▄▄▅▄▄▄▄▄▄▄▄▄▃▃▃▂▃▂▁▃▂▂▂▂▂▂▂▁▂▂▂▁

0,1
loss,2.07931


[34m[1mwandb[0m: Agent Starting Run: gu21tst0 with config:
[34m[1mwandb[0m: 	clip: 0
[34m[1mwandb[0m: 	decoder_learning_ratio: 3
[34m[1mwandb[0m: 	learning_rate: 0.001
[34m[1mwandb[0m: 	optimizer: adam
[34m[1mwandb[0m: 	teacher_forcing_ratio: 0


Building encoder and decoder ...
Models built and ready to go!
Building optimizers ...
Starting Training!
Initializing ...
Training...
Iteration: 1000; Percent complete: 25.0%; Average loss: 8.9636
Iteration: 2000; Percent complete: 50.0%; Average loss: 8.9637
Iteration: 3000; Percent complete: 75.0%; Average loss: 8.9637
Iteration: 4000; Percent complete: 100.0%; Average loss: 8.9636


0,1
loss,▃▆▄▄▅▃▅▃▄▃▃▅▂▇▂▅▄▃▅▄▄▄█▅▅▅▃▁▅▄▄▃▆▆▇▇▇▅▄▂

0,1
loss,8.96125


[34m[1mwandb[0m: Agent Starting Run: 30631d1e with config:
[34m[1mwandb[0m: 	clip: 0
[34m[1mwandb[0m: 	decoder_learning_ratio: 1
[34m[1mwandb[0m: 	learning_rate: 0.0001
[34m[1mwandb[0m: 	optimizer: sgd
[34m[1mwandb[0m: 	teacher_forcing_ratio: 0.5


Building encoder and decoder ...
Models built and ready to go!
Building optimizers ...
Starting Training!
Initializing ...
Training...
Iteration: 1000; Percent complete: 25.0%; Average loss: 8.9707
Iteration: 2000; Percent complete: 50.0%; Average loss: 8.9708
Iteration: 3000; Percent complete: 75.0%; Average loss: 8.9708
Iteration: 4000; Percent complete: 100.0%; Average loss: 8.9706


0,1
loss,▅█▇▂▁▇▃▄▃▄▂█▇▄▄▆▃▆▇▅▄▅▅█▄▅▄▅▅▂▄▄▆▃▅▇▅▅▆▃

0,1
loss,8.96898


[34m[1mwandb[0m: Agent Starting Run: hd02ryps with config:
[34m[1mwandb[0m: 	clip: 100
[34m[1mwandb[0m: 	decoder_learning_ratio: 1
[34m[1mwandb[0m: 	learning_rate: 0.0001
[34m[1mwandb[0m: 	optimizer: sgd
[34m[1mwandb[0m: 	teacher_forcing_ratio: 0


Building encoder and decoder ...
Models built and ready to go!
Building optimizers ...
Starting Training!
Initializing ...
Training...
Iteration: 1000; Percent complete: 25.0%; Average loss: 7.8085
Iteration: 2000; Percent complete: 50.0%; Average loss: 6.4499
Iteration: 3000; Percent complete: 75.0%; Average loss: 6.0513
Iteration: 4000; Percent complete: 100.0%; Average loss: 5.7692


0,1
loss,█████▇▅▃▃▃▃▄▃▃▃▂▂▂▂▂▂▂▂▁▂▂▂▂▂▂▁▂▂▁▁▁▁▁▁▁

0,1
loss,5.74151


[34m[1mwandb[0m: Agent Starting Run: o74esbax with config:
[34m[1mwandb[0m: 	clip: 0
[34m[1mwandb[0m: 	decoder_learning_ratio: 5
[34m[1mwandb[0m: 	learning_rate: 0.0005
[34m[1mwandb[0m: 	optimizer: adam
[34m[1mwandb[0m: 	teacher_forcing_ratio: 0.5


Building encoder and decoder ...
Models built and ready to go!
Building optimizers ...
Starting Training!
Initializing ...
Training...
Iteration: 1000; Percent complete: 25.0%; Average loss: 8.9786
Iteration: 2000; Percent complete: 50.0%; Average loss: 8.9786
Iteration: 3000; Percent complete: 75.0%; Average loss: 8.9785
Iteration: 4000; Percent complete: 100.0%; Average loss: 8.9786


0,1
loss,▆▅▄▅█▅█▅▇▅▇▇▅▄▂▅▄▆▆▆▄▅▃▄▄▇▆▅▄▅▆▄▅▅▅▅█▇▅▁

0,1
loss,8.973


## Problem 1.5

The best configuration is the one with the lowest loss.

Lowest Loss =  2.01315

Configuration:

- clip = 50
- decoder_learning_ratio = 3
- learning_rate = 0.0005
- optimizer = adam
- teacher_forcing_ratio = 1

In [30]:
def run_best_model(clip=50, decoder_learning_ratio=3, learning_rate=0.0005, optimizer='adam', teacher_forcing_ratio=1):
    # Configure models
    model_name = 'cb_model'
    attn_model = 'dot'
    hidden_size = 500
    encoder_n_layers = 2
    decoder_n_layers = 2
    dropout = 0.1
    batch_size = 64

    print('Building encoder and decoder ...')
    # Initialize word embeddings
    embedding = nn.Embedding(voc.num_words, hidden_size)

    # Initialize encoder & decoder models
    encoder = EncoderRNN(hidden_size, embedding, encoder_n_layers, dropout)
    decoder = LuongAttnDecoderRNN(attn_model, embedding, hidden_size, voc.num_words, decoder_n_layers, dropout)

    # Use appropriate device
    encoder = encoder.to(device)
    decoder = decoder.to(device)
    print('Models built and ready to go!')

    # Configure training/optimization
    clip = clip
    teacher_forcing_ratio = teacher_forcing_ratio
    learning_rate = learning_rate
    decoder_learning_ratio = decoder_learning_ratio
    n_iteration = 4000
    # Changed to not populate the entire console with loss outputs
    print_every = 500
    save_every = 1000

    # Ensure dropout layers are in train mode
    encoder.train()
    decoder.train()

    # Initialize optimizers
    print('Building optimizers ...')
    if optimizer == 'adam':
        encoder_optimizer = optim.Adam(encoder.parameters(), lr=learning_rate)
        decoder_optimizer = optim.Adam(decoder.parameters(), lr=learning_rate * decoder_learning_ratio)
    elif optimizer == 'sgd':
        encoder_optimizer = optim.SGD(encoder.parameters(), lr=learning_rate)
        decoder_optimizer = optim.SGD(decoder.parameters(), lr=learning_rate * decoder_learning_ratio)

    # If you have CUDA, configure CUDA to call
    for state in encoder_optimizer.state.values():
        for k, v in state.items():
            if isinstance(v, torch.Tensor):
                state[k] = v.cuda()

    for state in decoder_optimizer.state.values():
        for k, v in state.items():
            if isinstance(v, torch.Tensor):
                state[k] = v.cuda()

    # Run training iterations
    print("Starting Training!")
    trainIters(model_name, voc, pairs, encoder, decoder, encoder_optimizer, decoder_optimizer,
            embedding, encoder_n_layers, decoder_n_layers, save_dir, n_iteration, batch_size,
            print_every, save_every, clip, teacher_forcing_ratio, hidden_size, corpus_name)

    return encoder, decoder

In [31]:
best_model_enc, best_model_dec = run_best_model()

Building encoder and decoder ...
Models built and ready to go!
Building optimizers ...
Starting Training!
Initializing ...
Training...
Iteration: 500; Percent complete: 12.5%; Average loss: 4.0238
Iteration: 1000; Percent complete: 25.0%; Average loss: 3.4242
Iteration: 1500; Percent complete: 37.5%; Average loss: 3.1569
Iteration: 2000; Percent complete: 50.0%; Average loss: 2.8885
Iteration: 2500; Percent complete: 62.5%; Average loss: 2.6317
Iteration: 3000; Percent complete: 75.0%; Average loss: 2.4166
Iteration: 3500; Percent complete: 87.5%; Average loss: 2.2310
Iteration: 4000; Percent complete: 100.0%; Average loss: 2.0535


## Problem 1.6-1.7

Importing libraries

In [None]:
from torch.profiler import profile, record_function, ProfilerActivity

Profiling CPU Time and CUDA Time

In [None]:
best_model_enc.eval()
best_model_dec.eval()

input_sentence = 'hello how are you?'
print('> ' + input_sentence)

input_sentence = normalizeString(input_sentence)
best_model_searcher = GreedySearchDecoder(best_model_enc, best_model_dec)

with profile(activities=[ProfilerActivity.CPU, ProfilerActivity.CUDA], profile_memory=True, record_shapes=True) as prof:
    with record_function("model_inference"):
        output_words = evaluate(best_model_searcher, voc, input_sentence)
        output_words[:] = [x for x in output_words if not (x == 'EOS' or x == 'PAD')]
        print('Bot:', ' '.join(output_words))

In [34]:
print(prof.key_averages().table(sort_by="cpu_time_total", row_limit=10))

-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                   Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg       CPU Mem  Self CPU Mem      CUDA Mem  Self CUDA Mem    # of Calls  
-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                        model_inference        50.86%      20.131ms        99.92%      39.549ms      39.549ms       0.000us         0.00%       4.342ms       4.342ms           0 b      -7.96 Kb           0 b      -1.03 M

In [35]:
print(prof.key_averages(group_by_input_shape=True).table(sort_by="cpu_time_total", row_limit=10))

-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  --------------------------------------------------------------------------------  
                                                   Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg       CPU Mem  Self CPU Mem      CUDA Mem  Self CUDA Mem    # of Calls                                                                      Input Shapes  
-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  -------------------------------------------------------------------------

In [36]:
print(prof.key_averages().table(sort_by="cuda_time_total", row_limit=10))

-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                   Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg       CPU Mem  Self CPU Mem      CUDA Mem  Self CUDA Mem    # of Calls  
-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                        model_inference         0.00%       0.000us         0.00%       0.000us       0.000us     122.752ms      2826.77%     122.752ms      13.639ms           0 b           0 b           0 b           0 

Profiling CPU Memory and CUDA Memory

In [37]:
print(prof.key_averages().table(sort_by="self_cpu_memory_usage", row_limit=10))

-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                   Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg       CPU Mem  Self CPU Mem      CUDA Mem  Self CUDA Mem    # of Calls  
-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                            aten::empty         1.53%     605.766us         1.53%     605.766us       6.310us       0.000us         0.00%       0.000us       0.000us       7.96 Kb       7.96 Kb     344.85 Mb     344.85 M

In [38]:
print(prof.key_averages().table(sort_by="self_cuda_memory_usage", row_limit=10))

-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                   Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg       CPU Mem  Self CPU Mem      CUDA Mem  Self CUDA Mem    # of Calls  
-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                            aten::empty         1.53%     605.766us         1.53%     605.766us       6.310us       0.000us         0.00%       0.000us       0.000us       7.96 Kb       7.96 Kb     344.85 Mb     344.85 M

Exporting the chrome trace

In [39]:
prof.export_chrome_trace("trace.json")

## Problem 2.3

### TorchScript Greedy Search Decoder

In [40]:
class GreedySearchDecoder(nn.Module):
    def __init__(self, encoder, decoder, decoder_n_layers):
        super(GreedySearchDecoder, self).__init__()
        self.encoder = encoder
        self.decoder = decoder
        self._device = device
        self._SOS_token = SOS_token
        self._decoder_n_layers = decoder_n_layers

    __constants__ = ['_device', '_SOS_token', '_decoder_n_layers']

    def forward(self, input_seq: torch.Tensor, input_length: torch.Tensor, max_length: int):
        # Forward input through encoder model
        encoder_outputs, encoder_hidden = self.encoder(input_seq, input_length)
        # Prepare encoder's final hidden layer to be first hidden input to the decoder
        decoder_hidden = encoder_hidden[:self._decoder_n_layers]
        # Initialize decoder input with SOS_token

        decoder_input = torch.ones(1, input_seq.shape[1], device=self._device, dtype=torch.long) * self._SOS_token
        # Initialize tensors to append decoded words to
        all_tokens = torch.zeros([0], device=self._device, dtype=torch.long)
        all_scores = torch.zeros([0], device=self._device)
        # Iteratively decode one word token at a time
        for _ in range(max_length):
            # Forward pass through decoder
            decoder_output, decoder_hidden = self.decoder(decoder_input, decoder_hidden, encoder_outputs)
            # Obtain most likely word token and its softmax score
            decoder_scores, decoder_input = torch.max(decoder_output, dim=1)
            # Record token and score
            all_tokens = torch.cat((all_tokens, decoder_input), dim=0)
            all_scores = torch.cat((all_scores, decoder_scores), dim=0)
            # Prepare current token to be next decoder input (add a dimension)
            decoder_input = torch.unsqueeze(decoder_input, 0)
        # Return collections of word tokens and scores
        return all_tokens, all_scores

### TorchScript Model

In [57]:
device = torch.device('cpu')

In [58]:
save_dir = os.path.join("data", "save")
corpus_name = "movie-corpus"

# Configure models
model_name = 'cb_model'
attn_model = 'dot'
hidden_size = 500
encoder_n_layers = 2
decoder_n_layers = 2
dropout = 0.1
batch_size = 64

checkpoint_iter = 4000

loadFilename = os.path.join(save_dir, model_name, corpus_name,
                         '{}-{}_{}'.format(encoder_n_layers, decoder_n_layers, hidden_size),
                         '{}_checkpoint.tar'.format(checkpoint_iter))

# Load model
# Force CPU device options (to match tensors in this tutorial)
checkpoint = torch.load(loadFilename, map_location=device)
encoder_sd = checkpoint['en']
decoder_sd = checkpoint['de']
encoder_optimizer_sd = checkpoint['en_opt']
decoder_optimizer_sd = checkpoint['de_opt']
embedding_sd = checkpoint['embedding']
voc = Voc(corpus_name)
voc.__dict__ = checkpoint['voc_dict']


print('Building encoder and decoder ...')
# Initialize word embeddings
embedding = nn.Embedding(voc.num_words, hidden_size)
embedding.load_state_dict(embedding_sd)
# Initialize encoder & decoder models
encoder = EncoderRNN(hidden_size, embedding, encoder_n_layers, dropout)
decoder = LuongAttnDecoderRNN(attn_model, embedding, hidden_size, voc.num_words, decoder_n_layers, dropout)
# Load trained model parameters
encoder.load_state_dict(encoder_sd)
decoder.load_state_dict(decoder_sd)
# Use appropriate device
encoder = encoder.to(device)
decoder = decoder.to(device)
# Set dropout layers to ``eval`` mode
encoder.eval()
decoder.eval()
print('Models built and ready to go!')

  checkpoint = torch.load(loadFilename, map_location=device)


Building encoder and decoder ...
Models built and ready to go!


In [59]:
### Compile the whole greedy search model to TorchScript model
# Create artificial inputs
test_seq = torch.LongTensor(MAX_LENGTH, 1).random_(0, voc.num_words).to(device)
test_seq_length = torch.LongTensor([test_seq.size()[0]]).to(torch.device('cpu'))
# Trace the model
traced_encoder = torch.jit.trace(encoder, (test_seq, test_seq_length))

### Convert decoder model
# Create and generate artificial inputs
test_encoder_outputs, test_encoder_hidden = traced_encoder(test_seq, test_seq_length)
test_decoder_hidden = test_encoder_hidden[:decoder.n_layers]
test_decoder_input = torch.LongTensor(1, 1).random_(0, voc.num_words)

# Move the test inputs to the same device as the model (GPU)
test_encoder_outputs = test_encoder_outputs.to(device)
test_decoder_hidden = test_decoder_hidden.to(device)
test_decoder_input = test_decoder_input.to(device)

# Trace the model
traced_decoder = torch.jit.trace(decoder, (test_decoder_input, test_decoder_hidden, test_encoder_outputs))

### Initialize searcher module by wrapping ``torch.jit.script`` call
scripted_searcher = torch.jit.script(GreedySearchDecoder(traced_encoder, traced_decoder, decoder.n_layers))

  if a.grad is not None:


## Problem 2.4

In [60]:
print('scripted_searcher graph:\n', scripted_searcher.graph)

scripted_searcher graph:
 graph(%self : __torch__.___torch_mangle_60.GreedySearchDecoder,
      %input_seq.1 : Tensor,
      %input_length.1 : Tensor,
      %max_length.1 : int):
  %56 : bool = prim::Constant[value=0]()
  %45 : bool = prim::Constant[value=1]() # <ipython-input-40-22c2fc12193e>:24:8
  %21 : int = prim::Constant[value=4]() # <ipython-input-40-22c2fc12193e>:19:85
  %20 : Device = prim::Constant[value="cpu"]() # <ipython-input-40-22c2fc12193e>:19:65
  %14 : NoneType = prim::Constant()
  %12 : int = prim::Constant[value=2]() # <ipython-input-40-22c2fc12193e>:16:41
  %16 : int = prim::Constant[value=1]() # <ipython-input-40-22c2fc12193e>:19:35
  %29 : int = prim::Constant[value=0]() # <ipython-input-40-22c2fc12193e>:21:34
  %encoder : __torch__.___torch_mangle_40.EncoderRNN = prim::GetAttr[name="encoder"](%self)
  %7 : (Tensor, Tensor) = prim::CallMethod[name="forward"](%encoder, %input_seq.1, %input_length.1) # <ipython-input-40-22c2fc12193e>:14:42
  %encoder_outputs.1 : Te

## Problem 2.5

In [61]:
# Use appropriate device
scripted_searcher.to(device)
# Set dropout layers to ``eval`` mode
scripted_searcher.eval()

# Evaluate examples
sentences = ["hello", "what's up?", "who are you?", "where am I?", "where are you from?"]
for s in sentences:
    evaluateExample(s, scripted_searcher, voc)

> hello
Bot: hello . s a lovely .
> what's up?
Bot: harry . . . . .
> who are you?
Bot: thomas kent . i m with .
> where am I?
Bot: i can t . the past . .
> where are you from?
Bot: south boston . . hospital .


## Problem 2.6

In [62]:
def evaluateAndTime(searcher, eval_batches, n_iteration, device, max_length=MAX_LENGTH):
    time_diffs = []
    searcher = searcher.to(device)
    for iteration in range(n_iteration):
        input_variable, lengths, target_variable, mask, max_target_len = eval_batches[iteration]
        input_variable = input_variable.to(device)
        lengths = lengths.to('cpu')
        if iteration > 5:
            torch.cuda.synchronize()
            start = time.monotonic_ns()
        with torch.no_grad():
            output = searcher(input_variable, lengths, max_length)
        if iteration > 5:
            torch.cuda.synchronize()
            end = time.monotonic_ns()
        if iteration > 5:
            time_diffs.append(end - start)
    return np.array(time_diffs) / 1_000_000

In [63]:
n_iteration = 100

eval_batches = [batch2TrainData(voc, [random.choice(pairs) for _ in range(64)])
                    for _ in range(n_iteration)]

In [64]:
device = torch.device('cuda')

regular_gpu = evaluateAndTime(best_model_searcher, eval_batches, n_iteration, device)

print(regular_gpu.mean())

14.70044623404255


In [65]:
device = torch.device('cpu')

regular_cpu = evaluateAndTime(best_model_searcher, eval_batches, n_iteration, device)

print(regular_cpu.mean())

286.13959080851066


In [66]:
device = torch.device('cpu')

torchscript_cpu = evaluateAndTime(scripted_searcher, eval_batches, n_iteration, device)

print(torchscript_cpu.mean())

280.9392284680851


In [67]:
scripted_searcher.save("scripted_chatbot_cpu.pth")

### TorchScript Model for GPU

In [68]:
device = torch.device('cuda')

save_dir = os.path.join("data", "save")
corpus_name = "movie-corpus"

# Configure models
model_name = 'cb_model'
attn_model = 'dot'
hidden_size = 500
encoder_n_layers = 2
decoder_n_layers = 2
dropout = 0.1
batch_size = 64

checkpoint_iter = 4000

loadFilename = os.path.join(save_dir, model_name, corpus_name,
                         '{}-{}_{}'.format(encoder_n_layers, decoder_n_layers, hidden_size),
                         '{}_checkpoint.tar'.format(checkpoint_iter))

# Load model
# Force CPU device options (to match tensors in this tutorial)
checkpoint = torch.load(loadFilename, map_location=device)
encoder_sd = checkpoint['en']
decoder_sd = checkpoint['de']
encoder_optimizer_sd = checkpoint['en_opt']
decoder_optimizer_sd = checkpoint['de_opt']
embedding_sd = checkpoint['embedding']
voc = Voc(corpus_name)
voc.__dict__ = checkpoint['voc_dict']


print('Building encoder and decoder ...')
# Initialize word embeddings
embedding = nn.Embedding(voc.num_words, hidden_size)
embedding.load_state_dict(embedding_sd)
# Initialize encoder & decoder models
encoder = EncoderRNN(hidden_size, embedding, encoder_n_layers, dropout)
decoder = LuongAttnDecoderRNN(attn_model, embedding, hidden_size, voc.num_words, decoder_n_layers, dropout)
# Load trained model parameters
encoder.load_state_dict(encoder_sd)
decoder.load_state_dict(decoder_sd)
# Use appropriate device
encoder = encoder.to(device)
decoder = decoder.to(device)
# Set dropout layers to ``eval`` mode
encoder.eval()
decoder.eval()
print('Models built and ready to go!')

### Compile the whole greedy search model to TorchScript model
# Create artificial inputs
test_seq = torch.LongTensor(MAX_LENGTH, 1).random_(0, voc.num_words).to(device)
test_seq_length = torch.LongTensor([test_seq.size()[0]]).to(torch.device('cpu'))
# Trace the model
traced_encoder = torch.jit.trace(encoder, (test_seq, test_seq_length))

### Convert decoder model
# Create and generate artificial inputs
test_encoder_outputs, test_encoder_hidden = traced_encoder(test_seq, test_seq_length)
test_decoder_hidden = test_encoder_hidden[:decoder.n_layers]
test_decoder_input = torch.LongTensor(1, 1).random_(0, voc.num_words)

# Move the test inputs to the same device as the model (GPU)
test_encoder_outputs = test_encoder_outputs.to(device)
test_decoder_hidden = test_decoder_hidden.to(device)
test_decoder_input = test_decoder_input.to(device)

# Trace the model
traced_decoder = torch.jit.trace(decoder, (test_decoder_input, test_decoder_hidden, test_encoder_outputs))

### Initialize searcher module by wrapping ``torch.jit.script`` call
scripted_searcher = torch.jit.script(GreedySearchDecoder(traced_encoder, traced_decoder, decoder.n_layers))

print('scripted_searcher graph:\n', scripted_searcher.graph)

  checkpoint = torch.load(loadFilename, map_location=device)


Building encoder and decoder ...
Models built and ready to go!
scripted_searcher graph:
 graph(%self : __torch__.___torch_mangle_83.GreedySearchDecoder,
      %input_seq.1 : Tensor,
      %input_length.1 : Tensor,
      %max_length.1 : int):
  %56 : bool = prim::Constant[value=0]()
  %45 : bool = prim::Constant[value=1]() # <ipython-input-40-22c2fc12193e>:24:8
  %21 : int = prim::Constant[value=4]() # <ipython-input-40-22c2fc12193e>:19:85
  %20 : Device = prim::Constant[value="cuda"]() # <ipython-input-40-22c2fc12193e>:19:65
  %14 : NoneType = prim::Constant()
  %12 : int = prim::Constant[value=2]() # <ipython-input-40-22c2fc12193e>:16:41
  %16 : int = prim::Constant[value=1]() # <ipython-input-40-22c2fc12193e>:19:35
  %29 : int = prim::Constant[value=0]() # <ipython-input-40-22c2fc12193e>:21:34
  %encoder : __torch__.___torch_mangle_63.EncoderRNN = prim::GetAttr[name="encoder"](%self)
  %7 : (Tensor, Tensor) = prim::CallMethod[name="forward"](%encoder, %input_seq.1, %input_length.1) #

In [69]:
device = torch.device('cuda')

torchscript_gpu = evaluateAndTime(scripted_searcher, eval_batches, n_iteration, device)

print(torchscript_gpu.mean())

9.89043485106383


In [70]:
import pandas as pd

timing_table = pd.DataFrame({
    'Framework': ['PyTorch', 'TorchScript'],
    'Latency on CPU (ms)': [regular_cpu.mean(), torchscript_cpu.mean()],
    'Latency on GPU (ms)': [regular_gpu.mean(), torchscript_gpu.mean()]
})

### Timing Table

In [71]:
timing_table

Unnamed: 0,Framework,Latency on CPU (ms),Latency on GPU (ms)
0,PyTorch,286.139591,14.700446
1,TorchScript,280.939228,9.890435


## Problem 2.7

In [72]:
scripted_searcher.save("scripted_chatbot_gpu.pth")