In [86]:
!pip install spacy
!pip install wget



In [87]:
import torch
import pandas as pd
import json
import tqdm
from spacy.lang.en import English
import os
import random
import wget
import torch.nn as nn
import torch.nn.functional as F
import itertools
import torch.optim as optim

In [88]:
train_path = 'https://rajpurkar.github.io/SQuAD-explorer/dataset/train-v2.0.json'
test_path = 'https://rajpurkar.github.io/SQuAD-explorer/dataset/dev-v2.0.json'

In [89]:
tokenizer = English()

In [90]:
# !wget https://rajpurkar.github.io/SQuAD-explorer/dataset/train-v2.0.json

wget.download(train_path)

'train-v2.0 (2).json'

In [91]:
def open_file(file):
  with open(file, 'r') as f:
    lines = f.readlines()
  for line in lines[:10]:
    print(line)

In [92]:
# open_file('train-v2.0.json')

In [93]:
df = pd.read_json('train-v2.0.json')

In [94]:
df.head()

Unnamed: 0,version,data
0,v2.0,"{'title': 'Beyoncé', 'paragraphs': [{'qas': [{..."
1,v2.0,"{'title': 'Frédéric_Chopin', 'paragraphs': [{'..."
2,v2.0,{'title': 'Sino-Tibetan_relations_during_the_M...
3,v2.0,"{'title': 'IPod', 'paragraphs': [{'qas': [{'qu..."
4,v2.0,{'title': 'The_Legend_of_Zelda:_Twilight_Princ...


In [95]:
df.dtypes

version    object
data       object
dtype: object

In [96]:
df['data'][0]['paragraphs'][0]['qas'][0]['question'][-2]

'r'

In [97]:
tokenizer(df['data'][0]['paragraphs'][0]['qas'][0]['question'])[-2].text

'popular'

In [98]:
len(df)

442

In [99]:
# tokenize a input sentence into list of words
def tokenize_word(sentence):
  return [token.text for token in tokenizer(sentence)]

In [100]:
# clean the input text for the future processing

def clean_text(text):
  text = text.replace("]", " ] ")
  text = text.replace("[", " [ ")
  text = text.replace("\n", " ")
  text = text.replace("''", '" ').replace("``", '" ')
  return text

In [101]:
import torch.utils.data as data

class SquadDataset(data.Dataset):
  """
    Customizing squad dataset to include the following fields for each instance:
    1. question - string describing the question
    2. answer - string describing the corresponding answer
    3. context - the relevent context of question and answer, not all question answer pairs have the same context
    4. label - a higher level context, multiple question answer pairs might have same label but different context
  """

  def __init__(self, question, answer, context, label):
    self.question = question
    self.answer = answer
    self.context = context
    self.label = label

  def __get_item__(self, index):
    return self.question[index], self.answer[index], self.context[index], self.label[index]

  def __len__(self):
    return len(self.question)

In [102]:
class PreprocessDataset():
  def __init__(self, data_dir, tokenizer, max_len = 20):
    self.data_dir = data_dir
    self.embedding_dim = 10
    self.tokenizer = tokenizer
    self.max_len = max_len

  def load_data(self, file_name):
    print(f'Loading file {file_name}...')
    with open(file_name, 'r') as f:
      self.data = json.load(f)

  def separate_data(self, file_name):
    self.load_data(file_name)
    sub_dir = 'separate_data'

    if not os.path.exists(os.path.join(self.data_dir, sub_dir)):
      os.makedirs(os.path.join(self.data_dir, sub_dir))

    # create a sub directory to store fields into separate files
    with open(os.path.join(self.data_dir, sub_dir, sub_dir + '.context'), 'w', encoding="utf-8") as context_file,\
             open(os.path.join(self.data_dir, sub_dir, sub_dir + '.question'), 'w', encoding="utf-8") as question_file,\
             open(os.path.join(self.data_dir, sub_dir, sub_dir + '.answer'), 'w', encoding="utf-8") as answer_file,\
             open(os.path.join(self.data_dir, sub_dir, sub_dir + '.label'), 'w', encoding="utf-8") as label_file:
      for i in tqdm.tqdm(range(len(self.data['data']))):
        label = self.data['data'][i]['title']
        label_token = self.tokenizer(label)
        paragraphs = self.data['data'][i]['paragraphs']

        for paragraph in paragraphs:
          context = paragraph['context']
          # clean context
          context_tokens = self.tokenizer(context)

          qas = paragraph['qas']
          for qa in qas:
            # clean qa
            question = qa['question']
            question_tokens = self.tokenizer(question)
            # skip if no answers are found
            if(qa['is_impossible'] is True):
              continue
            # selecting only one answer for now
            answer = qa['answers'][0]['text']
            answer_tokens = self.tokenizer(answer)

            if(len(question_tokens) <= self.max_len and len(answer_tokens) <= self.max_len):
              context_file.write(' '.join([token.text for token in context_tokens]) + '\n')
              question_file.write(' '.join([token.text for token in question_tokens]) + '\n')
              answer_file.write(' '.join([token.text for token in answer_tokens]) + '\n')
              label_file.write(''.join(label_token.text) + '\n')


In [103]:
preprocessor = PreprocessDataset('./', tokenizer)
preprocessor.separate_data('train-v2.0.json')

Loading file train-v2.0.json...


100%|██████████| 442/442 [00:26<00:00, 16.42it/s]


In [104]:
# vocabulary building

# giving out a unique index for these token tags
PAD_token = 0
BOS_token = 1
EOS_token = 2
UNK_token = 3

PAD_tag = '<PAD>'
BOS_tag = '<BOS>'
EOS_tag = '<EOS>'
UNK_tag = '<UNK>'

class Vocabulary:
    def __init__(self, vocab_name):
        self.vocab_name = vocab_name
        self.trimmed = False
        self.word2idx = {}
        self.word2count = {}
        self.index2word = {
            PAD_token : PAD_tag,
            BOS_token : BOS_tag,
            EOS_token : EOS_tag,
            UNK_token : UNK_tag,
        }
        self.total_word_count = 4

    def addSentence(self, sentence):
        for word in sentence.split(' '):
            self.addWord(word)

    def addWord(self, word):
        if word not in self.word2idx:
            # adding a new word to vocabulary
            self.word2idx[word] = self.total_word_count
            self.word2count[word] = 1
            self.index2word[self.total_word_count] = word
            self.total_word_count += 1
        else:
            # just increase the word count
            self.word2count[word] += 1

    def trim_words(self, min_frequency):
        if self.trimmed:
            return
        self.trimmed = True

        retain_words = []
        for word, frequency in self.word2count.items():
            if frequency >= min_frequency:
                retain_words.append(word)

        # re initialize data
        self.word2idx = {}
        self.word2count = {}
        self.index2word = {
            PAD_token : PAD_tag,
            BOS_token : BOS_tag,
            EOS_token : EOS_tag,
            UNK_token : UNK_tag,
        }
        self.total_word_count = 4

        for word in retain_words:
            self.addWord(word)     

    def count_high_frequency_words(self, min_frequency):
        count = 0
        for frequency in self.word2count.values():
            if frequency >= min_frequency:
                count += 1
        return count   

In [105]:
def create_vocab(vocab_name, file_paths, min_frequency=-1):
    vocab = Vocabulary(vocab_name)
    for file_path in file_paths:
        sentences = open(file_path, encoding='utf-8').read().strip().split('\n')
        for sentence in sentences:
            vocab.addSentence(sentence)
        if min_frequency != -1:
            vocab.trim_words(min_frequency)
    return vocab

def make_pairs(questions_path, answers_path):
    pairs = []
    with open(questions_path, encoding='utf-8') as f:
        questions = f.read().strip().split('\n')
    with open(answers_path, encoding='utf-8') as f:
        answers = f.read().strip().split('\n')
    for i in range(min(len(questions), len(answers))):
        pairs.append((questions[i], answers[i]))
    return pairs


In [106]:
squad_vocab = create_vocab('squad_vocab', ['./separate_data/separate_data.question', './separate_data/separate_data.answer'])
squad_pairs = make_pairs('./separate_data/separate_data.question', './separate_data/separate_data.answer')

In [107]:
def index_from_sentence(voc:Vocabulary, sentence:str):
    return [voc.word2idx[word] for word in sentence.split(' ')]

def pad_sentence(sentence:str, max_length = 20):
    words = sentence.split(' ')
    num_padding = max_length - len(words)
    if num_padding > 0:
        words += [PAD_tag]*num_padding
    return ' '.join([BOS_tag] + words + [EOS_tag])

def pad_sentence_batch(sentences, pad_token = PAD_token):
    return list(itertools.zip_longest(*sentences, fillvalue = pad_token))

def pad_mask(sentences, mask_token = PAD_token):
    sentences_mask = []
    for i, sentence in enumerate(sentences):
        sentence_mask = []
        for token in sentence:
            if token == mask_token:
                sentence_mask.append(1)
            else:
                sentence_mask.append(0)
        sentences_mask.append(sentence_mask)
    return sentences_mask


def batch_input_sentences(sentences, voc: Vocabulary):
    '''
        1. convert the string sentence into a list of numbers - each number is the index of a word 
           according to the vocabulary voc.
        2. pad these sentences - more description in pad_sentence_batch function
        3. return the padded - indexed - batched sentence and the length of individual sentences
    '''
    indexed_sentences = [index_from_sentence(voc, sentence) for sentence in sentences]
    sentence_lengths = [ len(sentence) for sentence in indexed_sentences]
    padded_batch = pad_sentence_batch(indexed_sentences)   
    return torch.LongTensor(padded_batch), torch.tensor(sentence_lengths)

def batch_output_sentences(sentences, voc: Vocabulary):
    indexed_sentences = [index_from_sentence(voc, sentence) for sentence in sentences]
    max_sentence_length = max([len(sentence) for sentence in indexed_sentences])
    padded_batch = pad_sentence_batch(indexed_sentences)
    batch_mask = pad_mask(padded_batch)
    batch_mask = torch.BoolTensor(batch_mask)
    return torch.LongTensor(padded_batch), batch_mask, max_sentence_length

def batch_2_train_data(voc: Vocabulary, pairs):
    pairs.sort(key=lambda x: len(x[0].split(" ")), reverse=True)
    input_batch, output_batch = [], []
    for pair in pairs:
        input_batch.append(pair[0])
        output_batch.append(pair[1])
    return batch_input_sentences(input_batch, voc), batch_output_sentences(output_batch, voc)


In [138]:
# Example for validation
small_batch_size = 5
batches = batch_2_train_data(squad_vocab, [random.choice(squad_pairs) for _ in range(small_batch_size)])
(input_variable, lengths), (target_variable, mask, max_target_len) = batches

# print("input_variable:", input_variable)
# print("lengths:", lengths)
# print("target_variable:", target_variable)
print("mask:", mask)
# print("max_target_len:", max_target_len)

mask: tensor([[False, False, False, False, False],
        [False, False, False, False, False],
        [False, False, False, False, False],
        [False, False, False, False, False],
        [False, False, False, False,  True],
        [ True, False, False,  True,  True],
        [ True, False,  True,  True,  True],
        [ True, False,  True,  True,  True],
        [ True, False,  True,  True,  True],
        [ True, False,  True,  True,  True]])


In [133]:
index_from_sentence(squad_vocab, 'How much did the second world tour make in dollars ?')

[62, 448, 5, 40, 71, 154, 414, 450, 14, 453, 10]

In [110]:
pad_sentence('How much did the second world tour make in dollars ?')

'<BOS> How much did the second world tour make in dollars ? <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <EOS>'

In [111]:
squad_vocab.total_word_count

61071

In [112]:
squad_vocab.count_high_frequency_words(4)

19048

In [113]:
# since seq-seq model is encoder decoder type 
# we'll be having two model classes - one for encoder and other for decoder

class EncoderRNN(nn.Module):
    def __init__(self, hidden_size, embedding, num_layers=1, dropout=0):
        super(EncoderRNN, self).__init__()
        self.num_layers = num_layers
        self.hidden_size = hidden_size
        self.embedding = embedding
        self.gru = nn.GRU(hidden_size, hidden_size, num_layers,
                          dropout=(0 if num_layers == 1 else dropout), bidirectional=True)
        self.init_weights()

    def init_weights(self):
        for name, param in self.gru.named_parameters():
            if 'weight_ih' in name:
                torch.nn.init.xavier_uniform_(param.data)
            elif 'weight_hh' in name:
                torch.nn.init.orthogonal_(param.data)

    def forward(self, input_seq, input_lengths, hidden=None):
        embedded = self.embedding(input_seq)
        packed = nn.utils.rnn.pack_padded_sequence(embedded, input_lengths)
        outputs, hidden = self.gru(packed, hidden)
        outputs, _ = nn.utils.rnn.pad_packed_sequence(outputs)
        outputs = outputs[:, :, :self.hidden_size] + outputs[:, :, self.hidden_size:]
        return outputs, hidden

In [114]:
device = torch.device('cpu')

In [115]:
class Attn(nn.Module):
    def __init__(self, method, hidden_size):
        super(Attn, self).__init__()
        self.method = method
        if self.method not in ['dot', 'general', 'concat']:
            raise ValueError(self.method, "is not an appropriate attention method.")
        self.hidden_size = hidden_size

        if self.method == 'general':
            self.attn = nn.Linear(self.hidden_size, hidden_size)
        elif self.method == 'concat':
            self.attn = nn.Linear(self.hidden_size * 2, hidden_size)
            self.v = nn.Parameter(torch.zeros(hidden_size))
            nn.init.xavier_uniform_(self.v.data)

        self.init_weights()

    def init_weights(self):
        if self.method in ['general', 'concat']:
            nn.init.xavier_uniform_(self.attn.weight)
            nn.init.constant_(self.attn.bias, 0)

    def dot_score(self, hidden, encoder_output):
        return torch.sum(hidden * encoder_output, dim=2)

    def general_score(self, hidden, encoder_output):
        energy = self.attn(encoder_output)
        return torch.sum(hidden * energy, dim=2)

    def concat_score(self, hidden, encoder_output):
        energy = self.attn(torch.cat((hidden.expand(encoder_output.size(0), -1, -1), encoder_output), 2)).tanh()
        return torch.sum(self.v * energy, dim=2)

    def forward(self, hidden, encoder_outputs):
        if self.method == 'general':
            attn_energies = self.general_score(hidden, encoder_outputs)
        elif self.method == 'concat':
            attn_energies = self.concat_score(hidden, encoder_outputs)
        elif self.method == 'dot':
            attn_energies = self.dot_score(hidden, encoder_outputs)

        attn_energies = attn_energies.t()
        return F.softmax(attn_energies, dim=1).unsqueeze(1)

In [143]:
class DecoderRNN(nn.Module):
    def __init__(self, attn_model, embedding, hidden_size, output_size, num_layers=1, dropout=0.1):
        super(DecoderRNN, self).__init__()

        # Keep for reference
        self.attn_model = attn_model
        self.hidden_size = hidden_size
        self.output_size = output_size
        self.num_layers = num_layers
        self.dropout = dropout

        # Define layers
        self.embedding = embedding
        self.embedding_dropout = nn.Dropout(dropout)
        self.gru = nn.GRU(hidden_size, hidden_size, num_layers, dropout=(0 if num_layers == 1 else dropout))
        self.concat = nn.Linear(hidden_size * 2, hidden_size)
        self.out = nn.Linear(hidden_size, output_size)

        self.attn = Attn(attn_model, hidden_size)

    def forward(self, input_step, last_hidden, encoder_outputs):
        # Note: we run this one step (word) at a time
        # Get embedding of current input word
        embedded = self.embedding(input_step)
        embedded = self.embedding_dropout(embedded)
        # Forward through unidirectional GRU
        rnn_output, hidden = self.gru(embedded, last_hidden)
        # Calculate attention weights from the current GRU output
        attn_weights = self.attn(rnn_output, encoder_outputs)
        # Multiply attention weights to encoder outputs to get new "weighted sum" context vector
        context = attn_weights.bmm(encoder_outputs.transpose(0, 1))
        # Concatenate weighted context vector and GRU output using Luong eq. 5
        rnn_output = rnn_output.squeeze(0)
        context = context.squeeze(1)
        concat_input = torch.cat((rnn_output, context), 1)
        concat_output = torch.tanh(self.concat(concat_input))
        # Predict next word using Luong eq. 6
        output = self.out(concat_output)
        output = F.softmax(output, dim=1)
        # Return output and final hidden state
        return output, hidden

In [144]:
# def maskNLLLoss(inp, target, mask, device='cpu'):
#     nTotal = mask.sum()
#     crossEntropy = -torch.log(torch.gather(inp, 1, target.view(-1, 1)).squeeze(1))
#     masked_cross_entropy = crossEntropy.masked_select(mask)
#     loss = masked_cross_entropy.mean()
#     loss = loss.to(device)  # Ensure the loss tensor is on the correct device
#     print(f'Loss: {loss}, nTotal: {nTotal.item()}')
#     return loss, nTotal.item()


def maskNLLLoss(inp, target, mask):
    print(f'input: {inp}, target: {target}, mask: {mask}')
    nTotal = mask.sum()
    print(f'ntotal: {nTotal}')
    crossEntropy = -torch.log(torch.gather(inp, 1, target.view(-1, 1)).squeeze(1))
    print(f'crossEntropy: {crossEntropy}')
    loss = crossEntropy.masked_select(mask).mean()
    loss = loss.to(device)
    print(f'loss: {loss}, ntotal item: {nTotal.item()}')
    return loss, nTotal.item()


In [145]:
# Configure training/optimization
clip = 50.0
teacher_forcing_ratio = 1.0
learning_rate = 0.0001
decoder_learning_ratio = 5.0
n_iteration = 4000
print_every = 1
save_every = 500
checkpoint_path = 'model.pt'

In [146]:
def train(input_variable, lengths, target_variable, mask, max_target_len, encoder, decoder, embedding,
          encoder_optimizer, decoder_optimizer, batch_size, clip, max_length=20):

    # Zero gradients
    encoder_optimizer.zero_grad()
    decoder_optimizer.zero_grad()

    # Set device options
    input_variable = input_variable.to(device)
    target_variable = target_variable.to(device)
    mask = mask.to(device)
    # Lengths for RNN packing should always be on the CPU
    lengths = lengths.to("cpu")

    # Initialize variables
    loss = 0
    print_losses = []
    n_totals = 0

    # Forward pass through encoder
    encoder_outputs, encoder_hidden = encoder(input_variable, lengths)

    # Create initial decoder input (start with SOS tokens for each sentence)
    decoder_input = torch.LongTensor([[BOS_token for _ in range(batch_size)]])
    decoder_input = decoder_input.to(device)

    # Set initial decoder hidden state to the encoder's final hidden state
    decoder_hidden = encoder_hidden[:decoder.num_layers]

    # Determine if we are using teacher forcing this iteration
    use_teacher_forcing = True if random.random() < teacher_forcing_ratio else False

    # Forward batch of sequences through decoder one time step at a time
    if use_teacher_forcing:
        for t in range(max_target_len):
            decoder_output, decoder_hidden = decoder(
                decoder_input, decoder_hidden, encoder_outputs
            )
            # Teacher forcing: next input is current target
            decoder_input = target_variable[t].view(1, -1)
            # Calculate and accumulate loss
            mask_loss, nTotal = maskNLLLoss(decoder_output, target_variable[t], mask[t])
            loss += mask_loss
            print_losses.append(mask_loss.item() * nTotal)
            n_totals += nTotal
    else:
        for t in range(max_target_len):
            decoder_output, decoder_hidden = decoder(
                decoder_input, decoder_hidden, encoder_outputs
            )
            # No teacher forcing: next input is decoder's own current output
            _, topi = decoder_output.topk(1)
            decoder_input = torch.LongTensor([[topi[i][0] for i in range(batch_size)]])
            decoder_input = decoder_input.to(device)
            # Calculate and accumulate loss
            mask_loss, nTotal = maskNLLLoss(decoder_output, target_variable[t], mask[t])
            loss += mask_loss
            print_losses.append(mask_loss.item() * nTotal)
            n_totals += nTotal

    # Perform backpropagation
    loss.backward()

    # Clip gradients: gradients are modified in place
    _ = nn.utils.clip_grad_norm_(encoder.parameters(), clip)
    _ = nn.utils.clip_grad_norm_(decoder.parameters(), clip)

    # Adjust model weights
    encoder_optimizer.step()
    decoder_optimizer.step()

    return sum(print_losses) / n_totals

In [147]:
# Configure models
model_name = 'cb_model'
attn_model = 'dot'
#``attn_model = 'general'``
#``attn_model = 'concat'``
hidden_size = 500
encoder_num_layers = 2
decoder_num_layers = 2
dropout = 0.1
batch_size = 64

# Set checkpoint to load from; set to None if starting from scratch
loadFilename = None
checkpoint_iter = 4000

In [148]:
# Load model if a ``loadFilename`` is provided
if loadFilename:
    # If loading on same machine the model was trained on
    checkpoint = torch.load(loadFilename)
    # If loading a model trained on GPU to CPU
    #checkpoint = torch.load(loadFilename, map_location=torch.device('cpu'))
    encoder_sd = checkpoint['en']
    decoder_sd = checkpoint['de']
    encoder_optimizer_sd = checkpoint['en_opt']
    decoder_optimizer_sd = checkpoint['de_opt']
    embedding_sd = checkpoint['embedding']
    squad_vocab.__dict__ = checkpoint['voc_dict']


print('Building encoder and decoder ...')
# Initialize word embeddings
embedding = nn.Embedding(squad_vocab.total_word_count, hidden_size)
if loadFilename:
    embedding.load_state_dict(embedding_sd)
# Initialize encoder & decoder models
encoder = EncoderRNN(hidden_size, embedding, encoder_num_layers, dropout)
decoder = DecoderRNN(attn_model, embedding, hidden_size, squad_vocab.total_word_count, decoder_num_layers, dropout)
if loadFilename:
    encoder.load_state_dict(encoder_sd)
    decoder.load_state_dict(decoder_sd)
# Use appropriate device
encoder = encoder.to(device)
decoder = decoder.to(device)
print('Models built and ready to go!')

Building encoder and decoder ...
Models built and ready to go!


In [149]:
def trainIters(model_name, voc, pairs, encoder, decoder, encoder_optimizer, decoder_optimizer, embedding, encoder_num_layers, decoder_num_layers, save_dir, n_iteration, batch_size, print_every, save_every, clip, corpus_name, loadFilename):

    # Load batches for each iteration
    training_batches = [batch_2_train_data(voc, [random.choice(pairs) for _ in range(batch_size)])
                      for _ in range(n_iteration)]

    # Initializations
    print('Initializing ...')
    start_iteration = 1
    print_loss = 0
    if loadFilename:
        start_iteration = checkpoint['iteration'] + 1

    # Training loop
    print("Training...")
    for iteration in range(start_iteration, n_iteration + 1):
        training_batch = training_batches[iteration - 1]
        # Extract fields from batch
        (input_variable, lengths), (target_variable, mask, max_target_len) = training_batch

        # Run a training iteration with batch
        loss = train(input_variable, lengths, target_variable, mask, max_target_len, encoder,
                     decoder, embedding, encoder_optimizer, decoder_optimizer, batch_size, clip)
        print_loss += loss

        # Print progress
        if iteration % print_every == 0:
            print_loss_avg = print_loss / print_every
            print("Iteration: {}; Percent complete: {:.1f}%; Average loss: {:.4f}".format(iteration, iteration / n_iteration * 100, print_loss_avg))
            print_loss = 0

        # Save checkpoint
        if (iteration % save_every == 0):
            directory = os.path.join(save_dir, model_name, corpus_name, '{}-{}_{}'.format(encoder_num_layers, decoder_num_layers, hidden_size))
            if not os.path.exists(directory):
                os.makedirs(directory)
            torch.save({
                'iteration': iteration,
                'en': encoder.state_dict(),
                'de': decoder.state_dict(),
                'en_opt': encoder_optimizer.state_dict(),
                'de_opt': decoder_optimizer.state_dict(),
                'loss': loss,
                'voc_dict': voc.__dict__,
                'embedding': embedding.state_dict()
            }, os.path.join(directory, '{}_{}.tar'.format(iteration, 'checkpoint')))

In [150]:
# Configure training/optimization
clip = 50.0
teacher_forcing_ratio = 1.0
learning_rate = 0.0001
decoder_learning_ratio = 5.0
n_iteration = 4000
print_every = 1
save_every = 500
corpus_name = "squad_corpus"
save_dir = './checkpoints/'

# Ensure dropout layers are in train mode
encoder.train()
decoder.train()

# Initialize optimizers
print('Building optimizers ...')
encoder_optimizer = optim.Adam(encoder.parameters(), lr=learning_rate)
decoder_optimizer = optim.Adam(decoder.parameters(), lr=learning_rate * decoder_learning_ratio)
if loadFilename:
    encoder_optimizer.load_state_dict(encoder_optimizer_sd)
    decoder_optimizer.load_state_dict(decoder_optimizer_sd)

# If you have CUDA, configure CUDA to call
for state in encoder_optimizer.state.values():
    for k, v in state.items():
        if isinstance(v, torch.Tensor):
            state[k] = v.cuda()

for state in decoder_optimizer.state.values():
    for k, v in state.items():
        if isinstance(v, torch.Tensor):
            state[k] = v.cuda()

# Run training iterations
print("Starting Training!")
trainIters(model_name, squad_vocab, squad_pairs, encoder, decoder, encoder_optimizer, decoder_optimizer,
           embedding, encoder_num_layers, decoder_num_layers, save_dir, n_iteration, batch_size,
           print_every, save_every, clip, corpus_name, loadFilename)

Building optimizers ...
Starting Training!
Initializing ...
Training...
input: tensor([[1.9277e-05, 1.5425e-05, 1.8336e-05,  ..., 1.6551e-05, 1.6097e-05,
         1.6571e-05],
        [1.7125e-05, 1.5960e-05, 1.6504e-05,  ..., 1.4276e-05, 1.6593e-05,
         1.5605e-05],
        [1.7542e-05, 1.5141e-05, 1.7152e-05,  ..., 1.6833e-05, 1.6844e-05,
         1.6282e-05],
        ...,
        [1.6273e-05, 1.6933e-05, 1.7215e-05,  ..., 1.6803e-05, 1.6515e-05,
         1.5631e-05],
        [1.7613e-05, 1.4947e-05, 1.7267e-05,  ..., 1.7244e-05, 1.7050e-05,
         1.7150e-05],
        [1.6721e-05, 1.7373e-05, 1.8328e-05,  ..., 1.7833e-05, 1.5544e-05,
         1.6691e-05]], grad_fn=<SoftmaxBackward0>), target: tensor([26794,   345, 35257,  4968, 16775,  1520, 50370,  2362,    26,  1476,
           40,   140, 34299,  8418,  4425,  8314, 36284,  1840,  1449,  8030,
         1108,   772, 12639, 13864,   275, 12233,  7803, 12496,  4152, 50559,
        25136,  2343,  9438, 51295,   832, 29870, 1735

KeyboardInterrupt: 

In [151]:
class GreedySearchDecoder(nn.Module):
    def __init__(self, encoder, decoder):
        super(GreedySearchDecoder, self).__init__()
        self.encoder = encoder
        self.decoder = decoder

    def forward(self, input_seq, input_length, max_length):
        # Forward input through encoder model
        encoder_outputs, encoder_hidden = self.encoder(input_seq, input_length)
        # Prepare encoder's final hidden layer to be first hidden input to the decoder
        decoder_hidden = encoder_hidden[:decoder.n_layers]
        # Initialize decoder input with SOS_token
        decoder_input = torch.ones(1, 1, device=device, dtype=torch.long) * BOS_token
        # Initialize tensors to append decoded words to
        all_tokens = torch.zeros([0], device=device, dtype=torch.long)
        all_scores = torch.zeros([0], device=device)
        # Iteratively decode one word token at a time
        for _ in range(max_length):
            # Forward pass through decoder
            decoder_output, decoder_hidden = self.decoder(decoder_input, decoder_hidden, encoder_outputs)
            # Obtain most likely word token and its softmax score
            decoder_scores, decoder_input = torch.max(decoder_output, dim=1)
            # Record token and score
            all_tokens = torch.cat((all_tokens, decoder_input), dim=0)
            all_scores = torch.cat((all_scores, decoder_scores), dim=0)
            # Prepare current token to be next decoder input (add a dimension)
            decoder_input = torch.unsqueeze(decoder_input, 0)
        # Return collections of word tokens and scores
        return all_tokens, all_scores

In [153]:
def evaluate(encoder, decoder, searcher, voc, sentence, max_length=22):
    ### Format input sentence as a batch
    # words -> indexes
    indexes_batch = [index_from_sentence(voc, sentence)]
    # Create lengths tensor
    lengths = torch.tensor([len(indexes) for indexes in indexes_batch])
    # Transpose dimensions of batch to match models' expectations
    input_batch = torch.LongTensor(indexes_batch).transpose(0, 1)
    # Use appropriate device
    input_batch = input_batch.to(device)
    lengths = lengths.to("cpu")
    # Decode sentence with searcher
    tokens, scores = searcher(input_batch, lengths, max_length)
    # indexes -> words
    decoded_words = [voc.index2word[token.item()] for token in tokens]
    return decoded_words


def evaluateInput(encoder, decoder, searcher, voc):
    input_sentence = ''
    while(1):
        try:
            # Get input sentence
            input_sentence = input('> ')
            # Check if it is quit case
            if input_sentence == 'q' or input_sentence == 'quit': break
            # Normalize sentence
            # input_sentence = normalizeString(input_sentence)
            # Evaluate sentence
            output_words = evaluate(encoder, decoder, searcher, voc, input_sentence)
            # Format and print response sentence
            output_words[:] = [x for x in output_words if not (x == 'EOS' or x == 'PAD')]
            print('Bot:', ' '.join(output_words))

        except KeyError:
            print("Error: Encountered unknown word.")

In [154]:
# Set dropout layers to ``eval`` mode
encoder.eval()
decoder.eval()

# Initialize search module
searcher = GreedySearchDecoder(encoder, decoder)

# Begin chatting (uncomment and run the following line to begin)
# evaluateInput(encoder, decoder, searcher, voc)

In [155]:
evaluateInput(encoder, decoder, searcher, squad_vocab)

AttributeError: 'DecoderRNN' object has no attribute 'n_layers'