In [41]:
%matplotlib inline

Generation of Questions with a seq2seq network 
*************************************************************

[KEY: > input, = target, < output]

   

This is made possible by the simple but powerful idea of the `sequence
to sequence network <http://arxiv.org/abs/1409.3215>`__, in which two
recurrent neural networks work together to transform one sequence to
another. An encoder network condenses an input sequence into a vector,
and a decoder network unfolds that vector into a new sequence.



To improve upon this model we'll use an `attention
mechanism <https://arxiv.org/abs/1409.0473>`__, which lets the decoder
learn to focus over a specific range of the input sequence.



In [182]:
from __future__ import unicode_literals, print_function, division
from io import open
import unicodedata
import string
import re
import random

import torch
import torch.nn as nn
from torch import optim
import torch.nn.functional as F

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

Loading data files
===================

The data for this project is a set of many thousands of sentence pairs, broken down into two main groups i.e agreement and no agreement and are present in the dataset folder of the project



Similar to the character encoding used in the character-level RNN
tutorials, we will be representing each word in a language as a one-hot
vector, or giant vector of zeros except for a single one (at the index
of the word). 

We'll need a unique index per word to use as the inputs and targets of
the networks later. To keep track of all this we will use a helper class
called ``Lang`` which has word → index (``word2index``) and index → word
(``index2word``) dictionaries, as well as a count of each word
``word2count`` to use to later replace rare words.




In [72]:
SOS_token = 0
IDENT_TOKEN = -1
QUEST_TOKEN = -1
EOS_TOKEN =1
input_file_path = "./../data/agreement/"
agreement = True

class Lang:
    def __init__(self, name):
        self.name = name
        self.word2index = {}
        self.word2count = {}
        self.index2word = {0: "SOS", 1: "EOS"}
        self.n_words = 2  # Count SOS and EOS

    def addSentence(self, sentence):
        for word in sentence.split(' '):
            self.addWord(word)

    def addWord(self, word):
        if word not in self.word2index:
            self.word2index[word] = self.n_words
            self.word2count[word] = 1
            self.index2word[self.n_words] = word
            self.n_words += 1
        else:
            self.word2count[word] += 1

We shall convert all the words to lower case.

In [73]:
# Converts all to lowercase
def normalizeString(s):
    s = (s.lower().strip())
    return s

To read the data file we will split the file into lines, and then split
lines into pairs




In [74]:
def readLangs(lang1, lang2, reverse=False):
    print("Reading lines...")

    # Read the file and split into lines
    lines = open(input_file_path+'train.txt' ,).\
        read().strip().split('\n')

    # Split every line into pairs and normalize
    pairs = [[normalizeString(s) for s in l.split('\t')] for l in lines]

    # Reverse pairs, make Lang instances
    if reverse:
        pairs = [list(reversed(p)) for p in pairs]
        input_lang = Lang(lang2)
        output_lang = Lang(lang1)
    else:
        input_lang = Lang(lang1)
        output_lang = Lang(lang2)

    return input_lang, output_lang, pairs

The maximum length of each sentence has been set to a 100 words.



In [75]:
MAX_LENGTH = 100



def filterPairs(pairs):
    return [pair for pair in pairs]

The full process for preparing the data is:

-  Read text file and split into lines, split lines into pairs
-  Normalize text, filter by length and content
-  Make word lists from sentences in pairs




In [76]:
def prepareData(lang1, lang2, reverse=False):
    input_lang, output_lang, pairs = readLangs(lang1, lang2, reverse)
    print("Read %s sentence pairs" % len(pairs))
    pairs = filterPairs(pairs)
    print("Trimmed to %s sentence pairs" % len(pairs))
    print("Counting words...")
    for pair in pairs:
        input_lang.addSentence(pair[0])
        output_lang.addSentence(pair[1])
    print("Counted words:")
    print(input_lang.name, input_lang.n_words)
    print(output_lang.name, output_lang.n_words)
    return input_lang, output_lang, pairs


input_lang, output_lang, pairs = prepareData('out', 'inp', True)
IDENT_TOKEN = input_lang.word2index["ident"]
QUEST_TOKEN = input_lang.word2index["quest"]
print(random.choice(pairs))

Reading lines...
Read 116600 sentence pairs
Trimmed to 116600 sentence pairs
Counting words...
Counted words:
inp 55
out 56
['her unicorn who doesnt read doesnt confuse the unicorn by her unicorn . ident', 'her unicorn who doesnt read doesnt confuse the unicorn by her unicorn . ident']


The Seq2Seq Model
=================

A Recurrent Neural Network, or RNN, is a network that operates on a
sequence and uses its own output as input for subsequent steps.

A `Sequence to Sequence network <http://arxiv.org/abs/1409.3215>`__, or
seq2seq network, or `Encoder Decoder
network <https://arxiv.org/pdf/1406.1078v3.pdf>`__, is a model
consisting of two RNNs called the encoder and decoder. The encoder reads
an input sequence and outputs a single vector, and the decoder reads
that vector to produce an output sequence.


Unlike sequence prediction with a single RNN, where every input
corresponds to an output, the seq2seq model frees us from sequence
length and order, which makes it ideal for translation between two
languages.

With a seq2seq model the encoder creates a single vector which, in the
ideal case, encodes the "meaning" of the input sequence into a single
vector — a single point in some N dimensional space of sentences.

The Encoder
-----------

The encoder of a seq2seq network is a RNN that outputs some value for
every word from the input sentence. For every input word the encoder
outputs a vector and a hidden state, and uses the hidden state for the
next input word.

In [77]:
class EncoderRNN(nn.Module):
    def __init__(self, input_size, hidden_size):
        super(EncoderRNN, self).__init__()
        self.hidden_size = hidden_size

        self.embedding = nn.Embedding(input_size, hidden_size)
        self.gru = nn.GRU(hidden_size, hidden_size)

    def forward(self, input, hidden):
        embedded = self.embedding(input).view(1, 1, -1)
        output = embedded
        output, hidden = self.gru(output, hidden)
        return output, hidden

    def initHidden(self):
        return torch.randn(1, 1, self.hidden_size, device=device)

The Decoder
-----------

The decoder is another RNN that takes the encoder output vector(s) and
outputs a sequence of words to create the translation.




Simple Decoder
-----------

In the simplest seq2seq decoder we use only last output of the encoder.
This last output is sometimes called the *context vector* as it encodes
context from the entire sequence. This context vector is used as the
initial hidden state of the decoder.

At every step of decoding, the decoder is given an input token and
hidden state. The initial input token is the start-of-string ``<SOS>``
token, and the first hidden state is the context vector (the encoder's
last hidden state).

In [78]:
class DecoderRNN(nn.Module):
    def __init__(self, hidden_size, output_size):
        super(DecoderRNN, self).__init__()
        self.hidden_size = hidden_size

        self.embedding = nn.Embedding(output_size, hidden_size)
        self.gru = nn.GRU(hidden_size, hidden_size)
        self.out = nn.Linear(hidden_size, output_size)
        self.softmax = nn.LogSoftmax(dim=1)

    def forward(self, input, hidden):
        output = self.embedding(input).view(1, 1, -1)
        output = F.relu(output)
        output, hidden = self.gru(output, hidden)
        output = self.softmax(self.out(output[0]))
        return output, hidden

    def initHidden(self):
        return torch.randn(1, 1, self.hidden_size, device=device)

Attention Decoder
-----------


If only the context vector is passed betweeen the encoder and decoder,
that single vector carries the burden of encoding the entire sentence.

Attention allows the decoder network to "focus" on a different part of
the encoder's outputs for every step of the decoder's own outputs. First
we calculate a set of *attention weights*. These will be multiplied by
the encoder output vectors to create a weighted combination. The result
(called ``attn_applied`` in the code) should contain information about
that specific part of the input sequence, and thus help the decoder
choose the right output words.


Calculating the attention weights is done with another feed-forward
layer ``attn``, using the decoder's input and hidden state as inputs.
Because there are sentences of all sizes in the training data, to
actually create and train this layer we have to choose a maximum
sentence length (input length, for encoder outputs) that it can apply
to. Sentences of the maximum length will use all the attention weights,
while shorter sentences will only use the first few.

In [79]:
class AttnDecoderRNN(nn.Module):
    def __init__(self, hidden_size, output_size, dropout_p=0.1, max_length=MAX_LENGTH):
        super(AttnDecoderRNN, self).__init__()
        self.hidden_size = hidden_size
        self.output_size = output_size
        self.dropout_p = dropout_p
        self.max_length = max_length

        self.embedding = nn.Embedding(self.output_size, self.hidden_size)
        self.attn = nn.Linear(self.hidden_size * 2, self.max_length)
        self.attn_combine = nn.Linear(self.hidden_size * 2, self.hidden_size)
        self.dropout = nn.Dropout(self.dropout_p)
        self.gru = nn.GRU(self.hidden_size, self.hidden_size)
        self.out = nn.Linear(self.hidden_size, self.output_size)

    def forward(self, input, hidden, encoder_outputs):
        embedded = self.embedding(input).view(1, 1, -1)
        embedded = self.dropout(embedded)

        attn_weights = F.softmax(
            self.attn(torch.cat((embedded[0], hidden[0]), 1)), dim=1)
        attn_applied = torch.bmm(attn_weights.unsqueeze(0),
                                 encoder_outputs.unsqueeze(0))

        output = torch.cat((embedded[0], attn_applied[0]), 1)
        output = self.attn_combine(output).unsqueeze(0)

        output = F.relu(output)
        output, hidden = self.gru(output, hidden)

        output = F.log_softmax(self.out(output[0]), dim=1)
        return output, hidden, attn_weights

    def initHidden(self):
        return torch.randn(1, 1, self.hidden_size, device=device)



Training
========

Preparing Training Data
-----------------------

To train, for each pair we will need an input tensor (indexes of the
words in the input sentence) and target tensor (indexes of the words in
the target sentence). While creating these vectors we will append the
EOS token to both sequences.




In [82]:
def indexesFromSentence(lang, sentence):
    return [lang.word2index[word] for word in sentence.split(' ')]


def tensorFromSentence(lang, sentence):
    indexes = indexesFromSentence(lang, sentence)
    if(indexes[-1]!=lang.word2index["ident"] and indexes[-1]!=lang.word2index["quest"]):
        indexes.append(EOS_TOKEN)
        
    return torch.tensor(indexes, dtype=torch.long, device=device).view(-1, 1)


def tensorsFromPair(pair):
    input_tensor = tensorFromSentence(input_lang, pair[0])
    target_tensor = tensorFromSentence(output_lang, pair[1])
    return (input_tensor, target_tensor)

Training the Model
------------------

To train we run the input sentence through the encoder, and keep track
of every output and the latest hidden state. Then the decoder is given
the ``<SOS>`` token as its first input, and the last hidden state of the
encoder as its first hidden state.

"Teacher forcing" is the concept of using the real target outputs as
each next input, instead of using the decoder's guess as the next input.
Using teacher forcing causes it to converge faster but `when the trained
network is exploited, it may exhibit
instability <http://minds.jacobs-university.de/sites/default/files/uploads/papers/ESNTutorialRev.pdf>`__.

You can observe outputs of teacher-forced networks that read with
coherent grammar but wander far from the correct translation -
intuitively it has learned to represent the output grammar and can "pick
up" the meaning once the teacher tells it the first few words, but it
has not properly learned how to create the sentence from the translation
in the first place.

Because of the freedom PyTorch's autograd gives us, we can randomly
choose to use teacher forcing or not with a simple if statement. Turn
``teacher_forcing_ratio`` up to use more of it.




In [83]:
print(pairs[0])
tensorsFromPair(pairs[0])

['the cats do sleep . quest', 'do the cats sleep ? quest']


(tensor([[2],
         [3],
         [4],
         [5],
         [6],
         [7]]), tensor([[2],
         [3],
         [4],
         [5],
         [6],
         [7]]))

In [85]:
teacher_forcing_ratio = 0.5


def train(input_tensor, target_tensor, encoder, decoder, encoder_optimizer, decoder_optimizer, criterion, max_length=MAX_LENGTH):
    encoder_hidden = encoder.initHidden()

    encoder_optimizer.zero_grad()
    decoder_optimizer.zero_grad()

    input_length = input_tensor.size(0)
    target_length = target_tensor.size(0)

    encoder_outputs = torch.zeros(max_length, encoder.hidden_size, device=device)

    loss = 0

    for ei in range(input_length):
        encoder_output, encoder_hidden = encoder(
            input_tensor[ei], encoder_hidden)
        encoder_outputs[ei] = encoder_output[0, 0]

    decoder_input = torch.tensor([[SOS_token]], device=device)

    decoder_hidden = encoder_hidden

    use_teacher_forcing = True if random.random() < teacher_forcing_ratio else False

    if use_teacher_forcing:
        # Teacher forcing: Feed the target as the next input
        for di in range(target_length):
            decoder_output, decoder_hidden, decoder_attention = decoder(
                decoder_input, decoder_hidden, encoder_outputs)
            loss += criterion(decoder_output, target_tensor[di])
            decoder_input = target_tensor[di]  # Teacher forcing

    else:
        # Without teacher forcing: use its own predictions as the next input
        for di in range(target_length):
            decoder_output, decoder_hidden, decoder_attention = decoder(
                decoder_input, decoder_hidden, encoder_outputs)
            topv, topi = decoder_output.topk(1)
            decoder_input = topi.squeeze().detach()  # detach from history as input

            loss += criterion(decoder_output, target_tensor[di])
            if decoder_input.item() == IDENT_TOKEN or decoder_input.item() == QUEST_TOKEN or decoder_input.item()== EOS_TOKEN:
                break

    loss.backward()

    encoder_optimizer.step()
    decoder_optimizer.step()

    return loss.item() / target_length

This is a helper function to print time elapsed and estimated time
remaining given the current time and progress %.




In [86]:
import time
import math


def asMinutes(s):
    m = math.floor(s / 60)
    s -= m * 60
    return '%dm %ds' % (m, s)


def timeSince(since, percent):
    now = time.time()
    s = now - since
    es = s / (percent)
    rs = es - s
    return '%s (- %s)' % (asMinutes(s), asMinutes(rs))

The whole training process looks like this:

-  Start a timer
-  Initialize optimizers and criterion
-  Create set of training pairs
-  Start empty losses array for plotting

Then we call ``train`` many times and occasionally print the progress (%
of examples, time so far, estimated time) and average loss.




In [87]:
def trainIters(encoder, decoder, n_iters, print_every=1000, plot_every=100, learning_rate=0.01):
    start = time.time()
    plot_losses = []
    print_loss_total = 0  # Reset every print_every
    plot_loss_total = 0  # Reset every plot_every

    encoder_optimizer = optim.SGD(encoder.parameters(), lr=learning_rate)
    decoder_optimizer = optim.SGD(decoder.parameters(), lr=learning_rate)
    training_pairs = [tensorsFromPair(random.choice(pairs))
                      for i in range(n_iters)]
    criterion = nn.NLLLoss()

    for iter in range(1, n_iters + 1):
        training_pair = training_pairs[iter - 1]
        input_tensor = training_pair[0]
        target_tensor = training_pair[1]

        loss = train(input_tensor, target_tensor, encoder,
                     decoder, encoder_optimizer, decoder_optimizer, criterion)
        print_loss_total += loss
        plot_loss_total += loss

        if iter % print_every == 0:
            print_loss_avg = print_loss_total / print_every
            print_loss_total = 0
            print('%s (%d %d%%) %.4f' % (timeSince(start, iter / n_iters),
                                         iter, iter / n_iters * 100, print_loss_avg))

        if iter % plot_every == 0:
            plot_loss_avg = plot_loss_total / plot_every
            plot_losses.append(plot_loss_avg)
            plot_loss_total = 0

    showPlot(plot_losses)

Plotting results
----------------

Plotting is done with matplotlib, using the array of loss values
``plot_losses`` saved while training.




In [88]:
import matplotlib.pyplot as plt
plt.switch_backend('agg')
import matplotlib.ticker as ticker
import numpy as np


def showPlot(points):
    plt.figure()
    fig, ax = plt.subplots()
    # this locator puts ticks at regular intervals
    loc = ticker.MultipleLocator(base=0.2)
    ax.yaxis.set_major_locator(loc)
    plt.plot(points)

Evaluation
==========

Evaluation is mostly the same as training, but there are no targets so
we simply feed the decoder's predictions back to itself for each step.
Every time it predicts a word we add it to the output string, and if it
predicts the EOS token we stop there. We also store the decoder's
attention outputs for display later.




In [89]:
def evaluate(encoder, decoder, sentence, max_length=MAX_LENGTH):
    with torch.no_grad():
        input_tensor = tensorFromSentence(input_lang, sentence)
        input_length = input_tensor.size()[0]
        encoder_hidden = encoder.initHidden()

        encoder_outputs = torch.zeros(max_length, encoder.hidden_size, device=device)

        for ei in range(input_length):
            encoder_output, encoder_hidden = encoder(input_tensor[ei],
                                                     encoder_hidden)
            encoder_outputs[ei] += encoder_output[0, 0]

        decoder_input = torch.tensor([[SOS_token]], device=device)  # SOS

        decoder_hidden = encoder_hidden

        decoded_words = []
        decoder_attentions = torch.zeros(max_length, max_length)

        for di in range(max_length):
            decoder_output, decoder_hidden, decoder_attention = decoder(
                decoder_input, decoder_hidden, encoder_outputs)
            decoder_attentions[di] = decoder_attention.data
            topv, topi = decoder_output.data.topk(1)
            if topi.item() == IDENT_TOKEN :
                decoded_words.append('<IDENT>')
                break
            elif topi.item() == QUEST_TOKEN :
                decoded_words.append('<QUEST>')
                break
            elif topi.item() == EOS_TOKEN :
                decoded_words.append('<EOS>')
                break
            else:
                decoded_words.append(output_lang.index2word[topi.item()])

            decoder_input = topi.squeeze().detach()

        return decoded_words, decoder_attentions[:di + 1]

We can evaluate random sentences from the training set and print out the
input, target, and output to make some subjective quality judgements:




In [90]:
def evaluateRandomly(encoder, decoder, n=10):
    for i in range(n):
        pair = random.choice(pairs)
        print('>', pair[0])
        print('=', pair[1])
        output_words, attentions = evaluate(encoder, decoder, pair[0])
        output_sentence = ' '.join(output_words)
        print('<', output_sentence)
        print('')

Training and Evaluating
=======================



In [91]:
class TheModelClass(nn.Module):
    def __init__(self):
        super(TheModelClass, self).__init__()
        self.conv1 = nn.Conv2d(3, 6, 5)
        self.pool = nn.MaxPool2d(2, 2)
        self.conv2 = nn.Conv2d(6, 16, 5)
        self.fc1 = nn.Linear(16 * 5 * 5, 120)
        self.fc2 = nn.Linear(120, 84)
        self.fc3 = nn.Linear(84, 10)

    def forward(self, x):
        x = self.pool(F.relu(self.conv1(x)))
        x = self.pool(F.relu(self.conv2(x)))
        x = x.view(-1, 16 * 5 * 5)
        x = F.relu(self.fc1(x))
        x = F.relu(self.fc2(x))
        x = self.fc3(x)
        return x

# Initialize model
model = TheModelClass()

# Initialize optimizer
optimizer = optim.SGD(model.parameters(), lr=0.001, momentum=0.9)

# Print model's state_dict
print("Model's state_dict:")
for param_tensor in model.state_dict():
    print(param_tensor, "\t", model.state_dict()[param_tensor].size())

# Print optimizer's state_dict
print("Optimizer's state_dict:")
for var_name in optimizer.state_dict():
    print(var_name, "\t", optimizer.state_dict()[var_name])


Model's state_dict:
conv1.weight 	 torch.Size([6, 3, 5, 5])
conv1.bias 	 torch.Size([6])
conv2.weight 	 torch.Size([16, 6, 5, 5])
conv2.bias 	 torch.Size([16])
fc1.weight 	 torch.Size([120, 400])
fc1.bias 	 torch.Size([120])
fc2.weight 	 torch.Size([84, 120])
fc2.bias 	 torch.Size([84])
fc3.weight 	 torch.Size([10, 84])
fc3.bias 	 torch.Size([10])
Optimizer's state_dict:
param_groups 	 [{'lr': 0.001, 'weight_decay': 0, 'momentum': 0.9, 'dampening': 0, 'nesterov': False, 'params': [139930092388568, 139930092388640, 139930092388712, 139930092388784, 139930092388856, 139930092388928, 139930092389000, 139930092389072, 139930092389144, 139930092389216]}]
state 	 {}


In [98]:
hidden_size = 256
counter=1
name = "agreement_"+str(counter)+".pt"
print(name)

encoder1 = EncoderRNN(input_lang.n_words, hidden_size).to(device)
attn_decoder1 = AttnDecoderRNN(hidden_size, output_lang.n_words, dropout_p=0.1).to(device)
trainIters(encoder1, attn_decoder1, 75000, print_every=50)

torch.save(model.state_dict(),name)   
counter+=1

agreement_1.pt
0m 3s (- 78m 51s) (50 0%) 0.0449
0m 4s (- 56m 40s) (100 0%) 0.0384
0m 5s (- 48m 28s) (150 0%) 0.0449
0m 7s (- 44m 53s) (200 0%) 0.0372
0m 8s (- 42m 37s) (250 0%) 0.0442
0m 9s (- 40m 56s) (300 0%) 0.0374
0m 11s (- 39m 17s) (350 0%) 0.0248
0m 12s (- 38m 26s) (400 0%) 0.0500
0m 13s (- 38m 0s) (450 0%) 0.0427
0m 15s (- 37m 15s) (500 0%) 0.0250
0m 16s (- 36m 54s) (550 0%) 0.0412
0m 17s (- 36m 37s) (600 0%) 0.0447
0m 19s (- 36m 32s) (650 0%) 0.0575
0m 20s (- 36m 8s) (700 0%) 0.0316
0m 21s (- 35m 46s) (750 1%) 0.0456
0m 22s (- 35m 32s) (800 1%) 0.0480
0m 24s (- 35m 19s) (850 1%) 0.0414
0m 25s (- 35m 4s) (900 1%) 0.0737
0m 26s (- 34m 52s) (950 1%) 0.0631
0m 28s (- 34m 44s) (1000 1%) 0.0301
0m 29s (- 34m 30s) (1050 1%) 0.0318
0m 30s (- 34m 11s) (1100 1%) 0.0401
0m 31s (- 34m 4s) (1150 1%) 0.0339
0m 33s (- 33m 57s) (1200 1%) 0.0727
0m 34s (- 33m 55s) (1250 1%) 0.0444
0m 35s (- 33m 49s) (1300 1%) 0.0552
0m 37s (- 33m 44s) (1350 1%) 0.0386
0m 38s (- 33m 36s) (1400 1%) 0.0354
0m 39s 

5m 26s (- 30m 20s) (11400 15%) 0.0238
5m 27s (- 30m 20s) (11450 15%) 0.0264
5m 29s (- 30m 18s) (11500 15%) 0.0155
5m 30s (- 30m 17s) (11550 15%) 0.0138
5m 32s (- 30m 17s) (11600 15%) 0.0240
5m 33s (- 30m 16s) (11650 15%) 0.0451
5m 35s (- 30m 14s) (11700 15%) 0.0284
5m 36s (- 30m 13s) (11750 15%) 0.0256
5m 38s (- 30m 12s) (11800 15%) 0.0151
5m 39s (- 30m 11s) (11850 15%) 0.0221
5m 41s (- 30m 10s) (11900 15%) 0.0205
5m 42s (- 30m 9s) (11950 15%) 0.0316
5m 44s (- 30m 8s) (12000 16%) 0.0245
5m 45s (- 30m 7s) (12050 16%) 0.0181
5m 47s (- 30m 5s) (12100 16%) 0.0160
5m 48s (- 30m 4s) (12150 16%) 0.0298
5m 50s (- 30m 3s) (12200 16%) 0.1367
5m 51s (- 30m 2s) (12250 16%) 0.0391
5m 53s (- 30m 1s) (12300 16%) 0.0140
5m 54s (- 30m 0s) (12350 16%) 0.0213
5m 56s (- 29m 59s) (12400 16%) 0.0136
5m 57s (- 29m 58s) (12450 16%) 0.0256
5m 59s (- 29m 57s) (12500 16%) 0.0306
6m 1s (- 29m 57s) (12550 16%) 0.0224
6m 2s (- 29m 56s) (12600 16%) 0.0158
6m 4s (- 29m 55s) (12650 16%) 0.0215
6m 5s (- 29m 54s) (12700

10m 54s (- 25m 52s) (22250 29%) 0.0236
10m 56s (- 25m 51s) (22300 29%) 0.0161
10m 57s (- 25m 49s) (22350 29%) 0.0119
10m 59s (- 25m 48s) (22400 29%) 0.0122
11m 0s (- 25m 47s) (22450 29%) 0.0249
11m 2s (- 25m 45s) (22500 30%) 0.0118
11m 3s (- 25m 44s) (22550 30%) 0.0111
11m 5s (- 25m 43s) (22600 30%) 0.0075
11m 7s (- 25m 41s) (22650 30%) 0.0106
11m 8s (- 25m 40s) (22700 30%) 0.0364
11m 10s (- 25m 38s) (22750 30%) 0.0150
11m 11s (- 25m 37s) (22800 30%) 0.0114
11m 13s (- 25m 36s) (22850 30%) 0.0169
11m 14s (- 25m 34s) (22900 30%) 0.0146
11m 16s (- 25m 33s) (22950 30%) 0.0116
11m 17s (- 25m 31s) (23000 30%) 0.0090
11m 19s (- 25m 30s) (23050 30%) 0.0130
11m 20s (- 25m 29s) (23100 30%) 0.0117
11m 22s (- 25m 28s) (23150 30%) 0.0128
11m 23s (- 25m 27s) (23200 30%) 0.0092
11m 25s (- 25m 25s) (23250 31%) 0.0102
11m 26s (- 25m 24s) (23300 31%) 0.0206
11m 28s (- 25m 22s) (23350 31%) 0.0272
11m 29s (- 25m 21s) (23400 31%) 0.0230
11m 31s (- 25m 20s) (23450 31%) 0.0237
11m 33s (- 25m 18s) (23500 31%)

16m 14s (- 20m 49s) (32850 43%) 0.0136
16m 15s (- 20m 48s) (32900 43%) 0.0067
16m 17s (- 20m 47s) (32950 43%) 0.0088
16m 18s (- 20m 45s) (33000 44%) 0.0054
16m 20s (- 20m 44s) (33050 44%) 0.0066
16m 21s (- 20m 42s) (33100 44%) 0.0137
16m 23s (- 20m 40s) (33150 44%) 0.0054
16m 24s (- 20m 39s) (33200 44%) 0.0090
16m 25s (- 20m 37s) (33250 44%) 0.0066
16m 27s (- 20m 36s) (33300 44%) 0.0162
16m 29s (- 20m 35s) (33350 44%) 0.0052
16m 30s (- 20m 33s) (33400 44%) 0.0082
16m 31s (- 20m 32s) (33450 44%) 0.0152
16m 33s (- 20m 30s) (33500 44%) 0.0161
16m 35s (- 20m 29s) (33550 44%) 0.0192
16m 36s (- 20m 27s) (33600 44%) 0.0056
16m 38s (- 20m 26s) (33650 44%) 0.0055
16m 39s (- 20m 24s) (33700 44%) 0.0070
16m 40s (- 20m 23s) (33750 45%) 0.0045
16m 42s (- 20m 21s) (33800 45%) 0.0052
16m 43s (- 20m 20s) (33850 45%) 0.0100
16m 45s (- 20m 18s) (33900 45%) 0.0148
16m 46s (- 20m 17s) (33950 45%) 0.0227
16m 48s (- 20m 15s) (34000 45%) 0.0091
16m 49s (- 20m 14s) (34050 45%) 0.0107
16m 51s (- 20m 12s) (3410

21m 31s (- 15m 37s) (43450 57%) 0.0030
21m 33s (- 15m 36s) (43500 57%) 0.0074
21m 34s (- 15m 34s) (43550 58%) 0.0048
21m 36s (- 15m 33s) (43600 58%) 0.0053
21m 37s (- 15m 32s) (43650 58%) 0.0046
21m 39s (- 15m 30s) (43700 58%) 0.0093
21m 40s (- 15m 28s) (43750 58%) 0.0083
21m 42s (- 15m 27s) (43800 58%) 0.0059
21m 43s (- 15m 26s) (43850 58%) 0.0125
21m 45s (- 15m 24s) (43900 58%) 0.0031
21m 46s (- 15m 23s) (43950 58%) 0.0073
21m 48s (- 15m 21s) (44000 58%) 0.0041
21m 49s (- 15m 20s) (44050 58%) 0.0138
21m 51s (- 15m 18s) (44100 58%) 0.0049
21m 52s (- 15m 17s) (44150 58%) 0.0042
21m 54s (- 15m 15s) (44200 58%) 0.0088
21m 55s (- 15m 14s) (44250 59%) 0.0047
21m 57s (- 15m 12s) (44300 59%) 0.0132
21m 58s (- 15m 11s) (44350 59%) 0.0066
22m 0s (- 15m 10s) (44400 59%) 0.0041
22m 2s (- 15m 8s) (44450 59%) 0.0056
22m 3s (- 15m 7s) (44500 59%) 0.0053
22m 5s (- 15m 5s) (44550 59%) 0.0076
22m 6s (- 15m 4s) (44600 59%) 0.0057
22m 8s (- 15m 2s) (44650 59%) 0.0043
22m 9s (- 15m 1s) (44700 59%) 0.0033

26m 50s (- 10m 24s) (54050 72%) 0.0079
26m 51s (- 10m 22s) (54100 72%) 0.0062
26m 52s (- 10m 21s) (54150 72%) 0.0034
26m 54s (- 10m 19s) (54200 72%) 0.0066
26m 56s (- 10m 18s) (54250 72%) 0.0026
26m 57s (- 10m 16s) (54300 72%) 0.0025
26m 59s (- 10m 15s) (54350 72%) 0.0038
27m 0s (- 10m 13s) (54400 72%) 0.0038
27m 2s (- 10m 12s) (54450 72%) 0.0058
27m 3s (- 10m 10s) (54500 72%) 0.0048
27m 5s (- 10m 9s) (54550 72%) 0.0023
27m 6s (- 10m 7s) (54600 72%) 0.0099
27m 8s (- 10m 6s) (54650 72%) 0.0042
27m 9s (- 10m 4s) (54700 72%) 0.0024
27m 11s (- 10m 3s) (54750 73%) 0.0045
27m 12s (- 10m 1s) (54800 73%) 0.0047
27m 13s (- 10m 0s) (54850 73%) 0.0053
27m 15s (- 9m 58s) (54900 73%) 0.0022
27m 16s (- 9m 57s) (54950 73%) 0.0042
27m 18s (- 9m 55s) (55000 73%) 0.0032
27m 20s (- 9m 54s) (55050 73%) 0.0023
27m 21s (- 9m 52s) (55100 73%) 0.0026
27m 23s (- 9m 51s) (55150 73%) 0.0014
27m 24s (- 9m 49s) (55200 73%) 0.0018
27m 26s (- 9m 48s) (55250 73%) 0.0030
27m 27s (- 9m 46s) (55300 73%) 0.0020
27m 29s (

32m 14s (- 4m 59s) (64950 86%) 0.0020
32m 16s (- 4m 57s) (65000 86%) 0.0019
32m 18s (- 4m 56s) (65050 86%) 0.0019
32m 19s (- 4m 54s) (65100 86%) 0.0024
32m 21s (- 4m 53s) (65150 86%) 0.0024
32m 22s (- 4m 52s) (65200 86%) 0.0025
32m 24s (- 4m 50s) (65250 87%) 0.0015
32m 25s (- 4m 49s) (65300 87%) 0.0037
32m 27s (- 4m 47s) (65350 87%) 0.0049
32m 28s (- 4m 46s) (65400 87%) 0.0043
32m 30s (- 4m 44s) (65450 87%) 0.0013
32m 31s (- 4m 43s) (65500 87%) 0.0027
32m 33s (- 4m 41s) (65550 87%) 0.0070
32m 35s (- 4m 40s) (65600 87%) 0.0020
32m 36s (- 4m 38s) (65650 87%) 0.0014
32m 37s (- 4m 37s) (65700 87%) 0.0019
32m 39s (- 4m 35s) (65750 87%) 0.0047
32m 40s (- 4m 34s) (65800 87%) 0.0034
32m 42s (- 4m 32s) (65850 87%) 0.0020
32m 43s (- 4m 31s) (65900 87%) 0.0026
32m 45s (- 4m 29s) (65950 87%) 0.0020
32m 46s (- 4m 28s) (66000 88%) 0.0119
32m 48s (- 4m 26s) (66050 88%) 0.0028
32m 49s (- 4m 25s) (66100 88%) 0.0016
32m 51s (- 4m 23s) (66150 88%) 0.0020
32m 52s (- 4m 22s) (66200 88%) 0.0022
32m 54s (- 4

In [162]:
evaluateRandomly(encoder1, attn_decoder1)

> our bird does confuse some elephant around our bird . quest
= does our bird confuse some elephant around our bird ? quest
< does our bird confuse some elephant around our bird ? <QUEST>

> our seal doesnt irritate our seal . quest
= doesnt our seal irritate our seal ? quest
< doesnt our seal irritate our seal ? <QUEST>

> your elephant by your yaks does entertain the dog by your yak . quest
= does your elephant by your yaks entertain the dog by your yak ? quest
< does your elephant by your yaks entertain the dog by your yak ? <QUEST>

> her birds that dont sleep dont call her seals . ident
= her birds that dont sleep dont call her seals . ident
< her birds that dont sleep dont call her seals <IDENT>

> her cats dont admire your rabbits . ident
= her cats dont admire your rabbits . ident
< her cats dont admire your rabbits <IDENT>

> some bird upon our bird doesnt confuse some rabbits . quest
= doesnt some bird upon our bird confuse some rabbits ? quest
< doesnt some bird upon our bir

Visualizing Attention
---------------------

A useful property of the attention mechanism is its highly interpretable
outputs. Because it is used to weight specific encoder outputs of the
input sequence, we can imagine looking where the network is focused most
at each time step.

You could simply run ``plt.matshow(attentions)`` to see attention output
displayed as a matrix, with the columns being input steps and rows being
output steps:




In [45]:
output_words, attentions = evaluate(
    encoder1, attn_decoder1, "our cats do read quest")
plt.matshow(attentions.numpy())

<matplotlib.image.AxesImage at 0x7f3010f49630>

For a better viewing experience we will do the extra work of adding axes
and labels:




In [99]:
def showAttention(input_sentence, output_words, attentions):
    # Set up figure with colorbar
    fig = plt.figure()
    ax = fig.add_subplot(111)
    cax = ax.matshow(attentions.numpy(), cmap='bone')
    fig.colorbar(cax)

    # Set up axes
    ax.set_xticklabels([''] + input_sentence.split(' ') +
                       ['<end>'], rotation=90)
    ax.set_yticklabels([''] + output_words)

    # Show label at every tick
    ax.xaxis.set_major_locator(ticker.MultipleLocator(1))
    ax.yaxis.set_major_locator(ticker.MultipleLocator(1))

    plt.show()
    fig.savefig(input_sentence+'.png')


def evaluateAndShowAttention(input_sentence):
    output_words, attentions = evaluate(
        encoder1, attn_decoder1, input_sentence)
    print('input =', input_sentence)
    print('output =', ' '.join(output_words))
    showAttention(input_sentence, output_words, attentions)


evaluateAndShowAttention("her dog who does smile doesnt confuse your dog who does confuse your unicorn . quest")



input = her dog who does smile doesnt confuse your dog who does confuse your unicorn . quest
output = does her dog who doesnt smile does confuse your dog who does confuse your unicorn ? <QUEST>


In [189]:
lines = open(input_file_path+'test.txt' ,).\
        read().strip().split('\n')
input_sentences = [] 
output_sentences = []
predicted_sentences = []
full_sentence_word_match_counter = 0
full_sentence_pos_match_counter = 0
total_sentences_counter = 0
from eval import *
for line in lines:
    sentences = line.split("\t")
    input_sent = sentences[0]
    output_sent = sentences[1]
    input_sentences.append(input_sent)
    output_sentences.append(output_sent)
    outwords , out_attn = evaluate(encoder1,attn_decoder1,input_sent)
    predicted_sentence = " ".join(outwords)
    predicted_sentences.append(predicted_sentence)
    final_pred = outwords[:len(outwords)-2]
    expected = output_sent.split(" ")
    final_expected = expected[:len(outwords)-2]
    if full_sentence_word_match(final_pred,final_expected):
        full_sentence_word_match_counter+=1
    if full_sentence_pos_match(final_pred,final_expected,agreement):
        full_sentence_pos_match_counter+=1
    total_sentences_counter+=1
    
predictions = open(input_file_path+"predictions.txt","w")
for index in range(total_sentences_counter):
    predictions.write(str(index)+".\n")
    predictions.write("Input:"+input_sentences[index] + "\n")
    predictions.write("Expected Output:" + output_sentences[index]+"\n")
    predictions.write("Predicted:" + predicted_sentences[index] + "\n")
predictions.close()

report = open(input_file_path+"report.txt" , "w")
report.write("Total number Of sentences:" + str(total_sentences_counter) + "\n")
report.write("Total number of correct full sentences word match:" + str(full_sentence_word_match_counter) + "\n")
report.write("Total number of correct full sentences pos match:" + str(full_sentence_pos_match_counter) + "\n")
report.close()



    
    

['does', 'her', 'dog', 'who', 'doesnt', 'smile', 'does', 'confuse', 'your', 'dog', 'who', 'does', 'confuse', 'your', 'unicorn']
