In [1]:
# This code is a modified version of the seq2seq code found at
# the following tutorial:
# http://pytorch.org/tutorials/intermediate/seq2seq_translation_tutorial.html

# Imports. 
# Note that you must have PyTorch installed before you can run this.
# You can get PyTorch here: http://pytorch.org/
from __future__ import unicode_literals, print_function, division
from io import open
import unicodedata
import string
import re
import random

import torch
import torch.nn as nn
from torch.autograd import Variable
from torch import optim
import torch.nn.functional as F

# I ran this without cuda. Cuda allows you to use a GPU, so
# we'll probably want to use it for more complex experiments
use_cuda = torch.cuda.is_available()

In [2]:
# Start-of-sentence and end-of-sentence tokens
# The standard seq2seq version only has one EOS. This version has 
# 2 EOS--one signalling that the original sentence should be returned,
# the other signalling it should be reversed.
# I use a 1-hot encoding for all tokens.
SOS_token = 0
EOS_tokenA = 1 # For FWD
EOS_tokenB = 2 # For REV

# Defining the encodings for each token
class Lang:
    def __init__(self, name):
        self.name = name
        self.word2index = {u"fwd": 1, u"rev": 2}
        self.word2count = {u"fwd": 0, u"rev": 0}
        self.index2word = {0: "SOS", 1: "fwd", 2: "rev"}
        self.n_words = 3  # Count SOS and EOS

    def addSentence(self, sentence):
        for word in sentence.split(' '):
            self.addWord(word)

    def addWord(self, word):
        if word not in self.word2index:
            self.word2index[word] = self.n_words
            self.word2count[word] = 1
            self.index2word[self.n_words] = word
            self.n_words += 1
        else:
            self.word2count[word] += 1

In [3]:
# String processing stuff

# Turn a Unicode string to plain ASCII, thanks to
# http://stackoverflow.com/a/518232/2809427
def unicodeToAscii(s):
    return ''.join(
        c for c in unicodedata.normalize('NFD', s)
        if unicodedata.category(c) != 'Mn'
    )

# Lowercase, trim, and remove non-letter characters


def normalizeString(s):
    s = unicodeToAscii(s.lower().strip())
    s = re.sub(r"([.!?])", r" \1", s)
    s = re.sub(r"[^a-zA-Z.!?]+", r" ", s)
    return s

In [4]:
# Reading the training data

trainingFile = 'abcd.train'

def readLangs(lang1, lang2, reverse=False):
    print("Reading lines...")

    # Read the file and split into lines
    lines = open(trainingFile, encoding='utf-8').\
        read().strip().split('\n')

    # Split every line into pairs and normalize
    pairs = [[normalizeString(s) for s in l.split('\t')] for l in lines]

    # Reverse pairs, make Lang instances
    if reverse:
        pairs = [list(reversed(p)) for p in pairs]
        input_lang = Lang(lang2)
        output_lang = Lang(lang1)
    else:
        input_lang = Lang(lang1)
        output_lang = Lang(lang2)

    return input_lang, output_lang, pairs


In [5]:
# Max sentence length
# Not applicable here, since all sentences are at most 8 long.
MAX_LENGTH = 10



def filterPair(p):
    return len(p[0].split(' ')) < MAX_LENGTH and \
        len(p[1].split(' ')) < MAX_LENGTH 

def filterPairs(pairs):
    return [pair for pair in pairs if filterPair(pair)]

In [18]:
# Preparing data--also not really applicable here because we've already
# restricted the lengths of our training data. (This does lowercase the
# input and remove punctuation, but those things also don't matter here)
def prepareData(lang1, lang2, reverse=False):
    input_lang, output_lang, pairs = readLangs(lang1, lang2, reverse)
    print("Read %s sentence pairs" % len(pairs))
    pairs = filterPairs(pairs)
    print("Trimmed to %s sentence pairs" % len(pairs))
    print("Counting words...")
    for pair in pairs:
        input_lang.addSentence(pair[0])
        output_lang.addSentence(pair[1])
    print("Counted words:")
    print(input_lang.name, input_lang.n_words)
    print(output_lang.name, output_lang.n_words)
    return input_lang, output_lang, pairs


input_lang, output_lang, pairs = prepareData('eng', 'fra', False)
print(random.choice(pairs))

Reading lines...
Read 144000 sentence pairs
Trimmed to 144000 sentence pairs
Counting words...
Counted words:
eng 7
fra 7
[u'd b a a rev', u'a a b d rev']


In [7]:
# Class for the encoder RNN
class EncoderRNN(nn.Module):
    def __init__(self, input_size, hidden_size, n_layers=1):
        super(EncoderRNN, self).__init__()
        self.n_layers = n_layers
        self.hidden_size = hidden_size

        self.embedding = nn.Embedding(input_size, hidden_size)
        self.gru = nn.GRU(hidden_size, hidden_size)

    # For succesively generating each new output and hidden layer
    def forward(self, input, hidden):
        embedded = self.embedding(input).view(1, 1, -1)
        output = embedded
        for i in range(self.n_layers):
            output, hidden = self.gru(output, hidden)
        return output, hidden

    # Creates the initial hidden state
    def initHidden(self):
        result = Variable(torch.zeros(1, 1, self.hidden_size))
        if use_cuda:
            return result.cuda()
        else:
            return result

In [8]:
# Class for the basic decoder RNN, without attention
class DecoderRNN(nn.Module):
    def __init__(self, hidden_size, output_size, n_layers=1):
        super(DecoderRNN, self).__init__()
        self.n_layers = n_layers
        self.hidden_size = hidden_size

        self.embedding = nn.Embedding(output_size, hidden_size)
        self.gru = nn.GRU(hidden_size, hidden_size)
        self.out = nn.Linear(hidden_size, output_size)
        self.softmax = nn.LogSoftmax()

    # For successively generating each new output and hidden layer
    def forward(self, input, hidden):
        output = self.embedding(input).view(1, 1, -1)
        for i in range(self.n_layers):
            output = F.relu(output)
            output, hidden = self.gru(output, hidden)
        output = self.softmax(self.out(output[0]))
        return output, hidden

    # Creates the initial hidden state
    def initHidden(self):
        result = Variable(torch.zeros(1, 1, self.hidden_size))
        if use_cuda:
            return result.cuda()
        else:
            return result

In [9]:
# Class for a decoder RNN using attention
class AttnDecoderRNN(nn.Module):
    def __init__(self, hidden_size, output_size, n_layers=1, dropout_p=0.1, max_length=MAX_LENGTH):
        super(AttnDecoderRNN, self).__init__()
        self.hidden_size = hidden_size
        self.output_size = output_size
        self.n_layers = n_layers
        self.dropout_p = dropout_p
        self.max_length = max_length

        self.embedding = nn.Embedding(self.output_size, self.hidden_size)
        self.attn = nn.Linear(self.hidden_size * 2, self.max_length)
        self.attn_combine = nn.Linear(self.hidden_size * 2, self.hidden_size)
        self.dropout = nn.Dropout(self.dropout_p)
        self.gru = nn.GRU(self.hidden_size, self.hidden_size)
        self.out = nn.Linear(self.hidden_size, self.output_size)

    # For successively generating each new output and hidden layer
    def forward(self, input, hidden, encoder_output, encoder_outputs):
        embedded = self.embedding(input).view(1, 1, -1)
        embedded = self.dropout(embedded)

        attn_weights = F.softmax(
            self.attn(torch.cat((embedded[0], hidden[0]), 1)))
        attn_applied = torch.bmm(attn_weights.unsqueeze(0),
                                 encoder_outputs.unsqueeze(0))

        output = torch.cat((embedded[0], attn_applied[0]), 1)
        output = self.attn_combine(output).unsqueeze(0)

        for i in range(self.n_layers):
            output = F.relu(output)
            output, hidden = self.gru(output, hidden)

        output = F.log_softmax(self.out(output[0]))
        return output, hidden, attn_weights

    # Creates the initial hidden state
    def initHidden(self):
        result = Variable(torch.zeros(1, 1, self.hidden_size))
        if use_cuda:
            return result.cuda()
        else:
            return result

In [10]:
# Methods for interfacing between words and one-hot encodings
def indexesFromSentence(lang, sentence):
    return [lang.word2index[word] for word in sentence.split(' ')]


def variableFromSentence(lang, sentence):
    indexes = indexesFromSentence(lang, sentence)
    #indexes.append(EOS_token)
    result = Variable(torch.LongTensor(indexes).view(-1, 1))
    if use_cuda:
        return result.cuda()
    else:
        return result


def variablesFromPair(pair):
    input_variable = variableFromSentence(input_lang, pair[0])
    target_variable = variableFromSentence(output_lang, pair[1])
    return (input_variable, target_variable)

In [21]:
# This affects how training proceeds. I've run it with the default value
# of 0.5 but am trying it with 0.0 now.
teacher_forcing_ratio = 0.5

# Training the seq2seq network
def train(input_variable, target_variable, encoder, decoder, encoder_optimizer, decoder_optimizer, criterion, max_length=MAX_LENGTH):
    encoder_hidden = encoder.initHidden()

    encoder_optimizer.zero_grad()
    decoder_optimizer.zero_grad()

    input_length = input_variable.size()[0]
    target_length = target_variable.size()[0]

    encoder_outputs = Variable(torch.zeros(max_length, encoder.hidden_size))
    encoder_outputs = encoder_outputs.cuda() if use_cuda else encoder_outputs

    loss = 0

    for ei in range(input_length):
        encoder_output, encoder_hidden = encoder(
            input_variable[ei], encoder_hidden)
        encoder_outputs[ei] = encoder_output[0][0]

    decoder_input = Variable(torch.LongTensor([[SOS_token]]))
    decoder_input = decoder_input.cuda() if use_cuda else decoder_input

    decoder_hidden = encoder_hidden

    use_teacher_forcing = True if random.random() < teacher_forcing_ratio else False

    if use_teacher_forcing:
        # Teacher forcing: Feed the target as the next input
        for di in range(target_length):
            decoder_output, decoder_hidden, decoder_attention = decoder(
                decoder_input, decoder_hidden, encoder_output, encoder_outputs)
            loss += criterion(decoder_output[0], target_variable[di])
            decoder_input = target_variable[di]  # Teacher forcing

    else:
        # Without teacher forcing: use its own predictions as the next input
        for di in range(target_length):
            decoder_output, decoder_hidden, decoder_attention = decoder(
                decoder_input, decoder_hidden, encoder_output, encoder_outputs)
            topv, topi = decoder_output.data.topk(1)
            ni = topi[0][0]

            decoder_input = Variable(torch.LongTensor([[ni]]))
            decoder_input = decoder_input.cuda() if use_cuda else decoder_input

            loss += criterion(decoder_output[0], target_variable[di])
            if ni == EOS_tokenA or ni == EOS_tokenB:
                break

    loss.backward()

    encoder_optimizer.step()
    decoder_optimizer.step()

    return loss.data[0] / target_length

In [22]:
# Functions for tracking time
import time
import math


def asMinutes(s):
    m = math.floor(s / 60)
    s -= m * 60
    return '%dm %ds' % (m, s)


def timeSince(since, percent):
    now = time.time()
    s = now - since
    es = s / (percent)
    rs = es - s
    return '%s (- %s)' % (asMinutes(s), asMinutes(rs))

In [23]:
# Training iterations
def trainIters(encoder, decoder, n_iters, print_every=1000, plot_every=100, learning_rate=0.01):
    start = time.time()
    plot_losses = []
    print_loss_total = 0  # Reset every print_every
    plot_loss_total = 0  # Reset every plot_every

    encoder_optimizer = optim.SGD(encoder.parameters(), lr=learning_rate)
    decoder_optimizer = optim.SGD(decoder.parameters(), lr=learning_rate)
    training_pairs = [variablesFromPair(random.choice(pairs))
                      for i in range(n_iters)]
    criterion = nn.NLLLoss()

    for iter in range(1, n_iters + 1):
        training_pair = training_pairs[iter - 1]
        input_variable = training_pair[0]
        target_variable = training_pair[1]

        loss = train(input_variable, target_variable, encoder,
                     decoder, encoder_optimizer, decoder_optimizer, criterion)
        print_loss_total += loss
        plot_loss_total += loss

        if iter % print_every == 0:
            print_loss_avg = print_loss_total / print_every
            print_loss_total = 0
            print('%s (%d %d%%) %.4f' % (timeSince(start, iter / n_iters),
                                         iter, iter / n_iters * 100, print_loss_avg))

        if iter % plot_every == 0:
            plot_loss_avg = plot_loss_total / plot_every
            plot_losses.append(plot_loss_avg)
            plot_loss_total = 0

    showPlot(plot_losses)

In [24]:
# Function for graphically displaying results
import matplotlib.pyplot as plt
import matplotlib.ticker as ticker
import numpy as np


def showPlot(points):
    plt.figure()
    fig, ax = plt.subplots()
    # this locator puts ticks at regular intervals
    loc = ticker.MultipleLocator(base=0.2)
    ax.yaxis.set_major_locator(loc)
    plt.plot(points)

In [25]:
# Evaluate a single sentence
def evaluate(encoder, decoder, sentence, max_length=MAX_LENGTH):
    input_variable = variableFromSentence(input_lang, sentence)
    input_length = input_variable.size()[0]
    encoder_hidden = encoder.initHidden()

    encoder_outputs = Variable(torch.zeros(max_length, encoder.hidden_size))
    encoder_outputs = encoder_outputs.cuda() if use_cuda else encoder_outputs

    for ei in range(input_length):
        encoder_output, encoder_hidden = encoder(input_variable[ei],
                                                 encoder_hidden)
        encoder_outputs[ei] = encoder_outputs[ei] + encoder_output[0][0]

    decoder_input = Variable(torch.LongTensor([[SOS_token]]))  # SOS
    decoder_input = decoder_input.cuda() if use_cuda else decoder_input

    decoder_hidden = encoder_hidden

    decoded_words = []
    decoder_attentions = torch.zeros(max_length, max_length)

    for di in range(max_length):
        decoder_output, decoder_hidden, decoder_attention = decoder(
            decoder_input, decoder_hidden, encoder_output, encoder_outputs)
        decoder_attentions[di] = decoder_attention.data
        topv, topi = decoder_output.data.topk(1)
        ni = topi[0][0]
        if ni == EOS_tokenA:
            decoded_words.append('FWD')
            break
        elif ni == EOS_tokenB:
            decoded_words.append('REV')
            break
        else:
            decoded_words.append(output_lang.index2word[ni])

        decoder_input = Variable(torch.LongTensor([[ni]]))
        decoder_input = decoder_input.cuda() if use_cuda else decoder_input

    return decoded_words, decoder_attentions[:di + 1]

In [26]:
# Show the output for a few randomly selected sentences
def evaluateRandomly(encoder, decoder, n=10):
    for i in range(n):
        pair = random.choice(pairs)
        print('>', pair[0])
        print('=', pair[1])
        output_words, attentions = evaluate(encoder, decoder, pair[0])
        output_sentence = ' '.join(output_words)
        print('<', output_sentence)
        print('')

In [27]:
# Where the actual running of the code happens
hidden_size = 100
encoder1 = EncoderRNN(input_lang.n_words, hidden_size)
attn_decoder1 = AttnDecoderRNN(hidden_size, output_lang.n_words,
                               1, dropout_p=0.1)

if use_cuda:
    encoder1 = encoder1.cuda()
    attn_decoder1 = attn_decoder1.cuda()

trainIters(encoder1, attn_decoder1, 120000, print_every=1000)

0m 20s (- 41m 28s) (1000 0%) 1.2079
0m 42s (- 42m 5s) (2000 1%) 0.6930
3m 25s (- 133m 22s) (3000 2%) 0.5319
3m 46s (- 109m 20s) (4000 3%) 0.4514
4m 5s (- 94m 14s) (5000 4%) 0.3851
4m 23s (- 83m 29s) (6000 5%) 0.2976
4m 41s (- 75m 46s) (7000 5%) 0.2830
4m 59s (- 69m 50s) (8000 6%) 0.2226
5m 20s (- 65m 50s) (9000 7%) 0.2120
5m 39s (- 62m 16s) (10000 8%) 0.1792
5m 58s (- 59m 9s) (11000 9%) 0.1204
6m 16s (- 56m 28s) (12000 10%) 0.1396
6m 33s (- 54m 2s) (13000 10%) 0.0955
6m 52s (- 52m 4s) (14000 11%) 0.0804
7m 12s (- 50m 27s) (15000 12%) 0.1573
7m 31s (- 48m 52s) (16000 13%) 0.0774
7m 49s (- 47m 23s) (17000 14%) 0.0926
8m 8s (- 46m 6s) (18000 15%) 0.0947
8m 26s (- 44m 50s) (19000 15%) 0.0955
8m 44s (- 43m 43s) (20000 16%) 0.0696
9m 3s (- 42m 40s) (21000 17%) 0.0828
9m 21s (- 41m 41s) (22000 18%) 0.0911
9m 40s (- 40m 48s) (23000 19%) 0.1024
9m 59s (- 39m 58s) (24000 20%) 0.1103
10m 18s (- 39m 10s) (25000 20%) 0.0569
10m 37s (- 38m 25s) (26000 21%) 0.0525
10m 57s (- 37m 43s) (27000 22%) 0.06

In [19]:
evaluateRandomly(encoder1, attn_decoder1)

> d d b d a d c b fwd
= d d b d a d c b fwd
< d d b d a d c FWD

> d b d c fwd
= d b d c fwd
< d b d c FWD

> d b fwd
= d b fwd
< d b FWD

> b d b c a rev
= a c b d b rev
< a c b d b REV

> a rev
= a rev
< a REV

> c d rev
= d c rev
< d c REV

> c b c c c a fwd
= c b c c c a fwd
< c b c c c a FWD

> a a a b b rev
= b b a a a rev
< b b a a a REV

> a c b d a fwd
= a c b d a fwd
< a c b d a FWD

> d fwd
= d fwd
< d FWD



In [20]:
# Testing
testSet = open("abcd.test", "r")
testSents = testSet.readlines()

# Keeping track of number correct per input length
# Index 1 corresponds to length 4, index 2 to length 5, etc.
correctfwd = [0,0,0,0,0]
correctrev = [0,0,0,0,0]
totalfwd = [0,0,0,0,0]
totalrev = [0,0,0,0,0]

count = 0

for sentenceInit in testSents:
    sentence = sentenceInit.split("\t")[0].lower()
    output = sentenceInit.split("\t")[1].lower().strip()
    result, att = (evaluate(encoder1, attn_decoder1, sentence))
    correct = output == " ".join(result).lower()
    
    length = len(sentence.split()) - 1
    if sentence[-3:] == "fwd":
        #correct = sentence == " ".join(result).lower()
        correctfwd[length - 4] += correct
        totalfwd[length - 4] += 1
    else:
        correctrev[length - 4] += correct
        totalrev[length - 4] += 1
    
    count += 1
    if count % 1000 == 0:
        print(correctfwd, totalfwd, correctrev, totalrev)
    #print(sentence, " ".join(result).lower())

[0, 16, 23, 192, 210] [0, 16, 23, 231, 230] [0, 14, 21, 189, 142] [0, 16, 23, 231, 230]
[0, 32, 56, 374, 429] [0, 32, 56, 444, 468] [0, 27, 47, 298, 225] [0, 32, 56, 444, 468]
[0, 47, 98, 549, 650] [0, 47, 98, 648, 707] [0, 41, 81, 456, 372] [0, 47, 98, 648, 707]
[0, 66, 131, 727, 861] [0, 66, 131, 864, 939] [0, 60, 114, 597, 477] [0, 66, 131, 864, 939]
[0, 83, 150, 925, 1090] [0, 83, 150, 1083, 1184] [0, 75, 125, 743, 616] [0, 83, 150, 1083, 1184]
[0, 101, 174, 1097, 1313] [0, 101, 174, 1300, 1425] [0, 88, 136, 855, 726] [0, 101, 174, 1300, 1425]
[0, 116, 198, 1268, 1553] [0, 116, 198, 1505, 1681] [0, 101, 143, 953, 878] [0, 116, 198, 1505, 1681]
[0, 134, 221, 1469, 1761] [0, 134, 221, 1732, 1913] [0, 119, 163, 1120, 1044] [0, 134, 221, 1732, 1913]
[1, 153, 251, 1654, 1966] [1, 153, 251, 1956, 2139] [1, 138, 188, 1233, 1137] [1, 153, 251, 1956, 2139]
[1, 170, 282, 1832, 2188] [1, 170, 282, 2163, 2384] [1, 155, 212, 1400, 1314] [1, 170, 282, 2163, 2384]
[1, 187, 316, 2011, 2410] [1, 18

In [28]:
# You can use this cell to look at some specific examples of the output
for sentenceInit in testSents[5000:5020]:
    sentence = sentenceInit.split("\t")[0].lower()
    result, att = (evaluate(encoder1, attn_decoder1, sentence))
    correct = sentence == " ".join(result).lower()
    
    length = len(sentence.split()) - 1
    if sentence[-3:] == "fwd":
        correctfwd[length - 4] += correct
        totalfwd[length - 4] += 1
    else:
        correctrev[length - 4] += correct
        totalrev[length - 4] += 1
    
    count += 1
    if count % 1000 == 0:
        print(correctfwd, totalfwd, correctrev, totalrev)
    print(sentence, " ".join(result).lower())

a c a c d fwd a c a c d c fwd False False
a c a c d rev d c a rev False False
a c a c d a a b fwd a c a d b c a fwd False False
a c a c d a a b rev b a a c d c a rev False False
a c a c d a b fwd a c a d b c a fwd False False
a c a c d a b rev b a d c c a rev False False
a c a c d a b a fwd a c a d b c a fwd False False
a c a c d a b a rev a b a d c c a rev False False
a c a c d a c fwd a c a d c a c fwd False False
a c a c d a c rev c a d c a rev False False
a c a c d a c b fwd a c a d b c fwd False False
a c a c d a c b rev c b a c d c a rev False False
a c a c d a c d fwd a c a d c a d c d c False False
a c a c d a c d rev d c a d c d rev False False
a c a c d a d c fwd a c a d c d c d c d False False
a c a c d a d c rev c d a d c d rev False False
a c a c d b b fwd a c b b d c a c fwd False False
a c a c d b b rev b b d c c a rev False False
a c a c d c a fwd a c a d c a c c fwd False False
a c a c d c a rev a c d c a rev False False
