# Seq2Seq with Attention

In [2]:
from __future__ import unicode_literals, print_function, division
from io import open
import unicodedata
import string
import re
import random

import torch
import torch.nn as nn
from torch import optim
import torch.nn.functional as F

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [8]:
source = open('data/commands_source_en_70k.txt', encoding='UTF-8').read().strip().split('\n')  
target = open('data/commands_target_en_70k.txt', encoding='UTF-8').read().strip().split('\n')  
source_num = open('data/nums_source_en_10k.txt', encoding='UTF-8').read().strip().split('\n')  
target_num = open('data/nums_target_en_10k.txt', encoding='UTF-8').read().strip().split('\n')  

In [9]:
source.extend(source_num)
target.extend(target_num)

In [10]:
len(source)

80000

In [12]:
SOS_token = 0
EOS_token = 1


class Lang:
    def __init__(self, name):
        self.name = name
        self.word2index = {}
        self.word2count = {}
        self.index2word = {0: "SOS", 1: "EOS"}
        self.n_words = 2  # Count SOS and EOS

    def addSentence(self, sentence):
        for word in sentence.split(' '):
            self.addWord(word)

    def addWord(self, word):
        if word not in self.word2index:
            self.word2index[word] = self.n_words
            self.word2count[word] = 1
            self.index2word[self.n_words] = word
            self.n_words += 1
        else:
            self.word2count[word] += 1

In [13]:
# Turn a Unicode string to plain ASCII, thanks to
# https://stackoverflow.com/a/518232/2809427
def unicodeToAscii(s):
    return ''.join(
        c for c in unicodedata.normalize('NFD', s)
        if unicodedata.category(c) != 'Mn'
    )

# Lowercase, trim, and remove non-letter characters


def normalizeString(s):
    s = unicodeToAscii(s.lower().strip())
    s = re.sub(r"([!?])", r" \1", s)
#     s = re.sub(r"[^a-zA-Z!?]+", r" ", s)
    return s

In [14]:
def readLangs(lang1, lang2):
    print("Reading lines...")

    # Split every line into pairs and normalize
    pairs = [[normalizeString(s), normalizeString(t) ]for s, t in list(zip(target, source))] 

    input_lang = Lang(lang1)
    output_lang = Lang(lang2)

    return input_lang, output_lang, pairs

In [18]:
def prepareData(lang1, lang2):
    input_lang, output_lang, pairs = readLangs(lang1, lang2)
    print("Read %s sentence pairs" % len(pairs))
#     pairs = filterPairs(pairs)
#     print("Trimmed to %s sentence pairs" % len(pairs))
    print("Counting words...")
    for pair in pairs:
        input_lang.addSentence(pair[0])
        output_lang.addSentence(pair[1])
    print("Counted words:")
    print(input_lang.name, input_lang.n_words)
    print(output_lang.name, output_lang.n_words)
    return input_lang, output_lang, pairs


input_lang, output_lang, pairs = prepareData('src', 'tgt')
print(random.choice(pairs))

Reading lines...
Read 80000 sentence pairs
Counting words...
Counted words:
src 6190
tgt 16864
['turn on blue sports twenty two', 'turn on blue sports 22']


In [19]:
class EncoderRNN(nn.Module):
    def __init__(self, input_size, hidden_size):
        super(EncoderRNN, self).__init__()
        self.hidden_size = hidden_size

        self.embedding = nn.Embedding(input_size, hidden_size)
        self.gru = nn.GRU(hidden_size, hidden_size)

    def forward(self, input, hidden):
        embedded = self.embedding(input).view(1, 1, -1)
        output = embedded
        output, hidden = self.gru(output, hidden)
        return output, hidden

    def initHidden(self):
        return torch.zeros(1, 1, self.hidden_size, device=device)

In [20]:
class DecoderRNN(nn.Module):
    def __init__(self, hidden_size, output_size):
        super(DecoderRNN, self).__init__()
        self.hidden_size = hidden_size

        self.embedding = nn.Embedding(output_size, hidden_size)
        self.gru = nn.GRU(hidden_size, hidden_size)
        self.out = nn.Linear(hidden_size, output_size)
        self.softmax = nn.LogSoftmax(dim=1)

    def forward(self, input, hidden):
        output = self.embedding(input).view(1, 1, -1)
        output = F.relu(output)
        output, hidden = self.gru(output, hidden)
        output = self.softmax(self.out(output[0]))
        return output, hidden

    def initHidden(self):
        return torch.zeros(1, 1, self.hidden_size, device=device)

In [21]:
class AttnDecoderRNN(nn.Module):
    def __init__(self, hidden_size, output_size, dropout_p=0.1, max_length=50):
        super(AttnDecoderRNN, self).__init__()
        self.hidden_size = hidden_size
        self.output_size = output_size
        self.dropout_p = dropout_p
        self.max_length = max_length

        self.embedding = nn.Embedding(self.output_size, self.hidden_size)
        self.attn = nn.Linear(self.hidden_size * 2, self.max_length)
        self.attn_combine = nn.Linear(self.hidden_size * 2, self.hidden_size)
        self.dropout = nn.Dropout(self.dropout_p)
        self.gru = nn.GRU(self.hidden_size, self.hidden_size)
        self.out = nn.Linear(self.hidden_size, self.output_size)

    def forward(self, input, hidden, encoder_outputs):
        embedded = self.embedding(input).view(1, 1, -1)
        embedded = self.dropout(embedded)

        attn_weights = F.softmax(
            self.attn(torch.cat((embedded[0], hidden[0]), 1)), dim=1)
        attn_applied = torch.bmm(attn_weights.unsqueeze(0),
                                 encoder_outputs.unsqueeze(0))

        output = torch.cat((embedded[0], attn_applied[0]), 1)
        output = self.attn_combine(output).unsqueeze(0)

        output = F.relu(output)
        output, hidden = self.gru(output, hidden)

        output = F.log_softmax(self.out(output[0]), dim=1)
        return output, hidden, attn_weights

    def initHidden(self):
        return torch.zeros(1, 1, self.hidden_size, device=device)

In [22]:
def indexesFromSentence(lang, sentence):
    return [lang.word2index[word] for word in sentence.split(' ')]


def tensorFromSentence(lang, sentence):
    indexes = indexesFromSentence(lang, sentence)
    indexes.append(EOS_token)
    return torch.tensor(indexes, dtype=torch.long, device=device).view(-1, 1)


def tensorsFromPair(pair):
    input_tensor = tensorFromSentence(input_lang, pair[0])
    target_tensor = tensorFromSentence(output_lang, pair[1])
    return (input_tensor, target_tensor)

In [23]:
teacher_forcing_ratio = 0.5


def train(input_tensor, target_tensor, encoder, decoder, encoder_optimizer, decoder_optimizer, criterion, max_length=50):
    encoder_hidden = encoder.initHidden()

    encoder_optimizer.zero_grad()
    decoder_optimizer.zero_grad()

    input_length = input_tensor.size(0)
    target_length = target_tensor.size(0)

    encoder_outputs = torch.zeros(max_length, encoder.hidden_size, device=device)

    loss = 0

    for ei in range(input_length):
        encoder_output, encoder_hidden = encoder(
            input_tensor[ei], encoder_hidden)
        encoder_outputs[ei] = encoder_output[0, 0]

    decoder_input = torch.tensor([[SOS_token]], device=device)

    decoder_hidden = encoder_hidden

    use_teacher_forcing = True if random.random() < teacher_forcing_ratio else False

    if use_teacher_forcing:
        # Teacher forcing: Feed the target as the next input
        for di in range(target_length):
            decoder_output, decoder_hidden, decoder_attention = decoder(
                decoder_input, decoder_hidden, encoder_outputs)
            loss += criterion(decoder_output, target_tensor[di])
            decoder_input = target_tensor[di]  # Teacher forcing

    else:
        # Without teacher forcing: use its own predictions as the next input
        for di in range(target_length):
            decoder_output, decoder_hidden, decoder_attention = decoder(
                decoder_input, decoder_hidden, encoder_outputs)
            topv, topi = decoder_output.topk(1)
            decoder_input = topi.squeeze().detach()  # detach from history as input

            loss += criterion(decoder_output, target_tensor[di])
            if decoder_input.item() == EOS_token:
                break

    loss.backward()

    encoder_optimizer.step()
    decoder_optimizer.step()

    return loss.item() / target_length

In [24]:
import time
import math


def asMinutes(s):
    m = math.floor(s / 60)
    s -= m * 60
    return '%dm %ds' % (m, s)


def timeSince(since, percent):
    now = time.time()
    s = now - since
    es = s / (percent)
    rs = es - s
    return '%s (- %s)' % (asMinutes(s), asMinutes(rs))

In [25]:
def trainIters(encoder, decoder, n_iters, print_every=1000, plot_every=100, learning_rate=0.01):
    start = time.time()
    plot_losses = []
    print_loss_total = 0  # Reset every print_every
    plot_loss_total = 0  # Reset every plot_every

    encoder_optimizer = optim.SGD(encoder.parameters(), lr=learning_rate)
    decoder_optimizer = optim.SGD(decoder.parameters(), lr=learning_rate)
    training_pairs = [tensorsFromPair(random.choice(pairs))
                      for i in range(n_iters)]
    criterion = nn.NLLLoss()

    for iter in range(1, n_iters + 1):
        training_pair = training_pairs[iter - 1]
        input_tensor = training_pair[0]
        target_tensor = training_pair[1]

        loss = train(input_tensor, target_tensor, encoder,
                     decoder, encoder_optimizer, decoder_optimizer, criterion)
        print_loss_total += loss
        plot_loss_total += loss

        if iter % print_every == 0:
            print_loss_avg = print_loss_total / print_every
            print_loss_total = 0
            print('%s (%d %d%%) %.4f' % (timeSince(start, iter / n_iters),
                                         iter, iter / n_iters * 100, print_loss_avg))

        if iter % plot_every == 0:
            plot_loss_avg = plot_loss_total / plot_every
            plot_losses.append(plot_loss_avg)
            plot_loss_total = 0

    showPlot(plot_losses)

In [26]:
import matplotlib.pyplot as plt
plt.switch_backend('agg')
import matplotlib.ticker as ticker
import numpy as np


def showPlot(points):
    plt.figure()
    fig, ax = plt.subplots()
    # this locator puts ticks at regular intervals
    loc = ticker.MultipleLocator(base=0.2)
    ax.yaxis.set_major_locator(loc)
    plt.plot(points)

In [27]:
def evaluate(encoder, decoder, sentence, max_length=50):
    with torch.no_grad():
        input_tensor = tensorFromSentence(input_lang, sentence)
        input_length = input_tensor.size()[0]
        encoder_hidden = encoder.initHidden()

        encoder_outputs = torch.zeros(max_length, encoder.hidden_size, device=device)

        for ei in range(input_length):
            encoder_output, encoder_hidden = encoder(input_tensor[ei],
                                                     encoder_hidden)
            encoder_outputs[ei] += encoder_output[0, 0]

        decoder_input = torch.tensor([[SOS_token]], device=device)  # SOS

        decoder_hidden = encoder_hidden

        decoded_words = []
        decoder_attentions = torch.zeros(max_length, max_length)

        for di in range(max_length):
            decoder_output, decoder_hidden, decoder_attention = decoder(
                decoder_input, decoder_hidden, encoder_outputs)
            decoder_attentions[di] = decoder_attention.data
            topv, topi = decoder_output.data.topk(1)
            if topi.item() == EOS_token:
                decoded_words.append('<EOS>')
                break
            else:
                decoded_words.append(output_lang.index2word[topi.item()])

            decoder_input = topi.squeeze().detach()

        return decoded_words, decoder_attentions[:di + 1]

In [28]:
def evaluateRandomly(encoder, decoder, n=10):
    for i in range(n):
        pair = random.choice(pairs)
        print('>', pair[0])
        print('=', pair[1])
        output_words, attentions = evaluate(encoder, decoder, pair[0])
        output_sentence = ' '.join(output_words)
        print('<', output_sentence)
        print('')

In [29]:
hidden_size = 256
encoder1 = EncoderRNN(input_lang.n_words, hidden_size).to(device)
attn_decoder1 = AttnDecoderRNN(hidden_size, output_lang.n_words, dropout_p=0.1).to(device)

trainIters(encoder1, attn_decoder1, 72000, print_every=3000)

2m 0s (- 46m 15s) (3000 4%) 3.8974
4m 1s (- 44m 18s) (6000 8%) 2.4666
6m 3s (- 42m 22s) (9000 12%) 1.9148
8m 4s (- 40m 20s) (12000 16%) 1.6596
10m 7s (- 38m 29s) (15000 20%) 1.5601
12m 11s (- 36m 34s) (18000 25%) 1.4427
14m 12s (- 34m 30s) (21000 29%) 1.4515
16m 15s (- 32m 30s) (24000 33%) 1.4579
18m 18s (- 30m 30s) (27000 37%) 1.2978
20m 21s (- 28m 30s) (30000 41%) 1.2934
22m 25s (- 26m 29s) (33000 45%) 1.2221
24m 27s (- 24m 27s) (36000 50%) 1.2189
26m 31s (- 22m 27s) (39000 54%) 1.1994
28m 37s (- 20m 26s) (42000 58%) 1.1873
30m 41s (- 18m 25s) (45000 62%) 1.1600
32m 42s (- 16m 21s) (48000 66%) 1.2238
34m 48s (- 14m 19s) (51000 70%) 1.1423
36m 52s (- 12m 17s) (54000 75%) 1.0850
38m 58s (- 10m 15s) (57000 79%) 1.0277
41m 1s (- 8m 12s) (60000 83%) 1.0513
43m 6s (- 6m 9s) (63000 87%) 1.1323
45m 10s (- 4m 6s) (66000 91%) 1.0470
47m 14s (- 2m 3s) (69000 95%) 1.1254
49m 19s (- 0m 0s) (72000 100%) 1.0549


In [30]:
evaluateRandomly(encoder1, attn_decoder1)

> switch to go tv
= switch to go tv
< switch to tv tv <EOS>

> i'd like to watch champions hockey league
= i'd like to watch champions hockey league
< i'd like to watch champions hockey league <EOS>

> i want to change the channel
= i want to change the channel
< i want to change the channel <EOS>

> i want to watch a tennis game
= i want to watch a tennis game
< i want to watch a tennis game <EOS>

> the weather report please
= the weather report please
< the weather report please <EOS>

> i'd like to know if it is warm outside
= i'd like to know if it is warm outside
< i'd like to know if it is warm outside <EOS>

> i wanna switch to radio channel ninety nine
= i wanna switch to radio channel 99
< i wanna switch to radio channel 99 <EOS>

> how is the weather in auer on thirtieth december
= how is the weather in auer on 30.12
< how is the weather in auer on 30.12 <EOS>

> play the movie running with the devil
= play the movie running with the devil
< play the movie with me <EOS>

> t

In [31]:
evaluateRandomly(encoder1, attn_decoder1)

> turn on law and order criminal intent
= turn on law and order: criminal intent
< turn on interested and a <EOS>

> i wanna watch feed
= i wanna watch feed
< i wanna watch serie <EOS>

> weather update for sixteen fifty four in fussen
= weather update for 16:54 in fussen
< weather update for 16:54 in fussen <EOS>

> six thousand two hundred and twenty seven
= 6227
< 3157 <EOS>

> six thousand six hundred and seventy
= 6670
< 3157 <EOS>

> turn on camera in the office
= turn on camera in the office
< turn on camera in the office <EOS>

> change to radio channel eleven please
= change to radio channel 11 please
< change to radio channel 11 please <EOS>

> one hundred and fifty four
= 154
< 4 <EOS>

> i would like to change the radio station
= i would like to change the radio station
< i would like to change the radio station <EOS>

> three thousand three hundred and twenty three
= 3323
< 23 <EOS>



In [56]:
evaluateRandomly(encoder1, attn_decoder1)

> change radio station
= change radio station
< change radio station <EOS>

> turn on the q bees in the bedroom
= turn on the q bees in the bedroom
< turn on the q bees in the bedroom <EOS>

> please search for cold comes the night
= please search for cold comes the night
< please search for the good at the <EOS>

> activate the device outdoor
= activate the device outdoor
< activate the device outdoor <EOS>

> two thousand two hundred
= 2200
< 3157 <EOS>

> show me brighton and hove albion
= show me brighton & hove albion
< show me west & & <EOS>

> turn on my switch
= turn on my switch
< turn on my switch <EOS>

> put on tv channel nine
= put on tv channel 9
< put on tv channel 9 <EOS>

> i want to change the station to radio channel rmc italia
= i want to change the station to radio channel rmc italia
< i want to change the station to radio channel radio channel <EOS>

> is it raining on thirtieth march
= is it raining on 30.3
< is it raining on 30.3 <EOS>



In [57]:
evaluateRandomly(encoder1, attn_decoder1)

> six thousand eight hundred and thirteen
= 6813
< would <EOS>

> switch to kuriakos tv
= switch to kuriakos tv
< switch to tv tv <EOS>

> will i need an umbrella on sixteenth october
= will i need an umbrella on 16.10
< will i need an umbrella on 16.10 <EOS>

> nine thousand six hundred and forty eight
= 9648
< would <EOS>

> what's the weather gonna be like in zurich
= what's the weather gonna be like in zurich
< what's the weather gonna be like in zurich <EOS>

> how is the temperature between nine oh three and twenty forty in neumarkt
= how is the temperature between 09:03 and 20:40 in neumarkt
< how is the temperature between 19:20 and 19:20 in neumarkt <EOS>

> i need to know if it is raining
= i need to know if it is raining
< i need to know if it is raining <EOS>

> i want to listen to something else
= i want to listen to something else
< i want to listen to something else <EOS>

> i would like to watch annie
= i would like to watch annie
< i would like to watch handball <EOS>


In [33]:
def showAttention(input_sentence, output_words, attentions):
    # Set up figure with colorbar
    fig = plt.figure()
    ax = fig.add_subplot(111)
    cax = ax.matshow(attentions.numpy(), cmap='bone')
    fig.colorbar(cax)

    # Set up axes
    ax.set_xticklabels([''] + input_sentence.split(' ') +
                       ['<EOS>'], rotation=90)
    ax.set_yticklabels([''] + output_words)

    # Show label at every tick
    ax.xaxis.set_major_locator(ticker.MultipleLocator(1))
    ax.yaxis.set_major_locator(ticker.MultipleLocator(1))

    plt.show()


def evaluateAndShowAttention(input_sentence):
    output_words, attentions = evaluate(
        encoder1, attn_decoder1, input_sentence)
    print('input =', input_sentence)
    print('output =', ' '.join(output_words))
#     showAttention(input_sentence, output_words, attentions)
    
evaluateAndShowAttention("change to radio channel sixty please")

input = change to radio channel sixty please
output = change to radio channel 60 please <EOS>


## issue1: unkown mapping -> larger dataset

In [55]:
evaluateAndShowAttention("change to radio channel forty two please")

input = change to radio channel forty two please
output = change to radio channel 40 please <EOS>


## issue2: unkown words -> GloVe

In [42]:
evaluateAndShowAttention("i'd like to watch lord of the ring")

input = i'd like to watch lord of the ring
output = i'd like to watch ice the the <EOS>


In [60]:
import pickle
def save_variable(v,filename):
    f=open(filename,'wb')
    pickle.dump(v,f)
    f.close()
    return filename
 
def load_variable(filename):
    f=open(filename,'rb')
    r=pickle.load(f)
    f.close()
    return r

save_variable(encoder1, 'models/s2s_70k_encoder.pkl') 
save_variable(attn_decoder1, 'models/s2s_70k_decoder.pkl') 

'models/s2s_70k_decoder.pkl'

In [61]:
a = load_variable('models/s2s_70k_encoder.pkl')
b = load_variable('models/s2s_70k_decoder.pkl')

In [62]:
evaluateRandomly(a, b)

> is it pouring
= is it pouring
< is it pouring <EOS>

> do you have tennis
= do you have tennis
< do you have tennis <EOS>

> will i be cold today
= will i be cold today
< will i be cold today <EOS>

> four thousand nine hundred and forty nine
= 4949
< would <EOS>

> play the movie shaun the sheep movie farmageddon
= play the movie shaun the sheep movie: farmageddon
< play the movie the the day after <EOS>

> which temperatures can we expect in zurich
= which temperatures can we expect in zurich
< which temperatures can we expect in zurich <EOS>

> am i going to be cold
= am i going to be cold
< am i going to be cold <EOS>

> find comedie plus
= find comedie !+
< find planete + <EOS>

> turn on tv channel forty
= turn on tv channel 40
< turn on tv channel 40 <EOS>

> how warm will it get at six twenty two
= how warm will it get at 06:22
< how warm will it get at 06:22 <EOS>



  result = _VF.gru(input, hx, self._flat_weights, self.bias, self.num_layers,
