# Attention seq2seq - Pytorch

Dataset: http://svn.code.sf.net/p/cmusphinx/code/trunk/cmudict/cmudict-0.7b

In [402]:
%matplotlib inline

import numpy as np
import torch.nn as nn
import torch.nn.parallel
import torch.nn.functional as F
from torch.autograd import Variable
from torch import optim
import re
import time, math
from sklearn.model_selection import train_test_split

Path = 'data/'

## Preprocessing

In [3]:
#Get each word that begins with A-Z from each line into a list 
lines = [l.strip().split("  ") for l in open(Path+'cmudict-0.7b', encoding='latin1') 
         if re.match('^[A-Z]', l)]
#Split words and phonemes
lines = [(w, ps.split()) for w, ps in lines]
lines[0]

('A', ['AH0'])

In [4]:
#Get a list of all the unique phonemes from lines and adding _ to position 0 because it corresponds to padding
#when tokenised
phonemes = ["_"]+sorted(set(p for w, ps in lines for p in ps))
len(phonemes)

70

In [5]:
#Map phonemes to indices and letters to indices.
p2i = dict((v, k) for k, v in enumerate(phonemes))
letters = "_abcdefghijklmnopqrstuvwxyz*"
l2i = dict((v, k) for k, v in enumerate(letters))

In [40]:
#Start of sentence token
SOS_token = 0

In [6]:
maxlen = 15
#Map words to corresponding list of phoneme indices. Constraint
pronounce_dict = {w.lower(): [p2i[p] for p in ps] for w, ps in lines
                    if (5<=len(w)<=maxlen) and re.match("^[A-Z]+$", w)}
len(pronounce_dict)

108006

In [7]:
maxlen_p = max([len(v) for k,v in pronounce_dict.items()]); maxlen_p

16

In [8]:
#words contain the number of words in the filtered dictionary
words = np.random.permutation(list(pronounce_dict.keys()))
n = len(words)

#Initialise the input and labels array with zeros so that everywhere except 
#the position of values is padded
input_ = np.zeros((n, maxlen_p), np.int32)
labels_ = np.zeros((n, maxlen), np.int32)

#Fill in the non zero indices
for i, k in enumerate(words):
    for j, p in enumerate(pronounce_dict[k]): input_[i][j]=p
    for j, p in enumerate(k): labels_[i][j] = l2i[p]
        

In [9]:
#Create train, validation sets
(input_train, input_test, labels_train, labels_test, 
    ) = train_test_split(input_, labels_, test_size=0.1)

In [10]:
input_vocab_size, output_vocab_size = len(phonemes), len(letters);input_vocab_size, output_vocab_size

(70, 28)

In [11]:
dim = 240

In [30]:
def get_batch(x, y, batch_size=32):
    idxs = np.random.permutation(len(x))[:batch_size]
    return x[idxs], y[idxs]

## Model


In [550]:
class EncoderRNN(nn.Module):
    def __init__(self, input_size, hidden_size, n_layers=1):
        super(EncoderRNN, self).__init__()
        self.hidden_size = hidden_size
        self.embedding = nn.Embedding(input_size, hidden_size//2)
        self.grubi = nn.GRU(hidden_size//2, hidden_size, dropout=0.1, batch_first=True, num_layers=1,
                         bidirectional=True)
        self.gru = nn.GRU(hidden_size*2, hidden_size, batch_first=True, dropout=0.1,
                            num_layers=1)
    def forward(self, input, hidden):
        x, hidden = self.grubi(self.embedding(input), hidden)
        output, hidden = self.gru(x, hidden)
        return output, hidden

    # TODO: other inits
    def initHidden(self, batch_size):
        return Variable(torch.zeros(1, batch_size, self.hidden_size))

In [551]:
class DecoderRNN(nn.Module):
    def __init__(self, hidden_size, output_size, n_layers=1):
        super(DecoderRNN, self).__init__()
        self.hidden_size = hidden_size
        self.embedding = nn.Embedding(output_size, hidden_size)
        self.gru = nn.GRU(hidden_size, hidden_size, batch_first=True, num_layers=n_layers)
        # TODO use transpose of embedding
        self.out = nn.Linear(hidden_size, output_size)
        self.sm = nn.LogSoftmax()
        
    def forward(self, input, hidden):
        emb = self.embedding(input).unsqueeze(1)
        # NB: Removed relu
        res, hidden = self.gru(emb, hidden)
        output = self.sm(self.out(res[:,0]))
        return output, hidden

In [552]:
input_vocab_size, output_vocab_size, dim

(70, 28, 240)

In [553]:
def train(input_variable, target_variable, encoder, decoder, 
          encoder_optimizer, decoder_optimizer, criterion):
    batch_size, input_length = input_variable.size()
    target_length = target_variable.size()[1]
    encoder_hidden = encoder.initHidden(batch_size).cuda()
    encoder_optimizer.zero_grad()
    decoder_optimizer.zero_grad()
    loss = 0

    encoder_output, encoder_hidden = encoder(input_variable, encoder_hidden)
    decoder_input = Variable(torch.LongTensor([SOS_token]*batch_size)).cuda()
    decoder_hidden = encoder_hidden

    for di in range(target_length):
        decoder_output, decoder_hidden = decoder(decoder_input, decoder_hidden)          
                #, encoder_output, encoder_outputs)
        targ = target_variable[:, di]
        #print(decoder_output.size(), targ.size(), target_variable.size())
        loss += criterion(decoder_output, targ)
        _, indices = torch.max(decoder_output, 1)
        decoder_input = indices
        break
    loss.backward()
    encoder_optimizer.step()
    decoder_optimizer.step()
    return loss.data[0] / target_length

In [554]:
def asMinutes(s):
    m = math.floor(s / 60)
    s -= m * 60
    return '%dm %ds' % (m, s)

def timeSince(since, percent):
    now = time.time()
    s = now - since
    es = s / (percent)
    rs = es - s
    return '%s (- %s)' % (asMinutes(s), asMinutes(rs))

In [555]:
def trainIters(encoder, decoder, n_epochs, print_every=1000, plot_every=100, 
                learning_rate=0.01):
    start = time.time()    
    start = time.time()
    plot_losses = []
    print_loss_total = 0 # Reset every print_every
    plot_loss_total = 0 # Reset every plot_every
    
    encoder_optimizer = optim.Adam(netE.parameters(), lr=learning_rate, betas=(0.5, 0.999))
    decoder_optimizer = optim.Adam(netD.parameters(), lr=learning_rate, betas=(0.5, 0.999))

    criterion = nn.NLLLoss().cuda()
   
    for epoch in range(1, n_epochs + 1):
        training_batch = get_batch(input_train, labels_train)

        input_variable = Variable(torch.LongTensor((training_batch[0].astype('int64')))).cuda()
        target_variable = Variable(torch.LongTensor(training_batch[1].astype('int64'))).cuda()
        
        loss = train(input_variable, target_variable, encoder, decoder, encoder_optimizer, 
                             decoder_optimizer, criterion)
        print_loss_total += loss
        plot_loss_total += loss

        if epoch % print_every == 0:
            print_loss_avg = print_loss_total / print_every
            print_loss_total = 0
            print('%s (%d %d%%) %.4f' % (timeSince(start, epoch / n_epochs), epoch, 
                                         epoch / n_epochs * 100, print_loss_avg))
        
        if epoch % plot_every == 0:
            plot_loss_avg = plot_loss_total / plot_every
            plot_losses.append(plot_loss_avg)
            plot_loss_total = 0
    
    showPlot(plot_losses)

In [556]:
encoder = EncoderRNN(input_vocab_size, dim, 1).cuda()
decoder = DecoderRNN(dim, output_vocab_size).cuda()

In [557]:
import matplotlib.pyplot as plt
import matplotlib.ticker as ticker
import numpy as np
%matplotlib inline

def showPlot(points):
    plt.figure()
    fig, ax = plt.subplots()
    loc = ticker.MultipleLocator(base=0.2) # this locator puts ticks at regular intervals
    ax.yaxis.set_major_locator(loc)
    plt.plot(points)

In [558]:
trainIters(encoder, decoder, 1, print_every=500, learning_rate=0.05)

RuntimeError: Expected hidden size (2, 32, 480), got (1, 32, 240)

In [531]:
test_batch = get_batch(input_test, labels_test)

input_variable = Variable(torch.LongTensor((test_batch[0].astype('int64')))).cuda()
target_variable = Variable(torch.LongTensor(test_batch[1].astype('int64'))).cuda()

batch_size, input_length = input_variable.size()
target_length = target_variable.size()[1]
encoder_hidden = encoder.initHidden(batch_size).cuda()
    
encoder_output, encoder_hidden = encoder(input_variable, encoder_hidden)
decoder_input = Variable(torch.LongTensor([SOS_token]*batch_size)).cuda()
decoder_hidden = encoder_hidden
decoded_words = []
for di in range(target_length):
    decoder_output, decoder_hidden = decoder(decoder_input, decoder_hidden)          
    _, indices = torch.max(decoder_output, 1)
    decoded_words.append(indices)
    decoder_input = indices

torch.Size([32, 16, 120])


In [465]:
combine = []
for x in decoded_words:
    combine.append(x.cpu().data.numpy())

In [477]:
combine = np.array(combine)

In [481]:
input_test1=test_batch[0]
labels_test1=test_batch[1]
print ('  Phonemes_________________________________predictions______label')
for index in range(32):
    phoneme = '-'.join([phonemes[p] for p in input_test1[index]])
    prediction = [letters[l] for l in combine[:][index]]
    real = [letters[l] for l in labels_test1[index]]
    print ('  ',phoneme.strip('-_').ljust(40), ''.join(prediction).strip('_').ljust(14), 
           ''.join(real).strip('_'))

  Phonemes_________________________________predictions______label
   B-AH1-N-T-IH0-NG                         eeeeeeeeeeeeeeeeeeeeeeeeeeeeeeee bunting
   B-AH1-S-T                                tttttttttttttttttttttttttttttttt bused
   T-W-ER0-D-AW1-S-K-IY0                    qqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqq twardowski
   AE2-G-R-IH0-K-AH1-L-CH-AH0-R-AH0-L-IY0   yyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyy agriculturally
   B-AA1-R-OW0-Z                            tttttttttttttttttttttttttttttttt barros
   R-EY1-L-B-AY2-K                          qqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqq railbike
   HH-AA1-R-D-IH0-JH                        tttttttttttttttttttttttttttttttt hardage
   D-IH0-V-IH1-N-AH0-T-IY0                  qqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqq divinity
   EH2-K-S-T-R-AE1-V-AH0-G-AH0-N-T-L-IY0    tttttttttttttttttttttttttttttttt extravagantly
   B-EH0-N-AA0-V-EH1-N-T-IY0                qqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqq benavente
   F-IH0-T-IH0-P-AA1-L-D-IY0                ttttttttttttttttttttttt

IndexError: index 15 is out of bounds for axis 0 with size 15