In [192]:
#fast.ai Lesson 13

# Translating French to English with Pytorch

In [1]:
%matplotlib inline
import re, pickle, collections, bcolz, numpy as np, keras, sklearn, math, operator

Using TensorFlow backend.


In [99]:
from gensim.models import word2vec
import gensim
import torch, torch.nn as nn
from torch.nn.parameter import Parameter
from torch.autograd import Variable
from torch import optim
import torch.nn.functional as F

In [3]:
path='./data/'
dpath='./data/translate/'

The French-English parallel corpus can be downloaded from http://www.statmt.org/wmt10/training-giga-fren.tar. 

In [4]:
fname=path+'giga-fren.release2.fixed'
en_fname = fname+'.en'
fr_fname = fname+'.fr'

In [5]:
#review
import re
a = "123abc456"
b = "What the fuck ?"
re_eq = re.compile('^(Wh[^?.!]+\?)')
print (re.search("([0-9]*)([a-z]*)([0-9]*)",a).group())   #123abc456
print (re.search("([0-9]*)([a-z]*)([0-9]*)",a).group(1))   #123
print (re.search("([0-9]*)([a-z]*)([0-9]*)",a).group(2))   #abc
print (re.search("([0-9]*)([a-z]*)([0-9]*)",a).group(3) )  #456
print (re_eq.search(b).group() )  #

123abc456
123
abc
456
What the fuck ?


In [6]:
type(b)

str

 We'll just learn to translate questions that begin with 'Wh' 

In [7]:
re_eq = re.compile('^(Wh[^?.!]+\?)')
re_fq = re.compile('^([^?.!]+\?)')

lines = ((re_eq.search(eq), re_fq.search(fq)) 
         for eq, fq in zip(open(en_fname), open(fr_fname)))

qs = [(e.group(), f.group()) for e,f in lines if e and f]; len(qs)

52331

In [8]:
qs[:6]

[('What is light ?', 'Qu’est-ce que la lumière?'),
 ('Who are we?', 'Où sommes-nous?'),
 ('Where did we come from?', "D'où venons-nous?"),
 ('What would we do without it?', 'Que ferions-nous sans elle ?'),
 ('What is the absolute location (latitude and longitude) of Badger, Newfoundland and Labrador?',
  'Quelle sont les coordonnées (latitude et longitude) de Badger, à Terre-Neuve-etLabrador?'),
 ('What is the major aboriginal group on Vancouver Island?',
  'Quel est le groupe autochtone principal sur l’île de Vancouver?')]

In [9]:
pickle.dump(qs, open(dpath+'fr-en-qs.pkl', 'wb'))

In [10]:
qs = pickle.load(open(dpath+'fr-en-qs.pkl', 'rb'))

In [11]:
en_qs, fr_qs = zip(*qs)

In [12]:
re_apos = re.compile(r"(\w)'s\b")         # make 's a separate word
re_mw_punc = re.compile(r"(\w[’'])(\w)")  # other ' in a word creates 2 words
re_punc = re.compile("([\"().,;:/_?!—])") # add spaces around punctuation
re_mult_space = re.compile(r"  *")        # replace multiple spaces with just one

def simple_toks(sent):
    sent = re_apos.sub(r"\1 's", sent)
    sent = re_mw_punc.sub(r"\1 \2", sent)
    sent = re_punc.sub(r" \1 ", sent).replace('-', ' ')
    sent = re_mult_space.sub(' ', sent)
    return sent.lower().split()

In [13]:
fr_qtoks = list(map(simple_toks, fr_qs)); fr_qtoks[:4]

[['qu’', 'est', 'ce', 'que', 'la', 'lumière', '?'],
 ['où', 'sommes', 'nous', '?'],
 ["d'", 'où', 'venons', 'nous', '?'],
 ['que', 'ferions', 'nous', 'sans', 'elle', '?']]

In [14]:
en_qtoks = list(map(simple_toks, en_qs)); en_qtoks[:4]

[['what', 'is', 'light', '?'],
 ['who', 'are', 'we', '?'],
 ['where', 'did', 'we', 'come', 'from', '?'],
 ['what', 'would', 'we', 'do', 'without', 'it', '?']]

In [15]:
simple_toks("Rachel's baby is cuter than other's.")

['rachel', "'s", 'baby', 'is', 'cuter', 'than', 'other', "'s", '.']

In [16]:
PAD = 0; SOS = 1

In [17]:
def toks2ids(sents):
    voc_cnt = collections.Counter(t for sent in sents for t in sent)
    vocab = sorted(voc_cnt, key=voc_cnt.get, reverse=True)
    vocab.insert(PAD, "<PAD>")
    vocab.insert(SOS, "<SOS>")
    w2id = {w:i for i,w in enumerate(vocab)}
    ids = [[w2id[t] for t in sent] for sent in sents]
    return ids, vocab, w2id, voc_cnt

In [18]:
fr_ids, fr_vocab, fr_w2id, fr_counts = toks2ids(fr_qtoks)
en_ids, en_vocab, en_w2id, en_counts = toks2ids(en_qtoks)

In [19]:
len(fr_ids)

52331

In [20]:
en_vocab

['<PAD>',
 '<SOS>',
 '?',
 'the',
 'what',
 'of',
 'to',
 'and',
 'is',
 'in',
 'are',
 ',',
 'a',
 'for',
 'do',
 'why',
 'be',
 'you',
 'or',
 'who',
 'on',
 'this',
 'that',
 'have',
 'your',
 'when',
 'can',
 'should',
 'will',
 'which',
 'does',
 'i',
 'with',
 'where',
 'if',
 'it',
 'about',
 'would',
 '"',
 '(',
 ')',
 'canada',
 'we',
 'from',
 'not',
 'an',
 'by',
 'as',
 '/',
 'information',
 'these',
 'health',
 "'s",
 'was',
 'they',
 's',
 'did',
 'other',
 'their',
 'has',
 'were',
 'at',
 'been',
 'new',
 'use',
 'program',
 'most',
 'how',
 'need',
 'my',
 'role',
 'more',
 'between',
 'government',
 'public',
 'could',
 'canadian',
 'people',
 'research',
 'impact',
 'there',
 'happens',
 'our',
 'used',
 'work',
 'mean',
 'so',
 'such',
 'policy',
 'important',
 'its',
 'being',
 'done',
 'services',
 'some',
 'any',
 'think',
 'level',
 'time',
 't',
 'make',
 'service',
 'development',
 'take',
 'process',
 'community',
 'all',
 'kind',
 'national',
 'available',
 

## Word vectors


Stanford's GloVe word vectors can be downloaded from https://nlp.stanford.edu/projects/glove/ 

For French word vectors, we're using those from http://fauconnier.github.io/index.html

In [76]:
model = word2vec

In [168]:
def load_glove(loc):
    return (bcolz.open(loc+'.txt')[:],
        pickle.load(open(loc+'_words.pkl','rb'), encoding='latin1'),
        pickle.load(open(loc+'_idx.pkl','rb'), encoding='latin1'))

In [178]:
for line in f:
    en_vecs = line.split()
    en_wv_word = en_vecs[0]
    en_wv_idx = np.asarray(en_vecs[1:], dtype='float32')
    

In [46]:
lines

<_io.TextIOWrapper name='./data/nlp/glove.6B.100d.txt' mode='r' encoding='utf-8'>

In [30]:
with open('./data/nlp/glove.6B.100d.txt',encoding="utf-8", mode="r") as lines:
    en_w2v = {line.split()[0]: np.array(list(map(float, line.split()[1:])))
           for line in lines}

In [53]:
en_w2v['king']

array([-0.32307 , -0.87616 ,  0.21977 ,  0.25268 ,  0.22976 ,  0.7388  ,
       -0.37954 , -0.35307 , -0.84369 , -1.1113  , -0.30266 ,  0.33178 ,
       -0.25113 ,  0.30448 , -0.077491, -0.89815 ,  0.092496, -1.1407  ,
       -0.58324 ,  0.66869 , -0.23122 , -0.95855 ,  0.28262 , -0.078848,
        0.75315 ,  0.26584 ,  0.3422  , -0.33949 ,  0.95608 ,  0.065641,
        0.45747 ,  0.39835 ,  0.57965 ,  0.39267 , -0.21851 ,  0.58795 ,
       -0.55999 ,  0.63368 , -0.043983, -0.68731 , -0.37841 ,  0.38026 ,
        0.61641 , -0.88269 , -0.12346 , -0.37928 , -0.38318 ,  0.23868 ,
        0.6685  , -0.43321 , -0.11065 ,  0.081723,  1.1569  ,  0.78958 ,
       -0.21223 , -2.3211  , -0.67806 ,  0.44561 ,  0.65707 ,  0.1045  ,
        0.46217 ,  0.19912 ,  0.25802 ,  0.057194,  0.53443 , -0.43133 ,
       -0.34311 ,  0.59789 , -0.58417 ,  0.068995,  0.23944 , -0.85181 ,
        0.30379 , -0.34177 , -0.25746 , -0.031101, -0.16285 ,  0.45169 ,
       -0.91627 ,  0.64521 ,  0.73281 , -0.22752 , 

In [44]:
# en_vecs, en_wv_word, en_wv_idx = load_glove('/data/datasets/nlp/glove/results/glove.6B.100d')
# en_w2v = {w: en_vecs[en_wv_idx[w]] for w in en_wv_word}
# n_en_vec, dim_en_vec = en_vecs.shape

In [47]:
# with open('./data/nlp/glove.6B.100d.txt',encoding="utf-8", mode="r") as lines:
#     en_wv_word=[line.split()[0]for line in lines]

In [49]:
# len(en_wv_word)

400000

In [38]:
dim_en_vec=100

In [33]:
w2v_path='./data/nlp/frWac_non_lem_no_postag_no_phrase_200_skip_cut100.bin'
fr_model = gensim.models.KeyedVectors.load_word2vec_format(w2v_path, binary=True)
fr_voc = fr_model.vocab
dim_fr_vec = 200

In [56]:
len(fr_voc)

155562

In [35]:
def create_emb(w2v, targ_vocab, dim_vec):
    vocab_size = len(targ_vocab)
    emb = np.zeros((vocab_size, dim_vec))
    found=0

    for i, word in enumerate(targ_vocab):
        try: emb[i] = w2v[word]; found+=1
        except KeyError: emb[i] = np.random.normal(scale=0.6, size=(dim_vec,))

    return emb, found

In [37]:
fr_model

<gensim.models.keyedvectors.KeyedVectors at 0x7fba885249b0>

In [54]:
en_embs, found = create_emb(en_w2v, en_vocab, dim_en_vec); en_embs.shape, found

((19549, 100), 17251)

In [39]:
fr_embs, found = create_emb(fr_model, fr_vocab, dim_fr_vec); fr_embs.shape, found

((26709, 200), 21878)

## Prep data

Each sentence has to be of equal length. Keras has a convenient function `pad_sequences` to truncate and/or pad each sentence as required - even although we're not using keras for the neural net, we can still use any functions from it we need!

In [59]:
from keras.preprocessing.sequence import pad_sequences

maxlen = 30
en_padded = pad_sequences(en_ids, maxlen, 'int64', "post", "post")
fr_padded = pad_sequences(fr_ids, maxlen, 'int64', "post", "post")
en_padded.shape, fr_padded.shape, en_embs.shape,fr_embs.shape

((52331, 30), (52331, 30), (19549, 100), (26709, 200))

In [60]:
from sklearn import model_selection
fr_train, fr_test, en_train, en_test = model_selection.train_test_split(
    fr_padded, en_padded, test_size=0.1)

[o.shape for o in (fr_train, fr_test, en_train, en_test)]

[(47097, 30), (5234, 30), (47097, 30), (5234, 30)]

In [61]:
fr_train[0], en_train[0]

(array([   9,   46,  203,   11,   12,   83,   51,   21, 1002,   36,   21,
        3379,   15,  183,   14, 5705,  109, 1469,    2,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0]),
 array([  4,  11,   9,  24, 449,  11,  27,   3, 682,  18, 759,  14,   6,
        408, 427, 283,   2,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0]))

## Model
### Encoder-Decoder

Turning a sequence into a representation can be done using an RNN (called the 'encoder'. This approach is useful because RNN's are able to keep track of state and memory, which is obviously important in forming a complete understanding of a sentence.
* `bidirectional=True` passes the original sequence through an RNN, and the reversed sequence through a different RNN and concatenates the results. This allows us to look forward and backwards.
* We do this because in language things that happen later often influence what came before (i.e. in Spanish, "el chico, la chica" means the boy, the girl; the word for "the" is determined by the gender of the subject, which comes after).

In [62]:
def long_t(arr): return Variable(torch.LongTensor(arr)).cuda()

In [63]:
fr_emb_t = torch.FloatTensor(fr_embs).cuda()
en_emb_t = torch.FloatTensor(en_embs).cuda()

In [64]:
def create_emb(emb_mat, non_trainable=False):
    output_size, emb_size = emb_mat.size()
    emb = nn.Embedding(output_size, emb_size)
    emb.load_state_dict({'weight': emb_mat})
    if non_trainable:
        for param in emb.parameters(): 
            param.requires_grad = False
    return emb, emb_size, output_size

In [65]:
class EncoderRNN(nn.Module):
    def __init__(self, embs, hidden_size, n_layers=2):
        super(EncoderRNN, self).__init__()
        self.emb, emb_size, output_size = create_emb(embs, True)
        self.n_layers = n_layers
        self.hidden_size = hidden_size
        self.gru = nn.GRU(emb_size, hidden_size, batch_first=True, num_layers=n_layers)
#                          ,bidirectional=True)
        
    def forward(self, input, hidden):
        return self.gru(self.emb(input), hidden)

    def initHidden(self, batch_size):
        return Variable(torch.zeros(self.n_layers, batch_size, self.hidden_size))

In [87]:
def encode(inp, encoder):
    batch_size, input_length = inp.size()
    hidden = encoder.initHidden(batch_size).cuda()
    enc_outputs, hidden = encoder(inp, hidden)
    return long_t([SOS]*batch_size), enc_outputs, hidden    

Finally, we arrive at a vector representation of the sequence which captures everything we need to translate it. We feed this vector into more RNN's, which are trying to generate the labels. After this, we make a classification for what each word is in the output sequence.

In [67]:
class DecoderRNN(nn.Module):
    def __init__(self, embs, hidden_size, n_layers=2):
        super(DecoderRNN, self).__init__()
        self.emb, emb_size, output_size = create_emb(embs)
        self.gru = nn.GRU(emb_size, hidden_size, batch_first=True, num_layers=n_layers)
        self.out = nn.Linear(hidden_size, output_size)
        
    def forward(self, inp, hidden):
        emb = self.emb(inp).unsqueeze(1)
        res, hidden = self.gru(emb, hidden)
        res = F.log_softmax(self.out(res[:,0]))
        return res, hidden

This graph demonstrates the accuracy decay for a neural translation task. With an encoding/decoding technique, larger input sequences result in less accuracy.

<img src="https://smerity.com/media/images/articles/2016/bahdanau_attn.png" width="600">

This can be mitigated using an attentional model.

### Adding broadcasting to Pytorch

Using *broadcasting* makes a lot of numerical programming far simpler. Here's a couple of examples, using numpy:

In [68]:
v=np.array([1,2,3]); v, v.shape

(array([1, 2, 3]), (3,))

In [69]:
m=np.array([v,v*2,v*3]); m, m.shape

(array([[1, 2, 3],
        [2, 4, 6],
        [3, 6, 9]]), (3, 3))

In [70]:
m+v

array([[ 2,  4,  6],
       [ 3,  6,  9],
       [ 4,  8, 12]])

In [71]:
v1=np.expand_dims(v,-1); v1, v1.shape

(array([[1],
        [2],
        [3]]), (3, 1))

In [72]:
m+v1

array([[ 2,  3,  4],
       [ 4,  6,  8],
       [ 6,  9, 12]])

In [73]:
def unit_prefix(x, n=1):
    for i in range(n): x = x.unsqueeze(0)
    return x

def align(x, y, start_dim=2):
    xd, yd = x.dim(), y.dim()
    if xd > yd: y = unit_prefix(y, xd - yd)
    elif yd > xd: x = unit_prefix(x, yd - xd)

    xs, ys = list(x.size()), list(y.size())
    nd = len(ys)
    for i in range(start_dim, nd):
        td = nd-i-1
        if   ys[td]==1: ys[td] = xs[td]
        elif xs[td]==1: xs[td] = ys[td]
    return x.expand(*xs), y.expand(*ys)

In [74]:
def aligned_op(x,y,f): return f(*align(x,y,0))

def add(x, y): return aligned_op(x, y, operator.add)
def sub(x, y): return aligned_op(x, y, operator.sub)
def mul(x, y): return aligned_op(x, y, operator.mul)
def div(x, y): return aligned_op(x, y, operator.truediv)

In [75]:
def dot(x, y):
    assert(1<y.dim()<5)
    x, y = align(x, y)
    
    if y.dim() == 2: return x.mm(y)
    elif y.dim() == 3: return x.bmm(y)
    else:
        xs,ys = x.size(), y.size()
        res = torch.zeros(*(xs[:-1] + (ys[-1],)))
        for i in range(xs[0]): res[i].baddbmm_(x[i], (y[i]))
        return res

In [76]:
def Arr(*sz): return torch.randn(sz)/math.sqrt(sz[0])

In [77]:
m = Arr(3, 2); m2 = Arr(4, 3)
v = Arr(2)
b = Arr(4,3,2); t = Arr(5,4,3,2)

mt,bt,tt = m.transpose(0,1), b.transpose(1,2), t.transpose(2,3)

In [78]:
def check_eq(x,y): assert(torch.equal(x,y))

In [79]:
check_eq(dot(m,mt),m.mm(mt))
check_eq(dot(v,mt), v.unsqueeze(0).mm(mt))
check_eq(dot(b,bt),b.bmm(bt))
check_eq(dot(b,mt),b.bmm(unit_prefix(mt).expand_as(bt)))

In [80]:
exp = t.view(-1,3,2).bmm(tt.contiguous().view(-1,2,3)).view(5,4,3,3)
check_eq(dot(t,tt),exp)

In [81]:
check_eq(add(m,v),m+unit_prefix(v).expand_as(m))
check_eq(add(v,m),m+unit_prefix(v).expand_as(m))
check_eq(add(m,t),t+unit_prefix(m,2).expand_as(t))
check_eq(sub(m,v),m-unit_prefix(v).expand_as(m))
check_eq(mul(m,v),m*unit_prefix(v).expand_as(m))
check_eq(div(m,v),m/unit_prefix(v).expand_as(m))

## Attention Model

In [82]:
def Var(*sz): return Parameter(Arr(*sz)).cuda()

In [175]:
class AttnDecoderRNN(nn.Module):
    def __init__(self, embs, hidden_size, n_layers=2, p=0.1):
        super(AttnDecoderRNN, self).__init__()
        self.emb, emb_size, output_size = create_emb(embs)
        self.W1 = Var(hidden_size, hidden_size)
        self.W2 = Var(hidden_size, hidden_size)
        self.W3 = Var(emb_size+hidden_size, hidden_size)
        self.b2 = Var(hidden_size)
        self.b3 = Var(hidden_size)
        self.V = Var(hidden_size)
        self.gru = nn.GRU(hidden_size, hidden_size, num_layers=2)
        self.out = nn.Linear(hidden_size, output_size)

    def forward(self, inp, hidden, enc_outputs):
        emb_inp = self.emb(inp)
        w1e = dot(enc_outputs, self.W1)
        w2h = add(dot(hidden[-1], self.W2), self.b2).unsqueeze(1)
        u = F.tanh(add(w1e, w2h))
        a = mul(self.V,u).sum(2)
        a = F.softmax(a).unsqueeze(2)
        Xa = mul(a, enc_outputs).sum(1)
        res = dot(torch.cat([emb_inp, Xa.squeeze(1)], 1), self.W3)
        res = add(res, self.b3).unsqueeze(0)
        res, hidden = self.gru(res, hidden)
        res = F.log_softmax(self.out(res.squeeze(0)))
        return res, hidden

### Attention testing

In [84]:
def get_batch(x, y, batch_size=16):
    idxs = np.random.permutation(len(x))[:batch_size]
    return x[idxs], y[idxs]

In [85]:
hidden_size = 128
fra, eng = get_batch(fr_train, en_train, 4)
inp = long_t(fra)
targ = long_t(eng)
emb, emb_size, output_size = create_emb(en_emb_t)
emb.cuda()
inp.size()

torch.Size([4, 30])

In [100]:
W1 = Var(hidden_size, hidden_size)
W2 = Var(hidden_size, hidden_size)
W3 = Var(emb_size+hidden_size, hidden_size)
b2 = Var(1,hidden_size)
b3 = Var(1,hidden_size)
V = Var(1,1,hidden_size)
gru = nn.GRU(hidden_size, hidden_size, num_layers=2).cuda()
out = nn.Linear(hidden_size, output_size).cuda()

In [101]:
encoder = EncoderRNN(fr_emb_t, hidden_size).cuda()
# decoder = AttnDecoderRNN(en_emb_t, hidden_size).cuda()

In [136]:
encoder

<module 'torch.nn.functional' from '/home/cs/anaconda3/lib/python3.6/site-packages/torch/nn/functional.py'>

In [103]:
dec_inputs, enc_outputs, hidden = encode(inp, encoder)
enc_outputs.size(), hidden.size()

(torch.Size([4, 30, 128]), torch.Size([2, 4, 128]))

In [104]:
emb_inp = emb(dec_inputs); emb_inp.size()

torch.Size([4, 100])

In [105]:
w1e = dot(enc_outputs, W1); w1e.size()

torch.Size([4, 30, 128])

In [106]:
w2h = dot(hidden[-1], W2)
w2h = (w2h+b2.expand_as(w2h)).unsqueeze(1); w2h.size()

torch.Size([4, 1, 128])

In [159]:
u = F.tanh(w1e + w2h.expand_as(w1e))
u.size()

torch.Size([4, 30, 128])

In [160]:
a = (V.expand_as(u)*u)
a.size()

torch.Size([4, 30, 128])

In [161]:
a=a.sum(2)

In [162]:
a.size()

torch.Size([4, 30])

In [163]:
a = F.softmax(a).unsqueeze(2);

In [164]:
a.size()

torch.Size([4, 30, 1])

In [165]:
a.sum(1).squeeze(1)

Variable containing:
 1.0000
 1.0000
 1.0000
 1.0000
[torch.cuda.FloatTensor of size 4 (GPU 0)]

In [166]:
Xa = (a.expand_as(enc_outputs) * enc_outputs).sum(1); Xa.size()

torch.Size([4, 128])

In [167]:
res = dot(torch.cat([emb_inp, Xa.squeeze(1)], 1), W3)
res = (res+b3.expand_as(res)).unsqueeze(0); res.size()

torch.Size([1, 4, 128])

In [168]:
res, hidden = gru(res, hidden); res.size(), hidden.size()

(torch.Size([1, 4, 128]), torch.Size([2, 4, 128]))

In [169]:
res = F.log_softmax(out(res.squeeze(0))); res.size()

torch.Size([4, 19549])

## Train

In [170]:
def train(inp, targ, encoder, decoder, enc_opt, dec_opt, crit):
    decoder_input, encoder_outputs, hidden = encode(inp, encoder)
    target_length = targ.size()[1]
    
    enc_opt.zero_grad(); dec_opt.zero_grad()
    loss = 0

    for di in range(target_length):
        decoder_output, hidden = decoder(decoder_input, hidden, encoder_outputs)
        decoder_input = targ[:, di]
        loss += crit(decoder_output, decoder_input)

    loss.backward()
    enc_opt.step(); dec_opt.step()
    return loss.data[0] / target_length

In [171]:
def req_grad_params(o):
    return (p for p in o.parameters() if p.requires_grad)

In [172]:
def trainEpochs(encoder, decoder, n_epochs, print_every=1000, lr=0.01):
    loss_total = 0 # Reset every print_every
    
    enc_opt = optim.RMSprop(req_grad_params(encoder), lr=lr)
    dec_opt = optim.RMSprop(decoder.parameters(), lr=lr)
    crit = nn.NLLLoss().cuda()
    
    for epoch in range(n_epochs):
        fra, eng = get_batch(fr_train, en_train, 64)
        inp = long_t(fra)
        targ = long_t(eng)
        loss = train(inp, targ, encoder, decoder, enc_opt, dec_opt, crit)
        loss_total += loss

        if epoch % print_every == print_every-1:
            print('%d %d%% %.4f' % (epoch, epoch / n_epochs * 100, loss_total / print_every))
            loss_total = 0

In [176]:
hidden_size = 128
encoder = EncoderRNN(fr_emb_t, hidden_size).cuda()
decoder = AttnDecoderRNN(en_emb_t, hidden_size).cuda()

In [177]:
trainEpochs(encoder, decoder, 10000, print_every=500, lr=0.005)

499 4% 2.3886
999 9% 1.8825
1499 14% 1.6757
1999 19% 1.5435
2499 24% 1.4516
2999 29% 1.3766
3499 34% 1.3176
3999 39% 1.2719
4499 44% 1.2305
4999 49% 1.2076
5499 54% 1.1753
5999 59% 1.1456
6499 64% 1.1268
6999 69% 1.1034
7499 74% 1.0838
7999 79% 1.0719
8499 84% 1.0487
8999 89% 1.0319
9499 94% 1.0279
9999 99% 1.0180


## Test

In [178]:
def evaluate(inp):
    decoder_input, encoder_outputs, hidden = encode(inp, encoder)
    target_length = maxlen

    decoded_words = []
    for di in range(target_length):
        decoder_output, hidden = decoder(decoder_input, hidden, encoder_outputs)
        topv, topi = decoder_output.data.topk(1)
        ni = topi[0][0]
        if ni==PAD: break
        decoded_words.append(en_vocab[ni])
        decoder_input = long_t([ni])
    
    return decoded_words

In [179]:
def sent2ids(sent):
    ids = [fr_w2id[t] for t in simple_toks(sent)]
    return pad_sequences([ids], maxlen, 'int64', "post", "post")

In [180]:
def fr2en(sent): 
    ids = long_t(sent2ids(sent))
    trans = evaluate(ids)
    return ' '.join(trans)

In [181]:
i=8
print(en_qs[i],fr_qs[i])
fr2en(fr_qs[i])

What is the population of Canada? Quelle est la population du Canada ?


'what is the population of canada ?'

In [182]:
a="Quelle est tu fais"

In [183]:
fr2en(a)

'what is your favorite ?'