## Machine Translation using Seq2Seq LSTM

### Import modules

In [45]:
import torch
import torch.nn as nn

import numpy as np
import pandas as pd

import random
import re
import string

from sklearn.utils import shuffle
from sklearn.model_selection import train_test_split

### Reading data

In [46]:
data = pd.read_table('eng-fra.txt', names=['eng', 'fra'])
data

Unnamed: 0,eng,fra
0,Go.,Va !
1,Run!,Cours !
2,Run!,Courez !
3,Wow!,Ça alors !
4,Fire!,Au feu !
...,...,...
135837,A carbon footprint is the amount of carbon dio...,Une empreinte carbone est la somme de pollutio...
135838,Death is something that we're often discourage...,La mort est une chose qu'on nous décourage sou...
135839,Since there are usually multiple websites on a...,Puisqu'il y a de multiples sites web sur chaqu...
135840,If someone who doesn't know your background sa...,Si quelqu'un qui ne connaît pas vos antécédent...


In [47]:
data.iloc[135841].eng

'It may be impossible to get a completely error-free corpus due to the nature of this kind of collaborative effort. However, if we encourage members to contribute sentences in their own languages rather than experiment in languages they are learning, we might be able to minimize errors.'

In [48]:
data.iloc[135841].fra

"Il est peut-être impossible d'obtenir un Corpus complètement dénué de fautes, étant donnée la nature de ce type d'entreprise collaborative. Cependant, si nous encourageons les membres à produire des phrases dans leurs propres langues plutôt que d'expérimenter dans les langues qu'ils apprennent, nous pourrions être en mesure de réduire les erreurs."

**As we have so much data, computation time will be high. We will be taking only 20K instances of data from 135K instances.**

In [49]:
data = data.sample(10000)
data

Unnamed: 0,eng,fra
1180,It matters.,C'est important.
41277,Why don't you remember?,Pourquoi ne t'en souviens-tu pas ?
109561,All I can say is that I'd rather not go.,Tout ce que je peux dire est que je préférerai...
57377,You can't be sure of that.,Vous ne pouvez pas en être sûrs.
96368,I wish I could have spoken Spanish.,J'aimerais avoir pu parler espagnol.
...,...,...
2422,We'll fight.,Nous combattrons.
78795,He writes scripts for TV shows.,Il écrit des scénarios pour des séries télé.
62303,What else is on the agenda?,Qu'y a-t-il également à son programme aujourd'...
81847,Towns are larger than villages.,Les villes sont plus grandes que les villages.


In [50]:
data.reset_index(inplace=True)
data

Unnamed: 0,index,eng,fra
0,1180,It matters.,C'est important.
1,41277,Why don't you remember?,Pourquoi ne t'en souviens-tu pas ?
2,109561,All I can say is that I'd rather not go.,Tout ce que je peux dire est que je préférerai...
3,57377,You can't be sure of that.,Vous ne pouvez pas en être sûrs.
4,96368,I wish I could have spoken Spanish.,J'aimerais avoir pu parler espagnol.
...,...,...,...
9995,2422,We'll fight.,Nous combattrons.
9996,78795,He writes scripts for TV shows.,Il écrit des scénarios pour des séries télé.
9997,62303,What else is on the agenda?,Qu'y a-t-il également à son programme aujourd'...
9998,81847,Towns are larger than villages.,Les villes sont plus grandes que les villages.


In [51]:
data.drop('index', axis=1, inplace=True)
data

Unnamed: 0,eng,fra
0,It matters.,C'est important.
1,Why don't you remember?,Pourquoi ne t'en souviens-tu pas ?
2,All I can say is that I'd rather not go.,Tout ce que je peux dire est que je préférerai...
3,You can't be sure of that.,Vous ne pouvez pas en être sûrs.
4,I wish I could have spoken Spanish.,J'aimerais avoir pu parler espagnol.
...,...,...
9995,We'll fight.,Nous combattrons.
9996,He writes scripts for TV shows.,Il écrit des scénarios pour des séries télé.
9997,What else is on the agenda?,Qu'y a-t-il également à son programme aujourd'...
9998,Towns are larger than villages.,Les villes sont plus grandes que les villages.


### Data Pre-Processing

(we will not perform stop words removal and lemmatization and other tasks)

* lowercasing
* remove punctuations
* remove extra spaces
* appending `<SOS>` and `<EOS>` tokens in french data. 

In [52]:
data.eng = data.eng.apply(lambda x: x.lower())
data.fra = data.fra.apply(lambda x: x.lower())

In [53]:
sets_of_punctuations = set(string.punctuation)
sets_of_punctuations

{'!',
 '"',
 '#',
 '$',
 '%',
 '&',
 "'",
 '(',
 ')',
 '*',
 '+',
 ',',
 '-',
 '.',
 '/',
 ':',
 ';',
 '<',
 '=',
 '>',
 '?',
 '@',
 '[',
 '\\',
 ']',
 '^',
 '_',
 '`',
 '{',
 '|',
 '}',
 '~'}

In [54]:
# sets of punctuations doesn't contain " ' " symbol. We will remove it separately.

data["eng"] = data["eng"].apply(lambda x: re.sub("'", '', x))
data["fra"] = data["fra"].apply(lambda x: re.sub("'", '', x))

In [55]:
data.eng = data.eng.apply(lambda x: ''.join(char for char in x if char not in sets_of_punctuations))
data.fra = data.fra.apply(lambda x: ''.join(char for char in x if char not in sets_of_punctuations))

In [56]:
## uneven space removal
data["eng"] = data["eng"].apply(lambda x:re.sub("\s+"," ",x))
data["fra"] = data["fra"].apply(lambda x:re.sub("\s+"," ",x))

In [57]:
data

Unnamed: 0,eng,fra
0,it matters,cest important
1,why dont you remember,pourquoi ne ten souvienstu pas
2,all i can say is that id rather not go,tout ce que je peux dire est que je préférerai...
3,you cant be sure of that,vous ne pouvez pas en être sûrs
4,i wish i could have spoken spanish,jaimerais avoir pu parler espagnol
...,...,...
9995,well fight,nous combattrons
9996,he writes scripts for tv shows,il écrit des scénarios pour des séries télé
9997,what else is on the agenda,quy atil également à son programme aujourdhui
9998,towns are larger than villages,les villes sont plus grandes que les villages


In [58]:
data.fra = data.fra.apply(lambda x: "<START> "+x+" <END>")
data

Unnamed: 0,eng,fra
0,it matters,<START> cest important <END>
1,why dont you remember,<START> pourquoi ne ten souvienstu pas <END>
2,all i can say is that id rather not go,<START> tout ce que je peux dire est que je pr...
3,you cant be sure of that,<START> vous ne pouvez pas en être sûrs <END>
4,i wish i could have spoken spanish,<START> jaimerais avoir pu parler espagnol <END>
...,...,...
9995,well fight,<START> nous combattrons <END>
9996,he writes scripts for tv shows,<START> il écrit des scénarios pour des séries...
9997,what else is on the agenda,<START> quy atil également à son programme auj...
9998,towns are larger than villages,<START> les villes sont plus grandes que les v...


### Tokenization and Vocab creation

In [59]:
all_eng_vocab = set()
for sentence in data.eng:
    words = sentence.split()
    for word in words:
        if word not in all_eng_vocab:
            all_eng_vocab.add(word)

In [60]:
all_fra_vocab = set()
for sentence in data.fra:
    words = sentence.split()
    for word in words:
        if word not in all_fra_vocab:
            all_fra_vocab.add(word)

In [61]:
## find max length of English (Source/Input) sentences as well as French (Target) sentences

sequence_length = []
for sents in data.eng:
    sequence_length.append(len(sents.split(' ')))
max_length_eng = np.max(sequence_length)
print(max_length_eng)

27


In [62]:
sequence_length = []
for sents in data.fra:
    sequence_length.append(len(sents.split(' ')))
max_length_fra = np.max(sequence_length)
print(max_length_fra)

30


In [63]:
## Here we can see max length of sentence in input is 28 and for target is 35. 
# Note that it can change when notebook is rerun as we have sampled 20k instances.

In [64]:
input_words = sorted(all_eng_vocab)
target_words = sorted(all_fra_vocab)

In [65]:
num_encoder_tokens = len(input_words)
num_decoder_tokens = len(target_words)

print(num_encoder_tokens, num_decoder_tokens)

4824 7770


In [66]:
# For zero padding we add one extra token
num_decoder_tokens += 1
num_encoder_tokens += 1
num_encoder_tokens, num_decoder_tokens

(4825, 7771)

In [67]:
## Create word to index & index to word dictionaries

# word to idx (stoi)
input_token_index = dict([(word, i + 1) for i, word in enumerate(input_words)])
target_token_index = dict([(word, i + 1) for i, word in enumerate(target_words)])

# idx to words (itos)
input_index_token = dict((i, word) for word, i in input_token_index.items())
target_index_token = dict((i, word) for word, i in target_token_index.items())

### Data Split

In [68]:
X = data.eng
y = data.fra

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.15)
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((8500,), (1500,), (8500,), (1500,))

In [86]:
def generate_batch(X = X_train, y = y_train, batch_size = 128):
    '''Function to generate a batch of data '''
    for j in range(0, len(X), batch_size):
        encoder_input_data = np.zeros((max_length_eng, batch_size),dtype='float32')
        decoder_input_data = np.zeros((max_length_fra, batch_size),dtype='float32')
        decoder_target_data = np.zeros((max_length_fra, batch_size ,num_decoder_tokens),dtype='float32')
        for i, (input_text, target_text) in enumerate(zip(X[j:j+batch_size], y[j:j+batch_size])):
            for t, word in enumerate(input_text.split()):
                encoder_input_data[t, i] = input_token_index[word] # encoder input seq
            for t, word in enumerate(target_text.split()):
                if t<len(target_text.split())-1:
                    decoder_input_data[t, i] = target_token_index[word] # decoder input seq
                if t>0:
                    # decoder target sequence (one hot encoded)
                    # does not include the START_ token
                    # Offset by one timestep
                    decoder_target_data[t-1, i , target_token_index[word]] = 1.
        yield([encoder_input_data, decoder_input_data], decoder_target_data)

In [75]:
# Input to the Encoder
encoder_input_data = np.zeros((len(data.eng), max_length_eng),dtype='float16')

# output from the encoder or input to the decoder
decoder_input_data = np.zeros((len(data.fra), max_length_fra),dtype='float16')

# output by the decoder
decoder_target_data = np.zeros((len(data.fra), max_length_fra, num_decoder_tokens),dtype='float16')

In [76]:
for i, (input_text, target_text) in enumerate(zip(data.eng, data.fra)):
    for t, word in enumerate(input_text.split()):
        encoder_input_data[i, t] = input_token_index[word]
    for t, word in enumerate(target_text.split()):
        decoder_input_data[i, t] = target_token_index[word]
        if t > 0:
            # decoder target data is ahead of decoder input by one timestep
            decoder_target_data[i, t - 1, target_token_index[word]] = 1.

### Model Architecture: Encoder-Decoder

In [77]:
latent_dim = 50

In [78]:
class Encoder(nn.Module):
    def __init__(self, input_size, embedding_size, hidden_size):
        super(Encoder, self).__init__()
        self.input_size = input_size

        self.embedding_size = embedding_size

        self.hidden_size = hidden_size

        self.embedding = nn.Embedding(self.input_size, self.embedding_size)

        self.LSTM = nn.LSTM(self.embedding_size, self.hidden_size)

    def forward(self, x):
        embedding = self.embedding(x)
        outputs, (hidden_state, cell_state) = self.LSTM(embedding)
        encoder_states = [hidden_state,cell_state]

        return encoder_states

In [79]:
encoder = Encoder(num_encoder_tokens, latent_dim, latent_dim)
print(encoder)

Encoder(
  (embedding): Embedding(4825, 50)
  (LSTM): LSTM(50, 50)
)


In [80]:
class Decoder(nn.Module):
    def __init__(self, input_size, embedding_size, hidden_size, output_size):
        super(Decoder, self).__init__()

        # Size of the one hot vectors that will be the input to the decoder
        self.input_size = input_size

        # Output size of the word embedding NN
        self.embedding_size = embedding_size

        # Dimension of the NN's inside the lstm cell/ (hs,cs)'s dimension.
        self.hidden_size = hidden_size

        # Size of the one hot vectors that will be the output of the decoder
        self.output_size = output_size

        self.embedding = nn.Embedding(self.input_size, self.embedding_size)
        self.LSTM = nn.LSTM(self.embedding_size, hidden_size)
        self.fc = nn.Linear(self.hidden_size, self.output_size)

    def forward(self, x, enc_states):
        x = x.unsqueeze(0)
        embedding = self.embedding(x)

        # (passing encoder's hs, cs - context vectors)
        outputs, (hidden_state, cell_state) = self.LSTM(embedding, enc_states)

        predictions = self.fc(outputs)

        predictions = predictions.squeeze(0)

        decoder_states = (hidden_state, cell_state)

        return predictions, decoder_states

In [81]:
decoder = Decoder(num_decoder_tokens, latent_dim, latent_dim, num_decoder_tokens)
print(decoder)

Decoder(
  (embedding): Embedding(7771, 50)
  (LSTM): LSTM(50, 50)
  (fc): Linear(in_features=50, out_features=7771, bias=True)
)


In [82]:
class Seq2Seq(nn.Module):
    def __init__(self, Encoder_LSTM, Decoder_LSTM):
        super(Seq2Seq, self).__init__()
        self.Encoder_LSTM = Encoder_LSTM
        self.Decoder_LSTM = Decoder_LSTM

    def forward(self, source, target, tfr=0.5):
        batch_size = source.shape[1]

        target_len = target.shape[0]
        target_vocab_size = num_decoder_tokens

        outputs = torch.zeros(target_len, batch_size, target_vocab_size)

        hidden_state, cell_state = self.Encoder_LSTM(source)

        x = target[0]

        for i in range(1, target_len):
            output, ( hidden_state, cell_state ) = self.Decoder_LSTM(x, (hidden_state, cell_state))
            outputs[i] = output
            best_guess = output.argmax(1) # 1st dimension is word embedding, 0th dimension is batchsize
            x = target[i] if random.random() < tfr else best_guess # Either pass the next word correctly from the dataset or use the earlier predicted word

        return outputs

In [83]:
# Hyperparameters

learning_rate = 0.001
step = 0

model = Seq2Seq(encoder, decoder)
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)

criterion = nn.CrossEntropyLoss()

In [84]:
# Some model hyperparameters
train_samples = len(X_train)
val_samples = len(X_test)
batch_size = 256
num_epochs = 10

In [87]:
# Training the model
epoch_loss = 0.0
best_loss = 999999
losses = []
best_epoch = -1
ts1  = []

for epoch in range(num_epochs):
    epoch_loss_list = []
    print("Epoch - {} / {}".format(epoch+1, num_epochs))


    model.train(True)
    for batch_idx, ( input_data, target_data ) in enumerate(generate_batch(batch_size=batch_size)):
        input_data_enc = torch.tensor(input_data[0]).long()
        input_data_dec = torch.tensor(input_data[1]).long()
        target = torch.tensor(target_data.argmax(2)).long()

        # Pass the input and target for model's forward method
        output = model(input_data_enc, target)
        output = output[1:].reshape(-1, output.shape[2])
        target = target[1:].reshape(-1)

        # Clear the accumulating gradients
        optimizer.zero_grad()

        # Calculate the loss value for every epoch
        loss = criterion(output, target)

        # Calculate the gradients for weights & biases using back-propagation
        loss.backward()

        # Update the weights values using the gradients we calculated using bp
        optimizer.step()
        step += 1
        epoch_loss += loss.item()

        epoch_loss_list.append(loss.item())

        if epoch_loss < best_loss:
            best_loss = epoch_loss
            best_epoch = epoch
        if ((epoch - best_epoch) >= 10):
            print("no improvement in 10 epochs, break")
            break
        print("Iterations / loss -  {} / {}".format(batch_idx,loss.item()))
        print()
    losses.append(np.mean(epoch_loss_list))

torch.save({
          'model_state_dict': model.state_dict(),
          'loss': losses
          },"lstm_seq2seq")

Epoch - 1 / 10
Iterations / loss -  0 / 9.00987434387207

Iterations / loss -  1 / 8.932024002075195

Iterations / loss -  2 / 8.873627662658691

Iterations / loss -  3 / 8.81226921081543

Iterations / loss -  4 / 8.771747589111328

Iterations / loss -  5 / 8.741626739501953

Iterations / loss -  6 / 8.514213562011719

Iterations / loss -  7 / 8.362236976623535

Iterations / loss -  8 / 8.293224334716797

Iterations / loss -  9 / 8.1569185256958

Iterations / loss -  10 / 8.068642616271973

Iterations / loss -  11 / 7.958271026611328

Iterations / loss -  12 / 7.97873592376709

Iterations / loss -  13 / 7.722585678100586

Iterations / loss -  14 / 7.601340293884277

Iterations / loss -  15 / 7.468379020690918

Iterations / loss -  16 / 7.36539363861084

Iterations / loss -  17 / 7.312774181365967

Iterations / loss -  18 / 7.182806491851807

Iterations / loss -  19 / 6.935685157775879

Iterations / loss -  20 / 6.814390659332275

Iterations / loss -  21 / 6.725342273712158

Iterations 

Iterations / loss -  10 / 1.5011496543884277

Iterations / loss -  11 / 1.5950796604156494

Iterations / loss -  12 / 1.5128573179244995

Iterations / loss -  13 / 1.5710408687591553

Iterations / loss -  14 / 1.4542452096939087

Iterations / loss -  15 / 1.5206581354141235

Iterations / loss -  16 / 1.430726170539856

Iterations / loss -  17 / 1.5231949090957642

Iterations / loss -  18 / 1.582231044769287

Iterations / loss -  19 / 1.451737880706787

Iterations / loss -  20 / 1.5175186395645142

Iterations / loss -  21 / 1.5095438957214355

Iterations / loss -  22 / 1.521159291267395

Iterations / loss -  23 / 1.5686663389205933

Iterations / loss -  24 / 1.4923734664916992

Iterations / loss -  25 / 1.5365716218948364

Iterations / loss -  26 / 1.5591704845428467

Iterations / loss -  27 / 1.594876766204834

Iterations / loss -  28 / 1.5253019332885742

Iterations / loss -  29 / 1.5667297840118408

Iterations / loss -  30 / 1.5632286071777344

Iterations / loss -  31 / 1.56207776069

In [88]:
loss = losses[-1]
loss

1.433918041341445

In [107]:
model = Seq2Seq(Encoder(num_encoder_tokens, latent_dim, latent_dim), Decoder(num_decoder_tokens, latent_dim, latent_dim, num_decoder_tokens))


checkpoint = torch.load("lstm_seq2seq")
model.load_state_dict(checkpoint['model_state_dict'])

def decode_sequence(sentence, max_length=50):
    model.eval()
    # lower, removing punctuations,
    tokens =  (''.join(char for char in re.sub(" +", " ", re.sub("'", '', sentence).lower()) if char not in sets_of_punctuations)).split()

    text_to_indices = [ input_token_index[token] for token in tokens]
    sentence_tensor = torch.LongTensor(text_to_indices).unsqueeze(1)

    # Build encoder hidden, cell state
    with torch.no_grad():
        hidden, cell = model.Encoder_LSTM(sentence_tensor)

    outputs = [target_token_index["<START>"]]

    for _ in range(max_length):
        previous_word = torch.LongTensor([outputs[-1]])

        with torch.no_grad():
            output, ( hidden, cell ) = model.Decoder_LSTM(previous_word, (hidden, cell))
            best_guess = output.argmax(1).item()

        outputs.append(best_guess)

        # Model predicts it's the end of the sentence
        if best_guess == "<END>":
            break
#     print("Outputs", outputs)

    translated_sentence = [target_index_token.get(idx,'<PAD>') for idx in outputs]
    return translated_sentence[1:]

In [110]:
# target_index_token[40]

'<START>'

In [108]:
k=0
decoded_sentence = decode_sequence(X_train[k:k+1].values[0])
print('Input English sentence:', X_train[k:k+1].values[0])
print('Actual French Translation:', y_train[k:k+1].values[0])
print('Predicted French Translation:', decoded_sentence)

Outputs [40, 39, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
Input English sentence: what would we do instead
Actual French Translation: <START> que ferionsnous à la place  <END>
Predicted French Translation: ['<END>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>']


In [94]:
k+=1
decoded_sentence = decode_sequence(X_train[k:k+1].values[0])
print('Input English sentence:', X_train[k:k+1].values[0])
print('Actual Chinese Translation:', y_train[k:k+1].values[0])
print('Predicted Chinese Translation:', decoded_sentence)

Input English sentence: i cried a lot
Actual Chinese Translation: <START> jai beaucoup pleuré <END>
Predicted Chinese Translation: ['<END>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>']
