# Project - Neural Machine Translation

1. Build a Neural Machine Translation model.
2. Evaluate your model using BLEU score. <br>
Dataset: http://www.manythings.org/anki/fra-eng.zip


## Load Data

In [2]:
# !rm *.zip* -rf
# !wget  http://www.manythings.org/anki/fra-eng.zip

In [3]:
# !unzip -o fra-eng.zip
# !ls

## Data Cleaning

In [4]:
import pandas as pd
import numpy as np
import string
import re
from unicodedata import normalize

In [5]:
# load doc into memory
def load_doc(filename):
    # open the file as read only
    file = open(filename, mode='rt', encoding='utf-8')
    # read all text
    text = file.read()
    # close the file
    file.close()
    return text

In [6]:
# split a loaded document into sentences, then shuffle
def to_pairs(doc):
    lines = doc.strip().split('\n')
    pairs = [line.split('\t')[:2] for line in  lines]
    return pairs

In [7]:
# max number of pairs to consider
n_sentence =50000

# clean a list of lines
def clean_pairs(lines):
    i_sentence=0
    cleaned = list()
    # prepare regex for char filtering
    re_print = re.compile('[^%s]' % re.escape(string.printable))
    # prepare translation table for removing punctuation
    table = str.maketrans('', '', string.punctuation)
    for pair in lines:
        clean_pair = list()
        if i_sentence>=n_sentence:
            break
        for line in pair:
            # normalize unicode characters
            line = normalize('NFD', line).encode('ascii', 'ignore')
            line = line.decode('UTF-8')

            # tokenize on white space
            line = line.split()

            # convert to lowercase
            line = [word.lower() for word in line]
            
            # remove punctuation from each token
            line = [word.translate(table) for word in line]
            
            # remove non-printable chars form each token
            line = [re_print.sub('', w) for w in line]
            
            # remove tokens with numbers in them
            line = [word for word in line if word.isalpha()]
            
            # store as string
            clean_pair.append(' '.join(line))
        i_sentence+=1
        cleaned.append(clean_pair)
    return np.array(cleaned)
cleaned_doc_sentences=clean_pairs(to_pairs(load_doc('fra.txt')))

In [8]:
print(cleaned_doc_sentences.shape)
print(cleaned_doc_sentences[3])

(50000, 2)
['go' 'bouge']


## Splitting Data

In [9]:
np.random.shuffle(cleaned_doc_sentences)

# split into train/test
n_train=int(0.8*n_sentence)
train, test = cleaned_doc_sentences[:n_train], cleaned_doc_sentences[n_train:]
print(train.shape,test.shape)

(40000, 2) (10000, 2)


## Tokenizing and Transforming Input

### *Tokenize*

In [10]:
%load_ext tensorboard

In [11]:
from tensorflow.keras.preprocessing.text import Tokenizer



In [12]:
# fit and create tokenizer
def create_tokenizer(lines):
    tokenizer = Tokenizer()
    tokenizer.fit_on_texts(lines)
    return tokenizer

# max sentence length
def max_length(lines):
    max_l=0
    l=''
    for line in lines:
        if(max_l<len(line.split())):
            max_l=len(line.split())
            l=line.split()
    print("max line:",l)
    return max_l

In [13]:
# prepare english tokenizer
eng_tokenizer = create_tokenizer(cleaned_doc_sentences[:, 0])
eng_vocab_size = len(eng_tokenizer.word_index) + 1
eng_length = max_length(cleaned_doc_sentences[:, 0])
print('English Vocabulary Size:',eng_vocab_size)
print('English Max Length:',eng_length)

max line: ['is', 'it', 'a', 'yes', 'or', 'a', 'no']
English Vocabulary Size: 5930
English Max Length: 7


In [14]:
# prepare french tokenizer
fra_tokenizer = create_tokenizer(cleaned_doc_sentences[:, 1])
fra_vocab_size = len(fra_tokenizer.word_index) + 1
fra_length = max_length(cleaned_doc_sentences[:, 1])
print('French Vocabulary Size:',fra_vocab_size)
print('French Max Length:',fra_length)

max line: ['il', 'ny', 'a', 'pas', 'de', 'quoi', 'se', 'faire', 'des', 'nuds', 'au', 'cerveau']
French Vocabulary Size: 11847
French Max Length: 12


### Encoding

In [15]:
from keras.utils import to_categorical, pad_sequences

In [16]:
# encode and pad sequences
def encode_sequences(tokenizer, length, lines):
    # integer encode sequences
    X = tokenizer.texts_to_sequences(lines)
    # pad sequences with 0 values
    X = pad_sequences(X, maxlen=length, padding='post')
    return X

# one hot encode target sequence
def encode_output(sequences, vocab_size):
    ylist = list()
    for sequence in sequences:
        encoded = to_categorical(sequence, num_classes=vocab_size)
        ylist.append(encoded)
    y = np.array(ylist)
    y = y.reshape(sequences.shape[0], sequences.shape[1], vocab_size)
    return y

In [17]:
# prepare training data
trainX = encode_sequences(fra_tokenizer, fra_length, train[:, 1])
trainY = encode_sequences(eng_tokenizer, eng_length, train[:, 0])
trainY = encode_output(trainY, eng_vocab_size)

# prepare validation data
testX = encode_sequences(fra_tokenizer, fra_length, test[:, 1])
testY = encode_sequences(eng_tokenizer, eng_length, test[:, 0])
testY = encode_output(testY, eng_vocab_size)

## Define and Train model

In [18]:
from tensorflow import keras
from keras.models import Sequential
from keras.layers import LSTM
from keras.layers import Dense
from keras.layers import Embedding
from keras.layers import RepeatVector
from keras.layers import TimeDistributed
from keras.callbacks import EarlyStopping,ModelCheckpoint

In [19]:
# define NMT model
def define_model(src_vocab, tar_vocab, src_timesteps, tar_timesteps, n_units):
    model = Sequential()
    model.add(Embedding(src_vocab, n_units, input_length=src_timesteps, mask_zero=True))
    model.add(LSTM(n_units))
    model.add(RepeatVector(tar_timesteps))
    model.add(LSTM(n_units, return_sequences=True))
    model.add(TimeDistributed(Dense(tar_vocab, activation='softmax')))
    return model

In [20]:
# define model
model = define_model(fra_vocab_size, eng_vocab_size, fra_length, eng_length, 256)
model.compile(optimizer='adam', loss='categorical_crossentropy')

# summarize defined model
print(model.summary())

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 12, 256)           3032832   
                                                                 
 lstm (LSTM)                 (None, 256)               525312    
                                                                 
 repeat_vector (RepeatVecto  (None, 7, 256)            0         
 r)                                                              
                                                                 
 lstm_1 (LSTM)               (None, 7, 256)            525312    
                                                                 
 time_distributed (TimeDist  (None, 7, 5930)           1524010   
 ributed)                                                        
                                                                 
Total params: 5607466 (21.39 MB)
Trainable params: 56074

### Train

In [21]:
EPOCHS=15
BATCH_SIZE=16
MIN_DELTA=0.01
PATIENCE=3

In [22]:
es = EarlyStopping(monitor='val_loss',patience=PATIENCE, min_delta=MIN_DELTA)
checkpoint = ModelCheckpoint('chkpoint', monitor='val_loss', verbose=1, save_best_only=True, mode='min')

In [23]:
model.fit(trainX, trainY,
          epochs=EPOCHS, 
          batch_size=BATCH_SIZE, 
          validation_data=(testX, testY), 
#           callbacks=[es]
          callbacks=[checkpoint]          
         )

Epoch 1/15
Epoch 1: val_loss improved from inf to 2.52995, saving model to chkpoint
INFO:tensorflow:Assets written to: chkpoint\assets


INFO:tensorflow:Assets written to: chkpoint\assets


Epoch 2/15
Epoch 2: val_loss improved from 2.52995 to 2.07580, saving model to chkpoint
INFO:tensorflow:Assets written to: chkpoint\assets


INFO:tensorflow:Assets written to: chkpoint\assets


Epoch 3/15
Epoch 3: val_loss improved from 2.07580 to 1.81356, saving model to chkpoint
INFO:tensorflow:Assets written to: chkpoint\assets


INFO:tensorflow:Assets written to: chkpoint\assets


Epoch 4/15
Epoch 4: val_loss improved from 1.81356 to 1.62766, saving model to chkpoint
INFO:tensorflow:Assets written to: chkpoint\assets


INFO:tensorflow:Assets written to: chkpoint\assets


Epoch 5/15
Epoch 5: val_loss improved from 1.62766 to 1.50120, saving model to chkpoint
INFO:tensorflow:Assets written to: chkpoint\assets


INFO:tensorflow:Assets written to: chkpoint\assets


Epoch 6/15
Epoch 6: val_loss improved from 1.50120 to 1.41297, saving model to chkpoint
INFO:tensorflow:Assets written to: chkpoint\assets


INFO:tensorflow:Assets written to: chkpoint\assets


Epoch 7/15
Epoch 7: val_loss improved from 1.41297 to 1.35751, saving model to chkpoint
INFO:tensorflow:Assets written to: chkpoint\assets


INFO:tensorflow:Assets written to: chkpoint\assets


Epoch 8/15
Epoch 8: val_loss improved from 1.35751 to 1.33338, saving model to chkpoint
INFO:tensorflow:Assets written to: chkpoint\assets


INFO:tensorflow:Assets written to: chkpoint\assets


Epoch 9/15
Epoch 9: val_loss improved from 1.33338 to 1.32046, saving model to chkpoint
INFO:tensorflow:Assets written to: chkpoint\assets


INFO:tensorflow:Assets written to: chkpoint\assets


Epoch 10/15
Epoch 10: val_loss did not improve from 1.32046
Epoch 11/15
Epoch 11: val_loss did not improve from 1.32046
Epoch 12/15
Epoch 12: val_loss did not improve from 1.32046
Epoch 13/15
Epoch 13: val_loss did not improve from 1.32046
Epoch 14/15
Epoch 14: val_loss did not improve from 1.32046
Epoch 15/15
Epoch 15: val_loss did not improve from 1.32046


<keras.src.callbacks.History at 0x205c414e670>

## Evaluate model

In [24]:
from numpy import argmax
from nltk.translate.bleu_score import corpus_bleu
from keras.models import load_model

In [25]:
# map an integer to a word
def word_for_id(integer, tokenizer):
    for word, index in tokenizer.word_index.items():
        if index == integer:
            return word
    return None

# generate target given source sequence
def predict_sequence(model, tokenizer, source):
    prediction = model.predict(source, verbose=0)[0]
    integers = [argmax(vector) for vector in prediction]
    target = list()
    for i in integers:
        word = word_for_id(i, tokenizer)
        if word is None:
            break
        target.append(word)
    return ' '.join(target)

In [26]:
# evaluate the skill of the model
def evaluate_model(model, tokenizer, sources, raw_dataset):
    actual, predicted = list(), list()
    for i, source in enumerate(sources):
    # translate encoded source text
        source = source.reshape((1, source.shape[0]))
        translation = predict_sequence(model, eng_tokenizer, source)
        raw_target, raw_src = raw_dataset[i]
        if i < 2 or i>len(raw_dataset)-3:
            print('src=[%s], target=[%s], predicted=[%s]' % (raw_src, raw_target, translation))
        actual.append([raw_target.split()])
        predicted.append(translation.split())
    # calculate BLEU score with different n-grams
    print('BLEU: %f' % corpus_bleu(actual, predicted))
    print('BLEU-1: %f' % corpus_bleu(actual, predicted, weights=(1.0, 0, 0, 0)))
    print('BLEU-2: %f' % corpus_bleu(actual, predicted, weights=(0.5, 0.5, 0, 0)))
    print('BLEU-3: %f' % corpus_bleu(actual, predicted, weights=(0.3, 0.3, 0.3, 0)))
    print('BLEU-4: %f' % corpus_bleu(actual, predicted, weights=(0.25, 0.25, 0.25, 0.25)))
    print("")

In [27]:
model = load_model('chkpoint')

import time
start = time.time()

# test on some training sequences
print('train')
evaluate_model(model, eng_tokenizer, trainX, train)
# test on some test sequences
print('test')
evaluate_model(model, eng_tokenizer, testX, test)

end = time.time()
print("time taken to evaluate:",end-start)

train
src=[me comprendstu], target=[do you understand me], predicted=[do you understand me]
src=[tu es toujours vivante], target=[youre still alive], predicted=[youre still alive]
src=[restez baisse], target=[stay down], predicted=[stay down]
src=[ne moubliez pas], target=[dont forget me], predicted=[dont say me]
BLEU: 0.547125
BLEU-1: 0.829402
BLEU-2: 0.759050
BLEU-3: 0.706280
BLEU-4: 0.547125

test
src=[ca depend de toi], target=[its up to you], predicted=[it depends on you]
src=[je regrette de tavoir embrasse], target=[i regret kissing you], predicted=[i regret kissing you you]
src=[ca arrive], target=[its getting there], predicted=[it happens happen]
src=[mon sac fut derobe], target=[my bag was stolen], predicted=[my wife was taken]
BLEU: 0.287312
BLEU-1: 0.608362
BLEU-2: 0.490646
BLEU-3: 0.430707
BLEU-4: 0.287312

time taken to evaluate: 2156.7580366134644
