In [None]:
!pip install bidict

In [118]:
import pandas as pd
import numpy as np
import re, os
from matplotlib import pyplot as plt
from sklearn.utils import shuffle
from sklearn.model_selection import train_test_split
from tensorflow.keras.layers import Input, LSTM, Embedding, Dense
from tensorflow.keras.models import Model, Sequential
import tensorflow as tf
from nltk import word_tokenize
from gensim.models import Word2Vec
from sklearn.preprocessing import OneHotEncoder
from bidict import bidict
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.metrics import classification_report
from tensorflow.keras.callbacks import TensorBoard

In [119]:
# parameters
training=False
ctx_vec_len=256
epochs=1
no_of_tests=10
# if running on colab turn this false, and select GPU runtime
colab=False

In [120]:
from IPython.display import display, Markdown
if not colab:
    display(Markdown('''## Architecture For Neural Machine Trans
![Architecture Neural Machine Trans](image/NeuralMachineTrans.png)'''))

## Architecture For Neural Machine Trans
![Architecture Neural Machine Trans](image/NeuralMachineTrans.png)

In [121]:
if not colab:
    # if on local machine 
    root_dir='.'
    
else:
    # if using google colab use this code
    from google.colab import drive
    drive.mount('/content/drive')
    root_dir = "/content/drive/My Drive/Colab Notebooks"

In [122]:
data_path = os.path.join(root_dir, "fra.csv")
doc = pd.read_csv(data_path, skiprows = range(1, 130000), nrows=10000)
# shuffle dataset
doc=doc.sample(frac=1)

In [123]:
# replace contracted forms for english words
contracted_dict={"won't" : "will not", "can\'t" : "can not", "n\'t" : " not", "\'re" : " are", "\'s" : " is", "\'d" : " would", "\'ll" : " will", "\'t" : " not", "\'ve" : " have", "\'m" : " am"}

def replace_contracted(text):

    regex = re.compile("|".join(map(re.escape, contracted_dict.keys(  ))))
    return regex.sub(lambda match: contracted_dict[match.group(0)], text)

In [124]:
# apply decontraction and lowercase
doc=doc.apply(np.vectorize(lambda sent : replace_contracted(str(sent).strip().lower())))

In [125]:
# tokenize sentences and add start_ and _end keyword to target sentences
source_sents=doc.Source.apply(lambda sent: word_tokenize(sent))
target_sents=doc.Target.apply(lambda x : 'START_ '+ x + ' _END').apply(lambda sent: word_tokenize(sent))

In [126]:
# building the vocabulary
source_vocab=set().union(*source_sents)
target_vocab=set().union(*target_sents)

In [127]:
# max sentence length for each language in the dataset
max_source_len=max(source_sents.apply(len))
max_target_len=max(target_sents.apply(len))

In [128]:
# numeric identity for each word in vocab
source_wordint_rel=bidict(enumerate(source_vocab, 1))
target_wordint_rel=bidict(enumerate(target_vocab, 1))

In [129]:
# prepare inputs and outputs
encoder_source_arr=[list(map(lambda word : source_wordint_rel.inv[word], sent)) for sent in source_sents]
decoder_source_arr=[list(map(lambda word : target_wordint_rel.inv[word], sent)) for sent in target_sents]
decoder_output_arr=[list(map(lambda word : target_wordint_rel.inv[word], sent[1:])) for sent in target_sents]

In [130]:
# pad the inputs and outputs to max length
padded_encoder_source_arr=pad_sequences(encoder_source_arr, maxlen=max_source_len, padding='post')
padded_decoder_source_arr=pad_sequences(decoder_source_arr, maxlen=max_target_len, padding='post')
padded_decoder_output_arr=pad_sequences(decoder_output_arr, maxlen=max_target_len, padding='post')
onehotted_decoder_output_arr=tf.one_hot(padded_decoder_output_arr, len(target_vocab)+1).numpy()

# Model Preparation

In [131]:
# context-vector length
latent_dim=ctx_vec_len

# this is the source languge consumtion layer
encoder_inputs = Input(shape=(None,), name='encoder_sources')
# embed the 2-d source into 3-d
enc_emb =  Embedding(len(source_vocab)+1, latent_dim, mask_zero = True, name='enc_emb')(encoder_inputs)

# LSTM layer to encode the source sentence into context-vector representation
encoder_lstm = LSTM(latent_dim, return_state=True, name='encoder_lstm')
encoder_outputs, state_h, state_c = encoder_lstm(enc_emb)

# encoded-states tensor stores the context-vector
encoder_states = [state_h, state_c]

In [132]:
# this is the target languge consumtion layer
decoder_inputs = Input(shape=(None,), name='decoder_sources')
# embed the 2-d source into 3-d
dec_emb_layer = Embedding(len(target_vocab)+1, latent_dim, mask_zero = True, name='dec_emb_layer')
dec_emb = dec_emb_layer(decoder_inputs)

# decoder LSTM, this takes in the context-vector and starting or so-far decoded part of the target sentence
decoder_lstm = LSTM(latent_dim, return_sequences=True, return_state=True, name='decoder_lstm')
decoder_outputs, _, _ = decoder_lstm(dec_emb, initial_state=encoder_states)

# final layer that gives a probabilty distribution of the next possible words
decoder_dense = Dense(len(target_vocab)+1, activation='softmax', name='decoder_dense')
decoder_outputs = decoder_dense(decoder_outputs)

In [133]:
# model building and summary
model = Model([encoder_inputs, decoder_inputs], decoder_outputs, name='Model_Translation')
model.compile(optimizer='rmsprop', loss='categorical_crossentropy', metrics=['acc'])
model.summary()

Model: "Model_Translation"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
encoder_sources (InputLayer)    [(None, None)]       0                                            
__________________________________________________________________________________________________
decoder_sources (InputLayer)    [(None, None)]       0                                            
__________________________________________________________________________________________________
enc_emb (Embedding)             (None, None, 256)    1115392     encoder_sources[0][0]            
__________________________________________________________________________________________________
dec_emb_layer (Embedding)       (None, None, 256)    1818624     decoder_sources[0][0]            
__________________________________________________________________________________

# Training

In [134]:
# TensorBoard Callback 
tbCallBack = TensorBoard(log_dir=os.path.join(root_dir, 'Graph'), histogram_freq=0, write_graph=True, write_images=True)

In [135]:
if training:
    # train the model
    history=model.fit([padded_encoder_source_arr, padded_decoder_source_arr], onehotted_decoder_output_arr, epochs=epochs, validation_split=0.2, callbacks=[tbCallBack])
    model.save_weights(os.path.join(root_dir, 'word-seq2seq.hdf5'))
    with plt.style.context('dark_background'):
        plt.plot(history.history['acc'])
        plt.plot(history.history['val_acc'])
        plt.title('model accuracy')
        plt.ylabel('accuracy')
        plt.xlabel('epoch')
        plt.legend(['train', 'val'], loc='upper left')
        plt.show()
        plt.plot(history.history['loss'])
        plt.plot(history.history['val_loss'])
        plt.title('model loss')
        plt.ylabel('loss')
        plt.xlabel('epoch')
        plt.legend(['train', 'val'], loc='upper left')
        plt.show()

# Decoder Model

In [136]:
# Encode the source sequence to get the "Context vectors"
encoder_model = Model(encoder_inputs, encoder_states)
# Decoder setup
# Below tensors will hold the states of the previous time step
decoder_state_input_h = Input(shape=(latent_dim,))
decoder_state_input_c = Input(shape=(latent_dim,))
decoder_state_input = [decoder_state_input_h, decoder_state_input_c]
# Get the embeddings of the decoder sequence
dec_emb2= dec_emb_layer(decoder_inputs)
# To predict the next word in the sequence, set the initial states to the states from the previous time step
decoder_outputs2, state_h2, state_c2 = decoder_lstm(dec_emb2, initial_state=decoder_state_input)
decoder_states2 = [state_h2, state_c2]
# A dense softmax layer to generate prob dist. over the target vocabulary
decoder_outputs2 = decoder_dense(decoder_outputs2)
# Final decoder model
decoder_model = Model(
    [decoder_inputs] + decoder_state_input,
    [decoder_outputs2] + decoder_states2)

# Decoding Logic

In [137]:
def decode_sequence(source_seq):
    
    # Encode the source as state vectors.
    states_value = encoder_model.predict(source_seq)
    # Generate empty target sequence of length 1.
    target_seq = np.zeros((1,1))
    # Populate the first character of 
    #target sequence with the start character.
    target_seq[0, 0] = target_wordint_rel.inv['START_']
    # Sampling loop for a batch of sequences
    # (to simplify, here we assume a batch of size 1).
    stop_condition = False
    decoded_sentence = []
    while not stop_condition:
        output_tokens, h, c = decoder_model.predict([target_seq] + states_value)
    # Sample a token
        sampled_token_index = np.argmax(output_tokens[0, -1, :])
        sampled_word =target_wordint_rel[sampled_token_index]
        decoded_sentence += [sampled_word]
    # Exit condition: either hit max length
        # or find stop character.
        if (sampled_word == '_END' or
           len(decoded_sentence) > 50):
            stop_condition = True
    # Update the target sequence (of length 1).
        target_seq = np.zeros((1,1))
        target_seq[0, 0] = sampled_token_index
    # Update states
        states_value = [h, c]
    return decoded_sentence

# Prediction

In [147]:
def calc_strdiff(true, pred):
    return sum([1 for char in list(difflib.ndiff(true, pred)) if '+ ' in char or '- ' in char])/(len(true))
    
if not training:
    
    model.load_weights(os.path.join(root_dir, 'word-seq2seq.hdf5'))
    y_truePred = [(' '.join(target_sents[seq_index][1:-1]), ' '.join(decode_sequence(padded_encoder_source_arr[seq_index:seq_index+1])[:-1])) for seq_index, _ in enumerate(padded_encoder_source_arr[:no_of_tests])]
    error=sum([calc_strdiff(true, pred) for true, pred in y_truePred])/no_of_tests
    print(f'Avg error for {no_of_tests} tests was {error}.')
    print(pd.DataFrame(y_truePred, columns=['Expected', 'Predicted']))

Avg error for 10 tests was 1.0308702786361157.
                                          Expected                   Predicted
0  il était doté d'une désagréable voix perçante .   je ne suis pas de de de .
1            il avait peu d'amis et peu d'argent .   je ne suis pas de de de .
2     il avait beaucoup d'argent pour son voyage .  je ne suis pas pas de de .
3     il eut l'effronterie d'ignorer mon conseil .     il a a de de de de de .
4      il a une profonde affection pour son fils .  je ne suis pas pas de de .
5                il a bien plus d'argent que moi .     il a a de de de de de .
6            il a beaucoup plus d'argent que moi .  il a a de de de de de de .
7        il a une parfaite maîtrise de l'anglais .     il a a de de de de de .
8                il a tendance à être pessimiste .  il a a de de de de de de .
9                il a été occupé depuis ce matin .   je ne ne pas pas pas de .
