**<h1 align='ccenter'>Neural Machine translation</h1>** <hr/>

In [None]:
import numpy as np
import pandas as pd 
import tensorflow as tf
import matplotlib.pyplot as plt
from IPython.display import Markdown

In [None]:
plt.style.use("seaborn")
pd.set_option('display.max_columns', None)
pd.set_option('display.width', None)
pd.set_option('display.max_colwidth', None)

In [None]:
file='../input/englishfrench-fornmt/english-french.csv'
df = pd.read_csv(file)

In [None]:
display(Markdown(f'- **Total number of rows** : {df.shape[0]}'))
display(Markdown(f'- **Some random data**'))
display(df.sample(n=9))

**<h2 align="center">Data cleaning and Preprocessing</h2>**<hr>

### **Cleaning Data**

In [None]:
import re, string, json5

In [None]:
file = '../input/englishfrench-fornmt/eng-contraction.JSON'
with open(file,'r') as f:
    contractions = dict(json5.load(f))

In [None]:
def replace_contractions_by_words(text):
  text = ' '.join(
      [
       contractions[word] if word in contractions
       else word for word in text.split()
    ]
  )
  return text

def remove_punctuations(text):
  # Extracting text within bracket
  text = re.sub('\((\w+)\)', '\g<1>', text)
  text = re.sub('\[(\w+)\]', '\g<1>', text)
  text = re.sub('\{(\w+)\}', '\g<1>', text)
  
  punct = list(string.punctuation)
  # Actually not removing some punctions make the model more accurate
  for symbol in "'":
    punct.remove(symbol)

  # Remove punctuations and some unwanted characters
  punct = ''.join(punct)
  for p in f"{punct}»«…":
    text = text.replace(p, " ")
  return text

def cleaning(text, lang=None):
  """Clean the text"""
  text = text.lower()
  if lang =='en': 
    text = replace_contractions_by_words(text)
  text = remove_punctuations(text)
  # Remove whitespaces characters
  text = re.sub('\s+',' ', text)
  return text.strip()

In [None]:
# Remove non necessary characters
df.english = df.english.apply(lambda text:cleaning(text, lang='en'))
df.french = df.french.apply(cleaning)
display(df.sample(n=5))

### **Data preprocessing**

In [None]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split

In [None]:
def train_test_data(dataframe:pd.DataFrame, sample_size=20_000, test_size=0.35):
  dataframe = dataframe.sample(n=sample_size, random_state=np.random.randint(100))
  train_set, test_set = train_test_split(
    dataframe, test_size=test_size, random_state=np.random.randint(500),
  )
  train_set = train_set.reset_index(drop=True)
  test_set = test_set.reset_index(drop=True)
  return train_set, test_set

def tokenize(sentences):
  tokenizer = Tokenizer(oov_token='UNK', filters='',)
  tokenizer.fit_on_texts(sentences)
  return tokenizer

def get_maxlen(sentences): 
  return max(len(s.split(' ')) for s in sentences.to_numpy())

def get_vocab_size(tokenizer):
  return len(tokenizer.word_index)+1

In [None]:
# Setting the source language as French language and target the English
source, target = 'french', 'english'
# Beginning and ending of every sentence in target language
begin, end = 'debut', 'fin' 

# Prepare data for use in teacher forcing learning
df = df.rename(columns={source:'source', target: 'target'})
df = df[['source', 'target']]
df.target = (df.target.apply(lambda x: f'{begin} {x} {end}'))

In [None]:
# Create word dictionnary and encode texts sentences to sequences of numbers
src_tokenizer = tokenize(df.source)
tar_tokenizer = tokenize(df.target)

# Get the size of vocabularies
src_vocabsize =  get_vocab_size(src_tokenizer) 
tar_vocabsize =  get_vocab_size(tar_tokenizer) 

# Get length of the longuest sentence in each language 
src_seqlen = get_maxlen(df.source)
tar_seqlen = get_maxlen(df.target)

#### **Dataset splitting**

In [None]:
# SPlit the dataset into three data 
# Train set, Validation set and Test set
train_set, test_set = train_test_data(df, sample_size=df.shape[0], test_size=0.081)
train_set, val_set = train_test_data(train_set, sample_size=train_set.shape[0], test_size=0.2)

**<h2 align="center">Data visualisation</h2>**<hr>

In [None]:
def word_count (txt):
  return len(txt.split(' '))

def add_stats_data(dataframe):
  dataframe[f'{target}_word_count'] = dataframe.target.apply(word_count)
  dataframe[f'{source}_word_count'] = dataframe.source.apply(word_count)
  return dataframe

def get_frequency(dataframe, lang, col):
  frequency = [
     dataframe
      .loc[dataframe[col] == i, lang]
      .count() for i in range(1, 32)
  ]
  return frequency

def plot_bar_line(rows, title, color="blue"):
  plt.bar(range(1, 32), rows, color=color)
  plt.plot(range(1, 32), rows, color=color)
  plt.xticks(range(1, 32), )

  plt.xlabel("Number of words", fontsize='12', fontweight='bold')
  plt.ylabel("Number of word's frequencies", fontsize='12', fontweight='bold')
  plt.title(title, fontsize='14', fontweight='bold')

def visualize_frequency(dataframe, figsize=(18,5)):
  x={ 'target':get_frequency(dataframe, 'target', f'{target}_word_count'),
      'source': get_frequency(dataframe, 'source', f'{source}_word_count')}

  fig = plt.figure(figsize=figsize)
  plt.subplot(121)
  plot_bar_line(x['source'], f"{source} (source) words distribution")
  plt.subplot(122)
  plot_bar_line(x['target'], f"{target} (target) words distribution")
  fig.suptitle('Words count frequencies', fontsize=23, fontweight='bold')
  plt.show()

### **Stats**

In [None]:
#@title **Data frame sizes**
display(pd.DataFrame(
  {
    "Train_size":[train_set.shape[0]],
    "Validation_size":[val_set.shape[0]],
    "Test_size":[test_set.shape[0]],
  }
  ,index=[f'{source}/{target}']
))

In [None]:
#@title
display(pd.DataFrame(
  {
   "Is_source_language?":[source=='french', source=='english'],
   "Is_target_language?":[target=='french', target=='english'],
   "Vocabulary_size":[src_vocabsize, tar_vocabsize],
   "Sequence_length":[src_seqlen, tar_seqlen],
  },
  index=[source, target]
))

### **Plots**

In [None]:
#@title **Data frame stats**
plt.style.use('ggplot')
fig = plt.figure(figsize=(20,7))
labels = [
  [f'Source ({source})', f'Target ({target})'],
  [f'Source ({source})', f'Target ({target})'],
  ['Train set', 'Validation set', 'Test set']
]
data = [
  [src_vocabsize, tar_vocabsize],
  [src_seqlen, tar_seqlen],
  [train_set.shape[0], val_set.shape[0], test_set.shape[0]]
]
colors = [
  ['darkblue', '#02455f'],
  ['darkblue', '#02455f'],
  ['darkblue', '#02455f', '#02655f']
]
sizes = [[100, 100], [100, 100], [100, 100, 100]]
offset = [900, 1, 2000]
titles = ['Vocabulary size', 'Sequence lenght', 'Dataframe sizes']

for i in range(3):
  ax = plt.subplot(1, 3, i+1)
  ax.bar(labels[i], data[i], color=colors[i])
  ax.scatter(labels[i], data[i], s=sizes[i], marker='^', c='black')
  for j, txt in enumerate(data[i]):
    ax.text(labels[i][j], data[i][j]+offset[i], txt, fontsize=12)
  if i == 2: ax.yaxis.tick_right()
  plt.title(titles[i], fontsize='14', fontweight='bold',x=0.5, y=-0.17)
  plt.xticks(fontsize=12, fontweight="bold")
  plt.yticks(fontsize=13 )
  plt.ylabel("Size", fontsize=12, fontweight="bold")

fig.suptitle('Dataframe Stats', fontsize=23, fontweight='bold')
plt.show()

df = add_stats_data(df)
visualize_frequency(df, figsize=(20,7))
df = df.loc[:, ['source', 'target']]

**<h1 align="center">Deep Learning Model</h1>**<hr>

## **Batch of data Generator**

Since we need to fit all of the data (`+150k` rows and a big vocabulary size) in to the model, we do not process all data at once but we process. Instead data will be passed to the model in batch. For that we use [python generator](https://wiki.python.org/moin/Generators 'Go to the documention of python generator').

In [None]:
from tensorflow.keras.utils import to_categorical

# Create a generator to avoid machine to crash while training 
def get_batch_of_data(encoder_in_data, decoder_in_data, batch_size): 
  """Take one batch of `batch_size` in `encoder_in_data` and `decoder_in_data`, preprocess them and return (yield) these processed batch of data."""  
  data_size = encoder_in_data.shape[0]
  while True: ## Needed for keras fit method (an infinite loop)
    for i in range(0, data_size, batch_size):
      # Tokenize the batch of encoder data to be fed in the model
      enc_sentences = encoder_in_data[i:i+batch_size]
      encoder_input = src_tokenizer.texts_to_sequences(enc_sentences)
      encoder_input = pad_sequences(encoder_input, padding='post', maxlen=src_seqlen)
      
      # Tokenize the batch of decoder data to be fed in the model
      dec_sentences = decoder_in_data[i:i+batch_size]
      decoder_input = tar_tokenizer.texts_to_sequences(dec_sentences)
      decoder_input = pad_sequences(decoder_input, padding='post', maxlen=tar_seqlen)
      decoder_input_data = decoder_input[:, :-1] # Do not get the last word 'eos'|'fin'

      # Tokenize the batch of decoder data to be fed in the model
      # Since the decoder outputs will use a dense layer to make word classification
      # We we need to make One hot-encoding for the decoder target output data 
      decoder_output = to_categorical(decoder_input, num_classes=tar_vocabsize)
      decoder_output = decoder_output[:, 1:, :] # Do not get the first word 'sos'|'debut'

      inputs = [encoder_input, decoder_input_data]
      outputs = decoder_output
      yield inputs, outputs

## **RNN Model using GRU, Embedding and Dense layer in both encoder and decoder**

In [None]:
from tensorflow.keras.layers import Input, Embedding, GRU
from tensorflow.keras.layers import Dense, Dropout
from tensorflow.keras.models import Model
from tensorflow.keras.utils import plot_model

### **Model defininition**

In [None]:
def create_train_model(
  enc_seqlen, dec_seqlen, enc_vocabsize, dec_vocabsize, hsize=512, 
  embsize=512, encoder_dropout=0.0, decoder_dropout=0.0, 
):
  ### Encoder
  encoder_inputs = Input(shape=(enc_seqlen,), name="encoder_input")
  embedding = Embedding(
      enc_vocabsize, embsize, name="encoder_embedding"
  )
  encoder_gru = GRU(
      hsize, return_state=True, name="encoder_gru", dropout=encoder_dropout
  )
  encoder_emb = embedding(encoder_inputs)
  encoder_outputs, encoder_state = encoder_gru(encoder_emb)

  ### Decoder
  decoder_inputs = Input(shape=(dec_seqlen - 1,), name="decoder_input")
  embedding = Embedding(
      dec_vocabsize, embsize, input_length=dec_seqlen-1, name="decoder_embedding"
   )
  decoder_gru = GRU(
      hsize, return_state=True, return_sequences=True, name="decoder_gru", 
      dropout=decoder_dropout
  )

  decoder_emb = embedding(decoder_inputs)
  decoder_outputs, _ = decoder_gru(decoder_emb, initial_state=encoder_state)
  dense_layer = Dense(dec_vocabsize, activation="softmax", name="dense_layer")
  decoder_outputs = dense_layer(decoder_outputs)

  # Define the Model which accepts encoder/decoder inputs and outputs predictions
  model = Model(
      inputs=[encoder_inputs, decoder_inputs],
      outputs=decoder_outputs,
      name="encoder_decoder_model",
  )
  return model

In [None]:
def make_train(model, train_data, val_data, batch_size, epochs, callbacks=[]):
  train_generator = get_batch_of_data(
    train_data.source, train_data.target, batch_size
  )
  val_generator = get_batch_of_data(
    val_data.source, val_data.target, batch_size
  )
  number_of_steps = train_data.shape[0]//batch_size

  history = model.fit(
    x=train_generator,
    validation_data=val_generator,
    epochs=epochs,
    steps_per_epoch=number_of_steps,
    validation_steps = number_of_steps,
    callbacks=callbacks,
  )
  return history

### **Some paths**

In [None]:
import os
model_path = 'model/nmt_model'
model_weights_path = 'model/weights/weights.ckpt'
model_history_path = 'model/history.csv'
os.makedirs(name='model/weights/', exist_ok=True)

In [None]:
def save_model_data(model, history):
  # Save the training history
  pd.DataFrame(history.history).to_csv(model_history_path, index=False)
  # Save the full model, weights, opt,...
  model.save(model_path, overwrite=True)
    
def load_saved_model_data():
  # Load saved model and get only the best weights
  model = tf.keras.models.load_model(model_path)
  model.load_weights(model_weights_path)
  # Get training history
  history = tf.keras.callbacks.History()
  history.history = pd.read_csv(model_history_path).to_dict('list')
  return model, history

**<h2 align="center">Train the Model with cross validation</h2>**<hr>

### **Model creation** 

In [None]:
gru_hidden_units = 1060 
emmbedding_size  = 256  
encoder_dropout  = 0.1  
decoder_dropout  = 0.25 
model = create_train_model(
  src_seqlen, tar_seqlen, src_vocabsize, tar_vocabsize, 
  gru_hidden_units, emmbedding_size, 
  encoder_dropout, decoder_dropout
)

model.compile(
  optimizer='adam', loss='categorical_crossentropy', metrics=['acc']
)

In [None]:
# Train model architecture
display(plot_model(model, show_shapes=True, to_file='train_model.png'))
# display(model.summary())

### **Training**

In [None]:
save_best_weights = tf.keras.callbacks.ModelCheckpoint(
  model_weights_path, monitor='val_loss', mode='min',
  save_weights_only=True, save_best_only=True, verbose=1, 
)

early_stopping = tf.keras.callbacks.EarlyStopping(
  monitor='val_loss', patience=2, verbose=1,
  mode='min', restore_best_weights=True, 
)

batch_size = 88 
epochs = 100     

history = make_train(
  model, train_set, val_set, 
  batch_size, epochs,[save_best_weights, early_stopping]
)
save_model_data(model, history)

### **Load the model with the best weights**

In [None]:
a_model, history = load_saved_model_data()

**<h1 align="center">Model Testing and Generation of translation</h1>**<hr>


## **Model evaluation**

In [None]:
batch_size = 70
number_of_steps = test_set.shape[0]//batch_size
test_generator = get_batch_of_data(
  test_set.source, test_set.target, batch_size
)
evaluation = a_model.evaluate(
  x=test_generator, steps=number_of_steps, verbose=1
)

In [None]:
pd.DataFrame(
    {
        "Loss":[f"{evaluation[0]*100:.3f}%"],
        "Accuracy": [f"{evaluation[1]*100:.3f}%"]
    },
    index=["Loss/Accuracy"]
)

## **Learning curves**

In [None]:
def plot_one_history(x, y1, y2, input='loss', y1_label='Loss', y2_label='Validation loss'):
  title = "Loss's history"
  if input == 'acc':
    y1_label, y2_label = 'Accuracy', 'Validation Accuracy'
    title = "Accuracy's history"
  y_label = y1_label
  labels=('Epochs', 'acc')
  plt.plot(x, y1, label=y1_label)
  plt.plot(x, y2, label=y2_label)
  plt.xlabel("Epochs", c='darkred')
  plt.ylabel(y_label, c='darkblue')
  plt.title(title, c='darkgreen')
  plt.legend()

def plot_history(hist, figsize=(17, 5)):
  plt.figure(figsize=figsize)
  nb_epoch = range(1, len(hist.history['acc'])+1)
  plt.subplot(121)
  accuracies = hist.history['acc']
  val_accuracies = hist.history['val_acc']
  plot_one_history(x=nb_epoch, y1=accuracies, y2=val_accuracies, input='acc')

  plt.subplot(122)
  losses     = hist.history['loss']
  val_losses = hist.history['val_loss']
  plot_one_history(x=nb_epoch, y1=losses, y2=val_losses, input='loss')
  plt.show()

In [None]:
plot_history(history)
# A huge overfitting thought

## **Inference Model (Model for Translation)**

In [None]:
display(Markdown("<h1 align='center'>Creation of the inference model using the weight of trained model</h1>"))

In [None]:
def create_inference_encoder_from(input_layer, embedding_layer, gru_layer):
  encoder_embedding = embedding_layer(input_layer)
  encoder_outputs, encoder_state = gru_layer(encoder_embedding)
  encoder = Model(input_layer, encoder_state)
  return encoder

def create_inference_decoder_from(input_layer, embedding_layer, gru_layer, dense_layer):
  # decoder_input = Input(shape=(1,))
  decoder_embedding = embedding_layer(input_layer)
  input_shape = dense_layer.input.shape[-1]
  decoder_inputs_state = Input(shape=(input_shape,))

  decoder_output, decoder_output_state = gru_layer(
    decoder_embedding, initial_state=decoder_inputs_state
  )

  decoder_prediction = dense_layer(decoder_output)
  decoder = Model(
    inputs=[input_layer, decoder_inputs_state], 
    outputs=[decoder_prediction, decoder_output_state]
  )
  return decoder

In [None]:
display(Markdown("<h3 align='center'>Here the name of each layer in the train model (for encoder && decoder)</h3>"))
layers_name = [layer.name for layer in a_model.layers]; print(layers_name)

In [None]:
encoder = create_inference_encoder_from(
  a_model.get_layer('encoder_input').input,
  a_model.get_layer('encoder_embedding'),
  a_model.get_layer('encoder_gru')
)

decoder = create_inference_decoder_from(
  a_model.get_layer('decoder_input').input,
  a_model.get_layer('decoder_embedding'),
  a_model.get_layer('decoder_gru'),
  a_model.get_layer('dense_layer'),
)

## **Inference Encoder/decoder architecture**

In [None]:
img_encoder = plot_model(encoder, show_shapes=True, to_file='encoder.png')
img_decoder = plot_model(decoder, show_shapes=True, to_file='decoder.png')

display(img_encoder)
display(img_decoder)

# **Generating Translation** 

In [None]:
def decode_sentence(sentence):
  sentence = cleaning(sentence)
  sequence = src_tokenizer.texts_to_sequences([sentence])
  sequence = pad_sequences(sequence, padding='post', maxlen=src_seqlen)

  state = encoder.predict(sequence)
  tar_seq = np.zeros((1, 1))
  tar_seq[0,0] = tar_tokenizer.word_index[begin]
  sequence_translated = ''

  for i in range(tar_seqlen):
    de_prob, state = decoder.predict([tar_seq, state])
    idx_of_predicted_word = np.argmax(de_prob[0,-1,:])
    predicted_word = tar_tokenizer.index_word[idx_of_predicted_word]
    
    if predicted_word == end: break
    sequence_translated += f'{predicted_word} '

    tar_seq = np.zeros((1, 1))
    tar_seq[0,0] = idx_of_predicted_word
  return sequence_translated 

In [None]:
# cleaning("I'm having dinner, so i'm busy")
def get_random_sentence_on(dataframe):
  index = np.random.randint(dataframe.shape[0])
  src = dataframe.source.iloc[index]
  tar = dataframe.target.iloc[index]
  tar = re.sub(f'{begin} (.*) {end}', '\g<1>', tar)
  return src, tar, index

In [None]:
# Set a input sentence and rerun the cell 
sentences = [
  "le professeur est bon",
  "L'université Ibn Tofail", # OOV => `Ibn Tofail`
  "Ne soyez pas en colère",
  "Comment avez-vous réussi le projet?",
  "Je suis ton père",
  "En été, la vie est belle",
  "L'Afrique est un continent",
  "Le model n'a pas bien appris",
  "Le Maroc est un pays maghrebin" # Bias detected,
] 

for sentence in sentences:
    translation = decode_sentence(sentence)
    display(Markdown(
      f'''> **{source} source** : {sentence}  
          > **{target} translated** : {translation}'''))


## **Random phrases in the dataset**

In [None]:
#@title **Random phrases in the dataset** { run: "auto" }
sources = []
targets = []
translated = []
number_of_sentences_to_translate = 48 #@param {type:"slider", min:1, max:100, step:1}
for i in range(number_of_sentences_to_translate):
  src, tar, _ = get_random_sentence_on(test_set)
  translation = decode_sentence(src)
  sources.append(src)
  targets.append(tar)
  translated.append(translation)

(
 pd.DataFrame(
  {source: sources,target: targets, 
   f'{target} translated': translated})
  .style.set_properties(**{'text-align': 'left'})
  .set_table_styles(
  [
    {'selector': 'tr:hover',
      'props': [('background-color', '#089'), ('color', '#fff')]
    },
  ])
)