# English-to-French Translator

In this notebook I used the small english/french parallel text dataset prepared by Udacity in its NLP Nanodegree Program

## Step 0 - Import libraries and packages

In [1]:
import os

import numpy as np

import tensorflow

from tensorflow import keras

print(tensorflow.__version__)

from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Embedding, LSTM, GRU, Input, Dense, TimeDistributed, Activation, RepeatVector, Bidirectional
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.losses import sparse_categorical_crossentropy

2.1.0


### Check if GPU is working

Please check this [link](https://medium.com/@kegui/how-do-i-know-i-am-running-keras-model-on-gpu-a9cdcc24f986) for more detail on verifying your gpu

In [6]:
from tensorflow.python.client import device_lib

print(device_lib.list_local_devices())


[name: "/device:CPU:0"
device_type: "CPU"
memory_limit: 268435456
locality {
}
incarnation: 66245773583047
, name: "/device:GPU:0"
device_type: "GPU"
memory_limit: 3135687886
locality {
  bus_id: 1
  links {
  }
}
incarnation: 7853030068589879230
physical_device_desc: "device: 0, name: GeForce GTX 1050 Ti, pci bus id: 0000:01:00.0, compute capability: 6.1"
]


## Step 1 - Load data and preprocess

### Smaller English-French dataset from Udacity

In [9]:
data_dir = 'data_eng_fra'

# Load English data
with open(os.path.join(data_dir, 'small_vocab_en'), 'r') as f:
    eng_raw_data = f.read()
          
english_sentences = eng_raw_data.split('\n')

# Load French data
with open(os.path.join(data_dir, 'small_vocab_fr'), 'r') as f:
    fr_raw_data = f.read()
          
french_sentences = fr_raw_data.split('\n')

In [12]:
## Display some info

print("sentences of the English corpus: {}".format(len(english_sentences)))
print("sentences of the French corpus: {}".format(len(french_sentences)))

print("first English/French sentence:\n")
print(english_sentences[0])
print(french_sentences[0])


sentences of the English corpus: 137861
sentences of the French corpus: 137861
first English/French sentence:

new jersey is sometimes quiet during autumn , and it is snowy in april .
new jersey est parfois calme pendant l' automne , et il est neigeux en avril .


### Exploratory data analysis - size of the vocabulary

In [34]:
from collections import Counter

## Size of Vob of English texts
english_corpus = [w for sentence in english_sentences for w in sentence.split()]
english_vocab_counter = Counter(english_corpus)
english_vocab_size = len(english_vocab_counter)
print("Vocab size of English: {}".format(english_vocab_size))
print("10 most frequent English vocab: {}".format([x[0] for x in english_vob_counter.most_common(10)]))
print("Corpus size of English dataset: {}\n".format(len(english_corpus)))

## Size of Vob of French texts
french_corpus = [w for sentence in french_sentences for w in sentence.split()]
french_vocab_counter = Counter(french_corpus)
french_vocab_size = len(french_vocab_counter)
print("Vocab size of French: {}".format(french_vocab_size))
print("10 most frequent French vocab: {}".format([x[0] for x in french_vob_counter.most_common(10)]))
print("Corpus size of French dataset: {}\n".format(len(french_corpus)))

Vocab size of English: 227
10 most frequent English vocab: ['is', ',', '.', 'in', 'it', 'during', 'the', 'but', 'and', 'sometimes']
Corpus size of English dataset: 1823250

Vocab size of French: 355
10 most frequent French vocab: ['est', '.', ',', 'en', 'il', 'les', 'mais', 'et', 'la', 'parfois']
Corpus size of French dataset: 1961295



### Data Preprocessing

In [27]:
## Tokenizer
def tokenize(sentences):
    """
    Tokenize a list of sentences.
    
    input:
        sentence: a list of sentences. ex: ['hello world', 'this is a small world']
    output:
        (tokenized_sentences, tokenizer): a tuple, where
        
        tokenized_sentences: tokenized sentence. ex: [[1 , 2], [3, 4, 5, 6, 2]]
        tokenizer: the tokenizer
    """
    tokenizer = Tokenizer()
    tokenizer.fit_on_texts(sentences)
    tokenized_sentences = tokenizer.texts_to_sequences(sentences)
    
    return (tokenized_sentences, tokenizer)


## Padding

def pad(sentences, pad_length = None):
    """
    Pad every sentence in sentences to the length of pad_length
    If pad_length is None, it is set as the maximum length among the sentences
    
    inputs:
        sentences: a list of sentences. ex: [[1 , 2], [3, 4, 5, 6, 2]]
        pad_length: final length after padding
    
    output:
        padded_sentences: padded sentences. ex: [[1, 2, 0, 0, 0], [3, 4, 5, 6, 2]]
    """
    padded_sentences = pad_sequences(sentences, maxlen = pad_length, padding = 'post')
    return padded_sentences

### Helper function to convert ouput into readable sentences

The output will be transformed back to human-readable sentence by referring to the tokenizer

In [68]:
def output_to_texts(softmax_result, tokenizer):
    """
    Transform output into the text sentence
    
    inputs:
        softmax_result: the output from neural networks. shape = (padded_length, vocab_size)
        tokenizer: the tokenizer used to translate
    """
    # reverse tokenizer.word_index dict
    decoding_dict = {token: vocab for vocab, token in tokenizer.word_index.items()}
    # define 0 to be None
    decoding_dict[0] = ''
    # decode into a sentence
    sentence = ' '.join([decoding_dict[token] for token in np.argmax(softmax_result, 1)])
    
    return sentence

### Translation demo function

### Data Processing

In [35]:
## Process English and French data

eng_tokenized_sentences, eng_tokenizer = tokenize(english_sentences)
eng_padded_sentences = pad(eng_tokenized_sentences, pad_length = None)
print("shape of input (English sentences): {}".format(eng_padded_sentences.shape))
del eng_tokenized_sentences

eng_sentence_maxlen = eng_padded_sentences.shape[1]
print("maximum length of the English sentence: {}".format(eng_sentence_maxlen))



fr_tokenized_sentences, fr_tokenizer = tokenize(french_sentences)
fr_padded_sentences = pad(fr_tokenized_sentences, pad_length = None)
fr_padded_sentences = fr_padded_sentences.reshape(*fr_padded_sentences.shape, 1)
print("shape of output (French sentences): {}".format(fr_padded_sentences.shape))
del fr_tokenized_sentences

fr_sentence_maxlen = fr_padded_sentences.shape[1]
print("maximum length of the French sentence: {}".format(fr_sentence_maxlen))

shape of input (English sentences): (137861, 15)
maximum length of the English sentence: 15
shape of output (French sentences): (137861, 21, 1)
maximum length of the French sentence: 21


### Split into train and test dataset

In [74]:
## Prediction function

def predict_demo(model, test_indeces):
    """
    Print out several sentences and see their translations
    
    inputs
            model: model used for prediction
            test_indeces: a list of indeces of the test set to evaluate the model
    """

    for test_index in test_indeces:
        
        eng_sentence = english_sentences[test_index]
        answer_sentence = french_sentences[test_index]
        prediction_logits = model.predict(eng_padded_sentences[test_index].reshape(-1, eng_sentence_maxlen))[0]
        prediction_sentence = output_to_texts(prediction_logits, fr_tokenizer)
        
        
        print(eng_sentence)
        print(answer_sentence)
        print(prediction_sentence)
        print("\n")
        

## Step 2 - Benchmark model (deep RNN)

### Benchmark model

The benchmark model uses bidirectional GRUs

In [39]:
def build_benchmark_model(input_shape,
                        embedding_size,
                        GRU_units,
                        output_sentence_length,
                        input_vocab_size,
                        output_vocab_size,
                        lr):
    """
    inputs:
            input_shape
            output_sentence_length
            input_vocab_size
            output_vocab_size
    """
    # input
    inputs = Input(shape = input_shape[1:]) # remove the dimension of the batch
    # deep RNN
    x = Embedding(input_dim = input_vocab_size, output_dim = embedding_size)(inputs)
    x = Bidirectional(GRU(GRU_units))(x)
    x = RepeatVector(output_sentence_length)(x)
    x = Bidirectional(GRU(GRU_units, return_sequences = True))(x)
    x = TimeDistributed(Dense(output_vocab_size))(x)
    x = Activation('softmax')(x)
    # build model
    model = Model(inputs = inputs, outputs = x)
    
    # compile model
    model.compile(loss = sparse_categorical_crossentropy,
                  optimizer = Adam(lr),
                  metrics = ['accuracy'])
    
    return model

In [49]:
## build benchmark model

benchmark_model = build_benchmark_model(input_shape = (eng_padded_sentences.shape[0], eng_sentence_maxlen),
                        embedding_size = 256,
                        GRU_units = 128,
                        output_sentence_length = fr_sentence_maxlen,
                        input_vocab_size = english_vocab_size,
                        output_vocab_size = french_vocab_size,
                        lr = 1e-3)

benchmark_model.summary()

## train
benchmark_model.fit(eng_padded_sentences, fr_padded_sentences, batch_size = 1024, epochs = 20, validation_split = 0.2)

Model: "model_4"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_5 (InputLayer)         [(None, 15)]              0         
_________________________________________________________________
embedding_4 (Embedding)      (None, 15, 256)           58112     
_________________________________________________________________
bidirectional_8 (Bidirection (None, 256)               296448    
_________________________________________________________________
repeat_vector_4 (RepeatVecto (None, 21, 256)           0         
_________________________________________________________________
bidirectional_9 (Bidirection (None, 21, 256)           296448    
_________________________________________________________________
time_distributed_4 (TimeDist (None, 21, 355)           91235     
_________________________________________________________________
activation_4 (Activation)    (None, 21, 355)           0   

<tensorflow.python.keras.callbacks.History at 0x1bbc5e81e08>

In [75]:
## Predictions
predict_demo(benchmark_model, [1000])

they like strawberries , pears , and bananas .
ils aiment les fraises , les poires et les bananes .
ils aiment les fraises les poires et les bananes            




## Step 3 - Attention models