In [59]:
import pandas as pd
import numpy as np
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Model, Sequential
from tensorflow.keras.layers import GRU, Input, Dense, TimeDistributed, Activation, RepeatVector, Bidirectional,LSTM, Dropout, Attention, Reshape
from tensorflow.keras.layers import Embedding
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.losses import sparse_categorical_crossentropy
from tensorflow.keras.callbacks import ModelCheckpoint

In [7]:
##Loading and processing data
eng_fr = pd.read_csv("Dataset/nlp_intel_train.csv")
eng_fr_test = pd.read_csv("Dataset/nlp_intel_test.csv")

In [8]:
eng_fr

Unnamed: 0.1,Unnamed: 0,en,fr
0,1000,"In 1981, he founded the Astronomy Club of Rimo...","En 1981, il fonde le Club d'Astronomie de Rimo..."
1,1001,The club was very active and they twice organi...,Le club est très actif et organise à deux occa...
2,1002,"In 1983, Lemay initiated the first joint meeti...","En 1983, il est l'instigateur à Québec du cong..."
3,1003,"The conference took place in Quebec City, and ...",Le congrès est un franc succès et regroupe pas...
4,1004,"From 1990 to 1992, he was the National Preside...","De 1990 à 1992, il est président national de l..."
...,...,...,...
5219,6219,It is believed that consumers in the region wi...,On croit que les consommateurs de la région ac...
5220,6220,A study puts the global retail market for hala...,"Selon une étude, on estime le marché mondial d..."
5221,6221,A breakdown of the 1.5 billion Muslim consumer...,"Une répartition des 1,5 milliard de consommate..."
5222,6222,September 2006 saw the successful introduction...,"Au mois de septembre 2006, on a lancé avec suc..."


In [9]:
eng_fr = eng_fr.dropna(axis=0, how="any", subset=None, inplace=False)
eng_fr_test = eng_fr_test.dropna(axis=0, how="any", subset=None, inplace=False)

In [10]:
##Tokenizer and padding

def tokenize(data):
  t = Tokenizer()
  t.fit_on_texts(data)
  return t


def training_sequences(tokenizer, m_length, data):
    seq = tokenizer.texts_to_sequences(data)
    seq = pad_sequences(seq, maxlen = m_length, padding='post')
    return seq


In [24]:
#Preprocessing by tokenization and padding
#return processed data and tokenizer
def preprocess(x, y):

    x_tk = tokenize(x)
    y_tk = tokenize(y)

    preprocess_x = training_sequences(x_tk,55,x)
    preprocess_y = training_sequences(y_tk,55,y)

    # Keras's sparse_categorical_crossentropy function requires the labels to be in 3 dimensions
    preprocess_y = preprocess_y.reshape(*preprocess_y.shape, 1)

    return preprocess_x, preprocess_y, x_tk, y_tk

In [25]:
preproc_english_sentences, preproc_french_sentences, english_tokenizer, french_tokenizer = preprocess(eng_fr["en"].tolist(), eng_fr["fr"].tolist())

In [26]:
preproc_english_sentences

array([[   5, 2779,   46, ...,    0,    0,    0],
       [   1, 1926,   30, ...,    0,    0,    0],
       [   5, 1928, 3603, ...,    0,    0,    0],
       ...,
       [   6,  360, 9619, ...,    0,    0,    0],
       [ 139, 5107,  290, ...,    5,  117,  514],
       [1671,  109, 1125, ...,    0,    0,    0]], dtype=int32)

In [27]:
max_english_sequence_length = preproc_english_sentences.shape[1]
max_french_sequence_length = preproc_french_sentences.shape[1]
english_vocab_size = len(english_tokenizer.word_index)
french_vocab_size = len(french_tokenizer.word_index)

print("Max English sentence length:", max_english_sequence_length)
print("Max French sentence length:", max_french_sequence_length)
print("English vocabulary size:", english_vocab_size)
print("French vocabulary size:", french_vocab_size)

Max English sentence length: 55
Max French sentence length: 55
English vocabulary size: 9621
French vocabulary size: 12122


In [28]:
#Final output funtion
def logits_to_text(logits, tokenizer):

    index_to_words = {id: word for word, id in tokenizer.word_index.items()}
    index_to_words[0] = ' '

    return ' '.join([index_to_words[prediction] for prediction in np.argmax(logits, 1)])

In [128]:
def bd_model(input_shape, output_sequence_length, english_vocab_size, french_vocab_size):

    learning_rate = 0.005

    # Build the layers
    model = Sequential()
    model.add(Embedding(french_vocab_size, 256, input_length=input_shape[1], input_shape=input_shape[1:]))
    # model.add(GRU(256, return_sequences=True))
    # model.add(LSTM(256, return_sequences=True))
    model.add(GRU(512, return_sequences=True))
    model.add(Dense(1024, activation='relu'))
    model.add(Dense(english_vocab_size, activation='softmax'))

    # Compile model
    model.compile(loss=sparse_categorical_crossentropy,
                  optimizer=Adam(learning_rate),
                  metrics=['accuracy'])
    return model

In [129]:
preproc_french_sentences.shape

(5222, 55, 1)

In [130]:
tmp_x =pad_sequences(preproc_french_sentences, maxlen = 55, padding = 'post')
tmp_x = tmp_x.reshape((-1, preproc_french_sentences.shape[-2]))

# Train
model = bd_model(
    tmp_x.shape,
    preproc_english_sentences.shape[1],
    len(english_tokenizer.word_index)+1,
    len(french_tokenizer.word_index)+1)

model.summary()

model.fit(tmp_x, preproc_english_sentences, batch_size=32, epochs=15, validation_split=0.1)

Model: "sequential_22"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_23 (Embedding)    (None, 55, 256)           3103488   
                                                                 
 gru_13 (GRU)                (None, 55, 512)           1182720   
                                                                 
 dense_30 (Dense)            (None, 55, 1024)          525312    
                                                                 
 dense_31 (Dense)            (None, 55, 9622)          9862550   
                                                                 
Total params: 14674070 (55.98 MB)
Trainable params: 14674070 (55.98 MB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________
Epoch 1/15
Epoch 2/15
Epoch 3/15
Epoch 4/15
Epoch 5/15
Epoch 6/15
Epoch 7/15
Epoch 8/15
Epoch 9/15
Epoch 10/15
Epoch 11/15
Epoch 12/15
Epoch 13

<keras.src.callbacks.History at 0x7d9cb8229150>

In [136]:
i= 304


print("Prediction:")
print(logits_to_text(model.predict(tmp_x[[i]])[0], english_tokenizer))
print("\nCorrect Translation:")
print(eng_fr["en"].tolist()[i])
print("\nOriginal text:")
print(eng_fr["fr"].tolist()[i])

Prediction:
in 1864 donati was the first to clearly observe the spectrum of a comet park 1864b                                                                              

Correct Translation:
In 1864, Donati was the first to clearly observe the spectrum of a comet (Comet 1864b).

Original text:
En 1864, Donati est le premier à observer de façon claire le spectre d’une comète (la comète 1864b).
