## Import necessary libraries

In [47]:
import collections
import numpy as np
import pandas as pd
import keras
import tensorflow as tf
import random

from nltk.translate.bleu_score import corpus_bleu

from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import Model
from keras.layers import GRU, Input, Dense, TimeDistributed, Activation, RepeatVector, Bidirectional
from keras.layers.embeddings import Embedding
from keras.optimizers import Adam
from keras.losses import sparse_categorical_crossentropy
from keras import Sequential

## Reading the Data

In [48]:
DF = pd.read_csv("../input/language-translation-englishfrench/eng_-french.csv")

## Viewing the datasest

In [49]:
DF

Unnamed: 0,English words/sentences,French words/sentences
0,Hi.,Salut!
1,Run!,Cours !
2,Run!,Courez !
3,Who?,Qui ?
4,Wow!,Ça alors !
...,...,...
175616,"Top-down economics never works, said Obama. ""T...","« L'économie en partant du haut vers le bas, ç..."
175617,A carbon footprint is the amount of carbon dio...,Une empreinte carbone est la somme de pollutio...
175618,Death is something that we're often discourage...,La mort est une chose qu'on nous décourage sou...
175619,Since there are usually multiple websites on a...,Puisqu'il y a de multiples sites web sur chaqu...


## Renaming the columns of DataFrame

In [50]:
DF = DF.rename(columns={'English words/sentences': 'English', 'French words/sentences': 'French'})
DF

Unnamed: 0,English,French
0,Hi.,Salut!
1,Run!,Cours !
2,Run!,Courez !
3,Who?,Qui ?
4,Wow!,Ça alors !
...,...,...
175616,"Top-down economics never works, said Obama. ""T...","« L'économie en partant du haut vers le bas, ç..."
175617,A carbon footprint is the amount of carbon dio...,Une empreinte carbone est la somme de pollutio...
175618,Death is something that we're often discourage...,La mort est une chose qu'on nous décourage sou...
175619,Since there are usually multiple websites on a...,Puisqu'il y a de multiples sites web sur chaqu...


# Pre-Processing of the Data

## Separating the dataset into English and French

In [51]:
english = DF.English
english

0                                                       Hi.
1                                                      Run!
2                                                      Run!
3                                                      Who?
4                                                      Wow!
                                ...                        
175616    Top-down economics never works, said Obama. "T...
175617    A carbon footprint is the amount of carbon dio...
175618    Death is something that we're often discourage...
175619    Since there are usually multiple websites on a...
175620    If someone who doesn't know your background sa...
Name: English, Length: 175621, dtype: object

In [52]:
french = DF.French
french

0                                                    Salut!
1                                                   Cours !
2                                                  Courez !
3                                                     Qui ?
4                                                Ça alors !
                                ...                        
175616    « L'économie en partant du haut vers le bas, ç...
175617    Une empreinte carbone est la somme de pollutio...
175618    La mort est une chose qu'on nous décourage sou...
175619    Puisqu'il y a de multiples sites web sur chaqu...
175620    Si quelqu'un qui ne connaît pas vos antécédent...
Name: French, Length: 175621, dtype: object

## Tokenizing the English DataFrame

In [53]:
from nltk.tokenize import RegexpTokenizer
tokenizer = RegexpTokenizer(r'\w+')
for i,text in enumerate(english):
    stri = ""
    txt = tokenizer.tokenize(text)
    for j in txt:
        j = j.lower()
        stri = stri + j
        stri = stri + " "
    english[i] = stri

## Printing the first Ten Tokenized words of English DataFrame

In [54]:
print(english[0:10])

0      hi 
1     run 
2     run 
3     who 
4     wow 
5    fire 
6    help 
7    jump 
8    stop 
9    stop 
Name: English, dtype: object


## Tokenizing the French DataFrame

In [55]:
from nltk.tokenize import RegexpTokenizer
tokenizer = RegexpTokenizer(r'\w+')
for i,text in enumerate(french):
    stri = ""
    txt = tokenizer.tokenize(text)
    for j in txt:
        j = j.lower()
        stri = stri + j
        stri = stri + " "
    french[i] = stri

## Printing the first Ten Tokenized words of French DataFrame

In [56]:
print(french[0:10])

0        salut 
1        cours 
2       courez 
3          qui 
4     ça alors 
5       au feu 
6     à l aide 
7        saute 
8    ça suffit 
9         stop 
Name: French, dtype: object


## Transforming data into arrays

In [57]:
n1 = 0
n2 = 100
eng = list(english)
fre = list(french)

# for DF in english:eng.append(DF)

# for DF in french:fre.append(DF)

eng = np.asarray(eng)
fre = np.asarray(fre)

eng = eng[0:175000]
fre = fre[0:175000]

for i in range(n1,n2):
  print(eng[i] + "\t->\t" + fre[i] + "\n")

hi 	->	salut 

run 	->	cours 

run 	->	courez 

who 	->	qui 

wow 	->	ça alors 

fire 	->	au feu 

help 	->	à l aide 

jump 	->	saute 

stop 	->	ça suffit 

stop 	->	stop 

stop 	->	arrête toi 

wait 	->	attends 

wait 	->	attendez 

go on 	->	poursuis 

go on 	->	continuez 

go on 	->	poursuivez 

hello 	->	bonjour 

hello 	->	salut 

i see 	->	je comprends 

i try 	->	j essaye 

i won 	->	j ai gagné 

i won 	->	je l ai emporté 

i won 	->	j ai gagné 

oh no 	->	oh non 

attack 	->	attaque 

attack 	->	attaquez 

cheers 	->	santé 

cheers 	->	à votre santé 

cheers 	->	merci 

cheers 	->	tchin tchin 

get up 	->	lève toi 

go now 	->	va maintenant 

go now 	->	allez y maintenant 

go now 	->	vas y maintenant 

got it 	->	j ai pigé 

got it 	->	compris 

got it 	->	pigé 

got it 	->	compris 

got it 	->	t as capté 

hop in 	->	monte 

hop in 	->	montez 

hug me 	->	serre moi dans tes bras 

hug me 	->	serrez moi dans vos bras 

i fell 	->	je suis tombée 

i fell 	->	je suis tombé 

i k

## Counting English and French Words

In [58]:
e = [word for sentence in eng for word in sentence.split(" ")]
f = [word for sentence in fre for word in sentence.split(" ")]
english_word_counter = collections.Counter(e)
french_word_counter = collections.Counter(f)

In [59]:
print('{} English words.'.format(len(e)))
print('{} French words.'.format(len(f)))
print("\n")
print('{} unique English words.'.format(len(english_word_counter)))
print('{} unique French words.'.format(len(french_word_counter)))
print("\n")
print('10 Most common words in the English dataset:')
print('"' + '" "'.join(list(zip(*english_word_counter.most_common(10)))[0]) + '"')
print("\n")
print('10 Most common words in the French dataset:')
print('"' + '" "'.join(list(zip(*french_word_counter.most_common(10)))[0]) + '"')

1308720 English words.
1425733 French words.


13917 unique English words.
23918 unique French words.


10 Most common words in the English dataset:
"" "i" "you" "to" "the" "a" "t" "is" "that" "tom"


10 Most common words in the French dataset:
"" "je" "de" "pas" "est" "vous" "que" "il" "à" "ne"


## Sorting the above data into table form

In [60]:
dict1 = {1: ["English ", 1133720, 13917 ], 
     2: ["French", 1250733, 23918] 
     } 
# Print the names of the columns. 
print ("{:<15} {:<15} {:<15}".format('LANGUAGE', 'TOTAL WORDS', 'UNIQUE WORDS')) 
  
# print each data item. 
for key, value in dict1.items(): 
    language, total_words, unique_words = value 
    print ("{:<15} {:<15} {:<15}".format(language, total_words, unique_words)) 

LANGUAGE        TOTAL WORDS     UNIQUE WORDS   
English         1133720         13917          
French          1250733         23918          


## Indexing of a smaple text with help of tokenization

In [61]:
def tokenize(x):
    tokenizer = Tokenizer(char_level=False,oov_token=" ")
    tokenizer.fit_on_texts(x)
    return tokenizer.texts_to_sequences(x), tokenizer

text_sentences = [
    'An apple a day keeps a doctor away .',
    'well, hope this letter of mine finds u in pink of your health .',
    'This is a short sentence .']
  
text , tokenizer = tokenize(text_sentences)
print(text)
print(tokenizer.word_index)

[[5, 6, 2, 7, 8, 2, 9, 10], [11, 12, 3, 13, 4, 14, 15, 16, 17, 18, 4, 19, 20], [3, 21, 2, 22, 23]]
{' ': 1, 'a': 2, 'this': 3, 'of': 4, 'an': 5, 'apple': 6, 'day': 7, 'keeps': 8, 'doctor': 9, 'away': 10, 'well': 11, 'hope': 12, 'letter': 13, 'mine': 14, 'finds': 15, 'u': 16, 'in': 17, 'pink': 18, 'your': 19, 'health': 20, 'is': 21, 'short': 22, 'sentence': 23}


In [62]:
def pad(x,length=None):
    if (length==None):
        length = max([len(sentence) for sentence in x])
    a = pad_sequences(x,maxlen=length,padding="post")
    return a



## Indexing of complete Dataset

In [63]:
def preprocess(x,y):
    preprocess_x,x_tk = tokenize(x)
    preprocess_y,y_tk = tokenize(y)

    preprocess_x = pad(preprocess_x)
    preprocess_y = pad(preprocess_y)
    print(*preprocess_y.shape)
    preprocess_y = preprocess_y.reshape(*preprocess_y.shape, 1)

    return preprocess_x, preprocess_y, x_tk, y_tk

def preprocessing(x):
    preprocess_x,x_tk = tokenize(x)
    preprocess_x = pad(preprocess_x)
    return preprocess_x, x_tk

## Preprocessed Information about the data

In [64]:
pre_eng,pre_fre,eng_tk,fre_tk = preprocess(eng,fre)
max_eng_seq_len = pre_eng.shape[1]
max_fr_seq_len = pre_fre.shape[1]
english_vocab_size = len(eng_tk.word_index)
french_vocab_size = len(fre_tk.word_index)

print('Data Preprocessed')
print("Max English sentence length:", max_eng_seq_len)
print("Max French sentence length:", max_fr_seq_len)
print("English vocabulary size:", english_vocab_size)
print("French vocabulary size:", french_vocab_size)

175000 26
Data Preprocessed
Max English sentence length: 21
Max French sentence length: 26
English vocabulary size: 13917
French vocabulary size: 23918


# Model Implementation 

In [65]:
def embed_model(input_shape, output_sequence_length, english_vocab_size, french_vocab_size):

    learning_rate = 0.001
    model = keras.Sequential([
                                Embedding(english_vocab_size+1, 
                                          128, 
                                          input_length = input_shape[1]),
        
                                Bidirectional(GRU(128, 
                                                  return_sequences=True)),
        
                                tf.keras.layers.Dropout(0.25),
        
                                TimeDistributed(Dense(french_vocab_size, 
                                                      activation='softmax'))
                                ])
    model.summary()
    
    model.compile(loss=sparse_categorical_crossentropy,
                  optimizer=Adam(learning_rate),
                  metrics=['accuracy'])
    
    return model

tmp_x = pad(pre_eng, 
            max_fr_seq_len)

rnn_model = embed_model(tmp_x.shape,
                        max_fr_seq_len,
                        english_vocab_size,
                        french_vocab_size)

rnn_model.fit(tmp_x, pre_fre, batch_size=1024, epochs=20, validation_split=0.2)

rnn_model.save_weights("rnn_model_weights.h5")

Model: "sequential_2"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_2 (Embedding)      (None, 26, 128)           1781504   
_________________________________________________________________
bidirectional_2 (Bidirection (None, 26, 256)           198144    
_________________________________________________________________
dropout_2 (Dropout)          (None, 26, 256)           0         
_________________________________________________________________
time_distributed_2 (TimeDist (None, 26, 23918)         6146926   
Total params: 8,126,574
Trainable params: 8,126,574
Non-trainable params: 0
_________________________________________________________________
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


In [66]:
def logits_to_text(logits, tokenizer):
    index_to_words = {id: word for word, id in tokenizer.word_index.items()}
    index_to_words[0] = '<PAD>'

    return ' '.join([index_to_words[prediction] for prediction in np.argmax(logits, 1)])

## Checking for translation 

In [67]:
for k in range(10):
    predicted=[]
    a = random.randint(0,100000)
    print('Random Index: ', a)

    print("PREDICTED:\t", end=' ')
    for i in range(5):
        x = logits_to_text(rnn_model.predict(tmp_x[a])[i], fre_tk)
        print(x, end =' ')
        if x!='<PAD>':
            predicted.append(x)

    
    english = eng[a].split()
    french = fre[a].split()

    print("\n\nENGLISH:\t", eng[a] + "\nFRENCH:\t\t " + fre[a] + "\n")
    print('\nIndexing: ',tmp_x[a])
    
    print('\n|||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||\n')

Random Index:  86482
PREDICTED:	 tom adhésif père fox très 

ENGLISH:	 tom s father is very strict 
FRENCH:		 le père de tom est très sévère 


Indexing:  [  10   13  169    8   55 2266    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0]

|||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||

Random Index:  90972
PREDICTED:	 j honnête non sûre j 

ENGLISH:	 i m not sure i can trust you 
FRENCH:		 je ne suis pas sûre que je puisse me fier à vous 


Indexing:  [  2  28  34 118   2  25 421   3   0   0   0   0   0   0   0   0   0   0
   0   0   0   0   0   0   0   0]

|||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||

Random Index:  74070
PREDICTED:	 saviez tu dis trente aida 

ENGLISH:	 did you say thirty euros 
FRENCH:		 est ce que tu as dit trente euros 


Indexing:  [  42    3  114  628 3857    0    0    0    0    0    0    0    0    0
    0    0    0    0  

## CALCULATING BLEU SCORE

In [68]:
n = []
lst=[]
k=0
while k<1000:
    
    predicted=[]
    a = random.randint(0,175000)
    
    for i in range(5):
        x = logits_to_text(rnn_model.predict(tmp_x[a])[i], fre_tk)
    
        if x!='<PAD>':
            predicted.append(x)

    english = eng[a].split()
    french = fre[a].split()

    references = [[french]]
    candidates = [predicted]
    score = corpus_bleu(references, candidates, weights=(0.05, 0.25, 0.35, 0.35))
    lst.append(score)
    if score>.8:
        n+=[score]
    k+=1
    
    if k%100==0:
        print(k)        
def average(lst):
    print(sum(lst)/len(lst))   
average(n)

Corpus/Sentence contains 0 counts of 2-gram overlaps.
BLEU scores might be undesirable; use SmoothingFunction().
Corpus/Sentence contains 0 counts of 3-gram overlaps.
BLEU scores might be undesirable; use SmoothingFunction().
Corpus/Sentence contains 0 counts of 4-gram overlaps.
BLEU scores might be undesirable; use SmoothingFunction().


100
200
300
400
500
600
700
800
900
1000
0.9409652607509906


In [69]:
print("AVERAGE BLEU SCORE: ", end ='\t') 
average(n)


AVERAGE BLEU SCORE: 	0.9409652607509906


---