## Import necessary libraries

In [None]:
import collections
import numpy as np
import pandas as pd
import keras
import tensorflow as tf
import random

from nltk.translate.bleu_score import corpus_bleu

from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import Model
from keras.layers import GRU, Input, Dense, TimeDistributed, Activation, RepeatVector, Bidirectional
from keras.layers.embeddings import Embedding
from keras.optimizers import Adam
from keras.losses import sparse_categorical_crossentropy
from keras import Sequential

## Reading the Data

In [None]:
DF = pd.read_csv("../input/language-translation-englishfrench/eng_-french.csv")

## Viewing the datasest

In [None]:
DF

## Renaming the columns of DataFrame

In [None]:
DF = DF.rename(columns={'English words/sentences': 'English', 'French words/sentences': 'French'})
DF

# Pre-Processing of the Data

## Separating the dataset into English and French

In [None]:
english = DF.English
english

In [None]:
french = DF.French
french

## Tokenizing the English DataFrame

In [None]:
from nltk.tokenize import RegexpTokenizer
tokenizer = RegexpTokenizer(r'\w+')
for i,text in enumerate(english):
    stri = ""
    txt = tokenizer.tokenize(text)
    for j in txt:
        j = j.lower()
        stri = stri + j
        stri = stri + " "
    english[i] = stri

## Printing the first Ten Tokenized words of English DataFrame

In [None]:
print(english[0:10])

## Tokenizing the French DataFrame

In [None]:
from nltk.tokenize import RegexpTokenizer
tokenizer = RegexpTokenizer(r'\w+')
for i,text in enumerate(french):
    stri = ""
    txt = tokenizer.tokenize(text)
    for j in txt:
        j = j.lower()
        stri = stri + j
        stri = stri + " "
    french[i] = stri

## Printing the first Ten Tokenized words of French DataFrame

In [None]:
print(french[0:10])

## Transforming data into arrays

In [None]:
n1 = 0
n2 = 100
eng = list(english)
fre = list(french)

# for DF in english:eng.append(DF)

# for DF in french:fre.append(DF)

eng = np.asarray(eng)
fre = np.asarray(fre)

eng = eng[0:175000]
fre = fre[0:175000]

for i in range(n1,n2):
  print(eng[i] + "\t->\t" + fre[i] + "\n")

## Counting English and French Words

In [None]:
e = [word for sentence in eng for word in sentence.split(" ")]
f = [word for sentence in fre for word in sentence.split(" ")]
english_word_counter = collections.Counter(e)
french_word_counter = collections.Counter(f)

In [None]:
print('{} English words.'.format(len(e)))
print('{} French words.'.format(len(f)))
print("\n")
print('{} unique English words.'.format(len(english_word_counter)))
print('{} unique French words.'.format(len(french_word_counter)))
print("\n")
print('10 Most common words in the English dataset:')
print('"' + '" "'.join(list(zip(*english_word_counter.most_common(10)))[0]) + '"')
print("\n")
print('10 Most common words in the French dataset:')
print('"' + '" "'.join(list(zip(*french_word_counter.most_common(10)))[0]) + '"')

## Sorting the above data into table form

In [None]:
dict1 = {1: ["English ", 1133720, 13917 ], 
     2: ["French", 1250733, 23918] 
     } 
# Print the names of the columns. 
print ("{:<15} {:<15} {:<15}".format('LANGUAGE', 'TOTAL WORDS', 'UNIQUE WORDS')) 
  
# print each data item. 
for key, value in dict1.items(): 
    language, total_words, unique_words = value 
    print ("{:<15} {:<15} {:<15}".format(language, total_words, unique_words)) 

## Indexing of a smaple text with help of tokenization

In [None]:
def tokenize(x):
    tokenizer = Tokenizer(char_level=False,oov_token=" ")
    tokenizer.fit_on_texts(x)
    return tokenizer.texts_to_sequences(x), tokenizer

text_sentences = [
    'An apple a day keeps a doctor away .',
    'well, hope this letter of mine finds u in pink of your health .',
    'This is a short sentence .']
  
text , tokenizer = tokenize(text_sentences)
print(text)
print(tokenizer.word_index)

In [None]:
def pad(x,length=None):
    if (length==None):
        length = max([len(sentence) for sentence in x])
    a = pad_sequences(x,maxlen=length,padding="post")
    return a



## Indexing of complete Dataset

In [None]:
def preprocess(x,y):
    preprocess_x,x_tk = tokenize(x)
    preprocess_y,y_tk = tokenize(y)

    preprocess_x = pad(preprocess_x)
    preprocess_y = pad(preprocess_y)
    print(*preprocess_y.shape)
    preprocess_y = preprocess_y.reshape(*preprocess_y.shape, 1)

    return preprocess_x, preprocess_y, x_tk, y_tk

def preprocessing(x):
    preprocess_x,x_tk = tokenize(x)
    preprocess_x = pad(preprocess_x)
    return preprocess_x, x_tk

## Preprocessed Information about the data

In [None]:
pre_eng,pre_fre,eng_tk,fre_tk = preprocess(eng,fre)
max_eng_seq_len = pre_eng.shape[1]
max_fr_seq_len = pre_fre.shape[1]
english_vocab_size = len(eng_tk.word_index)
french_vocab_size = len(fre_tk.word_index)

print('Data Preprocessed')
print("Max English sentence length:", max_eng_seq_len)
print("Max French sentence length:", max_fr_seq_len)
print("English vocabulary size:", english_vocab_size)
print("French vocabulary size:", french_vocab_size)

# Model Implementation 

In [None]:
def embed_model(input_shape, output_sequence_length, english_vocab_size, french_vocab_size):

    learning_rate = 0.001
    model = keras.Sequential([
                                Embedding(english_vocab_size+1, 
                                          128, 
                                          input_length = input_shape[1]),
        
                                Bidirectional(GRU(128, 
                                                  return_sequences=True)),
        
                                tf.keras.layers.Dropout(0.25),
        
                                TimeDistributed(Dense(french_vocab_size, 
                                                      activation='softmax'))
                                ])
    model.summary()
    
    model.compile(loss=sparse_categorical_crossentropy,
                  optimizer=Adam(learning_rate),
                  metrics=['accuracy'])
    
    return model

tmp_x = pad(pre_eng, 
            max_fr_seq_len)

rnn_model = embed_model(tmp_x.shape,
                        max_fr_seq_len,
                        english_vocab_size,
                        french_vocab_size)

rnn_model.fit(tmp_x, pre_fre, batch_size=1024, epochs=20, validation_split=0.2)

rnn_model.save_weights("rnn_model_weights.h5")

In [None]:
def logits_to_text(logits, tokenizer):
    index_to_words = {id: word for word, id in tokenizer.word_index.items()}
    index_to_words[0] = '<PAD>'

    return ' '.join([index_to_words[prediction] for prediction in np.argmax(logits, 1)])

## Checking for translation 

In [None]:
for k in range(10):
    predicted=[]
    a = random.randint(0,100000)
    print('Random Index: ', a)

    print("PREDICTED:\t", end=' ')
    for i in range(5):
        x = logits_to_text(rnn_model.predict(tmp_x[a])[i], fre_tk)
        print(x, end =' ')
        if x!='<PAD>':
            predicted.append(x)

    
    english = eng[a].split()
    french = fre[a].split()

    print("\n\nENGLISH:\t", eng[a] + "\nFRENCH:\t\t " + fre[a] + "\n")
    print('\nIndexing: ',tmp_x[a])
    
    print('\n|||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||\n')

## CALCULATING BLEU SCORE

In [None]:
n = []
lst=[]
k=0
while k<1000:
    
    predicted=[]
    a = random.randint(0,175000)
    
    for i in range(5):
        x = logits_to_text(rnn_model.predict(tmp_x[a])[i], fre_tk)
    
        if x!='<PAD>':
            predicted.append(x)

    english = eng[a].split()
    french = fre[a].split()

    references = [[french]]
    candidates = [predicted]
    score = corpus_bleu(references, candidates, weights=(0.05, 0.25, 0.35, 0.35))
    lst.append(score)
    if score>.8:
        n+=[score]
    k+=1
    
    if k%100==0:
        print(k)        
def average(lst):
    print(sum(lst)/len(lst))   
average(n)

In [None]:
print("AVERAGE BLEU SCORE: ", end ='\t') 
average(n)


---