<a href="https://colab.research.google.com/github/saurav188/Apple-home-page-duplucate/blob/master/English_to_French_translator.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

#**English to French Translator**

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
import pandas as pd
import numpy as np
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
from keras.optimizers import Adam
from keras.losses import sparse_categorical_crossentropy
from keras.preprocessing.text import Tokenizer
from keras_preprocessing.sequence import pad_sequences
import collections

df=pd.read_csv("/content/drive/MyDrive/en-fr.csv/eng_-french.csv")

#Data Preprocessing

In [None]:
df.shape

(175621, 2)

In [None]:
df.head()

Unnamed: 0,English words/sentences,French words/sentences
0,Hi.,Salut!
1,Run!,Cours !
2,Run!,Courez !
3,Who?,Qui ?
4,Wow!,Ça alors !


In [None]:
df.isna().sum()

English words/sentences    0
French words/sentences     0
dtype: int64

In [None]:
special_characters = '"!@#$%^&*()-+?_|=,<>/"'

In [None]:
def tokenize(x):
    tokenizer = Tokenizer(
        filters=special_characters,
        lower=True,
        split=" "
    )
    tokenizer.fit_on_texts(x)
    return tokenizer.texts_to_sequences(x), tokenizer

In [None]:
def pad(x, length=None):
    return pad_sequences(x, maxlen = 55, padding = 'post')

In [None]:
def preproc(x,length=None):
  seq,tok=tokenize(x.apply(lambda x:str(x)))
  padded_value=pad(seq,length)
  #reshaped_value=padded_value.reshape(*padded_value.shape,1)
  return padded_value,tok

In [None]:
preproc_english,english_tokenizer=preproc(df["English words/sentences"])
preproc_french,french_tokenizer=preproc(df["French words/sentences"])

In [None]:
eng_vocab_size=len(english_tokenizer.word_index)
frn_vocab_size=len(french_tokenizer.word_index)

##One-Hot representation of sentences

In [None]:
i=98
print(df.iloc[i,0],"\nis represented as \n",preproc_english[i])
print(df.iloc[i,1],"\nis represented as \n",preproc_french[i])

Come in. 
is represented as 
 [ 91 449   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
   0]
Entre. 
is represented as 
 [8126    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0]


##Analysing Data


In [None]:
english_words_counter = collections.Counter([word for sentence in df["English words/sentences"] for word in sentence.split()])
french_words_counter = collections.Counter([word for sentence in df["French words/sentences"] for word in sentence.split()])

print('{} English words.'.format(len([word for sentence in df["English words/sentences"] for word in sentence.split()])))
print('{} unique English words.'.format(len(english_words_counter)))
print('\n10 Most common words in the English dataset:')
print('"' + '" "'.join(list(zip(*english_words_counter.most_common(10)))[0]) + '"')
print("\n-------------------------------------------------------------------\n")
print('{} French words.'.format(len([word for sentence in df["French words/sentences"] for word in sentence.split()])))
print('{} unique French words.'.format(len(french_words_counter)))
print('\n10 Most common words in the French dataset:')
print('"' + '" "'.join(list(zip(*french_words_counter.most_common(10)))[0]) + '"')

1082098 English words.
27393 unique English words.

10 Most common words in the English dataset:
"I" "to" "you" "the" "a" "is" "Tom" "of" "in" "have"

-------------------------------------------------------------------

1177832 French words.
44918 unique French words.

10 Most common words in the French dataset:
"de" "Je" "?" "pas" "que" "à" "ne" "la" "le" "Il"


#Building the model


In [None]:
def bd_model(input_shape, output_sequence_length, english_vocab_size, french_vocab_size):
    learning_rate = 0.003
    
    # Build the layers
    model = keras.Sequential()
    model.add(layers.Embedding(english_vocab_size, 256, input_length=input_shape[1], input_shape=input_shape[1:]))
    model.add(layers.Bidirectional(layers.GRU(256, return_sequences=True)))
    model.add(layers.TimeDistributed(layers.Dense(1024, activation='relu')))
    model.add(layers.TimeDistributed(layers.Dense(french_vocab_size, activation='softmax'))) 

    # Compile model
    model.compile(loss=sparse_categorical_crossentropy,
                  optimizer=Adam(learning_rate),
                  metrics=['accuracy'])
    return model

In [None]:
model=keras.models.load_model("/content/drive/MyDrive/en-fr.csv/checkpoint.h5")
model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 55, 256)           5261568   
                                                                 
 bidirectional (Bidirectiona  (None, 55, 512)          789504    
 l)                                                              
                                                                 
 time_distributed (TimeDistr  (None, 55, 1024)         525312    
 ibuted)                                                         
                                                                 
 time_distributed_1 (TimeDis  (None, 55, 40159)        41162975  
 tributed)                                                       
                                                                 
Total params: 47,739,359
Trainable params: 47,739,359
Non-trainable params: 0
____________________________________________

#Testing The Model

In [None]:
i=10000
j=20000

score = model.evaluate(preproc_english[i:j], preproc_french[i:j], verbose=0)
print('Test loss    :', score[0])
print('Test accuracy:', score[1])

Test loss    : 0.13109400868415833
Test accuracy: 0.9681817889213562


In [None]:
def sequence_to_text(sequence, tokenizer):
  index_to_words = {id: word for word, id in tokenizer.word_index.items()}
  index_to_words[0] = ''

  return ' '.join([index_to_words[prediction] for prediction in np.argmax(sequence, 1)])

In [None]:
def process_input(str):
  words=str.split(" ")
  #removing !,.,?
  words=[word[:-1] if word[-1] in [".","?","!"] else word for word in words]
  words=[word for word in words if len(word)>0]
  return " ".join(words)

In [None]:
i=12724
test_input=preproc_english[i:i+1]
predicted_sequence=model.predict(test_input,verbose=0)[0]
translated_text=sequence_to_text(predicted_sequence,french_tokenizer)

print("Original text :")
print(df["English words/sentences"][i])
print("\nOriginal translated text is :")
print(df["French words/sentences"][i])
print("\ntranslated text is :")
print(translated_text)

Original text :
Wash the dishes.

Original translated text is :
Lave la vaisselle !

translated text is :
lave la vaisselle.                                                    


#Tranalating inputed English sentence

In [None]:
inputed_text=input("Enter text to translater: ")
inputed_text=process_input(inputed_text)

word_index_english=english_tokenizer.word_index
word_index_french=french_tokenizer.word_index

try:
  preproced_input=pad([[word_index_english[i.lower()] for i in inputed_text.split(" ")]])
  predicted_sequence=model.predict(preproced_input,verbose=0)[0]
  translated_text=sequence_to_text(predicted_sequence,french_tokenizer)

  print("\nTranslated text is :")
  print(translated_text)

except:
  print("Inputed sentence contains words that are unrecognisable by the model try other sentence.")

Enter text to translater: are you fine?

Translated text is :
êtes tu bien                                                    
