# English to French Translator using TensorFlow

In [1]:
import numpy as np
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras. models import Model, Sequential
from tensorflow.keras.layers import GRU, Input, Dense, TimeDistributed, Activation, RepeatVector, Bidirectional, Dropout, LSTM, Embedding
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.losses import sparse_categorical_crossentropy

In [5]:
# Loading Data
english_data = '/content/Language Translation Files/small_vocab_en.txt'
french_data = '/content/Language Translation Files/small_vocab_fr.txt'

In [6]:
import os
def load_data(path):
  input_file = os.path.join(path)
  with open(input_file, 'r') as f:
    data = f.read()
  return data.split('\n')

In [7]:
english_sentences = load_data(english_data)
french_sentences = load_data(french_data)

In [8]:
for i in range(5):
  print('Sample:', i)
  print(english_sentences[i])
  print(french_sentences[i])
  print('-'*50)

Sample: 0
new jersey is sometimes quiet during autumn , and it is snowy in april .
new jersey est parfois calme pendant l' automne , et il est neigeux en avril .
--------------------------------------------------
Sample: 1
the united states is usually chilly during july , and it is usually freezing in november .
les états-unis est généralement froid en juillet , et il gèle habituellement en novembre .
--------------------------------------------------
Sample: 2
california is usually quiet during march , and it is usually hot in june .
california est généralement calme en mars , et il est généralement chaud en juin .
--------------------------------------------------
Sample: 3
the united states is sometimes mild during june , and it is cold in september .
les états-unis est parfois légère en juin , et il fait froid en septembre .
--------------------------------------------------
Sample: 4
your least liked fruit is the grape , but my least liked is the apple .
votre moins aimé fruit est

## Convert to Vocabulary

In [9]:
import collections

In [10]:
# Splitting words using counter function
english_words_counter = collections.Counter([word for sentences in english_sentences for word in sentences.split()])
print('English Vocab : ', len(english_words_counter))
french_words_counter = collections.Counter([word for sentences in french_sentences for word in sentences.split()])
print('French Vocab : ', len(french_words_counter))

English Vocab :  227
French Vocab :  355


## Tokenize

In [11]:
def tokenize(x):
  tokenizer = Tokenizer()
  tokenizer.fit_on_texts(x)
  return tokenizer.texts_to_sequences(x), tokenizer

In [12]:
text_sentences = [
  'The quick brown fox jumps over the lazy dog.',
  'By Jove, my quick study of lexicography won a prize.',
  'This is a short sentence.']

text_tokenized, text_tokenizer = tokenize(text_sentences)
print(text_tokenizer.word_index)
print()

for sample_i, (sent, token_sent) in enumerate(zip(text_sentences, text_tokenized)):
  print('Sequence {} in x'.format(sample_i, 1))
  print('Input: {}'.format(sent))
  print('Output: {}'.format(token_sent))

{'the': 1, 'quick': 2, 'a': 3, 'brown': 4, 'fox': 5, 'jumps': 6, 'over': 7, 'lazy': 8, 'dog': 9, 'by': 10, 'jove': 11, 'my': 12, 'study': 13, 'of': 14, 'lexicography': 15, 'won': 16, 'prize': 17, 'this': 18, 'is': 19, 'short': 20, 'sentence': 21}

Sequence 0 in x
Input: The quick brown fox jumps over the lazy dog.
Output: [1, 2, 4, 5, 6, 7, 1, 8, 9]
Sequence 1 in x
Input: By Jove, my quick study of lexicography won a prize.
Output: [10, 11, 12, 2, 13, 14, 15, 16, 3, 17]
Sequence 2 in x
Input: This is a short sentence.
Output: [18, 19, 3, 20, 21]


## Padding

In [13]:
def pad(x, length=None):
  return pad_sequences(x, maxlen=length, padding='post')

In [14]:
def preprocess(x, y):
  # Preprocess
  preprocess_x, x_tk = tokenize(x)
  preprocess_y, y_tk = tokenize(y)
  # Padding
  preprocess_x = pad(preprocess_x)
  preprocess_y = pad(preprocess_y)

  # Expanding dimensions for keras sparse_categorical_crossentropy that requires 3 dimensions
  preprocess_y = preprocess_y.reshape(*preprocess_y.shape, 1)
  return preprocess_x, preprocess_y, x_tk, y_tk

preproc_english_sentences, preproc_french_sentences, english_tokenizer, french_tokenizer = preprocess(english_sentences, french_sentences)

# Max sentence length
max_english_sequence_length = preproc_english_sentences.shape[1]
max_french_sequence_length = preproc_french_sentences.shape[1]
# Len of vocabulary
english_vocab_size = len(english_tokenizer.word_index)
french_vocab_size = len(french_tokenizer.word_index)

print('Data Processed')
print('Max english sentence length: ', max_english_sequence_length)
print('Max french sentence length: ', max_french_sequence_length)
print('English vocabulary size: ', english_vocab_size)
print('French vocabulary size: ', french_vocab_size)

Data Processed
Max english sentence length:  15
Max french sentence length:  21
English vocabulary size:  199
French vocabulary size:  344


## Model

### IDs back to text

In [15]:
def logits_to_text(logits, tokenizer):
  # Dictionray which maps each word to its correspnding ids
  # Then uses it to find all words with ids 0-9 and prints them
  index_to_words = {id: word for word , id in tokenizer.word_index.items()}
  index_to_words[0] = '<PAD>'

  # Predicting model for given word and selecting the best answer, then reverse enumerate the word from the id
  return ' '.join([index_to_words[prediction] for prediction in np.argmax(logits, 1)])

## Building Model

In [16]:
def embed_model(input_shape, output_sequence_length, english_vocab_size, french_vocab_size):
  '''
  Build and train an RNN model using word embedding on x and y
  :param input_shape: Tuple of input shape
  :param output_sequence length: Length of output sequence
  :param english_vocab_size: Number of unique english words in dataset
  :param french_vocab_size: Number of unique french words in the dataset
  '''

  learning_rate = 0.005

  model = Sequential()

  model.add(Embedding(english_vocab_size, 256, input_length=input_shape[1], input_shape=input_shape[1:]))
  model.add(GRU(256, return_sequences=True))
  model.add(TimeDistributed(Dense(1024, activation='relu')))
  model.add(Dropout(0.5))
  model.add(TimeDistributed(Dense(french_vocab_size, activation='softmax')))

  model.compile(loss=sparse_categorical_crossentropy,
                optimizer=Adam(learning_rate),
                metrics=['accuracy'])

  return model

In [17]:
# Reshaping the input to work with basic RNN

tmp_x = pad(preproc_english_sentences, preproc_french_sentences.shape[1])
tmp_x = tmp_x.reshape((-1, preproc_french_sentences.shape[-2]))

In [18]:
simple_rnn_model = embed_model(
    tmp_x.shape,
    preproc_french_sentences.shape[-1],
    len(english_tokenizer.word_index)+1,
    len(french_tokenizer.word_index)+1
)

In [19]:
simple_rnn_model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 21, 256)           51200     
                                                                 
 gru (GRU)                   (None, 21, 256)           394752    
                                                                 
 time_distributed (TimeDist  (None, 21, 1024)          263168    
 ributed)                                                        
                                                                 
 dropout (Dropout)           (None, 21, 1024)          0         
                                                                 
 time_distributed_1 (TimeDi  (None, 21, 345)           353625    
 stributed)                                                      
                                                                 
Total params: 1062745 (4.05 MB)
Trainable params: 106274

## Training

In [20]:
history = simple_rnn_model.fit(tmp_x, preproc_french_sentences, batch_size=1024, epochs=20, validation_split=0.2)

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


In [21]:
# Saving Model
simple_rnn_model.save('model.h5')

  saving_api.save_model(


## Arbitrary Predictions
### Performing predictions on the models using user input.

In [22]:
english_tokenizer.word_index

{'is': 1,
 'in': 2,
 'it': 3,
 'during': 4,
 'the': 5,
 'but': 6,
 'and': 7,
 'sometimes': 8,
 'usually': 9,
 'never': 10,
 'favorite': 11,
 'least': 12,
 'fruit': 13,
 'most': 14,
 'loved': 15,
 'liked': 16,
 'new': 17,
 'paris': 18,
 'india': 19,
 'united': 20,
 'states': 21,
 'california': 22,
 'jersey': 23,
 'france': 24,
 'china': 25,
 'he': 26,
 'she': 27,
 'grapefruit': 28,
 'your': 29,
 'my': 30,
 'his': 31,
 'her': 32,
 'fall': 33,
 'june': 34,
 'spring': 35,
 'january': 36,
 'winter': 37,
 'march': 38,
 'autumn': 39,
 'may': 40,
 'nice': 41,
 'september': 42,
 'july': 43,
 'april': 44,
 'november': 45,
 'summer': 46,
 'december': 47,
 'february': 48,
 'our': 49,
 'their': 50,
 'freezing': 51,
 'pleasant': 52,
 'beautiful': 53,
 'october': 54,
 'snowy': 55,
 'warm': 56,
 'cold': 57,
 'wonderful': 58,
 'dry': 59,
 'busy': 60,
 'august': 61,
 'chilly': 62,
 'rainy': 63,
 'mild': 64,
 'wet': 65,
 'relaxing': 66,
 'quiet': 67,
 'hot': 68,
 'dislikes': 69,
 'likes': 70,
 'limes': 7

In [35]:
def final_predictions(text):
  y_id_to_word = {value: key for key, value in french_tokenizer.word_index.items()}
  y_id_to_word[0] = '<PAD>'

  sentence = [english_tokenizer.word_index[word] for word in text.split()]
  sentence = pad_sequences([sentence], maxlen=preproc_french_sentences.shape[-2], padding='post')
  text1 = logits_to_text(simple_rnn_model.predict(sentence[:1])[0], french_tokenizer)
  text2 = ''

  for i in text1.split():
    if i=='<PAD>':
      break
    else:
      text2=text2+' '+i
  return text2

In [47]:
final_predictions(input())

most loved fruit


' fruit le plus aimé'

In [48]:
!pip install gradio

Collecting gradio
  Downloading gradio-4.36.1-py3-none-any.whl (12.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.3/12.3 MB[0m [31m52.5 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting aiofiles<24.0,>=22.0 (from gradio)
  Downloading aiofiles-23.2.1-py3-none-any.whl (15 kB)
Collecting fastapi (from gradio)
  Downloading fastapi-0.111.0-py3-none-any.whl (91 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m92.0/92.0 kB[0m [31m8.8 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting ffmpy (from gradio)
  Downloading ffmpy-0.3.2.tar.gz (5.5 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting gradio-client==1.0.1 (from gradio)
  Downloading gradio_client-1.0.1-py3-none-any.whl (318 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m318.1/318.1 kB[0m [31m29.7 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting httpx>=0.24.1 (from gradio)
  Downloading httpx-0.27.0-py3-none-any.whl (75 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━

In [49]:
import gradio as gr

In [53]:
interface = gr.Interface(fn=final_predictions,
             inputs=gr.Textbox(lines=2, placeholder='Text to translate'),
            outputs=gr.Textbox())

interface.launch(debug=True)

Setting queue=True in a Colab notebook requires sharing enabled. Setting `share=True` (you can turn this off by setting `share=False` in `launch()` explicitly).

Colab notebook detected. This cell will run indefinitely so that you can see errors and logs. To turn off, set debug=False in launch().
Running on public URL: https://9c521a7f6635970ec3.gradio.live

This share link expires in 72 hours. For free permanent hosting and GPU upgrades, run `gradio deploy` from Terminal to deploy to Spaces (https://huggingface.co/spaces)


Keyboard interruption in main thread... closing server.
Killing tunnel 127.0.0.1:7860 <> https://9c521a7f6635970ec3.gradio.live


