<a href="https://colab.research.google.com/github/rmtu14/ATM-/blob/main/English%20to%20french%20convert.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import numpy as np
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Model
from tensorflow.keras.layers import GRU, Input, Dense, TimeDistributed, Activation, RepeatVector, Bidirectional, Dropout, LSTM, Embedding
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.losses import sparse_categorical_crossentropy
import tensorflow as tf

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
english_data = "/content/drive/MyDrive/small_vocab_en.txt"
french_data = "/content/drive/MyDrive/small_vocab_fr.txt"

In [None]:
import os
def load_data(path):
  input_file = os.path.join(path)
  with open(input_file,"r") as f:
    data = f.read()
    return data.split('\n')


In [None]:
english_sentences = load_data(english_data)
french_sentences = load_data(french_data)

In [None]:
for i in range(5):
  print('Sample:',i)
  print(english_sentences[i])
  print(french_sentences[i])

Sample: 0
new jersey is sometimes quiet during autumn , and it is snowy in april .
new jersey est parfois calme pendant l' automne , et il est neigeux en avril .
Sample: 1
the united states is usually chilly during july , and it is usually freezing in november .
les états-unis est généralement froid en juillet , et il gèle habituellement en novembre .
Sample: 2
california is usually quiet during march , and it is usually hot in june .
california est généralement calme en mars , et il est généralement chaud en juin .
Sample: 3
the united states is sometimes mild during june , and it is cold in september .
les états-unis est parfois légère en juin , et il fait froid en septembre .
Sample: 4
your least liked fruit is the grape , but my least liked is the apple .
votre moins aimé fruit est le raisin , mais mon moins aimé est la pomme .


In [None]:
import collections

In [None]:
english_words_counter = collections.Counter([word for sentence in english_sentences for word in sentence.split()])
print(english_words_counter)
french_words_counter = collections.Counter([word for sentence in french_sentences for word in sentence.split()])
print(french_words_counter)

Counter({'is': 205858, ',': 140897, '.': 129039, 'in': 75525, 'it': 75137, 'during': 74933, 'the': 67628, 'but': 63987, 'and': 59850, 'sometimes': 37746, 'usually': 37507, 'never': 37500, 'least': 27564, 'favorite': 27371, 'fruit': 27105, 'most': 14934, 'loved': 13666, 'liked': 13546, 'new': 12197, 'paris': 11334, 'india': 11277, 'united': 11270, 'states': 11270, 'california': 11250, 'jersey': 11225, 'france': 11170, 'china': 10953, 'he': 10786, 'she': 10786, 'grapefruit': 10118, 'your': 9734, 'my': 9700, 'his': 9700, 'her': 9700, 'fall': 9134, 'june': 9133, 'spring': 9102, 'january': 9090, 'winter': 9038, 'march': 9023, 'autumn': 9004, 'may': 8995, 'nice': 8984, 'september': 8958, 'july': 8956, 'april': 8954, 'november': 8951, 'summer': 8948, 'december': 8945, 'february': 8942, 'our': 8932, 'their': 8932, 'freezing': 8928, 'pleasant': 8916, 'beautiful': 8915, 'october': 8910, 'snowy': 8898, 'warm': 8890, 'cold': 8878, 'wonderful': 8808, 'dry': 8794, 'busy': 8791, 'august': 8789, 'chil

In [None]:
def tokenize(x):
  tokenizer = Tokenizer()
  tokenizer.fit_on_texts(x)
  return tokenizer.texts_to_sequences(x), tokenizer

In [None]:
text_sentences = [
    'the quick brown fox jumps obver the lazy dog',
    'by love, my quck study of prize',
    'This is a short sentence'
]
text_tokenized, text_tokenizer = tokenize(text_sentences)
print(text_tokenizer.word_index)
print()
for sample_i, (sent, token_sent) in enumerate(zip(text_sentences,text_tokenized)):
  print('Sequence {} in x'.format(sample_i+1))
  print('Input {}'.format(sent))
  print('Output{}'.format(token_sent))


{'the': 1, 'quick': 2, 'brown': 3, 'fox': 4, 'jumps': 5, 'obver': 6, 'lazy': 7, 'dog': 8, 'by': 9, 'love': 10, 'my': 11, 'quck': 12, 'study': 13, 'of': 14, 'prize': 15, 'this': 16, 'is': 17, 'a': 18, 'short': 19, 'sentence': 20}

Sequence 1 in x
Input the quick brown fox jumps obver the lazy dog
Output[1, 2, 3, 4, 5, 6, 1, 7, 8]
Sequence 2 in x
Input by love, my quck study of prize
Output[9, 10, 11, 12, 13, 14, 15]
Sequence 3 in x
Input This is a short sentence
Output[16, 17, 18, 19, 20]


In [None]:
def pad(x, length=None):
  return pad_sequences(x, maxlen=length, padding='post')


In [None]:
def preprocess(x,y):
  preprocess_x, x_tk = tokenize(x)
  preprocess_y, y_tk = tokenize(y)
  preprocess_x = pad(preprocess_x)
  preprocess_y = pad(preprocess_y)
  preprocess_y = preprocess_y.reshape(*preprocess_y.shape,1)
  return preprocess_x,preprocess_y,x_tk,y_tk

preproc_english_sentences, preproc_french_sentences, english_tokenizer, french_tokenizer = \
preprocess(english_sentences, french_sentences)

max_english_sequence_legth = preproc_english_sentences.shape[1]
max_french_sequence_legth = preproc_french_sentences.shape[1]
english_vocab_size = len(english_tokenizer.word_index)
french_vocab_size = len(french_tokenizer.word_index)
print(max_english_sequence_legth)
print(max_french_sequence_legth)
print(english_vocab_size)
print(french_vocab_size)



15
21
199
344


In [None]:
def logits_to_text(logits,tokenizer):
  index_to_words = {id:word for word, id in tokenizer.word_index.items()}
  index_to_words[0] = '<PAD>'
  return ''.join([index_to_words[prediction] for prediction in np.argmax(logits,1)])

In [None]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, GRU, TimeDistributed, Dense, Dropout

def embed_model(input_shape, output_sequence_length, english_vocab_size, french_vocab_size):
    learning_rate = 0.005
    model = Sequential()
    model.add(Embedding(english_vocab_size, 256, input_length=input_shape[1], input_shape=input_shape[1:]))
    model.add(GRU(256, return_sequences=True))
    model.add(TimeDistributed(Dense(1024, activation='relu')))
    model.add(Dropout(0.5))
    model.add(TimeDistributed(Dense(french_vocab_size, activation='softmax')))
    model.compile(loss=sparse_categorical_crossentropy,
                  optimizer=Adam(learning_rate),
                  metrics=['accuracy'])

    return model


In [None]:
tmp_x = pad(preproc_english_sentences, preproc_french_sentences.shape[1])
tmp_x = tmp_x.reshape(-1, preproc_french_sentences.shape[-2])

In [None]:
simple_rnn_model = embed_model(
    tmp_x.shape,
    preproc_french_sentences.shape[1],
    len(english_tokenizer.word_index)+1,
    len(french_tokenizer.word_index)+1)

In [None]:
simple_rnn_model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 21, 256)           51200     
                                                                 
 gru (GRU)                   (None, 21, 256)           394752    
                                                                 
 time_distributed (TimeDist  (None, 21, 1024)          263168    
 ributed)                                                        
                                                                 
 dropout (Dropout)           (None, 21, 1024)          0         
                                                                 
 time_distributed_1 (TimeDi  (None, 21, 345)           353625    
 stributed)                                                      
                                                                 
Total params: 1062745 (4.05 MB)
Trainable params: 106274

In [None]:
history=simple_rnn_model.fit(tmp_x,preproc_french_sentences,batch_size=1024,epochs=20,validation_split=0.2)


Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


In [None]:
simple_rnn_model.save('model.h5')

In [None]:
english_tokenizer.word_index

In [None]:
def final_predictions(text):
 y_id_to_word = {value:key for key , value in french_tokenizer.word_index.items()}
 y_id_to_word[0] = '<PAD>'
 sentence = [english_tokenizer.word_index[word] for word in text.split()]
 sentence = pad_sequences([sentence], maxlen=preproc_french_sentences.shape[-2],padding='post')
 text1 = logits_to_text(simple_rnn_model.predict(sentence[:1][0],french_tokenizer))
 text2=""
 for i in text1.split():
  if i =='<PAD>':
    break
  else:
    text2=text2+" "+i

  return text2



In [None]:
final_predictions(input())