## Import libraries

In [105]:
import numpy as np
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Model,Sequential
from tensorflow.keras.layers import GRU,Input,Dense,TimeDistributed,Activation,RepeatVector,Bidirectional,Dropout,LSTM,Embedding
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.losses import sparse_categorical_crossentropy
import tensorflow as tf
import os

## Load dataset

In [106]:
english_data = "small_vocab_en.txt"
french_data = "small_vocab_fr.txt"

In [107]:
def load_data(path):
    input_file = os.path.join(path)
    with open (input_file,"r") as f:
        data = f.read()
    return data.split('\n')

In [108]:
english_sentence = load_data(english_data)
french_sentence = load_data(french_data)

In [109]:
for i in range(5):
    print(english_sentence[i])
    print(french_sentence[i])
    print('-'*70)

Mummy,Papa I miss you .
Maman, Papa tu me manques .
----------------------------------------------------------------------
Sakshi I Love You The Most .
Sakshi je t'aime le plus .
----------------------------------------------------------------------
Saurabh I am Proud of You .
Saurabh, je suis fier de toi.
----------------------------------------------------------------------
Shubham I Love You .
Shubham je t'aime .
----------------------------------------------------------------------
new jersey is sometimes quiet during autumn , and it is snowy in april .
new jersey est parfois calme pendant l' automne , et il est neigeux en avril .
----------------------------------------------------------------------


## Convert to Vocabulary

In [110]:
import collections

In [111]:
english_words_counter = collections.Counter([word for sentence in english_sentence for word in sentence.split()]) #list Comprehension
print("English Vocabulary: ",len(english_words_counter))
french_words_counter = collections.Counter([word for sentence in french_sentence for word in sentence.split()]) #list Comprehension
print("French Vocabulary: ",len(french_words_counter))

English Vocabulary:  243
French Vocabulary:  372


## Tokenize (implementation)

In [112]:
def tokenize(x):
    tokenizer = Tokenizer()
    tokenizer.fit_on_texts(x)
    return tokenizer.texts_to_sequences(x),tokenizer

In [113]:
text_sentences = [
    'This is a short sentence .'
    'Nitin,I am lucky to have friend like you .'
]
text_tokenized,text_tokenizer = tokenize(text_sentences)
print(text_tokenizer.word_index)

for sample_i,(sent,token_sent) in enumerate (zip(text_sentences,text_tokenized)):
    print("Sequence {} in x".format(sample_i+1))
    print("Input: {} ".format(sent))
    print("Input: {} ".format(token_sent))

{'this': 1, 'is': 2, 'a': 3, 'short': 4, 'sentence': 5, 'nitin': 6, 'i': 7, 'am': 8, 'lucky': 9, 'to': 10, 'have': 11, 'friend': 12, 'like': 13, 'you': 14}
Sequence 1 in x
Input: This is a short sentence .Nitin,I am lucky to have friend like you . 
Input: [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14] 


## Padding (implementation)

In [114]:
def pad(x, length=None):
    return pad_sequences(x,maxlen=length,padding='post')

In [115]:
#define process function with x and y
def preprocess(x,y):
    preprocess_x,x_tk = tokenize(x)
    preprocess_y,y_tk = tokenize(y)
    
    #padding the data
    preprocess_x = pad(preprocess_x)
    preprocess_y = pad(preprocess_y)
    
    #keras's sparese_categorical_crossentropy function requires the labels to be in 3 dimension
    #Expanding dimensions
    preprocess_y = preprocess_y.reshape(*preprocess_y.shape,1)
    return preprocess_x,preprocess_y,x_tk,y_tk

#preproc_english_sentence,preproc_french_sentence,english_tokenizer,french_tokenizer 
preproc_english_sentence,preproc_french_sentence,english_tokenizer,french_tokenizer  =\
preprocess(english_sentence,french_sentence)

#print max english sentence length
max_english_sentence_length = preproc_english_sentence.shape[1]
#print max french sentence length
max_french_sentence_length = preproc_french_sentence.shape[1]
#print len of englsih vocabulary
english_vocab_size = len(english_tokenizer.word_index)
#print len of englsih vocabulary
french_vocab_size = len(french_tokenizer.word_index)

In [116]:
print("Data Preprocessed")
print("Max English Sentence Length:",max_english_sentence_length)
print("Max French Sentence Length:",max_french_sentence_length)
print("English Voacbulary Size:",english_vocab_size)
print("French Voacbulary Size:",french_vocab_size)

Data Preprocessed
Max English Sentence Length: 15
Max French Sentence Length: 21
English Voacbulary Size: 211
French Voacbulary Size: 361


## Create Model

In [117]:
def logits_to_text(logits,tokenizer):
    index_to_words = {id: word for word, id in tokenizer.word_index.items()}
                      
    index_to_words[0] = '<PAD>'
    
    return ''.join([index_to_words[prediction] for prediction in np.argmax(logits,1)])

# Building Model

In [118]:
def embed_model(input_shape,output_sequence_length,english_vocab_size,french_vocab_size):
    #hyperparameter
    learning_rate = 0.005
    
    #build the layers
    model = Sequential()
    model.add(Embedding(english_vocab_size,256,input_length = input_shape[1],input_shape = input_shape[1:]))
    
    #add GRU layer of 256
    model.add(GRU(256,return_sequences=True))
    
    #add Time distribute layer dense
    model.add(TimeDistributed(Dense(1024,activation='relu')))
    
    #adding dropout layer
    model.add(Dropout(0.5))
    
    #model Time distributed dense layer
    model.add(TimeDistributed(Dense(french_vocab_size,activation='softmax')))
    
    #compile model
    model.compile(loss=sparse_categorical_crossentropy,optimizer=Adam(learning_rate),metrics=['accuracy'])
    return model


In [119]:
#Reshaping the input to work with a basic RNN
tmp_x = pad(preproc_english_sentence,preproc_french_sentence.shape[1])
tmp_x = tmp_x.reshape(-1,preproc_french_sentence.shape[-2])

In [120]:
simple_rnn_model = embed_model(
    tmp_x.shape,
    preproc_french_sentence.shape[1],
    len(english_tokenizer.word_index)+1,
    len(french_tokenizer.word_index)+1
)

In [121]:
#Print model summary
simple_rnn_model.summary()   

Model: "sequential_4"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_3 (Embedding)     (None, 21, 256)           54272     
                                                                 
 gru_3 (GRU)                 (None, 21, 256)           394752    
                                                                 
 time_distributed_6 (TimeDi  (None, 21, 1024)          263168    
 stributed)                                                      
                                                                 
 dropout_3 (Dropout)         (None, 21, 1024)          0         
                                                                 
 time_distributed_7 (TimeDi  (None, 21, 362)           371050    
 stributed)                                                      
                                                                 
Total params: 1083242 (4.13 MB)
Trainable params: 1083

## Training the model  

In [122]:
len(tmp_x)

137865

In [123]:
len(preproc_french_sentence)

137865

In [124]:
history = simple_rnn_model.fit(tmp_x,preproc_french_sentence,batch_size=1024,
                              epochs=20,validation_split=0.2)

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


## Saving our model

In [125]:
simple_rnn_model.save("Minimodel.h5")

  saving_api.save_model(


## Arbitary Predictions

In [126]:
english_tokenizer.word_index

{'is': 1,
 'in': 2,
 'it': 3,
 'during': 4,
 'the': 5,
 'but': 6,
 'and': 7,
 'sometimes': 8,
 'usually': 9,
 'never': 10,
 'favorite': 11,
 'least': 12,
 'fruit': 13,
 'most': 14,
 'loved': 15,
 'liked': 16,
 'new': 17,
 'paris': 18,
 'india': 19,
 'united': 20,
 'states': 21,
 'california': 22,
 'jersey': 23,
 'france': 24,
 'china': 25,
 'he': 26,
 'she': 27,
 'grapefruit': 28,
 'your': 29,
 'my': 30,
 'his': 31,
 'her': 32,
 'fall': 33,
 'june': 34,
 'spring': 35,
 'january': 36,
 'winter': 37,
 'march': 38,
 'autumn': 39,
 'may': 40,
 'nice': 41,
 'september': 42,
 'july': 43,
 'april': 44,
 'november': 45,
 'summer': 46,
 'december': 47,
 'february': 48,
 'our': 49,
 'their': 50,
 'freezing': 51,
 'pleasant': 52,
 'beautiful': 53,
 'october': 54,
 'snowy': 55,
 'warm': 56,
 'cold': 57,
 'wonderful': 58,
 'dry': 59,
 'busy': 60,
 'august': 61,
 'chilly': 62,
 'rainy': 63,
 'mild': 64,
 'wet': 65,
 'relaxing': 66,
 'quiet': 67,
 'hot': 68,
 'dislikes': 69,
 'likes': 70,
 'limes': 7

In [182]:
# def final_predictions(text):
#     y_id_to_word = {value: key for key,value in french_tokenizer.word_index.items()}
#     y_id_to_word[0] = '<PAD>'
#     sentence = [english_tokenizer.word_index[word] for word in text.split()]
#     sentence = pad_sequences([sentence],maxlen=preproc_french_sentence.shape[-2],padding='post')
# #     print(sentence)
#     text1 = logits_to_text(simple_rnn_model.predict(sentence[:1])[0],french_tokenizer)
#     text2 = ""
#     #iterate over text
#     for i in text1.split():
#         if i == '<PAD>':
#             break
#         else:
#             print(i)
#             text2 = text2+" "+i
#     return text2

In [183]:
def final_predictions(text):
    sentence = [english_tokenizer.word_index.get(word, 0) for word in text.split()]  # Replace unknown words with 0
    sentence = pad_sequences([sentence], maxlen=preproc_french_sentence.shape[-2], padding='post')
    
    # Make predictions using the model
    predictions = simple_rnn_model.predict(sentence[:1])[0]
    
    # Convert predictions to text
    text2 = logits_to_text(predictions, french_tokenizer)
    
    # Remove padding from the generated text
    text2 = text2.replace('<PAD>', '').strip()
    
    return text2


In [186]:
final_predictions(input())

it is during the spring


'ilestenlesprintemps'