## Import libraries

In [34]:
import numpy as np
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Model,Sequential
from tensorflow.keras.layers import GRU,Input,Dense,TimeDistributed,Activation,RepeatVector,Bidirectional,Dropout,LSTM,Embedding
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.losses import sparse_categorical_crossentropy
import tensorflow as tf
import os

## Load dataset

In [35]:
english_data = "small_vocab_en.txt"
french_data = "small_vocab_fr.txt"

In [36]:
def load_data(path):
    input_file = os.path.join(path)
    with open (input_file,"r") as f:
        data = f.read()
    return data.split('\n')

In [37]:
english_sentences = load_data(english_data)
french_sentences = load_data(french_data)

In [38]:
for i in range(5):
    print(english_sentences[i])
    print(french_sentences[i])
    print('-'*70)

good morning .
bonjour .
----------------------------------------------------------------------
good night .
bonne nuit .
----------------------------------------------------------------------
hello srushti .
Bonjour Srishti .
----------------------------------------------------------------------
i hate you .
je te dÃ©teste .
----------------------------------------------------------------------
my name is sanjana bafana .
Je m'appelle Sanjana Bafana .
----------------------------------------------------------------------


## Convert to Vocabulary

In [39]:
import collections

In [40]:
english_words_counter =collections.Counter ([word for sentence in english_sentences for word in sentence.split()])
print('English Vocab:',len (english_words_counter))
french_words_counter = collections.Counter ([word for sentence in french_sentences for word in sentence.split()])
print('French Vocab:', len(french_words_counter))

English Vocab: 282
French Vocab: 408


## Tokenize (implementation)

In [41]:
def tokenize(x):
    tokenizer = Tokenizer()
    tokenizer.fit_on_texts(x)
    return tokenizer.texts_to_sequences(x),tokenizer

In [42]:
text_sentences = [
    'This is a short sentence .',
    'Nitin,I am lucky to have friend like you .'
]
text_tokenized,text_tokenizer = tokenize(text_sentences)
print(text_tokenizer.word_index)
print()
for sample_i,(sent,token_sent) in enumerate (zip(text_sentences,text_tokenized)):
    print("Sequence {} in x".format(sample_i+1))
    print("Input: {} ".format(sent))
    print("Input: {} ".format(token_sent))

{'this': 1, 'is': 2, 'a': 3, 'short': 4, 'sentence': 5, 'nitin': 6, 'i': 7, 'am': 8, 'lucky': 9, 'to': 10, 'have': 11, 'friend': 12, 'like': 13, 'you': 14}

Sequence 1 in x
Input: This is a short sentence . 
Input: [1, 2, 3, 4, 5] 
Sequence 2 in x
Input: Nitin,I am lucky to have friend like you . 
Input: [6, 7, 8, 9, 10, 11, 12, 13, 14] 


## Padding (implementation)

In [43]:
def pad(x, length=None):
    return pad_sequences(x, maxlen=length, padding='post')

In [44]:
#define process function with x and y
def preprocess(x,y):
    preprocess_x,x_tk = tokenize(x)
    preprocess_y,y_tk = tokenize(y)

    #padding the data
    preprocess_x = pad(preprocess_x)
    preprocess_y = pad(preprocess_y)

    #keras's sparese_categorical_crossentropy function requires the labels to be in 3 dimension
    #Expanding dimensions
    preprocess_y = preprocess_y.reshape(*preprocess_y.shape,1)
    return preprocess_x,preprocess_y,x_tk,y_tk

#preproc_english_sentence,preproc_french_sentence,english_tokenizer,french_tokenizer
preproc_english_sentences,preproc_french_sentences,english_tokenizer,french_tokenizer  =\
preprocess(english_sentences,french_sentences)

#print max english sentence length
max_english_sentence_length = preproc_english_sentences.shape[1]
#print max french sentence length
max_french_sentence_length = preproc_french_sentences.shape[1]
#print len of englsih vocabulary
english_vocab_size = len(english_tokenizer.word_index)
#print len of englsih vocabulary
french_vocab_size = len(french_tokenizer.word_index)

In [45]:
print("Data Preprocessed")
print("Max English Sentence Length:",max_english_sentence_length)
print("Max French Sentence Length:",max_french_sentence_length)
print("English Voacbulary Size:",english_vocab_size)
print("French Voacbulary Size:",french_vocab_size)

Data Preprocessed
Max English Sentence Length: 15
Max French Sentence Length: 21
English Voacbulary Size: 249
French Voacbulary Size: 396


## Create Model

In [46]:
def logits_to_text(logits,tokenizer):
    index_to_words = {id: word for word, id in tokenizer.word_index.items()}

    index_to_words[0] = '<PAD>'

    return ' '.join([index_to_words[prediction] for prediction in np.argmax(logits,1)])

# Building Model

In [47]:
def embed_model(input_shape,output_sequence_length,english_vocab_size,french_vocab_size):
    #hyperparameter
    learning_rate = 0.005

    #build the layers
    model = Sequential()
    model.add(Embedding(english_vocab_size,256,input_length = input_shape[1],input_shape = input_shape[1:]))

    #add GRU layer of 256
    model.add(GRU(256,return_sequences=True))

    #add Time distribute layer dense
    model.add(TimeDistributed(Dense(1024,activation='relu')))

    #adding dropout layer
    model.add(Dropout(0.5))

    #model Time distributed dense layer
    model.add(TimeDistributed(Dense(french_vocab_size,activation='softmax')))

    #compile model
    model.compile(loss=sparse_categorical_crossentropy,optimizer=Adam(learning_rate),metrics=['accuracy'])
    return model


In [48]:
#Reshaping the input to work with a basic RNN
tmp_x = pad(preproc_english_sentences,preproc_french_sentences.shape[1])
tmp_x = tmp_x.reshape((-1,preproc_french_sentences.shape[-2]))

In [49]:
simple_rnn_model = embed_model(
    tmp_x.shape,
    preproc_french_sentences.shape[1],
    len(english_tokenizer.word_index)+1,
    len(french_tokenizer.word_index)+1
)

In [50]:
#Print model summary
simple_rnn_model.summary()

Model: "sequential_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_1 (Embedding)     (None, 21, 256)           64000     
                                                                 
 gru_1 (GRU)                 (None, 21, 256)           394752    
                                                                 
 time_distributed_2 (TimeDi  (None, 21, 1024)          263168    
 stributed)                                                      
                                                                 
 dropout_1 (Dropout)         (None, 21, 1024)          0         
                                                                 
 time_distributed_3 (TimeDi  (None, 21, 397)           406925    
 stributed)                                                      
                                                                 
Total params: 1128845 (4.31 MB)
Trainable params: 1128

## Training the model  

In [51]:
history = simple_rnn_model.fit(tmp_x,preproc_french_sentences,batch_size=1024,
                              epochs=20,validation_split=0.2)

Epoch 1/20

KeyboardInterrupt: 

## Saving our model

In [None]:
simple_rnn_model.save("Minimodel.h5")

## Arbitary Predictions

In [None]:
english_tokenizer.word_index

In [None]:
def final_predictions(text):
    y_id_to_word = {value: key for key,value in french_tokenizer.word_index.items()}
    y_id_to_word[0] = '<PAD>'
    sentence = [english_tokenizer.word_index[word] for word in text.split()]
    sentence = pad_sequences([sentence],maxlen=preproc_french_sentences.shape[-2],padding='post')
#     print(sentence)
    text1 = logits_to_text(simple_rnn_model.predict(sentence[:1])[0],french_tokenizer)
    text2 = ""
    #iterate over text
    for i in text1.split():
        if i == '<PAD>':
            break
        else:
            text2 = text2+" "+i
    return text2

In [None]:
final_predictions(input())