In [0]:
import pandas as pd
import numpy as np
from collections import Counter
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.layers import Dense, GRU, TimeDistributed, Input, Embedding, Dropout, Bidirectional, RepeatVector
from tensorflow.keras.models import Model, Sequential
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.losses import sparse_categorical_crossentropy
from tensorflow.keras.regularizers import l2

In [0]:
def helper(file_path):
  with open(file_path) as f:
    data = f.read()
    return data.split('\n')

In [0]:
english_data = helper('/content/small_vocab_en')
french_data = helper('/content/small_vocab_fr')

In [5]:
print('english :{} \n french translation: {}'.format(english_data[0],french_data[0]))

english :new jersey is sometimes quiet during autumn , and it is snowy in april . 
 french translation: new jersey est parfois calme pendant l' automne , et il est neigeux en avril .


In [0]:
counter_english = Counter([w for sentences in english_data for w in sentences.split()])
counter_french = Counter([w for sentences in french_data for w in sentences.split()])


In [7]:
print('unique english words: {}'.format(len(counter_english)))
print('english words: {}'.format(len([w for sentences in english_data for w in sentences.split()])))
print('unique french words: {}'.format(len(counter_french)))
print('french words: {}'.format(len([w for sentences in french_data for w in sentences.split()])))


unique english words: 227
english words: 1823250
unique french words: 355
french words: 1961295


In [0]:
def token(x):
  tokenizer = Tokenizer()
  tokenizer.fit_on_texts(x)
  return tokenizer.texts_to_sequences(x), tokenizer


def pad(x,length=None):
  if length==None:
    length = np.max([len(i) for i in x])
  return pad_sequences(x, maxlen=length, padding='post')

def preprocess(x,y):
  x_tokenized, x_tok = token(x)
  y_tokenized, y_tok = token(y)

  x_padded = pad(x_tokenized)
  y_padded = pad(y_tokenized)

  y_padded = y_padded.reshape(*y_padded.shape,1)
  return x_padded, x_tok, y_padded, y_tok

In [0]:
prepoc_x_sent, x_tokenizer, prepoc_y_sent, y_tokenizer = preprocess(english_data, french_data)

In [0]:
def logits_to_words(output,tokenizer):
  id2word = {i:w for (w,i) in tokenizer.word_index.items()}
  id2word[0] = '<PAD>'
  value = [np.argmax(i) for i in output]
  seq = [id2word[i] for i in value]
  return ' '.join(seq)

In [0]:
english_vocab_size = len(x_tokenizer.word_index)
french_vocab_size = len(y_tokenizer.word_index)

In [0]:
#### BASIC RNN MODEL ####
def simple_model(input_shape,french_vocab_size):
  lr = 1e-3
  input_seq = Input(input_shape[1:])
  rnn = GRU(256,return_sequences=True)(input_seq)
  rnn2 = GRU(128,return_sequences=True)(rnn)
  logits = TimeDistributed(Dense(french_vocab_size, activation='softmax'))(rnn2)
  model = Model(input_seq, logits)
  model.compile(loss=sparse_categorical_crossentropy,
                optimizer=Adam(lr),
                metrics=['acc'])
  return model

In [0]:
prepoc_x_sent = pad(prepoc_x_sent,length=prepoc_y_sent.shape[1])
prepoc_x_sent = prepoc_x_sent.reshape(-1,prepoc_x_sent.shape[1],1)

In [16]:
prepoc_x_sent.shape

(137861, 21, 1)

In [17]:
prepoc_y_sent.shape

(137861, 21, 1)

In [24]:
simple_rnn_model = simple_model(input_shape=prepoc_x_sent.shape,french_vocab_size=len(y_tokenizer.word_index))
simple_rnn_model.summary()

Model: "model_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_2 (InputLayer)         [(None, 21, 1)]           0         
_________________________________________________________________
gru_1 (GRU)                  (None, 21, 256)           198912    
_________________________________________________________________
gru_2 (GRU)                  (None, 21, 128)           148224    
_________________________________________________________________
time_distributed_1 (TimeDist (None, 21, 344)           44376     
Total params: 391,512
Trainable params: 391,512
Non-trainable params: 0
_________________________________________________________________


In [116]:
simple_rnn_model.fit(prepoc_x_sent,prepoc_y_sent, batch_size=1024, epochs=10, validation_split=0.2)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<tensorflow.python.keras.callbacks.History at 0x7f211a13d0f0>

In [0]:
#### EMBEDDING MODEL #####
def embed_model(input_shape, english_vocab_size, french_vocab_size, output_length):
  # input_seq = Input(input_shape[1])
  # embedding= Embedding(english_vocab_size,64,input_length=output_length)(input_seq)
  # rnn = GRU(64,return_sequences=True)(embedding)
  # logits = TimeDistributed(Dense(french_vocab_size,activation='softmax'))(rnn)
  # model = Model(embedding,logits)

  model = Sequential()
  model.add(Embedding(english_vocab_size + 1,64,input_length=output_length,input_shape=input_shape[1:]))
  model.add(GRU(512,return_sequences=True))
  model.add(Dropout(0.4))
  model.add(GRU(512,return_sequences=True,kernel_regularizer=l2(0.01)))
  model.add(TimeDistributed(Dense(512,activation='relu')))
  model.add(TimeDistributed(Dense(french_vocab_size+1,activation='softmax')))



  lr = 1e-3

  model.compile(loss=sparse_categorical_crossentropy,optimizer=Adam(lr),metrics=['acc'])
  model.summary()
  return model

In [0]:
new_prepoc_x_sent = prepoc_x_sent.reshape(-1,prepoc_x_sent.shape[1])

In [119]:
embedding_model = embed_model(new_prepoc_x_sent.shape,english_vocab_size,french_vocab_size,new_prepoc_x_sent.shape[1])

Model: "sequential_15"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_16 (Embedding)     (None, 21, 64)            12800     
_________________________________________________________________
gru_26 (GRU)                 (None, 21, 512)           887808    
_________________________________________________________________
dropout_9 (Dropout)          (None, 21, 512)           0         
_________________________________________________________________
gru_27 (GRU)                 (None, 21, 512)           1575936   
_________________________________________________________________
time_distributed_19 (TimeDis (None, 21, 512)           262656    
_________________________________________________________________
time_distributed_20 (TimeDis (None, 21, 345)           176985    
Total params: 2,916,185
Trainable params: 2,916,185
Non-trainable params: 0
___________________________________________

In [120]:
embedding_model.fit(new_prepoc_x_sent,prepoc_y_sent,epochs=10,batch_size=1024,validation_split=0.2)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<tensorflow.python.keras.callbacks.History at 0x7f211970e390>

In [0]:
### BIDIRECTIONAL RNN ####
def bidirect_model(input_shape, english_vocab_size, french_vocab_size, output_length):
  model = Sequential()
  model.add(Input(input_shape[1:]))
  model.add(Embedding(english_vocab_size+1, 200, input_length = output_length))
  model.add(Bidirectional(GRU(512,return_sequences=True)))
  model.add(Dropout(0.5))
  model.add(Bidirectional(GRU(64,return_sequences=True)))
  model.add(TimeDistributed(Dense(1024,activation='sigmoid')))
  model.add(Dropout(0.5))
  model.add(TimeDistributed(Dense(french_vocab_size+1,activation='softmax')))

  model.compile(loss=sparse_categorical_crossentropy,
                optimizer= Adam(0.003),
                metrics=['acc'])
  model.summary()
  return model

In [122]:
bi_model = bidirect_model(new_prepoc_x_sent.shape, english_vocab_size, french_vocab_size, new_prepoc_x_sent.shape[1])

Model: "sequential_16"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_17 (Embedding)     (None, 21, 200)           40000     
_________________________________________________________________
bidirectional_8 (Bidirection (None, 21, 1024)          2193408   
_________________________________________________________________
dropout_10 (Dropout)         (None, 21, 1024)          0         
_________________________________________________________________
bidirectional_9 (Bidirection (None, 21, 128)           418560    
_________________________________________________________________
time_distributed_21 (TimeDis (None, 21, 1024)          132096    
_________________________________________________________________
dropout_11 (Dropout)         (None, 21, 1024)          0         
_________________________________________________________________
time_distributed_22 (TimeDis (None, 21, 345)         

In [123]:
bi_model.fit(new_prepoc_x_sent,prepoc_y_sent,epochs=10,batch_size=1024,validation_split=0.2)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<tensorflow.python.keras.callbacks.History at 0x7f2117a94198>

In [0]:
def encoder_decoder(input_shape,english_vocab_size,french_vocab_size,output_length):
  learning_rate = 0.01
  latent_dim = 128
  
  #Config Model
  inputs = Input(shape=input_shape[1:])
  embedding_layer = Embedding(input_dim=english_vocab_size+1,
                              output_dim=output_length,
                              mask_zero=False)(inputs)
  bd_layer = Bidirectional(GRU(output_length))(embedding_layer)
  encoding_layer = Dense(latent_dim, activation='relu')(bd_layer)
  decoding_layer = RepeatVector(output_length)(encoding_layer)
  output_layer = Bidirectional(GRU(latent_dim, return_sequences=True))(decoding_layer)
  outputs = TimeDistributed(Dense(french_vocab_size+1, activation='softmax'))(output_layer)
  
  #Create Model from parameters defined above
  model = Model(inputs=inputs, outputs=outputs)
  model.compile(loss=sparse_categorical_crossentropy,
                optimizer=Adam(learning_rate),
                metrics=['accuracy'])
  model.summary()
  return model

In [208]:
enc_dec_model = encoder_decoder(new_prepoc_x_sent.shape,english_vocab_size,french_vocab_size,21)

Model: "model_16"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_29 (InputLayer)        [(None, 21)]              0         
_________________________________________________________________
embedding_36 (Embedding)     (None, 21, 21)            4200      
_________________________________________________________________
bidirectional_32 (Bidirectio (None, 42)                5544      
_________________________________________________________________
dense_52 (Dense)             (None, 128)               5504      
_________________________________________________________________
repeat_vector_14 (RepeatVect (None, 21, 128)           0         
_________________________________________________________________
bidirectional_33 (Bidirectio (None, 21, 256)           198144    
_________________________________________________________________
time_distributed_37 (TimeDis (None, 21, 345)           886

In [209]:
enc_dec_model.fit(new_prepoc_x_sent,prepoc_y_sent,epochs=10,batch_size=1024,validation_split=0.2)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<tensorflow.python.keras.callbacks.History at 0x7f20f8cc4240>

In [210]:
### enc dec model prediction ####
print(logits_to_words(enc_dec_model.predict(new_prepoc_x_sent[:1])[0], y_tokenizer))

new jersey est parfois calme pendant l'automne automne il il neigeux en avril <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD>


In [124]:
#### BIDIRECTIONAL EMBEDDING MODEL ####
print(logits_to_words(bi_model.predict(new_prepoc_x_sent[:1])[0], y_tokenizer))

new jersey est parfois calme pendant l' automne et il est neigeux en avril <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD>


In [125]:
###### SIMPLE RNN MODEL PREDICTION #######
print(logits_to_words(simple_rnn_model.predict(prepoc_x_sent[:1])[0], y_tokenizer))

new jersey est parfois calme en l' et il est est neigeux en <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD>


In [126]:
#### EMBEDDING MODEL PREDICTION ######
print(logits_to_words(embedding_model.predict(new_prepoc_x_sent[:1])[0],y_tokenizer))

new jersey est parfois calme en l' et et il est en en froid <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD>


In [127]:
french_data[0]

"new jersey est parfois calme pendant l' automne , et il est neigeux en avril ."

In [128]:
english_data[0]

'new jersey is sometimes quiet during autumn , and it is snowy in april .'