In [84]:
import os
import numpy as np
import pandas as pd

In [26]:
en = '/content/small_vocab_en.txt'
fn = '/content/small_vocab_fr.txt'

In [27]:
def join_path(path):
  file = os.path.join(path)

  with open(file,'r') as f:
    doc = f.read()
  return doc.split('\n')

In [28]:
en = join_path(en)
fn = join_path(fn)

In [29]:
for i in range(5):
  print('english sen: ', en[i])
  print('french sen: ',fn[i])
  print('*'*20,'\n')

english sen:  new jersey is sometimes quiet during autumn , and it is snowy in april .
french sen:  new jersey est parfois calme pendant l' automne , et il est neigeux en avril .
******************** 

english sen:  the united states is usually chilly during july , and it is usually freezing in november .
french sen:  les états-unis est généralement froid en juillet , et il gèle habituellement en novembre .
******************** 

english sen:  california is usually quiet during march , and it is usually hot in june .
french sen:  california est généralement calme en mars , et il est généralement chaud en juin .
******************** 

english sen:  the united states is sometimes mild during june , and it is cold in september .
french sen:  les états-unis est parfois légère en juin , et il fait froid en septembre .
******************** 

english sen:  your least liked fruit is the grape , but my least liked is the apple .
french sen:  votre moins aimé fruit est le raisin , mais mon moins

In [30]:
en[:2]

['new jersey is sometimes quiet during autumn , and it is snowy in april .',
 'the united states is usually chilly during july , and it is usually freezing in november .']

In [40]:
import collections

In [44]:
en_count = collections.Counter([word for sen in en for word in sen.split()])
len(en_count)

228

In [45]:
fn_count = collections.Counter([word for sen in fn for word in sen.split()])
len(fn_count)

345

In [51]:
import tensorflow
from tensorflow.keras.preprocessing.text import Tokenizer

In [70]:
def tokenize(text):
  tokenize = Tokenizer()
  tokenize.fit_on_texts(text)
  return tokenize.texts_to_sequences(text), tokenize

In [66]:
from tensorflow.keras.utils import pad_sequences

In [67]:
def pad(text, length=None):
  return pad_sequences(text, maxlen=length, padding='post')

In [71]:
def preprocess(x, y):
  pre_x, tk_x = tokenize(x)
  pre_y, tk_y = tokenize(y)

  pad_x = pad(pre_x)
  pad_y = pad(pre_y)

  pad_y = pad_y.reshape(*pad_y.shape, 1)
  return pad_x, tk_x, pad_y, tk_y

In [72]:
pre_en, en_tokenizer, pre_fn, fn_tokenizer = preprocess(en, fn)

In [73]:
pre_en.shape, pre_fn.shape

((47679, 15), (42734, 20, 1))

In [76]:
len(en_tokenizer.word_index)

200

In [77]:
len(fn_tokenizer.word_index)

333

In [78]:
pre_en[:2]

array([[17, 23,  1,  8, 66,  4, 50,  7,  3,  1, 51,  2, 47,  0,  0],
       [ 5, 19, 20,  1,  9, 60,  4, 58,  7,  3,  1,  9, 62,  2, 33]],
      dtype=int32)

In [81]:
pre_fn[:2]

array([[[ 35],
        [ 34],
        [  1],
        [  8],
        [ 67],
        [ 37],
        [ 11],
        [ 25],
        [  6],
        [  3],
        [  1],
        [112],
        [  2],
        [ 50],
        [  0],
        [  0],
        [  0],
        [  0],
        [  0],
        [  0]],

       [[  4],
        [ 32],
        [ 31],
        [  1],
        [ 12],
        [ 19],
        [  2],
        [ 53],
        [  6],
        [  3],
        [ 95],
        [ 69],
        [  2],
        [ 41],
        [  0],
        [  0],
        [  0],
        [  0],
        [  0],
        [  0]]], dtype=int32)

In [82]:
def logits_to_text(logits, tokenizer):
  index_to_word = {id: word for word, id in tokenizer.word_index.items()}

  index_to_word[0] = '<PAD>'

  return ' '.join([index_to_word[prediction] for prediction in np.argmax(logits, 1)])

In [86]:
temp_x = pad(pre_en, 21)
temp_x.shape

(47679, 21)

In [88]:
temp_x = temp_x.reshape((-1, 21))
temp_x.shape

(47679, 21)

In [89]:
from tensorflow import keras
from keras.layers import Dense, LSTM, Embedding, TimeDistributed, Dropout, GRU
from keras.models import Sequential

In [115]:
model = Sequential()

model.add(Embedding(199, 256, input_shape=(21,),input_length=21))
model.add(GRU(256, return_sequences=True))
model.add(TimeDistributed(Dense(1024, activation='relu')))
model.add(Dropout(.2))
model.add(TimeDistributed(Dense(344, activation='softmax')))

In [120]:

learning_rate = 0.005

model = Sequential()

model.add(Embedding(199, 256, input_length=21, input_shape=(21,)))

model.add(GRU(256, return_sequences=True))

model.add(TimeDistributed(Dense(1024, activation='relu')))

model.add(Dropout(0.5))

model.add(TimeDistributed(Dense(344, activation='softmax')))


In [121]:
model.compile(optimizer=keras.optimizers.Adam(learning_rate=.005),loss=keras.losses.sparse_categorical_crossentropy,metrics=['accuracy'])

In [122]:
print(model.summary())

Model: "sequential_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_1 (Embedding)     (None, 21, 256)           50944     
                                                                 
 gru_1 (GRU)                 (None, 21, 256)           394752    
                                                                 
 time_distributed_2 (TimeDi  (None, 21, 1024)          263168    
 stributed)                                                      
                                                                 
 dropout_1 (Dropout)         (None, 21, 1024)          0         
                                                                 
 time_distributed_3 (TimeDi  (None, 21, 344)           352600    
 stributed)                                                      
                                                                 
Total params: 1061464 (4.05 MB)
Trainable params: 1061

In [123]:
history = model.fit(temp_x, pre_fn, epochs=20, batch_size=1024, validation_split=.2)

Epoch 1/20


ValueError: ignored