In [100]:
import numpy as np
import tensorflow as tf
from tensorflow.keras import Sequential
from tensorflow.keras.layers import Input, LSTM, Embedding, Dense, RepeatVector
from tensorflow.keras.utils import to_categorical
import string
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split


In [101]:
data_path = 'fra.txt'
with open(data_path, 'r', encoding='utf-8') as f:
    lines = f.read()

sents = lines.strip().split('\n')
fra_eng = np.array([i.split('\t') for i in sents])

In [102]:
# before: fra_eng[0] = ['Go!' , 'Vamos.' , 'usrt-3 $asd 0345 dfs gtg (useless column)']
fra_eng = fra_eng[:100000]
fra_eng = fra_eng[:,0:2]
# before: fra_eng[0] = ['Go!' , 'Vamos.' ]


In [103]:
fra_eng[:,0] = [s.translate(str.maketrans('', '', string.punctuation)) for s in fra_eng[:,0]]
fra_eng[:,1] = [s.translate(str.maketrans('', '', string.punctuation)) for s in fra_eng[:,1]]
fra_eng[:,0] = [s.lower() for s in fra_eng[:,0]]
fra_eng[:,1] = [s.lower() for s in fra_eng[:,1]]

In [104]:
def create_tokenizer(lines):
    tokenizer = Tokenizer()
    tokenizer.fit_on_texts(lines)
    return tokenizer

In [105]:
en_tokenizer = create_tokenizer(fra_eng[:, 0])
# number of english vocabs
eng_vocab_size = len(en_tokenizer.word_index)+1
# each sentence contains 20 words
eng_sents_length = 20

fr_tokenizer = create_tokenizer(fra_eng[:, 1])
# number of french vocabs
fr_vocab_size = len(fr_tokenizer.word_index)+1
# each sentence contains 20 words
fr_sents_length = 20


In [106]:
# Converts a list of text lines into padded sequences of integers using a fitted tokenizer.
# e.g: "I love deep learning" -> [2,54,32,97,0,0,0,0]
def encode_seq(tokenizer, length, lines):
    seq = tokenizer.texts_to_sequences(lines)
    seq = pad_sequences(seq, maxlen=length, padding='post')
    return seq

In [107]:
train, test = train_test_split(fra_eng, test_size=0.2, random_state=12)
trainX = encode_seq(fr_tokenizer, fr_sents_length, train[:,1])
trainY = encode_seq(en_tokenizer, eng_sents_length, train[:,0])
testX = encode_seq(fr_tokenizer, fr_sents_length, test[:,1])
testY = encode_seq(en_tokenizer, eng_sents_length, test[:,0])

In [108]:
def define_model(in_vocab, out_vocab, in_timesteps, out_timesteps, units):
    model = Sequential()
    model.add(Embedding(in_vocab, units, input_length=in_timesteps, mask_zero = True))
    model.add(LSTM(units))
    model.add(RepeatVector(out_timesteps))
    model.add(LSTM(units, return_sequences=True))
    model.add(Dense(out_vocab, activation='softmax'))
    return model
    


In [109]:
model = define_model(fr_vocab_size, eng_vocab_size, fr_sents_length, eng_sents_length, 512)
loss = tf.keras.losses.SparseCategoricalCrossentropy()
model.compile(optimizer='adam', loss=loss)



In [None]:
history = model.fit(trainX, trainY, epochs=10, batch_size=512, validation_data=(testX, testY))

Epoch 1/10
[1m  5/157[0m [37m━━━━━━━━━━━━━━━━━━━━[0m [1m2:03:19[0m 49s/step - loss: 9.0317