In [50]:
import string
import re
from numpy import array, argmax, random, take
import pandas as pd
import numpy as np
from keras.models import Sequential
from keras.layers import Dense, LSTM, Embedding, RepeatVector
from keras.preprocessing.text import Tokenizer
from keras.callbacks import ModelCheckpoint
from keras.preprocessing.sequence import pad_sequences
from keras.models import load_model
from keras import optimizers
import pickle
import matplotlib.pyplot as plt

pd.set_option("display.max_colwidth", 200)

In [51]:
# function to read raw text file
def read_text(filename):
    # open the file
    file = open(filename, mode="rt", encoding="utf-8")

    # read all text
    text = file.read()
    file.close()
    return text

In [52]:
# split a text into sentences
def to_lines(text):
    sents = text.strip().split("\n")
    sents = [i.split("\t") for i in sents]
    return sents

In [53]:
langs = ['fra','ita','spa','por']

In [54]:
def get_data(lang):
    data = read_text(f"data/{lang}.txt")
    lang_eng = to_lines(data)
    lang_eng = array(lang_eng)
    lang_eng = lang_eng[:50000,:]

    return lang_eng

def remove_puntuation(lang_eng):
    # Remove punctuation
    lang_eng[:,0] = [s.translate(str.maketrans('', '', string.punctuation)) for s in lang_eng[:,0]]
    lang_eng[:,1] = [s.translate(str.maketrans('', '', string.punctuation)) for s in lang_eng[:,1]]
    return lang_eng

def to_lowerCase(lang_eng):
    # convert text to lowercase
    for i in range(len(lang_eng)):
        lang_eng[i,0] = lang_eng[i,0].lower()
        lang_eng[i,1] = lang_eng[i,1].lower()
    return lang_eng

In [55]:
def preprocessing(language: str):
    lang_eng = get_data(language)
    lang_eng = remove_puntuation(lang_eng)
    lang_eng = to_lowerCase(lang_eng)
    return lang_eng

In [56]:
# function to build a tokenizer
def tokenization(lines):
    tokenizer = Tokenizer()
    tokenizer.fit_on_texts(lines)
    return tokenizer

def build_lang_tokenizer(lang_eng,language:str):
    # prepare data
    lang_tokenizer = tokenization(lang_eng[:, 1])
    lang_vocab_size = len(lang_tokenizer.word_index) + 1

    pickle.dump(lang_tokenizer, open(f"tokenizers/{language}/{language}_tokenizer.pkl", "wb"))
    
    return lang_tokenizer, lang_vocab_size

def build_english_tokenizer(lang_eng,language:str):
    # prepare data
    eng_tokenizer = tokenization(lang_eng[:, 0])
    eng_vocab_size = len(eng_tokenizer.word_index) + 1

    pickle.dump(eng_tokenizer, open(f"tokenizers/{language}/eng_tokenizer.pkl", "wb"))

    return eng_tokenizer, eng_vocab_size


In [57]:
from sklearn.model_selection import train_test_split

def __train_test_split__(lang_eng):
    train, test = train_test_split(lang_eng, train_size=0.8, random_state=12)

    return train, test


def encode_sequences(tokenizer, length, lines):
    seq = tokenizer.texts_to_sequences(lines)
    # pad sequences with 0 values
    seq = pad_sequences(seq, maxlen=length, padding="post")
    return seq


def encode_sequences_data(lang_tokenizer,eng_tokenizer,train,test):
    # prepare training data
    trainX = encode_sequences(lang_tokenizer, 12, train[:, 1])
    trainY = encode_sequences(eng_tokenizer, 12, train[:, 0])

    # prepare validation data
    testX = encode_sequences(lang_tokenizer, 12, test[:, 1])
    testY = encode_sequences(eng_tokenizer, 12, test[:, 0])

    return trainX, trainY, testX, testY

In [58]:
def pre_training(language: str):
    lang_eng = preprocessing(language=language)
    lang_tokenizer, lang_vocab_size = build_lang_tokenizer(lang_eng,language)
    eng_tokenizer, eng_vocab_size = build_english_tokenizer(lang_eng,language)

    train, test = __train_test_split__(lang_eng)

    trainX, trainY, testX, testY = encode_sequences_data(lang_tokenizer,eng_tokenizer,train,test)

    return lang_tokenizer, lang_vocab_size, eng_tokenizer, eng_vocab_size, trainX, trainY, testX, testY
    

In [59]:
def model_creation_and_training(lang_vocab_size, eng_vocab_size,language:str,trainX,trainY):
    def define_model(in_vocab,out_vocab, in_timesteps,out_timesteps,units):
        model = Sequential()
        model.add(Embedding(in_vocab, units, input_length=in_timesteps, mask_zero=True))
        model.add(LSTM(units))
        model.add(RepeatVector(out_timesteps))
        model.add(LSTM(units, return_sequences=True))
        model.add(Dense(out_vocab, activation='softmax'))
        return model

    model = define_model(lang_vocab_size, eng_vocab_size, 12, 12, 512)

    adam = optimizers.Adam()
    model.compile(optimizer=adam, loss="sparse_categorical_crossentropy")

    filename = f"models/{language}.keras"
    checkpoint = ModelCheckpoint(
        filename, monitor="val_loss", verbose=1, save_best_only=True, mode="min"
    )

    # train model
    history = model.fit(
        trainX,
        trainY.reshape(trainY.shape[0], trainY.shape[1], 1),
        epochs=30,
        batch_size=512,
        validation_split=0.2,
        callbacks=[checkpoint],
        verbose=1,
    )

In [60]:
def main(language:str):
     lang_tokenizer, lang_vocab_size, eng_tokenizer, eng_vocab_size, trainX, trainY, testX, testY = pre_training(language)
     model_creation_and_training(lang_vocab_size, eng_vocab_size,language,trainX,trainY)

In [61]:
for lang in langs:
    print(f"language: {lang}")
    main(lang)

language: fra
Epoch 1/30
Epoch 1: val_loss improved from inf to 2.09508, saving model to models\fra.keras
Epoch 2/30
Epoch 2: val_loss improved from 2.09508 to 1.94904, saving model to models\fra.keras
Epoch 3/30
Epoch 3: val_loss improved from 1.94904 to 1.90145, saving model to models\fra.keras
Epoch 4/30
Epoch 4: val_loss improved from 1.90145 to 1.86850, saving model to models\fra.keras
Epoch 5/30
Epoch 5: val_loss improved from 1.86850 to 1.84878, saving model to models\fra.keras
Epoch 6/30
Epoch 6: val_loss improved from 1.84878 to 1.82537, saving model to models\fra.keras
Epoch 7/30
Epoch 7: val_loss improved from 1.82537 to 1.80268, saving model to models\fra.keras
Epoch 8/30
Epoch 8: val_loss improved from 1.80268 to 1.77779, saving model to models\fra.keras
Epoch 9/30
Epoch 9: val_loss improved from 1.77779 to 1.75277, saving model to models\fra.keras
Epoch 10/30
Epoch 10: val_loss improved from 1.75277 to 1.72380, saving model to models\fra.keras
Epoch 11/30
Epoch 11: val_lo