In [None]:
import numpy as np
import pandas as pd
from tensorflow import keras
from keras.models import Sequential
from keras.layers import Dense, Embedding, Activation, LSTM, RepeatVector, Input
#from google.colab import drive
import gdown

In [None]:
# download the data set
gdown.download(url="https://drive.google.com/file/d/16h_8WHOp2mFCyIqbjNIZ5omayG-P42wU/view?usp=sharing", output="fra.txt", fuzzy=True)

In [None]:
raw_data = open(r"./fra.txt", mode='rt', encoding='utf-8').read()
raw_data = raw_data.strip().split("\n")
raw_data = [i.split('\t') for i in raw_data]
data = np.array(raw_data)
print(data)
print("overall pairs", len(data))

In [None]:
data.shape

In [None]:
import string
from tensorflow.keras.preprocessing.text import Tokenizer
from sklearn.model_selection import train_test_split

In [None]:
data[:,0] = [word.translate(str.maketrans('', '', string.punctuation)) for word in data[:,0]]
data[:,1] = [word.translate(str.maketrans('', '', string.punctuation)) for word in data[:,1]]

In [None]:
for word in range(len(data)):
    data[word,0] = data[word,0].lower()
    data[word,1] = data[word,1].lower()

In [None]:
tokenizer = Tokenizer()
tokenizer.fit_on_texts(data[:,0])
l1_tokens = tokenizer
l1_vocab_size = len(l1_tokens.word_index) + 1
print("lang 1 vocab size", l1_vocab_size)

In [None]:
tokenizer = Tokenizer()
tokenizer.fit_on_texts(data[:,1])
l2_tokens = tokenizer
l2_vocab_size = len(l2_tokens.word_index) + 1
print("lang 2 vocab size", l2_vocab_size)

In [None]:
train, test = train_test_split(data, test_size=0.1, random_state=43)
X_train_seq = l1_tokens.texts_to_sequences(train[:,0])
X_train = keras.utils.pad_sequences(X_train_seq, 15, padding='post')
Y_train_seq = l2_tokens.texts_to_sequences(train[:,1])
Y_train = keras.utils.pad_sequences(Y_train_seq, 15, padding='post')

X_test_seq = l1_tokens.texts_to_sequences(test[:,0])
X_test = keras.utils.pad_sequences(X_test_seq, 15, padding='post')
Y_test_seq = l2_tokens.texts_to_sequences(test[:,1])
Y_test = keras.utils.pad_sequences(Y_test_seq, 15, padding='post')

print("X_train.shape", X_train.shape)
print("Y_train.shape", Y_train.shape)
print("X_test.shape", X_test.shape)
print("Y_test.shape", Y_test.shape)

In [None]:
print("text data -->", train[15, 1])
print("numbers sequence -->", Y_train_seq[15])
print("padded sequence --->", Y_train[15])

In [None]:
model = Sequential()
model.add(Input(shape=(15,), name='input'))
model.add(Embedding(l1_vocab_size, 256, input_length = 15, mask_zero=True, name='embedding'))
model.add(LSTM(128, name='encoder'))
model.add(RepeatVector(15))
model.add(LSTM(128, return_sequences=True, name='decoder'))
model.add(Dense(l2_vocab_size, activation='softmax'))
model.compile(optimizer='adam', loss='sparse_categorical_crossentropy')
model.summary()

In [None]:

history = model.fit(X_train, Y_train.reshape(Y_train.shape[0], Y_train.shape[1], 1), epochs=10, verbose=1, batch_size=256)
model.save_weights('my_eng_fra_model_e10.weights.h5')

In [None]:
def one_line_prediction(text1, m):
    #Given below is the code for pre-processing.
    def to_lines(text):
        sents = text.strip().split('\n')
        sents = [i.split('\t') for i in sents]
        return sents

    small_input = to_lines(text1)
    small_input = np.array(small_input)

    # Remove punctuation
    small_input[:,0] = [s.translate(str.maketrans('', '', string.punctuation)) for s in small_input[:,0]]
    # convert text to lowercase
    for i in range(len(small_input)):
        small_input[i,0] = small_input[i,0].lower()

    #encode and pad sequences
    small_input_seq=l1_tokens.texts_to_sequences(small_input[0])
    small_input= keras.utils.pad_sequences(small_input_seq,15,padding='post')


    #Using the code below, we load the model and get the prediction sequence.
    #model.load_weights('/content/drive/My Drive/Training/Book/0.Chapters/Chapter12 RNN and LSTM/1.Archives/Eng_fra_model_v2.hdf5')

    pred_seq = m.predict(small_input[0:1].reshape((small_input[0:1].shape[0],small_input[0:1].shape[1])), verbose=0)
    print(pred_seq.shape)
    #print(pred_seq)

    pred1 = [np.argmax(i) for i in pred_seq[0]]
    print(pred1)

    def num_to_word(n, tokens):
        for word, index in tokens.word_index.items():
            if index == n:
                return word
        return None

    Lang2_text = []
    for wid in pred1:
        t = num_to_word(wid, l2_tokens)
        if t != None:
            Lang2_text.append(t)

    return(' '.join(Lang2_text))

In [None]:
one_line_prediction("what is the breakfast today?", model)

In [None]:
m2 = Sequential()
m2.add(Input(shape=(15,)))
m2.add(Embedding(l1_vocab_size, 256, input_length = 15, mask_zero=True))
m2.add(LSTM(128))
m2.add(RepeatVector(15))
m2.add(LSTM(128, return_sequences=True))
m2.add(Dense(l2_vocab_size, activation='softmax'))
m2.summary()

In [None]:
# download the pretrained model.
gdown.download(url='https://drive.google.com/file/d/1_cO3qEeI2GMkToD7DEVXmjirDVwiPDSU/view?usp=sharing', output='Eng_fra_model.hdf5', fuzzy=True)

In [None]:
# load the model weights.
m2.load_weights('Eng_fra_model.hdf5')

In [None]:
one_line_prediction("i am beautiful", m2)