In [30]:
import numpy as np
import pandas as pd
from tensorflow import keras
from keras.models import Sequential
from keras.layers import Dense, Embedding, Activation, LSTM, RepeatVector

In [2]:
raw_data = open(r"/home/sudarsun/projects/ML_DL_py_TF/Chapter12_RNN_LSTM_V3/Datasets/fra-eng/fra.txt", mode='rt', encoding='utf-8').read()
raw_data = raw_data.strip().split("\n")
raw_data = [i.split('\t') for i in raw_data]
data = np.array(raw_data)
print(data)
print("overall pairs", len(data))

[['Go.' 'Va !'
  'CC-BY 2.0 (France) Attribution: tatoeba.org #2877272 (CM) & #1158250 (Wittydev)']
 ['Hi.' 'Salut !'
  'CC-BY 2.0 (France) Attribution: tatoeba.org #538123 (CM) & #509819 (Aiji)']
 ['Hi.' 'Salut.'
  'CC-BY 2.0 (France) Attribution: tatoeba.org #538123 (CM) & #4320462 (gillux)']
 ...
 ["Death is something that we're often discouraged to talk about or even think about, but I've realized that preparing for death is one of the most empowering things you can do. Thinking about death clarifies your life."
  "La mort est une chose qu'on nous décourage souvent de discuter ou même de penser mais j'ai pris conscience que se préparer à la mort est l'une des choses que nous puissions faire qui nous investit le plus de responsabilité. Réfléchir à la mort clarifie notre vie."
  'CC-BY 2.0 (France) Attribution: tatoeba.org #1969892 (davearms) & #1969962 (sacredceltic)']
 ['Since there are usually multiple websites on any given topic, I usually just click the back button when I arrive

In [3]:
data.shape

(175623, 3)

In [21]:
import string
from keras.preprocessing.text import Tokenizer
from sklearn.model_selection import train_test_split

In [12]:
data[:,0] = [word.translate(str.maketrans('', '', string.punctuation)) for word in data[:,0]]
data[:,1] = [word.translate(str.maketrans('', '', string.punctuation)) for word in data[:,1]]

In [13]:
for word in range(len(data)):
    data[word,0] = data[word,0].lower()
    data[word,1] = data[word,1].lower()

In [19]:
tokenizer = Tokenizer()
tokenizer.fit_on_texts(data[:,0])
l1_tokens = tokenizer
l1_vocab_size = len(l1_tokens.word_index) + 1
print("lang 1 vocab size", l1_vocab_size)

lang 1 vocab size 14671


In [20]:
tokenizer = Tokenizer()
tokenizer.fit_on_texts(data[:,1])
l2_tokens = tokenizer
l2_vocab_size = len(l2_tokens.word_index) + 1
print("lang 2 vocab size", l2_vocab_size)

lang 2 vocab size 33321


In [23]:
train, test = train_test_split(data, test_size=0.1, random_state=43)
X_train_seq = l1_tokens.texts_to_sequences(train[:,0])
X_train = keras.utils.pad_sequences(X_train_seq, 15, padding='post')
Y_train_seq = l2_tokens.texts_to_sequences(train[:,1])
Y_train = keras.utils.pad_sequences(Y_train_seq, 15, padding='post')

X_test_seq = l1_tokens.texts_to_sequences(test[:,0])
X_test = keras.utils.pad_sequences(X_test_seq, 15, padding='post')
Y_test_seq = l2_tokens.texts_to_sequences(test[:,1])
Y_test = keras.utils.pad_sequences(Y_test_seq, 15, padding='post')

print("X_train.shape", X_train.shape)
print("Y_train.shape", Y_train.shape)
print("X_test.shape", X_test.shape)
print("Y_test.shape", Y_test.shape)

X_train.shape (158060, 15)
Y_train.shape (158060, 15)
X_test.shape (17563, 15)
Y_test.shape (17563, 15)


In [28]:
print("text data -->", train[15, 1])
print("numbers sequence -->", Y_train_seq[15])
print("padded sequence --->", Y_train[15])

text data --> je ne pense pas que tom écoute
numbers sequence --> [1, 6, 58, 3, 4, 11, 1747]
padded sequence ---> [   1    6   58    3    4   11 1747    0    0    0    0    0    0    0
    0]


In [31]:
model = Sequential()
model.add(Embedding(l1_vocab_size, 256, input_length = 15, mask_zero=True))
model.add(LSTM(128))
model.add(RepeatVector(15))
model.add(LSTM(128, return_sequences=True))
model.add(Dense(l2_vocab_size, activation='softmax'))
model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 15, 256)           3755776   
                                                                 
 lstm (LSTM)                 (None, 128)               197120    
                                                                 
 repeat_vector (RepeatVector  (None, 15, 128)          0         
 )                                                               
                                                                 
 lstm_1 (LSTM)               (None, 15, 128)           131584    
                                                                 
 dense (Dense)               (None, 15, 33321)         4298409   
                                                                 
Total params: 8,382,889
Trainable params: 8,382,889
Non-trainable params: 0
______________________________________________

In [32]:
model.compile(optimizer='adam', loss='sparse_categorical_crossentropy')
history = model.fit(X_train, Y_train.reshape(Y_train.shape[0], Y_train.shape[1], 1), 
                    epochs=30, verbose=1, batch_size=1024)
model.save_weights('eng_fra_model.hdf5')

Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
Epoch 19/30
Epoch 20/30
Epoch 21/30
Epoch 22/30
Epoch 23/30
Epoch 24/30
Epoch 25/30
Epoch 26/30
Epoch 27/30
Epoch 28/30
Epoch 29/30
Epoch 30/30


In [85]:
def one_line_prediction(text1, m):
    #Given below is the code for pre-processing.  
    def to_lines(text):
        sents = text.strip().split('\n')
        sents = [i.split('\t') for i in sents]
        return sents
    
    small_input = to_lines(text1)
    small_input = np.array(small_input)
    
    # Remove punctuation
    small_input[:,0] = [s.translate(str.maketrans('', '', string.punctuation)) for s in small_input[:,0]]
    # convert text to lowercase
    for i in range(len(small_input)):
        small_input[i,0] = small_input[i,0].lower()

    #encode and pad sequences
    small_input_seq=l1_tokens.texts_to_sequences(small_input[0])
    small_input= keras.utils.pad_sequences(small_input_seq,15,padding='post')
   

    #Using the code below, we load the model and get the prediction sequence. 
    #model.load_weights('/content/drive/My Drive/Training/Book/0.Chapters/Chapter12 RNN and LSTM/1.Archives/Eng_fra_model_v2.hdf5')

    pred_seq = m.predict(small_input[0:1].reshape((small_input[0:1].shape[0],small_input[0:1].shape[1])), verbose=0)
    print(pred_seq.shape)
    #print(pred_seq)
    
    pred1 = [np.argmax(i) for i in pred_seq[0]]
    print(pred1)
    
    def num_to_word(n, tokens):
        for word, index in tokens.word_index.items():
            if index == n:
                return word
        return None

    Lang2_text = []
    for wid in pred1:
        t = num_to_word(wid, l2_tokens)
        if t != None:
            Lang2_text.append(t)

    return(' '.join(Lang2_text))

In [91]:
one_line_prediction("are you ok baby", model)

(1, 15, 33321)
[9, 9, 36, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]


'vous vous avec'

In [83]:
m2 = Sequential()
m2.add(Embedding(l1_vocab_size, 256, input_length = 15, mask_zero=True))
m2.add(LSTM(128))
m2.add(RepeatVector(15))
m2.add(LSTM(128, return_sequences=True))
m2.add(Dense(l2_vocab_size, activation='softmax'))
m2.summary()

Model: "sequential_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_1 (Embedding)     (None, 15, 256)           3755776   
                                                                 
 lstm_2 (LSTM)               (None, 128)               197120    
                                                                 
 repeat_vector_1 (RepeatVect  (None, 15, 128)          0         
 or)                                                             
                                                                 
 lstm_3 (LSTM)               (None, 15, 128)           131584    
                                                                 
 dense_1 (Dense)             (None, 15, 33321)         4298409   
                                                                 
Total params: 8,382,889
Trainable params: 8,382,889
Non-trainable params: 0
____________________________________________

In [84]:
m2.load_weights(r'/home/sudarsun/projects/ML_DL_py_TF/Chapter12_RNN_LSTM_V3/Datasets/Pre_trained_models/Eng_fra_model.hdf5')

In [90]:
one_line_prediction("are you ok baby", m2)

(1, 15, 33321)
[78, 4, 15, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]


'estce que tu'