In [2]:
import pandas as pd
from sklearn.model_selection import train_test_split
import tensorflow as tf
#import tensorflow_text as text
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from keras.models import load_model, Sequential
from keras.layers import Embedding, Dense, SimpleRNN
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

In [3]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
import string
import re

In [4]:
from tensorflow.keras.layers import Input, LSTM, Embedding, Dense, \
    Concatenate, TimeDistributed
from tensorflow.keras.models import Model
from tensorflow.keras.callbacks import EarlyStopping
import numpy as np

In [5]:
# Load data
data = pd.read_csv('../data/interim/Data_preprocess.csv', sep='\t')
reference_tensor_train = pd.read_csv("../data/interim/Reference_train.csv")
reference_tensor_val = pd.read_csv("../data/interim/Reference_val.csv")
translation_tensor_train = pd.read_csv("../data/interim/Translation_train.csv")
translation_tensor_val = pd.read_csv("../data/interim/Translation_val.csv")

## Hypothesis 1

In [41]:
class Model1():

    def __init__(self):
        pass

    def create_model(self, ref_lang, tran_lang, max_length_inp, latent_dim=300, embedding_dim=200):
        
        self.latent_dim = latent_dim
        self.embedding_dim = embedding_dim

        # Encoder
        self.encoder_inputs = Input(shape=(max_length_inp, ))

        # Embedding layer
        self.enc_emb = Embedding(len(ref_lang.word_index) + 1, embedding_dim,
                          trainable=True)(self.encoder_inputs)

        # Encoder LSTM 1
        self.encoder_lstm1 = LSTM(latent_dim, return_sequences=True,
                          return_state=True, dropout=0.4,
                          recurrent_dropout=0.4)
        (self.encoder_output, self.state_h, self.state_c) = self.encoder_lstm1(self.enc_emb)

        # Set up the decoder, using encoder_states as the initial state
        self.decoder_inputs = Input(shape=(None, ))

        # Embedding layer
        self.dec_emb_layer = Embedding(len(tran_lang.word_index) + 1, embedding_dim, trainable=True)
        dec_emb = self.dec_emb_layer(self.decoder_inputs)

        # Decoder LSTM
        self.decoder_lstm = LSTM(latent_dim, return_sequences=True,
                              return_state=True, dropout=0.4,
                              recurrent_dropout=0.2)
        (decoder_outputs, decoder_fwd_state, decoder_back_state) = \
        self.decoder_lstm(dec_emb, initial_state=[self.state_h, self.state_c])

        # Dense layer
        self.decoder_dense = TimeDistributed(Dense(len(tran_lang.word_index) + 1, activation='softmax'))
        decoder_outputs = self.decoder_dense(decoder_outputs)

        # Define the model
        return Model([self.encoder_inputs, self.decoder_inputs], decoder_outputs)

    def inference_model(self):

        # Encode the input sequence to get the feature vector
        encoder_model = Model(inputs=self.encoder_inputs, outputs=[self.encoder_output,
                            self.state_h, self.state_c])

        # Decoder setup

        # Below tensors will hold the states of the previous time step
        decoder_state_input_h = Input(shape=(self.latent_dim, ))
        decoder_state_input_c = Input(shape=(self.latent_dim, ))
        decoder_hidden_state_input = Input(shape=(max_length_inp, self.latent_dim))

        # Get the embeddings of the decoder sequence
        dec_emb2 = self.dec_emb_layer(self.decoder_inputs)

        # To predict the next word in the sequence, set the initial states to the states from the previous time step
        (decoder_outputs2, state_h2, state_c2) = self.decoder_lstm(dec_emb2,
                  initial_state=[decoder_state_input_h, decoder_state_input_c])

        # A dense softmax layer to generate prob dist. over the target vocabulary
        decoder_outputs2 = self.decoder_dense(decoder_outputs2)

        # Final decoder model
        decoder_model = Model([self.decoder_inputs] + [decoder_hidden_state_input,
                                decoder_state_input_h, decoder_state_input_c],
                                [decoder_outputs2] + [state_h2, state_c2])

        return encoder_model, decoder_model

In [42]:
def predict_sequence(input_seq, encoder_model, decoder_model):
    reverse_target_word_index = tran_lang.index_word
    reverse_source_word_index = ref_lang.index_word
    target_word_index = tran_lang.word_index

    # Encode the input as state vectors.
    (e_out, e_h, e_c) = encoder_model.predict(input_seq)

    # Generate empty target sequence of length 1
    target_seq = np.zeros((1, 1))

    # Populate the first word of target sequence with the start word.
    target_seq[0, 0] = target_word_index['sos']

    stop_condition = False
    decoded_sentence = ''

    while not stop_condition:
        (output_tokens, h, c) = decoder_model.predict([target_seq]
                + [e_out, e_h, e_c])

        # Sample a token
        sampled_token_index = np.argmax(output_tokens[0, -1, :])
        sampled_token = reverse_target_word_index[sampled_token_index+1]

        if sampled_token != 'eos':
            decoded_sentence += ' ' + sampled_token

        # Exit condition: either hit max length or find the stop word.
        if sampled_token == 'eos' or len(decoded_sentence.split()) >= max_length_targ - 1:
            stop_condition = True

        # Update the target sequence (of length 1)
        target_seq = np.zeros((1, 1))
        target_seq[0, 0] = sampled_token_index

        # Update internal states
        (e_h, e_c) = (h, c)

    return decoded_sentence

In [43]:
model1 = Model1()
m1 = model1.create_model(ref_lang, tran_lang, max_length_inp)
m1.summary()

Model: "model_5"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_9 (InputLayer)           [(None, 180)]        0           []                               
                                                                                                  
 input_10 (InputLayer)          [(None, None)]       0           []                               
                                                                                                  
 embedding_2 (Embedding)        (None, 180, 300)     3468300     ['input_9[0][0]']                
                                                                                                  
 embedding_3 (Embedding)        (None, None, 300)    2836200     ['input_10[0][0]']               
                                                                                            

In [44]:
m1.compile(optimizer='rmsprop', loss='sparse_categorical_crossentropy')

es = EarlyStopping(monitor='val_loss', mode='min', verbose=1, patience=2)

In [45]:
history = m1.fit(
    [reference_tensor_train, translation_tensor_train[:, :-1]],
    translation_tensor_train.reshape(translation_tensor_train.shape[0], translation_tensor_train.shape[1], 1)[:, 1:],
    epochs=10,
    callbacks=[es],
    batch_size=64,
    validation_data=([reference_tensor_val, translation_tensor_val[:, :-1]],
                     translation_tensor_val.reshape(translation_tensor.shape[0], translation_tensor_val.shape[1], 1)[:, 1:]),
    )

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [46]:
m1.save('model1.h5')

In [47]:
encoder_model1, decoder_model1 = model1.inference_model()

## Prediction

In [48]:
for i in range(0, 5):
    print(predict_sequence(reference_tensor_train[i].reshape(1,max_length_inp), encoder_model1, decoder_model1))

 of if it's and here don't the sos sos sos sos sos sos sos sos sos sos sos sos sos sos sos sos sos sos sos sos sos sos sos sos sos sos sos sos sos sos sos sos sos sos sos sos sos sos sos sos sos sos sos sos sos sos sos sos sos sos sos sos sos sos sos sos sos sos sos sos sos sos sos sos sos sos sos sos sos sos sos sos sos sos sos sos sos sos sos sos sos sos sos sos sos sos sos sos sos sos sos sos sos sos sos sos sos sos sos sos sos sos sos sos sos sos sos sos sos sos sos sos sos sos sos sos sos sos sos sos sos sos sos sos sos sos sos sos sos sos sos sos sos sos sos sos sos sos sos sos sos sos sos sos sos sos sos sos sos sos sos sos sos sos sos sos sos sos


 of if it's and here don't the sos sos sos sos sos sos sos sos sos sos sos sos sos sos sos sos sos sos sos sos sos sos sos sos sos sos sos sos sos sos sos sos sos sos sos sos sos sos sos sos sos sos sos sos sos sos sos sos sos sos sos sos sos sos sos sos sos sos sos sos sos sos sos sos sos sos sos sos sos sos sos sos sos sos sos sos sos sos sos sos sos sos sos sos sos sos sos sos sos sos sos sos sos sos sos sos sos sos sos sos sos sos sos sos sos sos sos sos sos sos sos sos sos sos sos sos sos sos sos sos sos sos sos sos sos sos sos sos sos sos sos sos sos sos sos sos sos sos sos sos sos sos sos sos sos sos sos sos sos sos sos sos sos sos sos sos sos sos
 of if it's and here don't the sos sos sos sos sos sos sos sos sos sos sos sos sos sos sos sos sos sos sos sos sos sos sos sos sos sos sos sos sos sos sos sos sos sos sos sos sos sos sos sos sos sos sos sos sos sos sos sos sos sos sos sos sos sos sos sos sos sos sos sos sos sos sos sos sos sos sos sos sos sos sos sos sos sos sos sos so

 of if it's and here don't the sos sos sos sos sos sos sos sos sos sos sos sos sos sos sos sos sos sos sos sos sos sos sos sos sos sos sos sos sos sos sos sos sos sos sos sos sos sos sos sos sos sos sos sos sos sos sos sos sos sos sos sos sos sos sos sos sos sos sos sos sos sos sos sos sos sos sos sos sos sos sos sos sos sos sos sos sos sos sos sos sos sos sos sos sos sos sos sos sos sos sos sos sos sos sos sos sos sos sos sos sos sos sos sos sos sos sos sos sos sos sos sos sos sos sos sos sos sos sos sos sos sos sos sos sos sos sos sos sos sos sos sos sos sos sos sos sos sos sos sos sos sos sos sos sos sos sos sos sos sos sos sos sos sos sos sos sos sos
 of if it's and here don't the sos sos sos sos sos sos sos sos sos sos sos sos sos sos sos sos sos sos sos sos sos sos sos sos sos sos sos sos sos sos sos sos sos sos sos sos sos sos sos sos sos sos sos sos sos sos sos sos sos sos sos sos sos sos sos sos sos sos sos sos sos sos sos sos sos sos sos sos sos sos sos sos sos sos sos sos so