# Baseline Model: Char Level

## Loading Libraries

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import re
import datetime
import numpy as np
from tqdm import tqdm
from sklearn.model_selection import train_test_split
import nltk.translate.bleu_score as bleu
import tensorflow as tf
from tensorflow.keras.layers import Input, Softmax, RNN, Dense, Embedding, LSTM, Flatten, Activation, GRU, Bidirectional, Concatenate
from tensorflow.keras.models import Model
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.callbacks import TensorBoard, ModelCheckpoint, EarlyStopping
import nltk.translate.bleu_score as bleu
import matplotlib.ticker as ticker
import warnings
warnings.filterwarnings("ignore")

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


## Loading Dataset

In [None]:
data_path = '/content/drive/MyDrive/CS2/2.Datasets/'
model_path = '/content/drive/MyDrive/CS2/3.Models/3_1_Baseline-Char/'

In [None]:
data = pd.read_csv(data_path + 'final_data.csv')
print(data.shape)
data.head()

(101717, 2)


Unnamed: 0,input_text,output_text
0,"Ofcouse , I love cheap fashion , fast fashion ...","Of course , I love cheap , fast fashion like f..."
1,If he want to listen to music that I do n't li...,If he wants to listen to music that I do n't l...
2,This happened because of the cultural differen...,This happened because of the cultural differen...
3,I 'm gon na earn much money to study abroad .,I 'm gon na earn enough money to study abroad .
4,It is not difficult for me but answer phones a...,It is not difficult for me but answering the p...


## Preprocess Data

In [None]:
def clean(text):
    text = re.sub('<.*>', '', text)
    text = re.sub('\(.*\)', '', text)
    text = re.sub('\[.*\]', '', text)
    text = re.sub('{.*}', '', text)
    text = re.sub("[-+@#^/|*(){}$~`<>=_]","",text)
    text = text.replace("\\","")
    text = re.sub("\[","",text)
    text = re.sub("\]","",text)
    text = re.sub("[0-9]","",text)
    return text

data['input_text'] = data['input_text'].apply(lambda x: clean(x))
data['output_text'] = data['output_text'].apply(lambda x: clean(x))

In [None]:
def preprocessing(data):
   
    data['input_text_length'] = data['input_text'].apply(len)
    data['output_text_length'] = data['output_text'].apply(len)

    data = data[data['input_text_length'] < 110]
    data = data[data['output_text_length'] < 110]

    data['output_text_in'] = '\t ' + data['output_text'].astype(str)
    data['output_text_out'] = data['output_text'].astype(str) + ' \n'

    data = data.drop(['input_text_length','output_text_length','output_text'], axis=1)
    return data

In [None]:
data = preprocessing(data)

data.iloc[0]['output_text_in'] = str(data.iloc[0]['output_text_in'])+' \n'
data.iloc[0]['output_text_out'] = str(data.iloc[0]['output_text_out'])+' \n'

print(data.shape)
data.head()

(91188, 3)


Unnamed: 0,input_text,output_text_in,output_text_out
0,"Ofcouse , I love cheap fashion , fast fashion ...","\t Of course , I love cheap , fast fashion lik...","Of course , I love cheap , fast fashion like f..."
1,If he want to listen to music that I do n't li...,\t If he wants to listen to music that I do n'...,If he wants to listen to music that I do n't l...
3,I 'm gon na earn much money to study abroad .,\t I 'm gon na earn enough money to study abro...,I 'm gon na earn enough money to study abroad ...
4,It is not difficult for me but answer phones a...,\t It is not difficult for me but answering th...,It is not difficult for me but answering the p...
5,The moment came when the world 's would have i...,\t The moment came when the world would have i...,The moment came when the world would have its ...


## Train Test Split

In [None]:
train, test = train_test_split(data, test_size=0.1, random_state=42)
print('Shape of Train Data:', train.shape)
print('Shape of Test Data:', test.shape)

Shape of Train Data: (82069, 3)
Shape of Test Data: (9119, 3)


## Tokenization

In [None]:
tokenizer_i = Tokenizer(filters="",char_level=True,lower=False)
tokenizer_o = Tokenizer(filters="",char_level=True,lower=False)

tokenizer_i.fit_on_texts(train['input_text'].values)
tokenizer_o.fit_on_texts(train['output_text_in'].values)

vocab_size_input = len(tokenizer_i.word_index.keys())
print('Input Vocab Size:', vocab_size_input)

vocab_size_output = len(tokenizer_o.word_index.keys())
print('Output Vocab Size:', vocab_size_output)

Input Vocab Size: 75
Output Vocab Size: 68


In [None]:
input_vocab = tokenizer_i.word_index
output_vocab = tokenizer_o.word_index

## Data Pipeline

In [None]:
class Dataset:
    def __init__(self, data, tokenizer_i, tokenizer_o, max_len_enc, max_len_dec):
        self.encoder_inps = data['input_text'].values
        self.decoder_inps = data['output_text_in'].values
        self.decoder_outs = data['output_text_out'].values
        self.tokenizer_o = tokenizer_o
        self.tokenizer_i = tokenizer_i
        self.max_len_enc = max_len_enc
        self.max_len_dec = max_len_dec


    def __getitem__(self, i):
        self.encoder_seq = self.tokenizer_i.texts_to_sequences([self.encoder_inps[i]]) 
        self.decoder_inp_seq = self.tokenizer_o.texts_to_sequences([self.decoder_inps[i]])
        self.decoder_out_seq = self.tokenizer_o.texts_to_sequences([self.decoder_outs[i]])

        self.encoder_seq = pad_sequences(self.encoder_seq, maxlen=self.max_len_enc, dtype='int32', padding='post')
        self.decoder_inp_seq = pad_sequences(self.decoder_inp_seq, maxlen=self.max_len_dec, dtype='int32', padding='post')
        self.decoder_out_seq = pad_sequences(self.decoder_out_seq, maxlen=self.max_len_dec, dtype='int32', padding='post')
        return self.encoder_seq, self.decoder_inp_seq, self.decoder_out_seq

    def __len__(self):
        return len(self.encoder_inps)

    
class Dataloder(tf.keras.utils.Sequence):    
    def __init__(self, dataset, batch_size=1):
        self.dataset = dataset
        self.batch_size = batch_size
        self.indexes = np.arange(len(self.dataset.encoder_inps))


    def __getitem__(self, i):
        start = i * self.batch_size
        stop = (i + 1) * self.batch_size
        data = []
        for j in range(start, stop):
            data.append(self.dataset[j])

        batch = [np.squeeze(np.stack(samples, axis=1), axis=0) for samples in zip(*data)]
        return tuple([[batch[0],batch[1]],batch[2]])

    def __len__(self): 
        return len(self.indexes) // self.batch_size

    def on_epoch_end(self):
        self.indexes = np.random.permutation(self.indexes)

In [None]:
train_dataset = Dataset(train, tokenizer_i, tokenizer_o, 110, 110)
test_dataset  = Dataset(test, tokenizer_i, tokenizer_o, 110, 110)

train_dataloader = Dataloder(train_dataset, batch_size=512)
test_dataloader = Dataloder(test_dataset, batch_size=512)

print('Train Dataloader:', train_dataloader[0][0][0].shape, train_dataloader[0][0][1].shape, train_dataloader[0][1].shape)
print('Test Dataloader:', test_dataloader[0][0][0].shape, test_dataloader[0][0][1].shape, test_dataloader[0][1].shape)

Train Dataloader: (512, 110) (512, 110) (512, 110)
Test Dataloader: (512, 110) (512, 110) (512, 110)


## Model Building

In [None]:
class Encoder(tf.keras.Model):
    '''
    Encoder model -- That takes a input sequence and returns encoder-outputs,encoder_final_state_h,encoder_final_state_c
    '''
    def __init__(self,in_vocab_size,embedding_dim,enc_units,input_length,name='Encoder'):
        super().__init__(name=name)
        self.in_vocab_size = in_vocab_size
        self.embedding_dim = embedding_dim
        self.input_length = input_length
        self.enc_units = enc_units

    def build(self, input_shape):
        self.embedding = Embedding(input_dim=self.in_vocab_size, output_dim=self.embedding_dim, input_length=self.input_length, mask_zero=True, name="Encoder_Embedding")
        self.lstm = LSTM(self.enc_units, return_state=True, return_sequences=True, name="Encoder_LSTM")
        
    def call(self,input_sentences,training=True):
        input_embed = self.embedding(input_sentences)
        encoder_output, encoder_state_h, encoder_state_c = self.lstm(input_embed)
        return encoder_output, encoder_state_h, encoder_state_c

#-------------------------------------------------------------------------------------------------------------------------------------
class Decoder(tf.keras.Model):
    '''
    Decoder model -- That takes a input sequence and returns output sequence
    '''
    def __init__(self,out_vocab_size,embedding_dim,dec_units,input_length,name='Decoder'):
        super().__init__(name=name)
        self.out_vocab_size = out_vocab_size
        self.embedding_dim = embedding_dim
        self.dec_units = dec_units
        self.input_length = input_length
     
    def build(self, input_shape):
        self.embedding = Embedding(input_dim=self.out_vocab_size, output_dim=self.embedding_dim, input_length=self.input_length, mask_zero=True, name="Decoder_Embedding")
        self.lstm = LSTM(self.dec_units, return_sequences=True, return_state=True, name="Decoder_LSTM")
        
    def call(self,target_sentences,initial_states):
        target_embedd = self.embedding(target_sentences)
        decoder_output, decoder_final_state_h, decoder_final_state_c = self.lstm(target_embedd, initial_state=initial_states)
        return decoder_output, decoder_final_state_h, decoder_final_state_c

#-------------------------------------------------------------------------------------------------------------------------------------
class Encoder_Decoder(tf.keras.Model):
    
    def __init__(self, encoder_inputs_length, decoder_inputs_length, in_vocab_size, out_vocab_size, embedding_dim, enc_units, dec_units, name='Encoder-Decoder'):
        super().__init__(name=name)
        self.encoder = Encoder(in_vocab_size=in_vocab_size+1, embedding_dim=embedding_dim, enc_units=enc_units, input_length=encoder_inputs_length)
        self.decoder = Decoder(out_vocab_size=out_vocab_size+1, embedding_dim=embedding_dim, dec_units=dec_units, input_length=decoder_inputs_length)
        self.dense   = Dense(out_vocab_size, activation='softmax', name='Dense')
    
    def call(self, data):
        input, output = data[0], data[1]

        encoder_output, encoder_h, encoder_c = self.encoder(input)
        decoder_output, decoder_h, decoder_c = self.decoder(output, [encoder_h, encoder_c])
        output                               = self.dense(decoder_output)
        return output

#-------------------------------------------------------------------------------------------------------------------------------------
def build_model_lstm(name):
  model = Encoder_Decoder(encoder_inputs_length=110, decoder_inputs_length=110, in_vocab_size=vocab_size_input, out_vocab_size=vocab_size_output,embedding_dim=300, enc_units=100, dec_units=100, name=name)
  return model

In [None]:
def train_model(model, model_name):

    es = EarlyStopping(patience=3, verbose=1, min_delta=0.001, monitor='val_loss', mode='min', restore_best_weights=True)
    lr = tf.keras.callbacks.ReduceLROnPlateau(monitor='val_loss', factor=0.99, patience=100)

    train_steps = train.shape[0]//512
    test_steps = test.shape[0]//512

    model.compile(optimizer=tf.keras.optimizers.Adam(), loss='sparse_categorical_crossentropy')

    with tf.device('/device:GPU:0'):
        model.fit(train_dataloader, steps_per_epoch=train_steps, epochs=20, validation_data=test_dataloader, validation_steps=test_steps, callbacks=[es,lr])

    model.summary()

## Model Training

In [None]:
model = build_model_lstm(name='LSTM_Encoder-Decoder_Scratch')
train_model(model, 'LSTM_Encoder-Decoder_Scratch')

Epoch 1/20
Epoch 2/20
Epoch 3/20
Restoring model weights from the end of the best epoch.
Epoch 00003: early stopping
Model: "LSTM_Encoder-Decoder_Scratch"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
Encoder (Encoder)            multiple                  183200    
_________________________________________________________________
Decoder (Decoder)            multiple                  181100    
_________________________________________________________________
Dense (Dense)                multiple                  6868      
Total params: 371,168
Trainable params: 371,168
Non-trainable params: 0
_________________________________________________________________
