In [1]:
import matplotlib.pyplot as plt
%matplotlib inline
# import seaborn as sns
import pandas as pd
import re
import os
import tensorflow as tf
from tensorflow.keras.layers import Embedding, LSTM, Dense
from tensorflow.keras.models import Model
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
import numpy as np
import datetime

In [2]:
data=pd.read_csv('sms_eng1.csv')

In [3]:
data=pd.read_csv('sms_eng1.csv')

In [4]:
from sklearn.model_selection import train_test_split
train, validation = train_test_split(data, test_size=0.01)

In [5]:
train.iloc[0]['english_inp']= str(train.iloc[0]['english_inp'])+' <end>'
train.iloc[0]['english_out']= str(train.iloc[0]['english_out'])+' <end>'

In [6]:
train.head()

Unnamed: 0,sms,english_inp,english_out
629,hmmm i dunno if she will but i dun thk shell b...,<start> hmm i dont know if she will but i dont...,hmm i dont know if she will but i dont think s...
1774,kenmy hp num is 0165460953,<start> ken my handphone number is 0165460953,ken my handphone number is 0165460953 <end>
1096,hey meet at 11 on msn,<start> hey meet at 11 on msn,hey meet at 11 on msn <end>
925,daddy going out tonight come home urself buy ...,<start> daddy is going out tonight you come ho...,daddy is going out tonight you come home yours...
1804,tt pests fathers hp laden u come faster ah,<start> that pests fathers handphone then you ...,that pests fathers handphone then you come fas...


In [7]:
tokenizer_sms = Tokenizer(oov_token=True)
tokenizer_sms.fit_on_texts(train["sms"].values)
tokenizer_eng=Tokenizer(filters='!"#$%&()*+,-./:;=?@[\\]^_`{|}~\t\n')
tokenizer_eng.fit_on_texts(train['english_inp'].values)
vocab_size_eng=len(tokenizer_eng.word_index.keys())
print(vocab_size_eng)
vocab_size_sms=len(tokenizer_sms.word_index.keys())
print(vocab_size_sms)

3080
4320


In [10]:
import pickle
with open('drive/MyDrive/glove_vectors', 'rb') as f:
    glove = pickle.load(f)
    glove_words =  set(glove.keys())
from numpy import zeros
vocab = len(tokenizer_eng.word_index)+1
essay_mat = zeros((vocab, 300))
for word, i in tokenizer_eng.word_index.items():
    if word in glove_words:
        vector = glove[word]
        essay_mat[i] = vector
vocab1 = len(tokenizer_sms.word_index)+1
essay_mat1 = zeros((vocab1, 300))
for word, i in tokenizer_sms.word_index.items():
    if word in glove_words:
        vector = glove[word]
        essay_mat1[i] = vector

In [11]:
class Encoder(tf.keras.Model):
    '''
    Encoder model -- That takes a input sequence and returns output sequence
    '''

    def __init__(self,inp_vocab_size,embedding_size,lstm_size,input_length):

        #Initialize Embedding layer
        #Intialize Encoder LSTM layer
        super().__init__()
        self.vocab_size = inp_vocab_size
        self.embedding_dim = embedding_size
        self.input_length = input_length
        self.enc_units= lstm_size
        self.lstm_output = 0
        self.lstm_state_h=0
        self.lstm_state_c=0

    def build(self,input_shape):
        self.embedding = Embedding(input_dim=self.vocab_size, output_dim=self.embedding_dim, input_length=self.input_length,trainable=False,weights=[essay_mat1] ,name="embedding_layer_encoder")
        self.lstm = LSTM(self.enc_units, return_state=True, return_sequences=True, name="Encoder_LSTM")


    def call(self,input_sequence,training=True):
        
      '''
          This function takes a sequence input and the initial states of the encoder.
          Pass the input_sequence input to the Embedding layer, Pass the embedding layer ouput to encoder_lstm
          returns -- All encoder_outputs, last time steps hidden and cell state
      '''
      input_embedd                           = self.embedding(input_sequence)
      self.lstm_output, self.lstm_state_h,self.lstm_state_c = self.lstm(input_embedd)
      return self.lstm_output, self.lstm_state_h,self.lstm_state_c    
    
    def initialize_states(self,batch_size):
      '''
      Given a batch size it will return intial hidden state and intial cell state.
      If batch size is 32- Hidden state is zeros of size [32,lstm_units], cell state zeros is of size [32,lstm_units]
      '''
      state_h=np.zeros([batch_size,self.enc_units])
      state_c=np.zeros([batch_size,self.enc_units])
      return state_h,state_c


In [12]:
class Attention(tf.keras.layers.Layer):
  '''
    Class the calculates score based on the scoring_function using Bahdanu attention mechanism.
  '''
  def __init__(self,scoring_function, att_units):

    super().__init__()
    self.scoring_function=scoring_function
    self.au=att_units
    # Please go through the reference notebook and research paper to complete the scoring functions

    if self.scoring_function=='dot':
      # Intialize variables needed for Dot score function here
      self.sm=tf.keras.layers.Softmax(axis=1)
      
    if scoring_function == 'general':
      # Intialize variables needed for General score function here
      self.d=Dense(self.au)
      self.sm=tf.keras.layers.Softmax(axis=1)
      
    elif scoring_function == 'concat':
      # Intialize variables needed for Concat score function here
      self.sm=tf.keras.layers.Softmax(axis=1)
      self.d1=Dense(self.au)
      self.d2=Dense(1)
      self.d3=Dense(self.au)
  
  
  def call(self,decoder_hidden_state,encoder_output):
    '''
      Attention mechanism takes two inputs current step -- decoder_hidden_state and all the encoder_outputs.
      * Based on the scoring function we will find the score or similarity between decoder_hidden_state and encoder_output.
        Multiply the score function with your encoder_outputs to get the context vector.
        Function returns context vector and attention weights(softmax - scores)
    '''
    
    if self.scoring_function == 'dot':
        # Implement Dot score function here
        decoder_hidden_state=tf.keras.layers.Reshape((decoder_hidden_state.shape[1],1))(decoder_hidden_state)
        dot_product=tf.keras.layers.Dot(axes=(2,1))([encoder_output,decoder_hidden_state])
        weight=self.sm(dot_product)
        con_vec=tf.keras.layers.Dot(axes=(1,1))([weight,encoder_output])
        con_vec=tf.math.reduce_sum(con_vec,axis=1)
        return con_vec,weight
    elif self.scoring_function == 'general':
        # Implement General score function here
        decoder_hidden_state=tf.keras.layers.Reshape((decoder_hidden_state.shape[1],1))(decoder_hidden_state)
        d1=tf.keras.layers.Dot(axes=(2,1))([encoder_output,decoder_hidden_state])
        weight=self.sm(d1)
        con_vec=tf.keras.layers.Dot(axes=(1,1))([weight,encoder_output])
        con_vec=tf.math.reduce_sum(con_vec,axis=1)
        return con_vec,weight
    elif self.scoring_function == 'concat':
        # Implement General score function here
        temp=self.d2(tf.keras.activations.tanh(self.d1(encoder_output)+self.d3(tf.expand_dims(decoder_hidden_state,1))))
        weight=self.sm(temp)
        con_vec=tf.keras.layers.Dot(axes=(1,1))([weight,encoder_output])
        con_vec=tf.math.reduce_sum(con_vec,axis=1)
        return con_vec,weight
    
    

In [13]:
class OneStepDecoder(tf.keras.Model):
  def __init__(self,tar_vocab_size, embedding_dim, input_length, dec_units ,score_fun ,att_units):
      super().__init__()
      # Initialize decoder embedding layer, LSTM and any other objects needed
      self.vocab_size=tar_vocab_size
      self.embedding_size=embedding_dim
      self.input_length=input_length
      self.dec_units=dec_units
      self.score_fun=score_fun
      self.au=att_units
      self.embedding=Embedding(input_dim=self.vocab_size,output_dim=300,input_length=self.input_length,trainable=False,weights=[essay_mat],mask_zero=True,name='Embedding_layer_decoder')
      self.lstm_layer=LSTM(units=self.dec_units,return_sequences=True,return_state=True,name='Lstm_decoder')
      self.dense_layer=Dense(self.vocab_size)
      self.attention_layer=Attention(self.score_fun,self.au)

  def call(self,input_to_decoder, encoder_output, state_h,state_c):
    '''
        One step decoder mechanisim step by step:
      A. Pass the input_to_decoder to the embedding layer and then get the output(batch_size,1,embedding_dim)
      B. Using the encoder_output and decoder hidden state, compute the context vector.
      C. Concat the context vector with the step A output
      D. Pass the Step-C output to LSTM/GRU and get the decoder output and states(hidden and cell state)
      E. Pass the decoder output to dense layer(vocab size) and store the result into output.
      F. Return the states from step D, output from Step E, attention weights from Step -B
    '''

    con_vec,weight=self.attention_layer(state_h,encoder_output)
    target_embedding=self.embedding(input_to_decoder)
    temp=tf.expand_dims(con_vec,1)
    concat=tf.concat([target_embedding,temp],axis=2)
    out,hidden,cell=self.lstm_layer(concat)
    out = tf.reshape(out, (-1, out.shape[2]))
    out=self.dense_layer(out)      
    return out,hidden,cell,weight,con_vec


In [14]:
class Decoder(tf.keras.Model):
    def __init__(self,out_vocab_size, embedding_dim, input_length, dec_units ,score_fun ,att_units):
      super().__init__()
      #Intialize necessary variables and create an object from the class onestepdecoder
      self.vocab_size=out_vocab_size
      self.embedding_dim=embedding_dim
      self.input_length=input_length
      self.dec_units=dec_units
      self.score_fun=score_fun
      self.au=att_units
      self.osd=OneStepDecoder(self.vocab_size,self.embedding_dim,self.input_length,self.dec_units,self.score_fun,self.au)

        
    def call(self, input_to_decoder,encoder_output,decoder_hidden_state,decoder_cell_state ):

        #Initialize an empty Tensor array, that will store the outputs at each and every time step
        #Create a tensor array as shown in the reference notebook
        
        #Iterate till the length of the decoder input
            # Call onestepdecoder for each token in decoder_input
            # Store the output in tensorarray
        # Return the tensor array
        
        temp = tf.TensorArray(tf.float32,size=len(input_to_decoder[0]),name='empty_tensor')
        for j in range(len(input_to_decoder[0])):
          out_one, hidden, cell,weight,con_vec = self.osd(input_to_decoder[:,j:j+1],encoder_output,decoder_hidden_state,decoder_cell_state)
          temp=temp.write(j,out_one)
        temp=tf.transpose(temp.stack(),[1,0,2])
        return temp


        
    

In [15]:
class encoder_decoder(tf.keras.Model):
  def __init__(self,int_len,out_len,  score_fun, att_units,batch_size):
    super().__init__()
    #Intialize objects from encoder decoder
    self.int_len=int_len
    self.out_len=out_len
    self.score=score_fun
    self.au=att_units
    self.batch_size=batch_size
    self.encoder = Encoder(inp_vocab_size=vocab_size_sms+1, embedding_size=300, input_length=self.int_len, lstm_size=256)
    self.decoder = Decoder(out_vocab_size=vocab_size_eng+1, embedding_dim=300,input_length=self.out_len,dec_units=256,score_fun=self.score,att_units=self.au)
    self.encoder_state_h,self.encoder_state_c=self.encoder.initialize_states(self.batch_size)
  def call(self,data):
    #Intialize encoder states, Pass the encoder_sequence to the embedding layer
    # Decoder initial states are encoder final states, Initialize it accordingly
    # Pass the decoder sequence,encoder_output,decoder states to Decoder
    # return the decoder output
    encoder_output, encoder_hidden, encoder_cell = self.encoder(data[0],[self.encoder_state_h,self.encoder_state_c])
    output                     = self.decoder(data[1],encoder_output,encoder_hidden,encoder_cell)
    return output 



In [16]:
class Dataset:
    def __init__(self, data, tokenizer_sms, tokenizer_eng, max_len):
        self.encoder_inps = data['sms'].values
        self.decoder_inps = data['english_inp'].values
        self.decoder_outs = data['english_out'].values
        self.tokenizer_eng = tokenizer_eng
        self.tokenizer_sms = tokenizer_sms
        self.max_len = max_len

    def __getitem__(self, i):
        self.encoder_seq = self.tokenizer_sms.texts_to_sequences([self.encoder_inps[i]]) # need to pass list of values
        self.decoder_inp_seq = self.tokenizer_eng.texts_to_sequences([self.decoder_inps[i]])
        self.decoder_out_seq = self.tokenizer_eng.texts_to_sequences([self.decoder_outs[i]])

        self.encoder_seq = pad_sequences(self.encoder_seq, maxlen=self.max_len, dtype='int32', padding='post')
        self.decoder_inp_seq = pad_sequences(self.decoder_inp_seq, maxlen=self.max_len, dtype='int32', padding='post')
        self.decoder_out_seq = pad_sequences(self.decoder_out_seq, maxlen=self.max_len, dtype='int32', padding='post')
        return self.encoder_seq, self.decoder_inp_seq, self.decoder_out_seq

    def __len__(self): # your model.fit_gen requires this function
        return len(self.encoder_inps)

    
class Dataloder(tf.keras.utils.Sequence):    
    def __init__(self, dataset, batch_size=1):
        self.dataset = dataset
        self.batch_size = batch_size
        self.indexes = np.arange(len(self.dataset.encoder_inps))


    def __getitem__(self, i):
        start = i * self.batch_size
        stop = (i + 1) * self.batch_size
        data_get = []
        for j in range(start, stop):
            data_get.append(self.dataset[j])

        batch = [np.squeeze(np.stack(samples, axis=1),axis=0) for samples in zip(*data_get)]
        # we are creating data like ([italian, english_inp], english_out) these are already converted into seq
        return tuple([[batch[0],batch[1]],batch[2]])

    def __len__(self):  # your model.fit_gen requires this function
        return len(self.indexes) // self.batch_size

    def on_epoch_end(self):
        self.indexes = np.random.permutation(self.indexes)

In [17]:
train_dataset = Dataset(train, tokenizer_sms, tokenizer_eng, 60)
test_dataset  = Dataset(validation, tokenizer_sms, tokenizer_eng, 60)

train_dataloader = Dataloder(train_dataset, batch_size=64)
test_dataloader = Dataloder(test_dataset, batch_size=64)


print(train_dataloader[0][0][0].shape, train_dataloader[0][0][1].shape, train_dataloader[0][1].shape)

(64, 60) (64, 60) (64, 60)


In [None]:
%load_ext tensorboard
log_dir="log/" + datetime.datetime.now().strftime("%Y%m%d-%H%M%S")
tensorboard = tf.keras.callbacks.TensorBoard(log_dir=log_dir,histogram_freq=1, write_graph=True,write_grads=True)
#Create an object of encoder_decoder Model class, 
# Compile the model and fit the model
model  = encoder_decoder(int_len=60,out_len=60,score_fun='dot',att_units=64,batch_size=64)
optimizer = tf.keras.optimizers.Adam()
model.compile(optimizer=optimizer,loss='sparse_categorical_crossentropy')
train_steps=train.shape[0]//64
valid_steps=validation.shape[0]//64
model.fit(train_dataloader, steps_per_epoch=train_steps, epochs=150, validation_data=test_dataloader, validation_steps=valid_steps,callbacks=[tensorboard])
model.summary()


The tensorboard extension is already loaded. To reload it, use:
  %reload_ext tensorboard
Epoch 1/150
Epoch 2/150
Epoch 3/150
Epoch 4/150
Epoch 5/150
Epoch 6/150
Epoch 7/150
Epoch 8/150
Epoch 9/150
Epoch 10/150
Epoch 11/150
Epoch 12/150
Epoch 13/150
Epoch 14/150
Epoch 15/150
Epoch 16/150
Epoch 17/150
Epoch 18/150
Epoch 19/150
Epoch 20/150
Epoch 21/150
Epoch 22/150
Epoch 23/150
Epoch 24/150
Epoch 25/150
Epoch 26/150
Epoch 27/150
Epoch 28/150
Epoch 29/150
Epoch 30/150
Epoch 31/150
Epoch 32/150
Epoch 33/150
Epoch 34/150
Epoch 35/150
Epoch 36/150
Epoch 37/150
Epoch 38/150
Epoch 39/150
Epoch 40/150
Epoch 41/150
Epoch 42/150
Epoch 43/150
Epoch 44/150
Epoch 45/150
Epoch 46/150
Epoch 47/150
Epoch 48/150
Epoch 49/150
Epoch 50/150
Epoch 51/150
Epoch 52/150
Epoch 53/150
Epoch 54/150
Epoch 55/150
Epoch 56/150
Epoch 57/150
Epoch 58/150
Epoch 59/150
Epoch 60/150
Epoch 61/150
Epoch 62/150
Epoch 63/150
Epoch 64/150
Epoch 65/150
Epoch 66/150
Epoch 67/150
Epoch 68/150
Epoch 69/150
Epoch 70/150
Epoch 71/

In [18]:

def predict(input_sentence,scoring_fun):

  '''
  A. Given input sentence, convert the sentence into integers using tokenizer used earlier
  B. Pass the input_sequence to encoder. we get encoder_outputs, last time step hidden and cell state
  C. Initialize index of <start> as input to decoder. and encoder final states as input_states to onestepdecoder.
  D. till we reach max_length of decoder or till the model predicted word <end>:
         predictions, input_states, attention_weights = model.layers[1].onestepdecoder(input_to_decoder, encoder_output, input_states)
         Save the attention weights
         And get the word using the tokenizer(word index) and then store it in a string.
  E. Call plot_attention(#params)
  F. Return the predicted sentence
  '''
  encoder_sms=tokenizer_sms.texts_to_sequences([input_sentence])
  encoder_pad=pad_sequences(encoder_sms,maxlen=60,padding='post',dtype='int32')
  encoder_out,encoder_hidden,encoder_cell=model.layers[0](encoder_pad)
  index=tokenizer_eng.word_index['<start>']
  index=np.reshape(index,(1,1))
  out=[]
  for j in range(60):
    output, h, c, weight, con_vec=model.layers[1].osd(index, encoder_out, encoder_hidden, encoder_cell)
    decoder_out=model.layers[1](index,encoder_out,encoder_hidden,encoder_cell)
    prob=np.argmax(decoder_out)
    index=np.reshape(prob,(1,1))
    if prob!=0:
        out.append(tokenizer_eng.index_word[prob])
    if prob!=0:
        if tokenizer_eng.index_word[prob]=='<end>':
            break
  return ' '.join(out)

In [19]:
import warnings
warnings.filterwarnings("ignore")

In [None]:

from nltk.translate import bleu_score
sms_inp=validation['sms'].values[:20]
eng_inp=validation['english_out'].values[:20]
score=[]
for i in range(20):
    prediction=predict(sms_inp[i],'dot')
    print(len(prediction))
    print('sms:',sms_inp[i])
    print('actual:',eng_inp[i])
    print('prediction:',prediction)
    score.append(bleu_score.sentence_bleu(eng_inp[i],prediction))
print('bleu_score:',np.average(score))

305
sms: wanna come sit with us at right row 23 corner
actual: do you want to come and sit with us at right row 23 corner <end>
prediction: difference head head head head head head head head head head head head head head head head head head head head head head head head head head head head head head head head head head head head head head head head head head head head head head head head head head head head head head head head head head head
305
sms: help me find millians mem card thk its on e dining table
actual: help me find millians mem card think its on the dining table <end>
prediction: difference head head head head head head head head head head head head head head head head head head head head head head head head head head head head head head head head head head head head head head head head head head head head head head head head head head head head head head head head head head head
305
sms: she alone lah muz b w somebody meh
actual: she is alone she must be with somebody <end