##  1. Import Dependencies


In [59]:
import os
import numpy as np
import pandas as pd
from sklearn import metrics
import spacy

In [3]:
import tensorflow as tf

In [4]:
tf.enable_eager_execution()

## 2. Load the dataset

In [5]:
#define load method
def load_dataset(path, feature_cols, target_cols):
    dataset = pd.read_csv(path, keep_default_na = False)
    
    question_dataframe = pd.DataFrame(dataset, columns = feature_cols)
    labels = pd.DataFrame(dataset, columns = target_cols)
    
    question_array = question_dataframe.to_numpy()
    labels_array = labels.to_numpy()
    
    return question_array, labels_array    

In [6]:
#dataset load path
base_path = '.'
dataset_path =  base_path + '/question-pairs-dataset/questions.csv'

#dataset feature and target columns
features = ['question1', 'question2']
target = ['is_duplicate']

In [7]:
#load the dataset
x, y = load_dataset(dataset_path, features, target)

In [8]:
#check dataset shape in feature(x) and target(y)
print('Question Pair:',x.shape)
print('Question Similarity:',y.shape,'\n')

Question Pair: (404351, 2)
Question Similarity: (404351, 1) 



## 3. Split the Dataset

In [9]:
#define method to split dataset
def split_dataset(x, y, train_ratio):
    assert x.shape[0] == y.shape[0], 'different num rows in feature and target'
    
    num_rows_split = int(x.shape[0] * train_ratio)
    
    x_train = x[:num_rows_split]
    x_test = x[num_rows_split:]
    
    y_train = y[:num_rows_split]
    y_test = y[num_rows_split:]
    
    return x_train, y_train, x_test, y_test

In [10]:
#define test_train split ratio
split_ratio = 0.8

In [11]:
#train-test split dataset
x_train, y_train, x_test, y_test = split_dataset(x, y, split_ratio)

In [12]:
#verify shape of train and test splits
print('x_train & y_train:',x_train.shape, ' & ', y_train.shape)
print('x_test & y_test:',x_test.shape, ' & ', y_test.shape)

x_train & y_train: (323480, 2)  &  (323480, 1)
x_test & y_test: (80871, 2)  &  (80871, 1)


## 4. Preprocessing the Input

##  4.1 Load Pre-trained Model

In [13]:
#define pretrained model loader
def read_pretrained_model(glove_path):
    
    with open(glove_path, 'r') as f:
        words = set()
        word_to_vec_map = {}
        
        for line in f:
            line = line.strip().split()
            
            curr_word = line[0]
            words.add(curr_word)
            
            word_to_vec_map[curr_word] = np.array(line[1:], dtype = np.float32)
            
        
        i = 1
        word_to_index = {}
        index_to_word = {}
        
        for w in sorted(words):
            
            word_to_index[w] = i
            index_to_word[i] = w
            
            i = i + 1
        
        return word_to_index, index_to_word, word_to_vec_map

In [14]:
#glove vector path
glove_path = './glove_model/glove.6B.300d.txt'

In [15]:
#loading glove vector
word_to_index, index_to_word, word_to_vec_map = read_pretrained_model(glove_path)

In [16]:
#Test glove vectors
word = 'security'
index = 323224

print('Index of ',word,' in vocabulary is ', word_to_index[word])
print('Word of ',index,' in vocabulary is ', index_to_word[index])
print('Vector of ',word,' in vocabulary is\n ', word_to_vec_map[word])

Index of  security  in vocabulary is  323224
Word of  323224  in vocabulary is  security
Vector of  security  in vocabulary is
  [-1.8194e-01  1.3781e-01  3.9300e-02 -2.0317e-01  3.7706e-01 -1.5607e-02
  1.1759e-01  1.0152e+00 -1.4270e-01 -2.7698e+00  1.1016e-01  1.8549e-02
  4.4524e-01 -3.3648e-01 -5.3132e-01  3.3976e-01  2.6947e-01  4.7103e-02
 -3.3890e-01  2.1926e-03  2.1345e-01 -2.6047e-01  1.9542e-01 -6.3285e-01
 -5.0128e-01  4.4029e-01  2.2583e-02  4.8802e-01 -2.8056e-01  4.5661e-01
  3.9620e-01 -3.1185e-01 -5.3025e-01  4.5025e-01 -2.9634e-02 -3.3689e-01
 -1.3732e-01  1.6462e-01 -5.4839e-01 -6.0662e-01 -2.2178e-02 -6.3760e-02
  3.3564e-01  6.5757e-02 -4.0945e-01  1.6017e-01 -3.6239e-01  1.3025e-01
 -7.0111e-02  1.2693e-01 -2.9299e-01 -5.9315e-02 -4.3517e-01 -2.9694e-02
  4.2646e-01 -2.3699e-01 -3.3968e-01  1.0792e-01 -1.7100e-01 -4.3971e-01
  2.6197e-01  2.8372e-01 -2.4098e-01 -4.8820e-03 -3.1007e-01 -1.7094e-01
  2.5328e-01  2.9205e-01  4.8540e-01 -4.5817e-01 -9.7190e-02 -1.8966

## 4.2 Convert Input Text-to-Indices

In [17]:
#load spacy model
nlp = spacy.load('en')

In [18]:
#Define method for sentence into indices 
def sentences_to_indices(X, word_to_index, max_len):
    
    m = X.shape[0]
    num_sentences = X.shape[1]
        
    X_indices = np.zeros((m, num_sentences, max_len))
    
    
    for i in range(m):
        
        for k in range(num_sentences):
                        
            sentence_words = [str(word).lower() for word in list(nlp(X[i][k]))]
                        
            j = 0
            
            for w in sentence_words:
                X_indices[i, k, j] = word_to_index.get(w, 0)
                j += 1

    
    return X_indices

In [19]:
x_index_train = sentences_to_indices(x_train, word_to_index, max_len=500)

In [20]:
x_index_test = sentences_to_indices(x_test, word_to_index, max_len=500)

In [29]:
x_index_train.shape

(323480, 2, 500)

## 5. Define the Model

In [30]:
#Define pre-trained embedding layer
def pretrained_embedding_layer(word_to_vec_map, word_to_index):
        
    vocab_len = len(word_to_index) + 1                  
    emb_dim = word_to_vec_map["cucumber"].shape[0]     
    
    emb_matrix = np.zeros((vocab_len, emb_dim))
    
    
    for word, index in word_to_index.items():
        emb_matrix[index, :] = word_to_vec_map[word]

    embedding_layer = tf.keras.layers.Embedding(vocab_len, emb_dim, trainable=False, mask_zero= True)
   
    embedding_layer.build((None,))
    
    embedding_layer.set_weights([emb_matrix])
    
    return embedding_layer

In [31]:
#define text-encoding model
class EncodingModel(tf.keras.Model):
    
    def __init__(self):
        super(EncodingModel, self).__init__()
        self.embedding_layer = pretrained_embedding_layer(word_to_vec_map, word_to_index)
        
        self.lstm_layer1 = tf.keras.layers.LSTM(128, return_sequences = True)
        self.dropout = tf.keras.layers.Dropout(0.5)
        
        self.lstm_layer2 = tf.keras.layers.LSTM(128, return_sequences = False)
        
        self.dense = tf.keras.layers.Dense(128)
        
        self.dense_combined = tf.keras.layers.Dense(512, activation='relu')
        self.classification = tf.keras.layers.Dense(1, activation='sigmoid')
        
        
    
    def call(self, inputs):
        x = self.embedding_layer(inputs[:, 0, :])            #For Question1
        x = self.lstm_layer1(x)
        x = self.dropout(x)
        x = self.lstm_layer2(x)
        x = self.dropout(x)
        x = self.dense(x)
        
        y = self.embedding_layer(inputs[:, 1, :])            #For Question2
        y = self.lstm_layer1(y)
        y = self.dropout(y)
        y = self.lstm_layer2(y)
        y = self.dropout(y)
        y = self.dense(y)
        
        z = tf.math.abs(tf.subtract(x, y))
        z = self.dense_combined(z)
        z = self.classification(z)
        
               
                
        return z     

In [36]:
with tf.device('/CPU:0'):
    encoding_model = EncodingModel()

In [37]:
encoding_model.compile(loss = 'binary_crossentropy', optimizer = tf.train.AdamOptimizer(0.0025), metrics=['accuracy'])

## 6. Train the Model

In [38]:
#define checkpoint path
checkpoint_path = './model_files/weights.{epoch:02d}-{val_loss:.2f}.ckpt'
checkpoint_dir = os.path.dirname(checkpoint_path)

In [41]:
cp_callback = tf.keras.callbacks.ModelCheckpoint(
    checkpoint_path, verbose=1, save_weights_only=True,
    # Save weights, every epoch
    period=1)

In [42]:
encoding_model.fit(x_index_train, y_train, batch_size=256, epochs=6, verbose=1, callbacks= [cp_callback], validation_data= (x_index_test, y_test))

Train on 323480 samples, validate on 80871 samples
Epoch 1/6
Epoch 00001: saving model to ./model_files/weights.01-0.52.ckpt
Instructions for updating:
Use tf.train.CheckpointManager to manage checkpoints rather than manually editing the Checkpoint proto.
Epoch 2/6
Epoch 00002: saving model to ./model_files/weights.02-0.51.ckpt
Epoch 3/6
Epoch 00003: saving model to ./model_files/weights.03-0.49.ckpt
Epoch 4/6
Epoch 00004: saving model to ./model_files/weights.04-0.51.ckpt
Epoch 5/6
Epoch 00005: saving model to ./model_files/weights.05-0.52.ckpt
Epoch 6/6
Epoch 00006: saving model to ./model_files/weights.06-0.49.ckpt


<tensorflow.python.keras.callbacks.History at 0x7f1022271f98>

In [43]:
encoding_model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_7 (Embedding)      multiple                  120000300 
_________________________________________________________________
lstm_2 (LSTM)                multiple                  219648    
_________________________________________________________________
dropout_1 (Dropout)          multiple                  0         
_________________________________________________________________
lstm_3 (LSTM)                multiple                  131584    
_________________________________________________________________
dense_3 (Dense)              multiple                  16512     
_________________________________________________________________
dense_4 (Dense)              multiple                  66048     
_________________________________________________________________
dense_5 (Dense)              multiple                  513       
Total para

## 7. Prediction on Testset

In [44]:
y_predict = encoding_model.predict(x_index_test)

In [50]:
y_predict_labels = (y_predict > 0.55).astype(int)

In [51]:
y_predict_labels

array([[0],
       [1],
       [1],
       ...,
       [0],
       [0],
       [1]])

## 8. Classification Metric

In [54]:
y_test.shape

(80871, 1)

In [63]:
conf_mat = metrics.confusion_matrix(y_test, y_predict_labels)

In [64]:
f1_score = metrics.f1_score(y_test, y_predict_labels)

In [65]:
conf_mat

array([[39788, 11987],
       [ 4673, 24423]])

In [66]:
f1_score

0.745672152169267