# Refer the notebook on Siamese Networks and Batch Normalisation before using this notebook

# Why [Skip Connections](https://chatbotslife.com/resnets-highwaynets-and-densenets-oh-my-9bb15918ee32)


## Allow for training deeper networks.

## Effective gradient propogation to lower layers.

## ResNets

In [1]:
from IPython.display import Image
from IPython.core.display import HTML 
Image(url= "https://github.com/shagunsodhani/PyDataConf2017/blob/master/assets/resnet.png")

In [2]:
import pandas as pd
import numpy as np
np.random.seed(1337)

In [3]:
path_to_dataset = "/home/shagun/FortKnox/Quora/quora_duplicate_questions.tsv"
path_to_glove_vectors = "/home/shagun/models/GloVe/glove.6B.100d.txt"

In [4]:
max_len = 50
embedding_dim = 100
# Refer the exploratory notebook to see how max_len value is arrived at

In [5]:
# Load the dataset into a pandas dataframe
df = pd.read_csv(path_to_dataset, delimiter="\t")
print("Total number of question pairs = ", str(len(df)))

Total number of question pairs =  404290


In [6]:
# Let us look at a sample of the dataset
df_sample = df.sample(5)

df_sample

Unnamed: 0,id,qid1,qid2,question1,question2,is_duplicate
294721,294721,416674,416675,Who was the first person to draw the map of th...,What are some tricks to remember world map?,0
225481,225481,207353,333822,Why does hydrogen fluoride have a higher boili...,What is the boiling point of hydrogen flouride?,0
230318,230318,339814,339815,Which film will you watch on 12th August 2016:...,Which film will you watch this weekend: Rustom...,1
371157,371157,501762,501763,Is it better for health to drink a little bit ...,How do I really know if I'm an alcoholic or if...,0
147305,147305,232510,232511,What is the difference between evaporation and...,What is the main difference between something ...,0


## We will play with a very small sample of this dataset to save on time. Feel free to train the network on the entire data later.

In [7]:
df = df.sample(10000)

In [8]:
labels = list(df['is_duplicate'].apply(lambda x: int(x)).values)
labels = np.asarray(labels)

In [9]:
# Create a list of all the question pairs
first_question_list = list(df['question1'].apply(lambda x: str(x)).values)
second_question_list = list(df['question2'].apply(lambda x: str(x)).values)
question_list = list(zip(first_question_list, second_question_list))

In [10]:
print(len(question_list))

10000


In [11]:
from IPython.display import Image
from IPython.core.display import HTML 
Image(url= "https://raw.githubusercontent.com/shagunsodhani/PyDataConf2017/master/assets/keras-tensorflow-logo.jpg")
# Image taken from: https://blog.keras.io/keras-as-a-simplified-interface-to-tensorflow-tutorial.html

In [49]:
from keras.preprocessing.sequence import pad_sequences
from keras.preprocessing.text import Tokenizer
from utils.util import *
from keras.layers.wrappers import Bidirectional
from keras.layers import Embedding, Input, GRU, Dense, Activation, Lambda, BatchNormalization
from keras.layers.merge import add, concatenate
from keras.optimizers import Adam

### Preprocess the data

In [50]:
tokenizer = Tokenizer()
tokenizer.fit_on_texts(first_question_list + second_question_list)

In [51]:
sequence1 = pad_sequences(tokenizer.texts_to_sequences(first_question_list), maxlen=max_len)
sequence2 = pad_sequences(tokenizer.texts_to_sequences(second_question_list), maxlen=max_len)

In [52]:
if(len(sequence1) == len(sequence2)):
    print("Good to go")

Good to go


In [53]:
# Basic ML Preprocessing
indices = np.arange(len(sequence1))
np.random.shuffle(indices)
sequence1 = sequence1[indices]
sequence2 = sequence2[indices]
labels = labels[indices]
nb_validation_samples = int(0.3 * len(sequence1))

sequence1_train = sequence1[:-nb_validation_samples]
sequence2_train = sequence2[:-nb_validation_samples]
labels_train = labels[:-nb_validation_samples]
sequence1_val = sequence1[-nb_validation_samples:]
sequence2_val = sequence2[-nb_validation_samples:]
labels_val = labels[-nb_validation_samples:]

In [54]:
print("Number of training examples: " + str(len(sequence1_train)))

Number of training examples: 7000


In [55]:
print("Number of validation examples: " + str(len(sequence1_val)))

Number of validation examples: 3000


### Load embeddings

In [56]:
# Load embeddings
embeddings_index = get_glove_embeddings(path_to_glove_vectors)

### Preparing the embedding matrix which our model would use

In [57]:
# Preparing the embedding matrix which our model would use
word_index = tokenizer.word_index
embedding_matrix = np.zeros((len(word_index) + 1, embedding_dim))
for word, i in word_index.items():
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        # words not found in embedding index will be all-zeros.
        embedding_matrix[i] = embedding_vector

### Encode the question into a vector space using a Bidirectional GRU (or LSTM or whatever RNN you believe in)

In [58]:
def question_encoder():
    return Bidirectional(GRU(units=200), merge_mode='concat', name="bidir_gru")

### The next part is the core of this network and we would walk through it slowly

In [59]:
def create_question_network():
    
#     Create an input layer
    sequence_input = Input(shape=(max_len,), dtype='int32', name="input_layer")
    
#     Create an embedding layer
    embedding_layer = Embedding(len(word_index) + 1,
                            embedding_dim,
                            weights=[embedding_matrix],
                            input_length=max_len,
                            mask_zero = True,
                            trainable=False)
    
#     Use the embedding layer we just created
    embedded_sequences = embedding_layer(sequence_input)
    
#     Embeddings are for words, sentences uses encoders
    encoded_question = question_encoder()(embedded_sequences)
    
#     Lets fully connect them
    dense1 = Dense(64)(encoded_question)
#     BatchNormalization
    bn1 = BatchNormalization()(dense1)
    relu1 = Activation('relu')(bn1)
    
#    Make it deep
    dense2 = Dense(64)(relu1)
#     BatchNormalization
    bn2 = BatchNormalization()(dense2)
#     Skip Connection
    res2 = add([relu1, bn2])
    relu2 = Activation('relu')(res2)    

#     And Deeper
    dense3 = Dense(64)(relu2)
#     BatchNormalization
    bn3 = BatchNormalization()(dense3)
#     Skip Connection
    res3 = add([relu2, bn3])
    relu3 = Activation('relu')(res3)

#     Now we are in rythm
    dense4 = Dense(64)(relu3)
#     BatchNormalization
    bn4 = BatchNormalization()(dense4)
#     Skip Connection
    res4 = add([relu3, bn4])
    relu4 = Activation('relu')(bn4)

    features = concatenate([relu4, relu3, relu2, relu1])
#     BatchNormalization
    output = BatchNormalization()(features)
    
    model = Model(inputs=sequence_input, outputs=output)

    return model

### Now we will make the siamese twin

In [60]:
def create_network():
    # network definition
    question_network = create_question_network()
    
#     input to the first head of the network
    input1 = Input(shape=(max_len,))
    
#     input to the second head of the network
    input2 = Input(shape=(max_len,))
    
#     processing the first input
    processed1 = question_network(input1)
    
#     processing the second input
    processed2 = question_network(input2)
    
#     Computing the distance between the transformed inputs.
    distance = Lambda(euclidean_distance, output_shape=eucl_dist_output_shape)([processed1, processed2])

    
    model = Model(inputs=[input1, input2], outputs=distance)
    return model

In [None]:
model = create_network()

optimizer = Adam(lr=0.001, clipnorm=5)
model.compile(loss=contrastive_loss, optimizer=optimizer)

for i in range(10):
    model.fit([sequence1_train, sequence2_train], labels_train,
       validation_data=([sequence1_val, sequence2_val], labels_val),
       batch_size=128, epochs=1)
    
    model_labels_train = model.predict([sequence1_train, sequence2_train], batch_size=128)
    print("Training accuracy: "+str(compute_accuracy(model_labels_train, labels_train)))
    
    model_labels_val = model.predict([sequence1_val, sequence2_val], batch_size=128)
    print("Validation accuracy: "+str(compute_accuracy(model_labels_val, labels_val)))

Train on 7000 samples, validate on 3000 samples
Epoch 1/1
1536/7000 [=====>........................] - ETA: 79s - loss: 107.2517

In [None]:
model_labels_val.ravel() < 0.5

In [None]:
np.mean(np.equal(model_labels_val.ravel() < 0.5, labels_val))

In [None]:
compute_accuracy()