# Siamese Network

## Siamese Network measures similarity between two comparable items 

## Use cases
    
### [One-Shot Learning](https://hackernoon.com/one-shot-learning-with-siamese-networks-in-pytorch-8ddaab10340e)
### Are two photographs of the same person
### Are two questions paraphrases of each other

## General Architecture

In [1]:
from IPython.display import Image
from IPython.core.display import HTML 
Image(url= "https://raw.githubusercontent.com/shagunsodhani/PyDataConf2017/master/assets/siamese.jpeg")
# Image taken from: https://hackernoon.com/one-shot-learning-with-siamese-networks-in-pytorch-8ddaab10340e

## Important Features

### Similarity Vs Classification
### Weight sharing
### Feature representation

In [2]:
import pandas as pd
import numpy as np
np.random.seed(42)

In [3]:
path_to_dataset = "/home/shagun/FortKnox/Quora/quora_duplicate_questions.tsv"
path_to_glove_vectors = "/home/shagun/models/GloVe/glove.6B.100d.txt"

In [4]:
max_len = 50
embedding_dim = 100
# Refer the exploratory notebook to see how max_len value is arrived at

In [5]:
# Load the dataset into a pandas dataframe
df = pd.read_csv(path_to_dataset, delimiter="\t")
print("Total number of question pairs = ", str(len(df)))

Total number of question pairs =  404290


In [6]:
# Let us look at a sample of the dataset
df_sample = df.sample(5)

df_sample

Unnamed: 0,id,qid1,qid2,question1,question2,is_duplicate
8336,8336,16255,16256,"If someone has a domain name, but has not trad...",Are domain names trademarked?,0
393688,393688,21395,380773,How can I get venture capital funding for my c...,Chamath Palihapitiya: How do I get venture cap...,1
372110,372110,263922,502801,What challenges does a UX design student face ...,"When being interviewed for a UX design job, wh...",0
212223,212223,317257,317258,What is DIAC testing?,What is diac?,0
115276,115276,89327,52882,Why is education important to Jewish people? I...,Why is education valued in Jewish culture?,1


## We will play with a very small sample of this dataset to save on time. Feel free to train the network on the entire data later.

In [7]:
df = df.sample(1000)

In [8]:
labels = list(df['is_duplicate'].apply(lambda x: int(x)).values)
labels = np.asarray(labels)

In [9]:
# Create a list of all the question pairs
first_question_list = list(df['question1'].apply(lambda x: str(x)).values)
second_question_list = list(df['question2'].apply(lambda x: str(x)).values)
question_list = list(zip(first_question_list, second_question_list))

In [10]:
print(len(question_list))

1000


In [11]:
from IPython.display import Image
from IPython.core.display import HTML 
Image(url= "https://raw.githubusercontent.com/shagunsodhani/PyDataConf2017/master/assets/keras-tensorflow-logo.jpg")
# Image taken from: https://blog.keras.io/keras-as-a-simplified-interface-to-tensorflow-tutorial.html

In [12]:
from keras.preprocessing.sequence import pad_sequences
from keras.preprocessing.text import Tokenizer
from utils.util import *
from keras.layers.wrappers import Bidirectional
from keras.layers import Embedding, Input, GRU, Dense, Activation, Lambda, BatchNormalization
from keras.optimizers import Adam

Using TensorFlow backend.


### Preprocess the data

In [13]:
tokenizer = Tokenizer()
tokenizer.fit_on_texts(first_question_list + second_question_list)

In [14]:
sequence1 = pad_sequences(tokenizer.texts_to_sequences(first_question_list), maxlen=max_len)
sequence2 = pad_sequences(tokenizer.texts_to_sequences(second_question_list), maxlen=max_len)

In [15]:
if(len(sequence1) == len(sequence2)):
    print("Good to go")

Good to go


In [16]:
# Basic ML Preprocessing
indices = np.arange(len(sequence1))
np.random.shuffle(indices)
sequence1 = sequence1[indices]
sequence2 = sequence2[indices]
labels = labels[indices]
nb_validation_samples = int(0.3 * len(sequence1))

sequence1_train = sequence1[:-nb_validation_samples]
sequence2_train = sequence2[:-nb_validation_samples]
labels_train = labels[:-nb_validation_samples]
sequence1_val = sequence1[-nb_validation_samples:]
sequence2_val = sequence2[-nb_validation_samples:]
labels_val = labels[-nb_validation_samples:]

In [17]:
print("Number of training examples: " + str(len(sequence1_train)))

Number of training examples: 700


In [18]:
print("Number of validation examples: " + str(len(sequence1_val)))

Number of validation examples: 300


### Load embeddings

In [19]:
# Load embeddings
embeddings_index = get_glove_embeddings(path_to_glove_vectors)

### Preparing the embedding matrix which our model would use

In [20]:
# Preparing the embedding matrix which our model would use
word_index = tokenizer.word_index
embedding_matrix = np.zeros((len(word_index) + 1, embedding_dim))
for word, i in word_index.items():
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        # words not found in embedding index will be all-zeros.
        embedding_matrix[i] = embedding_vector

### Encode the question into a vector space using a Bidirectional GRU (or LSTM or whatever RNN you believe in)

In [21]:
def question_encoder():
    return Bidirectional(GRU(units=200), merge_mode='concat', name="bidir_gru")

### The next part is the core of this network and we would walk through it slowly

In [22]:
def create_question_network():
    
#     Create an input layer
    sequence_input = Input(shape=(max_len,), dtype='int32', name="input_layer")
    
#     Create an embedding layer
    embedding_layer = Embedding(len(word_index) + 1,
                            embedding_dim,
                            weights=[embedding_matrix],
                            input_length=max_len,
                            trainable=False)
    
#     Use the embedding layer we just created
    embedded_sequences = embedding_layer(sequence_input)
    
#     Embeddings are for words, sentences uses encoders
    encoded_question = question_encoder()(embedded_sequences)
    
#     Lets fully connect them
    dense1 = Dense(128)(encoded_question)
    relu1 = Activation('relu')(dense1)
    
#    Make it deep
    dense2 = Dense(64)(relu1)
    relu2 = Activation('relu')(dense2)    

#     And Deeper
    dense3 = Dense(32)(relu2)
    relu3 = Activation('relu')(dense3)

#     Now we are in rythm
    dense4 = Dense(16)(relu3)
    tanh4 = Activation('relu')(dense4)
    
    output = BatchNormalization()(tanh4)
    
    model = Model(inputs=sequence_input, outputs=output)

    return model

### Now we will make the siamese twin

In [None]:
def create_network():
    # network definition
    question_network = create_question_network()
    
#     input to the first head of the network
    input1 = Input(shape=(max_len,))
    
#     input to the second head of the network
    input2 = Input(shape=(max_len,))
    
#     processing the first input
    processed1 = question_network(input1)
    
#     processing the second input
    processed2 = question_network(input2)
    
#     Computing the distance between the transformed inputs.
    distance = Lambda(euclidean_distance, output_shape=eucl_dist_output_shape)([processed1, processed2])

    
    model = Model(inputs=[input1, input2], outputs=distance)
    return model

In [None]:
model = create_network()

optimizer = Adam(lr=0.001, clipnorm=5)
model.compile(loss=contrastive_loss, optimizer=optimizer)

for i in range(10):
    model.fit([sequence1_train, sequence2_train], labels_train,
       validation_data=([sequence1_val, sequence2_val], labels_val),
       batch_size=1, epochs=1)
    
    model_labels_train = model.predict([sequence1_train, sequence2_train], batch_size=128)
    print("Training accuracy: "+str(compute_accuracy(model_labels_train, labels_train)))
    
    model_labels_val = model.predict([sequence1_val, sequence2_val], batch_size=128)
    print("Validation accuracy: "+str(compute_accuracy(model_labels_val, labels_val)))

Train on 700 samples, validate on 300 samples
Epoch 1/1
Training accuracy: 0.632857142857
Validation accuracy: 0.643333333333
Train on 700 samples, validate on 300 samples
Epoch 1/1

In [None]:
model_labels_val.ravel() < 0.5

In [None]:
np.mean(np.equal(model_labels_val.ravel() < 0.5, labels_val))

In [None]:
compute_accuracy()