# Siamese Network

## Siamese Network measures similarity between two comparable items 

## Use cases
    
### [One-Shot Learning](https://hackernoon.com/one-shot-learning-with-siamese-networks-in-pytorch-8ddaab10340e)
### Are two photographs of the same person
### Are two questions paraphrases of each other

## General Architecture

In [38]:
![Siamese Network]()

FileNotFoundError: [Errno 2] No such file or directory: 'https://raw.githubusercontent.com/shagunsodhani/PyDataConf2017/master/assets/siamese.jpeg'

In [None]:
## Important Features

### Similarity Vs Classification
### Weight sharing
### Feature representation

In [5]:
from collections import Counter
from keras.layers import Embedding, Input, GRU, Dense, BatchNormalization, Activation, merge, Merge, add, concatenate, Lambda
from keras.layers.wrappers import Bidirectional
from keras.models import Model
from keras.preprocessing.sequence import pad_sequences
from keras.preprocessing.text import Tokenizer
from utils.util import *
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd

In [6]:
path_to_dataset = "/home/shagun/FortKnox/Quora/quora_duplicate_questions.tsv"
path_to_glove_vectors = "/home/shagun/models/GloVe/glove.6B.100d.txt"

In [7]:
max_len = 50
embedding_dim = 100

In [8]:
# Load the dataset into a pandas dataframe
df = pd.read_csv(path_to_dataset, delimiter="\t")
print("Total number of question pairs = ", str(len(df)))

Total number of question pairs =  404290


In [18]:
# Let us look at a sample of the dataset
df_sample = df.sample(5)
df_sample

Unnamed: 0,id,qid1,qid2,question1,question2,is_duplicate
217940,217940,8040,11521,I am 19 years old girl and my height is 5'3. H...,How do I increase my height?,1
331352,331352,458237,458238,Which is the best yoga mat and why?,What are some of the best yoga mats on the mar...,1
279706,279706,399179,399180,What is your best meme?,What are some of the best memes on men?,0
297679,297679,420073,420074,Why does Quora have so many questions about ai...,Why are so many Quora questions about airplane...,1
120188,120188,194976,194977,Is there limitations on how often you can take...,How often are submariners allowed to take baths?,1


In [17]:
labels = list(df['is_duplicate'].apply(lambda x: int(x)).values)
labels = np.asarray(labels)

In [10]:
# Create a list of all the question pairs
first_question_list = list(df['question1'].apply(lambda x: str(x)).values)
second_question_list = list(df['question2'].apply(lambda x: str(x)).values)
question_list = list(zip(first_question_list, second_question_list))

In [11]:
print(len(question_list))

404290


In [12]:
tokenizer = Tokenizer()
tokenizer.fit_on_texts(first_question_list + second_question_list)

In [14]:
sequence1 = pad_sequences(tokenizer.texts_to_sequences(first_question_list), maxlen=max_len)
sequence2 = pad_sequences(tokenizer.texts_to_sequences(second_question_list), maxlen=max_len)

In [15]:
if(len(sequence1) == len(sequence2)):
    print("Good to go")

Good to go


In [19]:
indices = np.arange(len(sequence1))
# np.random.shuffle(indices)
sequence1 = sequence1[indices]
sequence2 = sequence2[indices]
labels = labels[indices]
nb_validation_samples = int(0.3 * len(sequence1))

sequence1_train = sequence1[:-nb_validation_samples]
sequence2_train = sequence2[:-nb_validation_samples]
labels_train = labels[:-nb_validation_samples]
sequence1_val = sequence1[-nb_validation_samples:]
sequence2_val = sequence2[-nb_validation_samples:]
labels_val = labels[-nb_validation_samples:]

In [20]:
len(sequence1_train)

283003

In [21]:
len(sequence1_val)

121287

In [22]:
embeddings_index = get_glove_embeddings(path_to_glove_vectors)

In [23]:
word_index = tokenizer.word_index
embedding_matrix = np.zeros((len(word_index) + 1, embedding_dim))
for word, i in word_index.items():
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        # words not found in embedding index will be all-zeros.
        embedding_matrix[i] = embedding_vector

In [24]:
def question_encoder():
    return Bidirectional(GRU(units=200), merge_mode='concat', name="bidir_gru")
    
def create_question_network():
    sequence_input = Input(shape=(max_len,), dtype='int32', name="input_layer")
    embedding_layer = Embedding(len(word_index) + 1,
                            embedding_dim,
                            weights=[embedding_matrix],
                            input_length=max_len,
                            trainable=False)
    embedded_sequences = embedding_layer(sequence_input)
    
    encoded_question = question_encoder()(embedded_sequences)
    
    dense1 = Dense(128)(encoded_question)
    bn1 = BatchNormalization()(dense1)
    relu1 = Activation('relu')(bn1)

    dense2 = Dense(128)(relu1)
    bn2 = BatchNormalization()(dense2)
    res2 = add([relu1, bn2])
    relu2 = Activation('relu')(res2)    

    dense3 = Dense(128)(relu2)
    bn3 = BatchNormalization()(dense3)
    res3 = add([relu2, bn3])
    relu3 = Activation('relu')(res3)   
    
    feats = concatenate([relu3, relu2, relu1])
    bn4 = BatchNormalization()(feats)

    model = Model(inputs=sequence_input, outputs=bn4)

    return model

def create_network():
    # network definition
    question_network = create_question_network()
    
    input1 = Input(shape=(max_len,))
    input2 = Input(shape=(max_len,))
    
    processed1 = question_network(input1)
    processed2 = question_network(input2)
    
    distance = Lambda(euclidean_distance, output_shape=eucl_dist_output_shape)([processed1, processed2])
    
    model = Model(inputs=[input1, input2], outputs=distance)
    return model

In [27]:
from keras.optimizers import RMSprop, SGD, Adam
model = create_network()

optimizer = Adam(lr=0.001)
model.compile(loss=contrastive_loss, optimizer=optimizer)

for i in range(20):
    model.fit([sequence1_train, sequence2_train], labels_train,
       validation_data=([sequence1_val, sequence2_val], labels_val),
       batch_size=128, epochs=1)
    
    model_labels_train = model.predict([sequence1_train, sequence2_train], batch_size=128)
    print("Training accuracy: "+str(compute_accuracy(model_labels_train, labels_train)))
    
    model_labels_val = model.predict([sequence1_val, sequence2_val], batch_size=128)
    print("Validation accuracy: "+str(compute_accuracy(model_labels_val, labels_val)))

Train on 70 samples, validate on 30 samples
Epoch 1/1
Training accuracy: 0.685714285714
Validation accuracy: 0.633333333333
Train on 70 samples, validate on 30 samples
Epoch 1/1
Training accuracy: 0.685714285714
Validation accuracy: 0.6
Train on 70 samples, validate on 30 samples
Epoch 1/1
Training accuracy: 0.685714285714
Validation accuracy: 0.633333333333
Train on 70 samples, validate on 30 samples
Epoch 1/1
Training accuracy: 0.685714285714
Validation accuracy: 0.633333333333
Train on 70 samples, validate on 30 samples
Epoch 1/1
Training accuracy: 0.685714285714
Validation accuracy: 0.633333333333
Train on 70 samples, validate on 30 samples
Epoch 1/1
Training accuracy: 0.685714285714
Validation accuracy: 0.633333333333
Train on 70 samples, validate on 30 samples
Epoch 1/1
Training accuracy: 0.685714285714
Validation accuracy: 0.633333333333
Train on 70 samples, validate on 30 samples
Epoch 1/1
Training accuracy: 0.685714285714
Validation accuracy: 0.633333333333
Train on 70 samples

KeyboardInterrupt: 

In [23]:
model_labels_val.ravel() < 0.5

array([False, False, False, False, False, False, False, False, False,
        True, False, False, False, False, False,  True, False, False,
       False, False, False, False,  True, False, False,  True, False,
       False,  True, False], dtype=bool)

In [24]:
np.mean(np.equal(model_labels_val.ravel() < 0.5, labels_val))

0.6333333333333333

In [None]:
compute_accuracy()