# Triplet Network

## Extension of Siamese Network

## General Architecture

In [67]:
from IPython.display import Image
from IPython.core.display import HTML 
Image(url= "https://raw.githubusercontent.com/shagunsodhani/PyDataConf2017/master/assets/tripletNetwork.png")
# Image taken from: https://hackernoon.com/one-shot-learning-with-siamese-networks-in-pytorch-8ddaab10340e

In [68]:
import pandas as pd
import numpy as np
np.random.seed(42)

In [69]:
path_to_dataset = "/home/shagun/FortKnox/Quora/quora_duplicate_questions.tsv"
path_to_glove_vectors = "/home/shagun/models/GloVe/glove.6B.100d.txt"

In [70]:
max_len = 50
embedding_dim = 100
# Refer the exploratory notebook to see how max_len value is arrived at

In [71]:
# Load the dataset into a pandas dataframe
df = pd.read_csv(path_to_dataset, delimiter="\t")
print("Total number of question pairs = ", str(len(df)))

Total number of question pairs =  404290


In [72]:
# Let us look at a sample of the dataset
df_sample = df.sample(5)

df_sample

Unnamed: 0,id,qid1,qid2,question1,question2,is_duplicate
339785,339785,23355,17761,How do I talk English fluently?,How could I improve my English pronunciation?,1
1775,1775,3534,3535,To Men: Do men still hand out their number and...,"For men: what do you think about ""boys don't c...",0
90667,90667,53021,152109,Where can I download Sherlock Holmes Season 4?,My life is like Sherlock Holmes season 2 and 3...,0
93625,93625,156519,156520,Why does Quora put neither the most upvoted no...,Is there a way to sort by the most upvoted ans...,0
98948,98948,164345,164346,What is consultancy services?,What does a consultancy service do?,0


## We will play with a very small sample of this dataset to save on time. Feel free to train the network on the entire data later.

In [73]:
# Select the collection of duplicate questions
df_duplicate = df[df['is_duplicate']==1]
df_nonduplicate = df[df['is_duplicate']==0]
print("Number of duplicate question pairs: ", str(len(df_duplicate)))

Number of duplicate question pairs:  149263


In [74]:
df = df_duplicate.sample(1000)
df2 = df_nonduplicate.sample(1000)

In [75]:
labels = list(df['is_duplicate'].apply(lambda x: int(x)).values)
labels = np.asarray(labels)

In [76]:
# Create a list of all the question pairs

# Anchor question
anchor_question_list = list(df['question1'].apply(lambda x: str(x)).values)

# Positive question
positive_question_list = list(df['question2'].apply(lambda x: str(x)).values)

# Negative Question
negative_question_list = list(df2['question1'].apply(lambda x: str(x)).values)

question_list = list(zip(anchor_question_list, positive_question_list, negative_question_list))

In [77]:
print(len(question_list))

1000


In [78]:
from IPython.display import Image
from IPython.core.display import HTML 
Image(url= "https://raw.githubusercontent.com/shagunsodhani/PyDataConf2017/master/assets/keras-tensorflow-logo.jpg")
# Image taken from: https://blog.keras.io/keras-as-a-simplified-interface-to-tensorflow-tutorial.html

In [79]:
from keras.preprocessing.sequence import pad_sequences
from keras.preprocessing.text import Tokenizer
from utils.util import *
from keras.layers.wrappers import Bidirectional
from keras.layers import Embedding, Input, GRU, Dense, Activation, Lambda, BatchNormalization
from keras.optimizers import Adam

### Preprocess the data

In [80]:
tokenizer = Tokenizer()
tokenizer.fit_on_texts(anchor_question_list + positive_question_list + negative_question_list)

In [81]:
anchor_sequence = pad_sequences(tokenizer.texts_to_sequences(anchor_question_list), maxlen=max_len)
positive_sequence = pad_sequences(tokenizer.texts_to_sequences(positive_question_list), maxlen=max_len)
negative_sequence = pad_sequences(tokenizer.texts_to_sequences(negative_question_list), maxlen=max_len)

In [82]:
if(len(anchor_sequence) == len(positive_sequence) and (len(positive_sequence == len(negative_sequence)))):
    print("Good to go")

Good to go


In [83]:
# Basic ML Preprocessing
indices = np.arange(len(anchor_sequence))
np.random.shuffle(indices)
anchor_sequence = anchor_sequence[indices]
positive_sequence = positive_sequence[indices]
negative_sequence = negative_sequence[indices]
labels = labels[indices]
nb_validation_samples = int(0.3 * len(anchor_sequence))

anchor_sequence_train = anchor_sequence[:-nb_validation_samples]
positive_sequence_train = positive_sequence[:-nb_validation_samples]
negative_sequence_train = negative_sequence[:-nb_validation_samples]
labels_train = labels[:-nb_validation_samples]
anchor_sequence_val = anchor_sequence[-nb_validation_samples:]
positive_sequence_val = positive_sequence[-nb_validation_samples:]
negative_sequence_val = negative_sequence[-nb_validation_samples:]
labels_val = labels[-nb_validation_samples:]

In [84]:
print("Number of training examples: " + str(len(anchor_sequence_train)))

Number of training examples: 700


In [85]:
print("Number of validation examples: " + str(len(anchor_sequence_val)))

Number of validation examples: 300


### Load embeddings

In [86]:
# Load embeddings
embeddings_index = get_glove_embeddings(path_to_glove_vectors)

### Preparing the embedding matrix which our model would use

In [87]:
# Preparing the embedding matrix which our model would use
word_index = tokenizer.word_index
embedding_matrix = np.zeros((len(word_index) + 1, embedding_dim))
for word, i in word_index.items():
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        # words not found in embedding index will be all-zeros.
        embedding_matrix[i] = embedding_vector

### Encode the question into a vector space using a Bidirectional GRU (or LSTM or whatever RNN you believe in)

In [88]:
def question_encoder():
    return Bidirectional(GRU(units=200), merge_mode='concat', name="bidir_gru")

### The next part is the core of this network and we would walk through it slowly

In [89]:
def create_question_network():
    
#     Create an input layer
    sequence_input = Input(shape=(max_len,), dtype="int32", name="input_layer")
    
#     Create an embedding layer
    embedding_layer = Embedding(len(word_index) + 1,
                            embedding_dim,
                            weights=[embedding_matrix],
                            input_length=max_len,
                            trainable=False)
    
#     Use the embedding layer we just created
    embedded_sequences = embedding_layer(sequence_input)
    
#     Embeddings are for words, sentences uses encoders
    encoded_question = question_encoder()(embedded_sequences)
    
#     Lets fully connect them
    dense1 = Dense(128)(encoded_question)
    relu1 = Activation('relu')(dense1)
    
#    Make it deep
    dense2 = Dense(64)(relu1)
    relu2 = Activation('relu')(dense2)    

#     And Deeper
    dense3 = Dense(32)(relu2)
    relu3 = Activation('relu')(dense3)

#     Now we are in rythm
    dense4 = Dense(16)(relu3)
    tanh4 = Activation('relu')(dense4)
    
    output = BatchNormalization()(tanh4)
    
    model = Model(inputs=sequence_input, outputs=output)

    return model

### Now we will make the siamese twin

In [92]:
from utils.util import *

def create_network():
    # network definition
    question_network = create_question_network()
    
#     input to the first head of the network
    input_anchor = Input(shape=(max_len,))
    
#     input to the second head of the network
    input_positive = Input(shape=(max_len,))
    
#     input to the third head of the network
    input_negative = Input(shape=(max_len,))
    
#     processing the first input
    processed_anchor = question_network(input_anchor)
    
#     processing the second input
    processed_positive = question_network(input_positive)
    
#     processing the third input
    processed_negative = question_network(input_negative)
    
#     Computing the distance between the transformed inputs.

    distance = Lambda(triplet_loss, output_shape=(1, ))([processed_anchor, processed_positive, processed_negative])

    model = Model(
        inputs=[input_anchor, input_positive, input_negative],
        outputs=distance)    
    
    return model

In [112]:
model = create_network()

optimizer = Adam(lr=0.001, clipnorm=5)
model.compile(loss=identity_loss, optimizer=optimizer)


for i in range(10):
    model.fit([anchor_sequence_train, positive_sequence_train, negative_sequence_train], np.ones(len(anchor_sequence_train)),
       validation_data=([anchor_sequence_val, positive_sequence_val, negative_sequence_val], np.ones(len(anchor_sequence_val))),
       batch_size=128, epochs=1)
    

Train on 700 samples, validate on 300 samples
Epoch 1/1
Train on 700 samples, validate on 300 samples
Epoch 1/1
128/700 [====>.........................] - ETA: 12s - loss: 0.4209

KeyboardInterrupt: 