# Installing TRAX

In [3]:
!pip install trax==1.3.1 #Use this version for this notebook 

Collecting trax==1.3.1
[?25l  Downloading https://files.pythonhosted.org/packages/fe/d8/ad90a5c79804561bbbc5fd65a4cb6b6e735370225e777cfc46980a9dc479/trax-1.3.1-py2.py3-none-any.whl (347kB)
[K     |████████████████████████████████| 348kB 6.9MB/s 
Collecting tensor2tensor
[?25l  Downloading https://files.pythonhosted.org/packages/d6/7c/9e87d30cefad5cbc390bb7f626efb3ded9b19416b8160f1a1278da81b218/tensor2tensor-1.15.7-py2.py3-none-any.whl (1.4MB)
[K     |████████████████████████████████| 1.5MB 17.5MB/s 
[?25hCollecting funcsigs
  Downloading https://files.pythonhosted.org/packages/69/cb/f5be453359271714c01b9bd06126eaf2e368f1fddfff30818754b5ac2328/funcsigs-1.0.2-py2.py3-none-any.whl
Collecting tensorflow-text
[?25l  Downloading https://files.pythonhosted.org/packages/a0/86/22ad798f94d564c3e423758b60ddd3689e83ad629b3f31ff2ae45a6e3eed/tensorflow_text-2.4.3-cp36-cp36m-manylinux1_x86_64.whl (3.4MB)
[K     |████████████████████████████████| 3.4MB 37.5MB/s 
Collecting t5
[?25l  Downloadin

# Load packages
- Please note that Trax Numpy is referred to as fastnp while numpy remains as np

In [4]:
import os
import nltk
import trax
from trax import layers as tl
from trax.supervised import training
from trax.fastmath import numpy as fastnp
import numpy as np
import pandas as pd
import random as rnd
nltk.download('punkt')

# set random seeds
trax.supervised.trainer_lib.init_random_number_generators(34)
rnd.seed(34)



[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.




# Load dataset of duplicate questions from Quora

In [5]:
data = pd.read_csv("C:\Users\SHIVAM\Downloads\questions.csv")
N=len(data)
print('Number of question pairs: ', N)
data.head()

Number of question pairs:  404351


Unnamed: 0,id,qid1,qid2,question1,question2,is_duplicate
0,0,1,2,What is the step by step guide to invest in sh...,What is the step by step guide to invest in sh...,0
1,1,3,4,What is the story of Kohinoor (Koh-i-Noor) Dia...,What would happen if the Indian government sto...,0
2,2,5,6,How can I increase the speed of my internet co...,How can Internet speed be increased by hacking...,0
3,3,7,8,Why am I mentally very lonely? How can I solve...,Find the remainder when [math]23^{24}[/math] i...,0
4,4,9,10,"Which one dissolve in water quikly sugar, salt...",Which fish would survive in salt water?,0


# Split train and test sets

In [6]:
N_train = 300000
N_test  = 10*1024
data_train = data[:N_train]
data_test  = data[N_train:N_train+N_test]
print("Train set:", len(data_train), "Test set:", len(data_test))
del(data) # remove to free memory

Train set: 300000 Test set: 10240


# Select only duplicate questions to construct the training set

In [7]:
td_index = (data_train['is_duplicate'] == 1).to_numpy()
print('select those index corresponding to duplicate pairs:', td_index)
td_index = [i for i, x in enumerate(td_index) if x]
print("selected pairs index", td_index)
print('number of duplicate questions: ', len(td_index))
print('indexes of first ten duplicate questions:', td_index[:10])

select those index corresponding to duplicate pairs: [False False False ... False  True False]
selected pairs index [5, 7, 11, 12, 13, 15, 16, 18, 20, 29, 31, 32, 38, 48, 49, 50, 51, 53, 58, 62, 65, 66, 67, 71, 72, 73, 74, 79, 84, 85, 86, 88, 92, 93, 95, 100, 104, 107, 113, 120, 122, 125, 127, 135, 136, 143, 144, 152, 156, 158, 159, 160, 163, 165, 168, 173, 175, 176, 178, 179, 180, 182, 185, 188, 189, 190, 191, 193, 194, 197, 198, 199, 200, 203, 209, 210, 215, 216, 219, 220, 221, 224, 226, 229, 235, 236, 238, 242, 243, 244, 246, 249, 250, 251, 253, 255, 260, 261, 262, 267, 269, 270, 273, 274, 275, 281, 284, 285, 286, 287, 288, 291, 293, 295, 296, 299, 304, 307, 308, 309, 312, 317, 318, 321, 322, 323, 326, 329, 331, 339, 341, 346, 347, 348, 349, 350, 353, 364, 365, 368, 373, 377, 380, 383, 390, 393, 394, 395, 397, 399, 400, 402, 403, 404, 405, 409, 410, 412, 415, 421, 422, 428, 430, 431, 432, 439, 442, 443, 445, 446, 450, 451, 457, 458, 459, 460, 461, 462, 464, 468, 476, 479, 483, 484, 

In [None]:
print(data_train['question1'][5])  #  Example of question duplicates (first one in data)
print(data_train['question2'][5])
print('is_duplicate: ', data_train['is_duplicate'][5])

Astrology: I am a Capricorn Sun Cap moon and cap rising...what does that say about me?
I'm a triple Capricorn (Sun, Moon and ascendant in Capricorn) What does this say about me?
is_duplicate:  1


# Finalize training and test sets

In [8]:
Q1_train_words = np.array(data_train['question1'][td_index])
Q2_train_words = np.array(data_train['question2'][td_index])

Q1_test_words = np.array(data_test['question1'])
Q2_test_words = np.array(data_test['question2'])
y_test  = np.array(data_test['is_duplicate'])

In [None]:
print('TRAINING QUESTIONS:\n')
print('Question 1: ', Q1_train_words[0])
print('Question 2: ', Q2_train_words[0], '\n')
print('Question 1: ', Q1_train_words[5])
print('Question 2: ', Q2_train_words[5], '\n')

print('TESTING QUESTIONS:\n')
print('Question 1: ', Q1_test_words[0])
print('Question 2: ', Q2_test_words[0], '\n')
print('is_duplicate =', y_test[0], '\n')

TRAINING QUESTIONS:

Question 1:  Astrology: I am a Capricorn Sun Cap moon and cap rising...what does that say about me?
Question 2:  I'm a triple Capricorn (Sun, Moon and ascendant in Capricorn) What does this say about me? 

Question 1:  What would a Trump presidency mean for current international master’s students on an F1 visa?
Question 2:  How will a Trump presidency affect the students presently in US or planning to study in US? 

TESTING QUESTIONS:

Question 1:  How do I prepare for interviews for cse?
Question 2:  What is the best way to prepare for cse? 

is_duplicate = 0 



In [9]:
# create arrays
Q1_train = np.empty_like(Q1_train_words)
Q2_train = np.empty_like(Q2_train_words)

Q1_test = np.empty_like(Q1_test_words)
Q2_test = np.empty_like(Q2_test_words)

In [10]:
# Building the vocabulary with the train set - assign a unique integer index to each distinct word in the vocabulary
from collections import defaultdict

vocab = defaultdict(lambda: 0)
vocab['<PAD>'] = 1

for idx in range(len(Q1_train_words)):
    Q1_train[idx] = nltk.word_tokenize(Q1_train_words[idx])
    Q2_train[idx] = nltk.word_tokenize(Q2_train_words[idx])
    q = Q1_train[idx] + Q2_train[idx]
    for word in q:
        if word not in vocab:
            vocab[word] = len(vocab) + 1
print('The length of the vocabulary is: ', len(vocab))

The length of the vocabulary is:  36342


In [None]:
print(vocab['<PAD>'])
print(vocab['Astrology'])
print(vocab['Astronomy'])  #not in vocabulary, returns 0

1
2
0


In [11]:
for idx in range(len(Q1_test_words)): 
    Q1_test[idx] = nltk.word_tokenize(Q1_test_words[idx])
    Q2_test[idx] = nltk.word_tokenize(Q2_test_words[idx])

In [None]:
print('Train set has reduced to: ', len(Q1_train) ) 
print('Test set length: ', len(Q1_test) ) 

Train set has reduced to:  111486
Test set length:  10240


Note that the vocabulary is built only using the train set

In [12]:
# Converting questions to array of integers
for i in range(len(Q1_train)):
    Q1_train[i] = [vocab[word] for word in Q1_train[i]]
    Q2_train[i] = [vocab[word] for word in Q2_train[i]]

        
for i in range(len(Q1_test)):
    Q1_test[i] = [vocab[word] for word in Q1_test[i]]
    Q2_test[i] = [vocab[word] for word in Q2_test[i]]

In [None]:
print('first question in the train set:\n')
print(Q1_train_words[0], '\n') 
print('encoded version:')
print(Q1_train[0],'\n')

print('first question in the test set:\n')
print(Q1_test_words[0], '\n')
print('encoded version:')
print(Q1_test[0]) 

first question in the train set:

Astrology: I am a Capricorn Sun Cap moon and cap rising...what does that say about me? 

encoded version:
[2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21] 

first question in the test set:

How do I prepare for interviews for cse? 

encoded version:
[32, 38, 4, 107, 65, 1015, 65, 11522, 21]


## Reserve a validation set out of the train set

In [13]:
cut_off = int(len(Q1_train)*.8)
train_Q1, train_Q2 = Q1_train[:cut_off], Q2_train[:cut_off]
val_Q1, val_Q2 = Q1_train[cut_off: ], Q2_train[cut_off:]
print('Number of duplicate questions: ', len(Q1_train))
print("The length of the training set is:  ", len(train_Q1))
print("The length of the validation set is: ", len(val_Q1))

Number of duplicate questions:  111486
The length of the training set is:   89188
The length of the validation set is:  22298


# Data Generator

In [14]:
def data_generator(Q1, Q2, batch_size, pad=1, shuffle=True):
    """Generator function that yields batches of data

    Args:
        Q1 (list): List of transformed (to tensor) questions.
        Q2 (list): List of transformed (to tensor) questions.
        batch_size (int): Number of elements per batch.
        pad (int, optional): Pad character from the vocab. Defaults to 1.
        shuffle (bool, optional): If the batches should be randomnized or not. Defaults to True.
    Yields:
        tuple: Of the form (input1, input2) with types (numpy.ndarray, numpy.ndarray)
        NOTE: input1: inputs to your model [q1a, q2a, q3a, ...] i.e. (q1a,q1b) are duplicates
              input2: targets to your model [q1b, q2b,q3b, ...] i.e. (q1a,q2i) i!=a are not duplicates
    """

    input1 = []
    input2 = []
    idx = 0
    len_q = len(Q1)
    question_indexes = [*range(len_q)]
    
    if shuffle:
        rnd.shuffle(question_indexes)
    
    while True:
        if idx >= len_q:
            # if idx is greater than or equal to len_q, set idx accordingly 
            idx = 0
            # shuffle to get random batches if shuffle is set to True
            if shuffle:
                rnd.shuffle(question_indexes)
        
        # get questions at the `question_indexes[idx]` position in Q1 and Q2
        q1 = Q1[question_indexes[idx]]
        q2 = Q2[question_indexes[idx]]
        
        # increment idx by 1
        idx += 1
        input1.append(q1)
        input2.append(q2)
        if len(input1) == batch_size:
            # determine max_len as the longest question in input1 & input 2
            # take max of input1 & input2 and then max out of the two of them.
            
            #max_len = len(max(max(input1, input2, key=len), key=len))
            max_len = max(max([len(q) for q in input1]), max([len(q) for q in input2]))
            #print('longest question of length:',max_len)
            
            # pad to power-of-2
            max_len = 2**int(np.ceil(np.log2(max_len)))
            #print('padding up to max_len of', max_len)
            
            b1 = []
            b2 = []
            for q1, q2 in zip(input1, input2):
                q1 = q1 + (max_len-len(q1))*[vocab['<PAD>']]
                q2 = q2 + (max_len-len(q2))*[vocab['<PAD>']]
                b1.append(q1)
                b2.append(q2)
            yield np.array(b1), np.array(b2)
            # reset the batches
            input1, input2 = [], []

In [None]:
batch_size = 2
res1, res2 = next(data_generator(train_Q1, train_Q2, batch_size))
print("First questions  : ",'\n', res1, '\n')
print("Second questions : ",'\n', res2)

First questions  :  
 [[  30   87   78  134 2131 1980   28   78  594   21    1    1    1    1
     1    1]
 [  30   55   78 3540 1460   28   56  253   21    1    1    1    1    1
     1    1]] 

Second questions :  
 [[  30  156   78  134 2131 9516   21    1    1    1    1    1    1    1
     1    1]
 [  30  156   78 3540 1460  131   56  253   21    1    1    1    1    1
     1    1]]


# Build Siamese Network

In [15]:
def Siamese(vocab_size=len(vocab), d_model=128, mode='train'):
    """Returns a Siamese model.

    Args:
        vocab_size (int, optional): Length of the vocabulary. Defaults to len(vocab).
        d_model (int, optional): Depth of the model. Defaults to 128.
        mode (str, optional): 'train', 'eval' or 'predict', predict mode is for fast inference. Defaults to 'train'.

    Returns:
        trax.layers.combinators.Parallel: A Siamese model. 
    """

    def normalize(x):  # normalizes the vectors to have L2 norm 1
        return x / fastnp.sqrt(fastnp.sum(x * x, axis=-1, keepdims=True))
    
    q_processor = tl.Serial(                                       # Processor will run on Q1 and Q2.
                  tl.Embedding(vocab_size,d_model),                # Embedding layer
                  tl.LSTM(d_model),                                # LSTM layer
                  tl.Mean(axis=1),                                 # Mean over columns
                  tl.Fn('Normalize', lambda x: normalize(x))       # Apply normalize function
                  )                                                # Returns one vector of shape [batch_size, d_model].
    
    
    # Run on Q1 and Q2 in parallel.
    model = tl.Parallel(q_processor, q_processor)
    return model


In [16]:
# check your model
model = Siamese()
print(model)

Parallel_in2_out2[
  Serial[
    Embedding_41789_128
    LSTM_128
    Mean
    Normalize
  ]
  Serial[
    Embedding_41789_128
    LSTM_128
    Mean
    Normalize
  ]
]


# Define specific Loss function

In [17]:
def TripletLossFn(v1, v2, margin=0.25):
    """Custom Loss function.

    Args:
        v1 (numpy.ndarray): Array with dimension (batch_size, model_dimension) associated to Q1.
        v2 (numpy.ndarray): Array with dimension (batch_size, model_dimension) associated to Q2.
        margin (float, optional): Desired margin. Defaults to 0.25.

    Returns:
        jax.interpreters.xla.DeviceArray: Triplet Loss.
    """

    scores = fastnp.dot(v1, v2.T)  # pairwise cosine sim
    # calculate new batch size
    batch_size = len(scores)
    
    # use fastnp to grab all positive `diagonal` entries in `scores`
    positive = fastnp.diag(scores)  # the positive ones (duplicates)
    
    negative_without_positive = scores - fastnp.eye(batch_size) * 2
    
    closest_negative = fastnp.max(negative_without_positive, axis = 1)
    
    negative_zero_on_duplicate = (1 - fastnp.eye(batch_size)) * scores
    #negative_zero_on_duplicate = fastnp.multiply((1.0 - fastnp.eye(batch_size)), scores) 
    
    mean_negative = fastnp.sum(negative_zero_on_duplicate, axis=1) / (batch_size - 1)
    
    triplet_loss1 = fastnp.maximum(closest_negative - positive + margin, 0)
    triplet_loss2 = fastnp.maximum(mean_negative - positive + margin, 0)
    triplet_loss = fastnp.mean(triplet_loss1 + triplet_loss2)
    
    return triplet_loss

In [18]:
from functools import partial
def TripletLoss(margin=0.25):
    triplet_loss_fn = partial(TripletLossFn, margin=margin)
    return tl.Fn('TripletLoss', triplet_loss_fn)

# Generate training and validation data generators

In [19]:
batch_size = 256
train_generator = data_generator(train_Q1, train_Q2, batch_size, vocab['<PAD>'])
val_generator = data_generator(val_Q1, val_Q2, batch_size, vocab['<PAD>'])
print('train_Q1.shape ', train_Q1.shape)
print('val_Q1.shape   ', val_Q1.shape)

train_Q1.shape  (89188,)
val_Q1.shape    (22298,)


# Train the model

In [20]:
lr_schedule = trax.lr.warmup_and_rsqrt_decay(400, 0.01)

def train_model(Siamese, TripletLoss, lr_schedule, train_generator=train_generator, val_generator=val_generator, output_dir='model/'):
    """Training the Siamese Model

    Args:
        Siamese (function): Function that returns the Siamese model.
        TripletLoss (function): Function that defines the TripletLoss loss function.
        lr_schedule (function): Trax multifactor schedule function.
        train_generator (generator, optional): Training generator. Defaults to train_generator.
        val_generator (generator, optional): Validation generator. Defaults to val_generator.
        output_dir (str, optional): Path to save model to. Defaults to 'model/'.

    Returns:
        trax.supervised.training.Loop: Training loop for the model.
    """
    output_dir = os.path.expanduser(output_dir)


    train_task = training.TrainTask(
        labeled_data= train_generator,                  # Use generator (train)
        loss_layer= TripletLoss(),                      # Use triplet loss. Don't forget to instantiate this object
        optimizer= trax.optimizers.Adam(0.01),          # Don't forget to add the learning rate parameter
        lr_schedule= lr_schedule,                       # Use Trax multifactor schedule function
    )

    eval_task = training.EvalTask(
        labeled_data= val_generator,       # Use generator (val)
        metrics= [TripletLoss()],          # Use triplet loss. Don't forget to instantiate this object
    )

    training_loop = training.Loop(Siamese(),
                                  train_task,
                                  eval_task=eval_task,
                                  output_dir=output_dir)

    return training_loop

In [None]:
train_steps = 2500
training_loop = train_model(Siamese, TripletLoss, lr_schedule)
training_loop.run(train_steps)

Step      1: train TripletLoss |  0.49951738
Step      1: eval  TripletLoss |  0.49955446
Step    100: train TripletLoss |  0.43771967
Step    100: eval  TripletLoss |  0.34000629
Step    200: train TripletLoss |  0.22195224
Step    200: eval  TripletLoss |  0.15131871
Step    300: train TripletLoss |  0.13090034
Step    300: eval  TripletLoss |  0.12246029
Step    400: train TripletLoss |  0.10107065
Step    400: eval  TripletLoss |  0.10338214
Step    500: train TripletLoss |  0.08687650
Step    500: eval  TripletLoss |  0.09262031
Step    600: train TripletLoss |  0.07717504
Step    600: eval  TripletLoss |  0.09059331
Step    700: train TripletLoss |  0.07456116
Step    700: eval  TripletLoss |  0.07687782
Step    800: train TripletLoss |  0.05797867
Step    800: eval  TripletLoss |  0.06868526
Step    900: train TripletLoss |  0.05793841
Step    900: eval  TripletLoss |  0.06242508
Step   1000: train TripletLoss |  0.05792687
Step   1000: eval  TripletLoss |  0.05789907
Step   110

# Evaluate the model's performance

In [23]:
# Loading in the saved model
model = Siamese()
model.init_from_file('model/model.pkl.gz')

In [24]:
def classify(test_Q1, test_Q2, y, threshold, model, vocab, data_generator=data_generator, batch_size=64):
    """Function to test the accuracy of the model.

    Args:
        test_Q1 (numpy.ndarray): Array of Q1 questions.
        test_Q2 (numpy.ndarray): Array of Q2 questions.
        y (numpy.ndarray): Array of actual target.
        threshold (float): Desired threshold.
        model (trax.layers.combinators.Parallel): The Siamese model.
        vocab (collections.defaultdict): The vocabulary used.
        data_generator (function): Data generator function. Defaults to data_generator.
        batch_size (int, optional): Size of the batches. Defaults to 64.

    Returns:
        float: Accuracy of the model.
    """
    accuracy = 0
    for i in range(0, len(test_Q1), batch_size):
        # Call the data generator (built in Ex 01) with shuffle=False using next()
        q1, q2 = next(data_generator(test_Q1[i:i+batch_size], test_Q2[i:i+batch_size], batch_size, pad=vocab['<PAD>'], shuffle=False))
        # use batch size chuncks of actual output targets 
        y_test = y[i:i+batch_size]
        # Call the model
        v1, v2 = model([q1,q2])

        for j in range(batch_size):
            # take dot product to compute cos similarity of each pair of entries, v1[j], v2[j]
            d = fastnp.dot(v1[j], v2[j].T)
            # is d greater than the threshold?
            res = int(d > threshold)
            # increment accurancy if y_test is equal `res`
            accuracy += int(y_test[j] == res)
    # compute accuracy using accuracy and total length of test questions
    accuracy = accuracy / len(test_Q1)
    
    return accuracy

In [25]:
accuracy = classify(Q1_test,Q2_test, y_test, 0.7, model, vocab, batch_size = 512) 
print("Accuracy", accuracy)

Accuracy 0.7359375


# Test the model

In [26]:
def predict(question1, question2, threshold, model, vocab, data_generator=data_generator, verbose=False):
    """Function for predicting if two questions are duplicates.

    Args:
        question1 (str): First question.
        question2 (str): Second question.
        threshold (float): Desired threshold.
        model (trax.layers.combinators.Parallel): The Siamese model.
        vocab (collections.defaultdict): The vocabulary used.
        data_generator (function): Data generator function. Defaults to data_generator.
        verbose (bool, optional): If the results should be printed out. Defaults to False.

    Returns:
        bool: True if the questions are duplicates, False otherwise.
    """
    # use `nltk` word tokenize function to tokenize
    q1 = nltk.word_tokenize(question1)  
    q2 = nltk.word_tokenize(question2)  
    Q1, Q2 = [], []
    for word in q1:  # encode q1
        # increment by checking the 'word' index in `vocab`
        Q1 += [vocab[word]]
    for word in q2:  # encode q2
        # increment by checking the 'word' index in `vocab`
        Q2 += [vocab[word]]
        
    # Call the data generator (built in Ex 01) using next()
    # pass [Q1] & [Q2] as Q1 & Q2 arguments of the data generator. Set batch size as 1
    Q1, Q2 = next(data_generator([Q1],[Q2], batch_size=1, pad=vocab['<PAD>'], shuffle=False))
    # Call the model
    v1, v2 = model([Q1,Q2])
    # take dot product to compute cos similarity of each pair of entries, v1, v2
    d = fastnp.dot(v1,v2.T).item()
    # is d greater than the threshold?
    res = d>threshold
    
    if(verbose):
        print("Q1  = ", Q1, "\nQ2  = ", Q2)
        print("d   = ", d)
        print("res = ", res)

    return res

In [27]:
# try with your own questions
question1 = "When will I see you?"
question2 = "When can I see you again?"
# 1 means it is duplicated, 0 otherwise
predict(question1 , question2, 0.7, model, vocab, verbose = True)

Q1  =  [[585  76   4  46  53  21   1   1]] 
Q2  =  [[ 585   33    4   46   53 7287   21    1]]
d   =  0.8169271945953369
res =  True


True

In [28]:
question1 = "Do they enjoy eating the dessert?"
question2 = "Do they like hiking in the desert?"
# 1 means it is duplicated, 0 otherwise
predict(question1 , question2, 0.7, model, vocab, verbose=True)

Q1  =  [[  443  1145  3158  1169    78 29071    21     1]] 
Q2  =  [[  443  1145    60 15323    28    78  7438    21]]
d   =  0.4396783113479614
res =  False


False

In [31]:
question1 = "do you enjoy watching football?"
question2 = "do you like football games?"
# 1 means it is duplicated, 0 otherwise
predict(question1 , question2, 0.7, model, vocab, verbose=True)

Q1  =  [[  38   53 3158 1152 3484   21    1    1]] 
Q2  =  [[  38   53   60 3484 2868   21    1    1]]
d   =  0.6310033202171326
res =  False


False

In [32]:
question1 = "do you enjoy football?"
question2 = "do you like football games?"
# 1 means it is duplicated, 0 otherwise
predict(question1 , question2, 0.7, model, vocab, verbose=True)

Q1  =  [[  38   53 3158 3484   21    1    1    1]] 
Q2  =  [[  38   53   60 3484 2868   21    1    1]]
d   =  0.7026862502098083
res =  True


True