**This notebook is an assignment from the course: ['Natural Language Processing with Sequence Models'](https://www.coursera.org/learn/sequence-models-in-nlp)**

In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
# for dirname, _, filenames in os.walk('/kaggle/input'):
#     for filename in filenames:
#         print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
!pip install trax
import trax
from trax import layers as tl
from trax.supervised import training
from trax.fastmath import numpy as fastnp
import random as rnd

rnd.seed(11)

In [None]:
!unzip '../input/quora-question-pairs/train.csv.zip'

In [None]:
df = pd.read_csv('./train.csv',low_memory=False)
df.head()

In [None]:
len(df)

In [None]:
cut_df = int(len(df)*0.95)

df = df.sample(frac=1)

train,test = df[:cut_df],df[cut_df:]

In [None]:
len(train),len(test)

In [None]:
train = train[train.is_duplicate==1]
len(train)

In [None]:
Q1_train_words = np.array(train['question1'])
Q2_train_words = np.array(train['question2'])

Q1_test_words = np.array(test['question1'])
Q2_test_words = np.array(test['question2'])
y_test  = np.array(test['is_duplicate'])

In [None]:
vocab_dir='gs://trax-ml/vocabs/'
vocab_file='en_32k.subword'

In [None]:
# sentence = 'It is nice to learn new things today!'
Q1_train = list(map(list,list(trax.data.tokenize(iter(Q1_train_words),vocab_file=vocab_file))))
Q2_train = list(map(list,list(trax.data.tokenize(iter(Q2_train_words),vocab_file=vocab_file))))

Q1_test = list(map(list,list(trax.data.tokenize(iter(Q1_test_words),vocab_file=vocab_file))))
Q2_test = list(map(list,list(trax.data.tokenize(iter(Q2_test_words),vocab_file=vocab_file))))

In [None]:
cut_off = int(len(Q1_train)*.8)
train_Q1, train_Q2 = Q1_train[:cut_off], Q2_train[:cut_off]
val_Q1, val_Q2 = Q1_train[cut_off: ], Q2_train[cut_off:]

In [None]:
trax.data.text_encoder.PAD_ID

In [None]:
def data_generator(Q1, Q2, batch_size, pad=0, shuffle=True):

    input1 = []
    input2 = []
    idx = 0
    len_q = len(Q1)
    question_indexes = [*range(len_q)]
    
    if shuffle:
        rnd.shuffle(question_indexes)
    
    while True:
        if idx >= len_q:
            idx = 0
            if shuffle:
                rnd.shuffle(question_indexes)
#         print(len_q,idx)
        q1 = Q1[question_indexes[idx]]
        q2 = Q2[question_indexes[idx]]
        
        idx += 1
        
        input1.append(q1)
        input2.append(q2)
        
        if len(input1) == batch_size:
            max_len = len(max(max(input1,key=len),max(input2,key=len),key=len))
            max_len =  2**int(np.ceil(np.log2(max_len)))
#             print(max_len)
            b1 = []
            b2 = []
            for q1, q2 in zip(input1, input2):  
#                 print(q1.shape,q2.shape)
                q1 = q1+[pad]*(max_len-len(q1))                
                q2 = q2+[pad]*(max_len-len(q2))
                
                b1.append(q1)                
                b2.append(q2)
            yield np.array(b1), np.array(b2)

            input1, input2 = [], [] 

In [None]:
batch_size = 2
res1, res2 = next(data_generator(train_Q1, train_Q2, batch_size))
print("First questions  : ",'\n', res1, '\n')
print("Second questions : ",'\n', res2)

In [None]:
def Siamese(vocab_size=33000, d_model=128, mode='train'):

    def normalize(x):  # normalizes the vectors to have L2 norm 1
        return x / fastnp.sqrt(fastnp.sum(x * x, axis=-1, keepdims=True))
    
    q_processor = tl.Serial(  # Processor will run on Q1 and Q2.
        tl.Embedding(vocab_size,d_model), # Embedding layer
        tl.LSTM(d_model), # LSTM layer
        tl.Mean(axis=1), # Mean over columns
        tl.Fn('Normalize', lambda x: normalize(x))  # Apply normalize function
    )  # Returns one vector of shape [batch_size, d_model].
        
    # Run on Q1 and Q2 in parallel.
    model = tl.Parallel(q_processor, q_processor)
    return model

In [None]:
model = Siamese();model

In [None]:
def TripletLossFn(v1, v2, margin=0.25):
    """Custom Loss function.

    Args:
        v1 (numpy.ndarray): Array with dimension (batch_size, model_dimension) associated to Q1.
        v2 (numpy.ndarray): Array with dimension (batch_size, model_dimension) associated to Q2.
        margin (float, optional): Desired margin. Defaults to 0.25.

    Returns:
        jax.interpreters.xla.DeviceArray: Triplet Loss.
    """
    
    scores = fastnp.dot(v1,v2.T) 

    batch_size = len(scores)
    # use fastnp to grab all postive `diagonal` entries in `scores`
    positive = fastnp.diagonal(scores)  # the positive ones (duplicates)

    # multiply `fastnp.eye(batch_size)` with 2.0 and subtract it out of `scores`
    negative_without_positive = scores-fastnp.eye(batch_size)*2

    # take the row by row `max` of `negative_without_positive`. 
    closest_negative = negative_without_positive.max(axis=[1])

    # subtract `fastnp.eye(batch_size)` out of 1.0 and do element-wise multiplication with `scores`
    negative_zero_on_duplicate = (1-fastnp.eye(batch_size))*scores
    
    # use `fastnp.sum` on `negative_zero_on_duplicate` for `axis=1` and divide it by `(batch_size - 1)` 
    mean_negative = fastnp.sum(negative_zero_on_duplicate,axis=1)/(batch_size-1)
    # compute `fastnp.maximum` among 0.0 and `A`
    # A = subtract `positive` from `margin` and add `closest_negative` 
    triplet_loss1 = fastnp.maximum(margin-positive+closest_negative,0.0)
    
    # compute `fastnp.maximum` among 0.0 and `B`
    # B = subtract `positive` from `margin` and add `mean_negative`
    triplet_loss2 = fastnp.maximum(margin-positive+mean_negative,0)

    # add the two losses together and take the `fastnp.mean` of it
    triplet_loss = fastnp.mean(triplet_loss1+triplet_loss2)
#     print(triplet_loss)
    

    
    return triplet_loss

In [None]:
from functools import partial
def TripletLoss(margin=0.25):
    triplet_loss_fn = partial(TripletLossFn, margin=margin)
    return tl.Fn('TripletLoss', triplet_loss_fn)

In [None]:
batch_size = 256
train_generator = data_generator(train_Q1, train_Q2, batch_size)
val_generator = data_generator(val_Q1, val_Q2, batch_size)
# print('train_Q1.shape ', train_Q1.shape)
# print('val_Q1.shape   ', val_Q1.shape)

In [None]:
lr_schedule = trax.lr.warmup_and_rsqrt_decay(400, 0.01)

In [None]:
def train_model(Siamese, TripletLoss, lr_schedule, train_generator=train_generator, val_generator=val_generator, output_dir='model/'):
    """Training the Siamese Model

    Args:
        Siamese (function): Function that returns the Siamese model.
        TripletLoss (function): Function that defines the TripletLoss loss function.
        lr_schedule (function): Trax multifactor schedule function.
        train_generator (generator, optional): Training generator. Defaults to train_generator.
        val_generator (generator, optional): Validation generator. Defaults to val_generator.
        output_dir (str, optional): Path to save model to. Defaults to 'model/'.

    Returns:
        trax.supervised.training.Loop: Training loop for the model.
    """
    output_dir = os.path.expanduser(output_dir)

    train_task = training.TrainTask(
        labeled_data=train_generator,       
        loss_layer=TripletLoss(),         
        optimizer=trax.optimizers.Adam(0.01),
        lr_schedule=lr_schedule, 
    )

    eval_task = training.EvalTask(
        labeled_data=val_generator,      
        metrics=[TripletLoss()],          
    )
    
    training_loop = training.Loop(Siamese(),
                                  train_task,
                                  eval_tasks=eval_task,
                                  output_dir=output_dir)

    return training_loop

In [None]:
train_steps = 1000
training_loop = train_model(Siamese, TripletLoss, lr_schedule)
training_loop.run(n_steps = train_steps)

In [None]:
def classify(test_Q1, test_Q2, y, threshold, model, data_generator=data_generator, batch_size=64):
    """Function to test the accuracy of the model.

    Args:
        test_Q1 (numpy.ndarray): Array of Q1 questions.
        test_Q2 (numpy.ndarray): Array of Q2 questions.
        y (numpy.ndarray): Array of actual target.
        threshold (float): Desired threshold.
        model (trax.layers.combinators.Parallel): The Siamese model.
        vocab (collections.defaultdict): The vocabulary used.
        data_generator (function): Data generator function. Defaults to data_generator.
        batch_size (int, optional): Size of the batches. Defaults to 64.

    Returns:
        float: Accuracy of the model.
    """
    accuracy = 0
    ### START CODE HERE (Replace instances of 'None' with your code) ###
    for i in range(0, len(test_Q1), batch_size):
        # Call the data generator (built in Ex 01) with shuffle=False using next()
        # use batch size chuncks of questions as Q1 & Q2 arguments of the data generator. e.g x[i:i + batch_size]
        # Hint: use `vocab['<PAD>']` for the `pad` argument of the data generator
        q1, q2 = next(data_generator(test_Q1[i:i+batch_size],test_Q2[i:i+batch_size],
                                     batch_size,shuffle=False))
        # use batch size chuncks of actual output targets (same syntax as example above)
        
        y_test = y[i:i+batch_size]
#         print(y_test.shape)
        # Call the model
        v1, v2 = model([q1,q2])
#         print(v1,v2)
#         print(i)
        for j in range(batch_size):
            # take dot product to compute cos similarity of each pair of entries, v1[j], v2[j]
            # don't forget to transpose the second argument
            d = fastnp.dot(v1[j],v2[j].T)
            # is d greater than the threshold?
            res = d>threshold
#             print(j,res)
            # increment accurancy if y_test is equal `res`
            accuracy += float(y_test[j]==res)
    # compute accuracy using accuracy and total length of test questions
    accuracy = accuracy/len(test_Q1)
    ### END CODE HERE ###
    
    return accuracy

In [None]:
Q1_test = Q1_test[:512*39]
Q2_test = Q2_test[:512*39]
y_test = y_test[:512*39].tolist()

In [None]:
model = Siamese()
model.init_from_file('./model/model.pkl.gz')

In [None]:
accuracy = classify(Q1_test,Q2_test, y_test, 0.7, model, batch_size = 512)
print("Accuracy", accuracy)