In [None]:
import numpy as np
import pandas as pd
import os

In [None]:
import torch

In [None]:
!pip install -U sentence-transformers

In [None]:
pd.options.display.max_colwidth = None
pd.options.display.max_rows = 100

In [None]:
from sentence_transformers import SentenceTransformer
from sentence_transformers import SentenceTransformer, SentencesDataset, InputExample, losses, evaluation, util
from torch.utils.data import DataLoader
import gc

import scipy
from sklearn.metrics import accuracy_score

In [None]:
model_name = 'nli-distilroberta-base-v2'


In [None]:
model = SentenceTransformer(model_name)

In [None]:
model.encode('hi').shape

In [None]:
train = pd.read_csv("/kaggle/input/quora-question-pairs/train.csv.zip")
print(train.shape)
#train['is_duplicate'] = train['is_duplicate'].replace(0,-1)
test = pd.read_csv('../input/quora-question-pairs/test.csv.zip')
train.head(30)

In [None]:
num_train_examples = 121600
num_test_examples = 3200
distance_metrics = 'cosine_distance'
num_epochs = 2
batch_size = 16

In [None]:
train['question1'][0]

In [None]:
train_samples = []
for row in range(num_train_examples):
    sample = InputExample(texts=[str(train['question1'][row]), str(train['question2'][row])],
                         label = float(train['is_duplicate'][row]))
    train_samples.append(sample)
    
train_dataset = SentencesDataset(train_samples, model=model)
train_dataloader = DataLoader(train_dataset, shuffle=True, batch_size=batch_size)

# train_loss = losses.OnlineContrastiveLoss(model=model, margin=0.5)
train_loss =losses.CosineSimilarityLoss(model=model)

In [None]:
test.head()

In [None]:
test_samples = num_test_examples

sentences1 = list(train['question1'][-1*test_samples:])
sentences2 = list(train['question2'][-1*test_samples:])
scores = list(train['is_duplicate'][-1*test_samples:].astype('float'))

evaluator1 = evaluation.EmbeddingSimilarityEvaluator(sentences1, sentences2, scores)
evaluator2 = evaluation.BinaryClassificationEvaluator(sentences1, sentences2, scores)


In [None]:
from sklearn.metrics import confusion_matrix
def thr_to_accuracy(thr, Y_test, predictions):
    return -accuracy_score(Y_test, np.array(predictions>thr, dtype=np.int))

In [None]:
# Cosine Sim before training

# Compute embedding for both lists
embeddings1 = model.encode(sentences1, convert_to_tensor=True)
embeddings2 = model.encode(sentences2, convert_to_tensor=True)

In [None]:
cosine_scores = util.pytorch_cos_sim(embeddings1, embeddings2)

In [None]:
k = cosine_scores.diag().cpu().numpy().tolist()
d = cosine_scores.diag().cpu().numpy()

In [None]:
print(pd.DataFrame({"labels": scores, 'cosine_sim': k}).groupby("labels").agg({"cosine_sim":["count","mean"]}))

best_thr = scipy.optimize.fmin(thr_to_accuracy, args=(scores, d), x0=0.5)
print(best_thr)

print("\n Confusion matrix")
print(confusion_matrix(y_true = scores, y_pred = np.array(d > best_thr).astype("int").tolist() ))

pred_before = np.array(d > best_thr).astype("int").tolist()

In [None]:
model.fit(train_objectives=[(train_dataloader, train_loss)],
         epochs = num_epochs,
         warmup_steps=100,
         evaluator=evaluator2)

In [None]:
embeddings1 = model.encode(sentences1, convert_to_tensor=True)
embeddings2 = model.encode(sentences2, convert_to_tensor=True)
#Compute cosine-similarits
cosine_scores = util.pytorch_cos_sim(embeddings1, embeddings2)
k = cosine_scores.diag().cpu().numpy().tolist()
d = cosine_scores.diag().cpu().numpy()

pd.DataFrame({"labels": scores, 'cosine_sim': k}).groupby("labels").agg({"cosine_sim":["count","mean"]})

best_thr = scipy.optimize.fmin(thr_to_accuracy, args=(scores, d), x0=0.5)
print(best_thr)
print("\n Confusion matrix")
print(confusion_matrix(y_true = scores, y_pred = np.array(d > best_thr).astype("int").tolist() ))

pred_aft = np.array(d > best_thr).astype("int").tolist()

In [None]:
batch_num = 100000
ans=[]
for i in range(int(len(test)/batch_num)+1):
    if (i+1)*batch_num < len(test):
        test1 = list(test['question1'][i*batch_num:(i+1)*batch_num])
        test2 = list(test['question2'][i*batch_num:(i+1)*batch_num])
    else:
        test1 = list(test['question1'][i*batch_num:])
        test2 = list(test['question2'][i*batch_num:])
    testembeddings1 = model.encode(test1, convert_to_tensor=True)
    testembeddings2 = model.encode(test2, convert_to_tensor=True)
    for t in range(0,10):
        if (t+1)*10000<len(testembeddings1):
            cosine_scores = util.pytorch_cos_sim(testembeddings1[t*10000:(t+1)*10000], testembeddings2[t*10000:(t+1)*10000])
            k = cosine_scores.diag().cpu().numpy().tolist()
        else:
            cosine_scores = util.pytorch_cos_sim(testembeddings1[t*10000:], testembeddings2[t*10000:])
            k = cosine_scores.diag().cpu().numpy().tolist()
       
        for j in k:
            ans.append(j)
        if (t+1)*10000>len(testembeddings1):
            break


In [None]:
pred = pd.DataFrame({'test_id':test['test_id'] ,'is_duplicate':ans})
pred.to_csv('submission.csv',index = False)

In [None]:
pred_df = pd.DataFrame({  "sentences1":sentences1,
                          "sentences2":sentences2,
                          "y_true":scores,
                          "y_pred_before":pred_before,
                          "y_pred_after":pred_aft})

In [None]:
print(pred_df[(pred_df.y_true != pred_df.y_pred_before) & 
              (pred_df.y_true == pred_df.y_pred_after) ].reset_index(drop=True).head(50).T)

In [None]:
print(pred_df[(pred_df.y_true != pred_df.y_pred_before) & 
              (pred_df.y_true != pred_df.y_pred_after) ].reset_index(drop=True).head(50).T)