In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)


# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
pd.options.display.max_colwidth = None
pd.options.display.max_rows = 100

In [None]:
!pip install -U sentence-transformers


In [None]:
from sentence_transformers import SentenceTransformer
from sentence_transformers import SentenceTransformer, SentencesDataset, InputExample, losses, evaluation, util
from torch.utils.data import DataLoader
import gc

import scipy
from sklearn.metrics import accuracy_score


In [None]:
#model = SentenceTransformer('/kaggle/input/model-weights-sbert-trained-on-these-data/model_mnli/model_mnli/')

model = SentenceTransformer('/kaggle/input/distilbertbasenlistsbmeantokens/distilbert-base-nli-stsb-mean-tokens/')



In [None]:
model.encode("hi").shape

In [None]:
train = pd.read_csv("/kaggle/input/quora-question-pairs/train.csv.zip")
print(train.shape)
#train['is_duplicate'] = train['is_duplicate'].replace(0,-1)

train.head(30)


In [None]:
num_train_examples = 121600
num_test_examples = 3200
distance_metric = "cosine_distance"
num_epochs = 2
batch_size = 32

In [None]:
#help(model.fit)

In [None]:
train['question1'][0]

In [None]:
train_samples = []
for row in range(num_train_examples):
    sample = InputExample(texts=[str(train['question1'][row]), str(train['question2'][row])], 
                          label=int(train['is_duplicate'][row]))
    train_samples.append(sample)

train_dataset = SentencesDataset(train_samples, model=model)
train_dataloader = DataLoader(train_dataset, shuffle=True, batch_size=batch_size)

train_loss = losses.OnlineContrastiveLoss(model=model, margin=0.5)

In [None]:
test_samples = num_test_examples

sentences1 = list(train['question1'][-1*test_samples:])
sentences2 = list(train['question2'][-1*test_samples:])
scores =  list(train['is_duplicate'][-1*test_samples:].astype('int'))

evaluator1 = evaluation.EmbeddingSimilarityEvaluator(sentences1, sentences2, scores)
evaluator2 = evaluation.BinaryClassificationEvaluator(sentences1, sentences2, scores)

# ... Your other code to load training data



In [None]:
#model.fit(train_objectives=[(train_dataloader, train_loss)], epochs=1, warmup_steps=50, evaluator=evaluator2)

In [None]:
from sklearn.metrics import confusion_matrix
def thr_to_accuracy(thr, Y_test, predictions):
    return -accuracy_score(Y_test, np.array(predictions>thr, dtype=np.int))


In [None]:
# COSINE SIM before training

#Compute embedding for both lists
embeddings1 = model.encode(sentences1, convert_to_tensor=True)
embeddings2 = model.encode(sentences2, convert_to_tensor=True)
#Compute cosine-similarits
cosine_scores = util.pytorch_cos_sim(embeddings1, embeddings2)

print(pd.DataFrame({"labels": scores, 'cosine_sim': np.diag(cosine_scores).tolist()}).groupby("labels").agg({"cosine_sim":["count","mean"]}))

best_thr = scipy.optimize.fmin(thr_to_accuracy, args=(scores, np.diag(cosine_scores)), x0=0.5)
print(best_thr)

print("\n Confusion matrix")
print(confusion_matrix(y_true = scores, y_pred = np.array(np.diag(cosine_scores) > best_thr).astype("int").tolist() ))

pred_before = np.array(np.diag(cosine_scores) > best_thr).astype("int").tolist()

In [None]:
del train
gc.collect()
#np.diag(cosine_scores)

In [None]:
model.fit(train_objectives=[(train_dataloader, train_loss)], epochs= num_epochs, warmup_steps=100, evaluator=evaluator2)

In [None]:
# COSINE SIM After training
#Compute embedding for both lists
embeddings1 = model.encode(sentences1, convert_to_tensor=True)
embeddings2 = model.encode(sentences2, convert_to_tensor=True)
#Compute cosine-similarits
cosine_scores = util.pytorch_cos_sim(embeddings1, embeddings2)
pd.DataFrame({"labels": scores, 'cosine_sim': np.diag(cosine_scores).tolist()}).groupby("labels").agg({"cosine_sim":["count","mean"]})

best_thr = scipy.optimize.fmin(thr_to_accuracy, args=(scores, np.diag(cosine_scores)), x0=0.5)
print(best_thr)
print("\n Confusion matrix")
print(confusion_matrix(y_true = scores, y_pred = np.array(np.diag(cosine_scores) > best_thr).astype("int").tolist() ))

pred_aft = np.array(np.diag(cosine_scores) > best_thr).astype("int").tolist()

In [None]:
## Error Analysis

pred_df = pd.DataFrame({  "sentences1":sentences1,
                          "sentences2":sentences2,
                          "y_true":scores,
                          "y_pred_before":pred_before,
                          "y_pred_after":pred_aft})

In [None]:
print(pred_df[(pred_df.y_true != pred_df.y_pred_before) & 
              (pred_df.y_true == pred_df.y_pred_after) ].reset_index(drop=True).head(50).T)

In [None]:
print(pred_df[(pred_df.y_true != pred_df.y_pred_before) & 
              (pred_df.y_true != pred_df.y_pred_after) ].reset_index(drop=True).head(50).T)