In [None]:
import os, collections, random, itertools

import tqdm
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [None]:
# load data
df = pd.read_csv("/kaggle/input/quora-question-pairs/train.csv.zip")
df["question1"] = df["question1"].astype(str)  # resolve nan
df["question2"] = df["question2"].astype(str)
df["qid1"] -= 1  #  index
df["qid2"] -= 1

In [None]:
# all questions are identified with its qid
qid_to_question = {}
for qid1, qid2, question1, question2 in zip(df["qid1"], df["qid2"], df["question1"], df["question2"]):
    qid_to_question[qid1] = question1
    qid_to_question[qid2] = question2
questions_by_idx = [qid_to_question[qid] for qid in range(max(qid_to_question) + 1)]
assert len(questions_by_idx) == len(qid_to_question)

# Obtain BERT sentence embeddings

Various options for sentence transformers here: https://huggingface.co/sentence-transformers

We are using `bert-base-nli-stsb-mean-tokens` and `bert-large-nli-stsb-mean-tokens`

Reference: https://arxiv.org/pdf/1908.10084.pdf

- BERT is better at semantic textual similarity (STS) task compared to its variants (Table 2)
- Mean pooling is better (Table 6)
- Training on STSb (STS benchmark) improves performance on STS task (Table 2)
- I do not think it has been trained on Quora dataset (to ensure no data leakage)

In [None]:
!pip install sentence-transformers > /dev/null

In [None]:
from sentence_transformers import SentenceTransformer

#### Using `bert-base-nli-stsb-mean-tokens`

In [None]:
model_name = 'bert-base-nli-stsb-mean-tokens'
model = SentenceTransformer(model_name)

In [None]:
sentence_vectors_by_idx = model.encode(questions_by_idx)

In [None]:
np.save("sentence_vectors_{}.npy".format(model_name), sentence_vectors_by_idx)
sentence_vectors_by_idx = np.load("sentence_vectors_{}.npy".format(model_name))
sentence_vectors_by_idx.shape

#### Using `bert-large-nli-stsb-mean-tokens`

In [None]:
model_name = 'bert-large-nli-stsb-mean-tokens'
model = SentenceTransformer(model_name)

In [None]:
sentence_vectors_by_idx = model.encode(questions_by_idx)

In [None]:
np.save("sentence_vectors_{}.npy".format(model_name), sentence_vectors_by_idx)
sentence_vectors_by_idx = np.load("sentence_vectors_{}.npy".format(model_name))
sentence_vectors_by_idx.shape