# Importing the required dependencies

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
import torch
import datasets
from datasets import Dataset
from datasets import load_dataset
from sentence_transformers import InputExample
from tqdm.auto import tqdm  # so we see progress bar
from torch.utils.data import DataLoader
from sentence_transformers import models, SentenceTransformer
from sentence_transformers import losses
import sentence_transformers
from transformers import BertTokenizer
import time

# Loading the dataset into a pandas dataframe

In [2]:
df = pd.read_csv('quora_duplicate_questions.tsv',sep = '\t')

# Data Preprocessing

In [3]:
df = df.sample(50000)

In [4]:
df.head(5)    # Viewing the dataset

Unnamed: 0,id,qid1,qid2,question1,question2,is_duplicate
95745,95745,159610,159611,How do you say whatever in Spanish?,How do you say 'wildfire' in Spanish?,0
84108,84108,142248,142249,Can I use satin paint in the feature wall of m...,I want to buy a wall painting as a gift to my ...,0
95744,95744,110572,113370,Why does China support Pakistan a country know...,"Why does US, Saudi Arabia and China support Pa...",0
282225,282225,17632,49754,What is the reason behind the sudden discontin...,What's the main reason behind 500 & 1000 rs no...,1
402961,402961,536505,536506,How much longer will my cold last?,How long does the average cold virus last?,0


In [5]:
df = df.drop('id',1)    # Dropping the not required id column
df = df.drop('qid1',1)  # Dropping the not required qid1 column
df = df.drop('qid2',1)  # Dropping the not required qid2 column

  df = df.drop('id',1)    # Dropping the not required id column
  df = df.drop('qid1',1)  # Dropping the not required qid1 column
  df = df.drop('qid2',1)  # Dropping the not required qid2 column


In [6]:
df.shape        # Dimension of the Dataset

(50000, 3)

In [7]:
df = df.dropna(how='any',axis=0)    # Dropping rows with null values

In [8]:
df.head(5)

Unnamed: 0,question1,question2,is_duplicate
95745,How do you say whatever in Spanish?,How do you say 'wildfire' in Spanish?,0
84108,Can I use satin paint in the feature wall of m...,I want to buy a wall painting as a gift to my ...,0
95744,Why does China support Pakistan a country know...,"Why does US, Saudi Arabia and China support Pa...",0
282225,What is the reason behind the sudden discontin...,What's the main reason behind 500 & 1000 rs no...,1
402961,How much longer will my cold last?,How long does the average cold virus last?,0


In [9]:
df.shape

(50000, 3)

# Splitting the dataframe into train and test

In [10]:
X = df.drop(["is_duplicate"],axis=1)
y = df["is_duplicate"]
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.2,random_state=1)

# Converting the pandas dataframe into dataset of datasets library using its Dataset class

In [11]:
train_df = pd.concat([X_train,y_train],axis = 1)
test_df = pd.concat([X_test,y_test],axis = 1)
train = Dataset.from_pandas(train_df)
test = Dataset.from_pandas(test_df)

# Fine Tuning Bert

In [12]:
# Converting the train data columns into an almost matching format with InputExample class
train_samples = []
for row in tqdm(train):
    train_samples.append(InputExample(
        texts=[row['question1'], row['question2']],
    ))

  0%|          | 0/40000 [00:00<?, ?it/s]

In [13]:
# Initializing the data loader
from sentence_transformers import datasets

batch_size = 32

loader = datasets.NoDuplicatesDataLoader(
    train_samples, batch_size=batch_size)

In [14]:
# Initializing the model by using bert and pooler modules
bert = models.Transformer('sentence-transformers/bert-base-nli-mean-tokens')
pooler = models.Pooling(
    bert.get_word_embedding_dimension(),
    pooling_mode_mean_tokens=True
)

model = SentenceTransformer(modules=[bert, pooler])

model

SentenceTransformer(
  (0): Transformer({'max_seq_length': 512, 'do_lower_case': False}) with Transformer model: BertModel 
  (1): Pooling({'word_embedding_dimension': 768, 'pooling_mode_cls_token': False, 'pooling_mode_mean_tokens': True, 'pooling_mode_max_tokens': False, 'pooling_mode_mean_sqrt_len_tokens': False})
)

In [15]:
# We need to optimize our model that is ready and for that we initialize our loss as MNR loss
from sentence_transformers import losses

loss = losses.MultipleNegativesRankingLoss(model)

In [16]:
# Training our model with the loss and train for single epoch and warm up 10% of the training before
epochs = 1
warmup_steps = int(len(loader) * epochs * 0.1)

model.fit(
    train_objectives=[(loader, loss)],
    epochs=epochs,
    warmup_steps=warmup_steps,
    output_path='./sbert_test_mnr2',
    show_progress_bar=True
) 



Epoch:   0%|          | 0/1 [00:00<?, ?it/s]

Iteration:   0%|          | 0/1250 [00:00<?, ?it/s]

In [17]:
model = SentenceTransformer('./sbert_test_mnr2')

In [18]:
import datasets

sts = datasets.load_dataset('glue', 'stsb', split='validation')

sts

Downloading builder script:   0%|          | 0.00/7.78k [00:00<?, ?B/s]

Downloading metadata:   0%|          | 0.00/4.47k [00:00<?, ?B/s]

Downloading and preparing dataset glue/stsb (download: 784.05 KiB, generated: 1.09 MiB, post-processed: Unknown size, total: 1.86 MiB) to /Users/nilaypatel/.cache/huggingface/datasets/glue/stsb/1.0.0/dacbe3125aa31d7f70367a07a8a9e72a5a0bfeb5fc42e75c9db75b96da6053ad...


Downloading data:   0%|          | 0.00/803k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/5749 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/1500 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/1379 [00:00<?, ? examples/s]

Dataset glue downloaded and prepared to /Users/nilaypatel/.cache/huggingface/datasets/glue/stsb/1.0.0/dacbe3125aa31d7f70367a07a8a9e72a5a0bfeb5fc42e75c9db75b96da6053ad. Subsequent calls will reuse this data.


Dataset({
    features: ['sentence1', 'sentence2', 'label', 'idx'],
    num_rows: 1500
})

In [19]:
sts = sts.map(lambda x: {'label': x['label'] / 5.0})



  0%|          | 0/1500 [00:00<?, ?ex/s]

In [20]:
samples = []
for sample in sts:
    samples.append(InputExample(
        texts=[sample['sentence1'], sample['sentence2']],
        label=sample['label']
    ))

In [21]:
from sentence_transformers.evaluation import EmbeddingSimilarityEvaluator

evaluator = EmbeddingSimilarityEvaluator.from_input_examples(
    samples, write_csv=False
)

In [22]:
evaluator(model)

0.8478172504320958