### Quora question pairs - Use transformers for question comparison

Data: from kaggle

In [None]:
!pip install -U sentence-transformers


In [None]:
from sentence_transformers import SentenceTransformer, util

import pandas as pd
import numpy as np

from sklearn.metrics.pairwise import cosine_similarity

import matplotlib.pyplot as plt
%matplotlib inline

#### Prepare data - Quora questions

In [None]:
!wget https://github.com/ravi-ilango/aicamp-mar-2021/blob/main/lab2_3/train.csv.zip?raw=true -O train.csv.zip

!unzip train.csv.zip


In [None]:
df = pd.read_csv('train.csv')
df.head()

In [None]:
df = df[~df['question1'].isnull() & ~df['question2'].isnull()]

#### Explore data

In [None]:
df['length1'] = df['question1'].apply(lambda s: len(s.split()))
df['length1'].hist()

In [None]:
df['length2'] = df['question2'].apply(lambda s: len(s.split()))
df['length2'].hist()

In [None]:
labels = df.is_duplicate.values

plt.hist(labels)
plt.xlabel('target')
plt.ylabel('count')
plt.title('target distribution')
plt.xticks(np.arange(len(np.unique(labels))));

In [None]:
from wordcloud import WordCloud
train_qs = df.question1.values
cloud = WordCloud(width=1440, height=1080).generate(" ".join(train_qs.astype(str)))
plt.figure(figsize=(20, 15))
plt.imshow(cloud)
plt.axis('off')


In [None]:
train_qs = df.question2.values
cloud = WordCloud(width=1440, height=1080).generate(" ".join(train_qs.astype(str)))
plt.figure(figsize=(20, 15))
plt.imshow(cloud)
plt.axis('off')

In [None]:
df_ = pd.concat([df[df.is_duplicate == 0].sample(n=5000, random_state=12),
                df[df.is_duplicate == 1].sample(n=5000, random_state=18)])

labels = df_.is_duplicate.values

plt.hist(labels)
plt.xlabel('target')
plt.ylabel('count')
plt.title('target distribution')
plt.xticks(np.arange(len(np.unique(labels))));

#### Using sentence transformer

In [None]:
model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')

In [None]:
# Single list of sentences - Possible tens of thousands of sentences
sentences = ['The cat sits outside',
             'A man is playing guitar',
             'I love pasta',
             'The new movie is awesome',
             'The cat plays in the garden',
             'A woman watches TV',
             'The new movie is so great',
             'Do you like pizza?']

#embeddings = model.encode(sentences)
paraphrases = util.paraphrase_mining(model, sentences)
paraphrases[:10]

#### Using sentence transformer to score quora questions

In [None]:
df_['score'] = df_.apply(lambda row: util.paraphrase_mining(model, [row['question1'], row['question2']]), axis=1)


In [None]:
df_.head()

#### Accuracy of detection

In [None]:
from sklearn.metrics import classification_report

thresh_ = .7
y_pred = df_['score'].apply(lambda x: 1 if x > thresh_ else 0)
print(classification_report(df_['is_duplicate'].values, y_pred))
