#### This notebook explores the duplicate and non-duplicate set of samples in the training set by using similarity scores of their sentence embeddings and also highlights incorrectly labelled cases.

In [None]:
import numpy as np 
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

pd.set_option('display.max_colwidth', -1)

import warnings
warnings.filterwarnings('ignore')

In [None]:
! pip install bert-for-tf2

In [None]:
! du -sh /kaggle/input/quora-question-pairs/*

In [None]:
! unzip /kaggle/input/quora-question-pairs/train.csv.zip

In [None]:
# Size of train.csv

! du -sh ./*

In [None]:
import tensorflow as tf
import tensorflow_hub as hub
import bert

In [None]:
train_df = pd.read_csv("./train.csv")

In [None]:
train_df.head()

In [None]:
train_df.info()

### Distribution of classes - is duplicate / not-duplicate

In [None]:
train_df.groupby("is_duplicate").count()['id'].plot.bar()
plt.show()

### Null values

In [None]:
train_df.isnull().sum()

In [None]:
train_df[train_df['question1'].isnull()].head()

In [None]:
train_df[train_df['question2'].isnull()].head()

In [None]:
# Removing rows with Null questions

train_df = train_df[~train_df['question1'].isnull()]
train_df = train_df[~train_df['question2'].isnull()]

In [None]:
train_df.count()

### Exploring similarity scores between question pairs using Language-Agnostic-Bert-Sentence embedding model

[LABSE](https://ai.googleblog.com/2020/08/language-agnostic-bert-sentence.html) - is a multilingual model to preduce sentence embeddings based on Bert and combines methods for obtaining sentence embeddings with MLM and Translation Language Model pretrained encoders. 

It is trained on Monolingual data and bilingual translation pairs.

In [None]:
LABSE_model_URL = "https://tfhub.dev/google/LaBSE/1"
MAX_SEQ_LENGTH = 64

LABSE model is available from TFhub and is loaded from there to wrap it as a callable object and to be used as a Keras Layer. Its vocab_file is stored as atf.saved_model.Asset and the do_lower_case flag is stored as a tf.Variable object on the SavedModel.

In [None]:
## Define Model containing LABSE as Keras layers

def getModel(model_url, max_seq_length):
    # Load the saved LaBSE model as Keras layer. 
    # Set trainable to True to enable weight update for fine-tuning the model for down stream task
    labse_layer = hub.KerasLayer(handle=model_url, trainable=True, name='labse')

    # Define Inputs
    input_word_ids = tf.keras.Input(shape=(max_seq_length, ), dtype=tf.int32, name='input_word_ids')
    input_mask = tf.keras.Input(shape=(max_seq_length, ), dtype=tf.int32, name='input_mask')
    input_segment_ids = tf.keras.Input(shape=(max_seq_length, ), dtype=tf.int32, name='input_segment_ids')

    # LABSE layer 
    pooled_output, _ = labse_layer([input_word_ids, input_mask ,input_segment_ids])

    # The output is L2 normalized shape [batch_size, 768] representing a complete sentence embedding
    pooled_output = tf.keras.layers.Lambda(lambda x: tf.nn.l2_normalize(x, axis=1), name='l2_normalized_pooling')(pooled_output)

    # Define Model
    return tf.keras.Model(inputs=[input_word_ids, input_mask ,input_segment_ids], outputs=pooled_output), labse_layer

In [None]:
## Input preparation
labse_model, labse_layer = getModel(LABSE_model_URL, MAX_SEQ_LENGTH)

# labse_layer = hub.KerasLayer(handle=LABSE_model_URL, trainable=True, name='labse')

vocab_file = labse_layer.resolved_object.vocab_file.asset_path.numpy()  # Get vocab file path as numpy array 
do_lower_case = labse_layer.resolved_object.do_lower_case.numpy()   # Get Boolean Variable as numpy array
tokenizer = bert.bert_tokenization.FullTokenizer(vocab_file, do_lower_case)

print(vocab_file, do_lower_case)

In [None]:
labse_model.summary()

In [None]:
def create_input(input_strings, tokenizer, max_seq_length):
    input_ids_all, input_mask_all, segment_ids_all = [], [], []
    
    for input_string in tqdm(input_strings):
        # Tokenize input
        input_tokens = ["[CLS]"] + tokenizer.tokenize(input_string) + ["[SEP]"]
        input_ids = tokenizer.convert_tokens_to_ids(input_tokens)
        sequence_length = min(len(input_ids), max_seq_length)

        # Padding or truncation.
        if len(input_ids) >= max_seq_length:
          input_ids = input_ids[:max_seq_length]
        else:
          input_ids = input_ids + [0] * (max_seq_length - len(input_ids))

        input_mask = [1] * sequence_length + [0] * (max_seq_length - sequence_length)

        input_ids_all.append(input_ids)
        input_mask_all.append(input_mask)
        segment_ids_all.append([0] * max_seq_length)

    return np.array(input_ids_all), np.array(input_mask_all), np.array(segment_ids_all)

In [None]:
train_df.columns

In [None]:
def encode(input_text):
    input_ids, input_mask, segment_ids = create_input(input_text, tokenizer, MAX_SEQ_LENGTH)
    return labse_model.predict([input_ids, input_mask, segment_ids])

Obtaining embeddings for each question in the pair

We can run the following code to obtain embeddings. It takes ~30 minutes to run it for 400K records in a single Tesla T4 GPU (16GB).

I have already run it saved the results. Let's go the the next steps by importing the results file.


In [None]:
# sample_df = train_df
# question1_array = sample_df['question1'].values
# question2_array = sample_df['question2'].values

# %time question1_embeddings = encode(question1_array)

In [None]:
# %time question2_embeddings = encode(question2_array)

In [None]:
# question1_embeddings.shape, question2_embeddings.shape     # shape - ((404287, 768), (404287, 768))

Computing consine similarity

In [None]:
# product = question1_embeddings * question2_embeddings
# print(product.shape)                                # shape (404287, 768)

# cosine_similarity = product.sum(axis=1)
# cosine_similarity.shape                             # (404287,) 

In [None]:
# cos_similarity = prod.sum(axis=1)
# cos_similarity.shape

In [None]:
# sample_df['similarity'] = cos_similarity

In [None]:
!ls /kaggle/input/train-df-with-similarity-score/train_with_similarity_scores.csv

Importing the results file which has the original train.csv added with the cosine similarity between the question pair.

In [None]:
sample_df = pd.read_csv("/kaggle/input/train-df-with-similarity-score/train_with_similarity_scores.csv")

#### Plotting the distribution of similarity score for  duplicate and non-duplicate question pairs

In [None]:
sns.distplot(sample_df[sample_df['is_duplicate'] == 1.0]['similarity'][0:] , label = "1", color = 'red')
sns.distplot(sample_df[sample_df['is_duplicate'] == 0.0]['similarity'][0:] , label = "0", color = 'green')
plt.xlabel("Question pair cosine similarity ")
plt.show()

In [None]:
sns.violinplot(x = 'is_duplicate', y = 'similarity', data = sample_df[0:])
plt.ylabel("Question pair cosine similarity")
plt.show()

From the similarity score obtained using sentence embeddings of pretrained LABSE model, the median of distribution of similarity score for both duplicate and non-duplicate questions is higher than 0.5. 

Let's look into more detail for each case next.

### Duplicate question pairs

In [None]:
sample_df[sample_df['is_duplicate'] == 1].describe()['similarity']

#### Duplicate Question pairs with similarity score < 0.5

In [None]:
duplicate_low_simlarity_prop = sample_df[(sample_df['is_duplicate'] == 1) & (sample_df['similarity'] < 0.5)]['similarity'].count() / sample_df[sample_df['is_duplicate'] == 1]['similarity'].count()

print(" {} percent of duplicate question pairs have similarity score less than 0.5 ".format(duplicate_low_simlarity_prop * 100))

In [None]:
sample_df[(sample_df['is_duplicate'] == 1) & (sample_df['similarity'] < 0.5)].describe()['similarity']

In [None]:
# Going through top question pairs with lowest similarity scores

sample_df[(sample_df['is_duplicate'] == 1)][['question1', 'question2','similarity']].sort_values(by='similarity').head(10)

In [None]:
# Going through top question pairs with highest similarity scores.

sample_df[(sample_df['is_duplicate'] == 1)][['question1', 'question2','similarity']].sort_values(by='similarity', ascending=False).head(10)

Question pairs with similarity scores are identical in the training dataset. We'll explore non-identical.

In [None]:
# Going through top question pairs with highest similarity scores.

sample_df[(sample_df['is_duplicate'] == 1) & (sample_df['similarity'] < 0.8)][['question1', 'question2','similarity']].sort_values(by='similarity', ascending=False).head(10)

### Non-duplicate question pairs

In [None]:
sample_df[sample_df['is_duplicate'] == 0].describe()['similarity']

In [None]:
non_duplicate_high_simlarity_prop = sample_df[(sample_df['is_duplicate'] == 0) & (sample_df['similarity'] > 0.5)]['similarity'].count() / sample_df[sample_df['is_duplicate'] == 0]['similarity'].count()

print(" {} percent of Non-duplicate question pairs have similarity score more than 0.5 ".format(non_duplicate_high_simlarity_prop * 100))

In [None]:
sample_df[(sample_df['is_duplicate'] == 0) & (sample_df['similarity'] < 0.5)].describe()['similarity']

Non-duplicate question pairs with similarity score > 0.5

In [None]:
sample_df[(sample_df['is_duplicate'] == 0) & (sample_df['similarity'] > 0.5)].describe()['similarity']

While exploring, I came across the following top instances where the question pairs are duplicate but are labelled as otherwise.

In [None]:
sample_df[(sample_df['is_duplicate'] == 0) & (sample_df['similarity'] > 0.5)].sort_values(by='similarity', ascending=False).head(23)

True non-duplicate cases

In [None]:
sample_df_v2 = sample_df[(sample_df['is_duplicate'] == 0) & (sample_df['similarity'] > 0.5)]
sample_df_v2[(sample_df_v2['similarity'] <= 0.9 )].sort_values(by='similarity', ascending=False).head(10)

#### References
* https://github.com/Taaniya/natural-language-understanding/blob/master/Explore_Language_Agnostic_BERT_Sentence_Embedding.ipynb
* https://tfhub.dev/google/LaBSE/1