In [None]:
%reset -sf

# Dataset Visualisation

This notebook is solely use to generate visualisations on the dataset.

In [None]:
import os, collections, random, itertools

import tqdm
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [None]:
for dirname, _, filenames in os.walk('/kaggle/input'): 
    for filename in filenames: print(os.path.join(dirname, filename))

In [None]:
# load data
df = pd.read_csv("/kaggle/input/quora-question-pairs/train.csv.zip")
df["question1"] = df["question1"].astype(str)  # resolve nan
df["question2"] = df["question2"].astype(str)
df["qid1"] = df["qid1"] - 1
df["qid2"] = df["qid2"] - 1
maxidx = max(max(df["qid1"]), max(df["qid2"])) + 1

In [None]:
df.sample(10)

### Indexing the questions

In [None]:
# all questions are identified with its qid
qid_to_question = {}
for qid1, qid2, question1, question2 in zip(df["qid1"], df["qid2"], df["question1"], df["question2"]):
    qid_to_question[qid1] = question1
    qid_to_question[qid2] = question2

### Simple Analysis of the dataset

In [None]:
print("Number of questions", len(qid_to_question))
print("Number of duplicate pairs", sum(df["is_duplicate"]))
print("Percentage of pairs that are duplicate {:.3f}%".format(sum(df["is_duplicate"])/len(qid_to_question)*100))

In [None]:
qid_to_labelled_qids = collections.defaultdict(set)
qid_to_duplicate_qids = collections.defaultdict(set)
for qid1, qid2, is_duplicate in zip(df["qid1"], df["qid2"], df["is_duplicate"]):
    qid_to_labelled_qids[qid1].add(qid2)
    qid_to_labelled_qids[qid2].add(qid1)
    if is_duplicate:
        qid_to_duplicate_qids[qid1].add(qid2)
        qid_to_duplicate_qids[qid2].add(qid1)

In [None]:
plt.figure(figsize=(14,4))
plt.title("Number of labels for each question, and how many of which are duplicate")

label_sizes = [len(qid_to_labelled_qids[qid]) for qid in qid_to_question]
count, bins = np.histogram(label_sizes, bins=range(max(label_sizes)+2))
count = count*bins[:-1]   # convert number of groups to population
plt.bar(bins[1:-1], count[1:], width=1, label="total")

duplicate_sizes = [len(qid_to_duplicate_qids[qid]) for qid in qid_to_question]
count, bins = np.histogram(duplicate_sizes, bins=range(max(duplicate_sizes)+2))
count = count*bins[:-1]   # convert number of groups to population
plt.bar(bins[1:-1], count[1:], width=1, label="duplicate")
plt.xlim(0,50)
plt.xlabel("Count")
plt.ylabel("Dataset Frequency")
plt.legend()
plt.show()

In [None]:
print("Largest label sizes:", sorted(label_sizes)[-20:])
print("Largest duplicate sizes:", sorted(duplicate_sizes)[-20:])

### Connecting similar questions

In [None]:
import typing


class DisjointSet:
    # https://github.com/not522/ac-library-python/blob/master/atcoder/dsu.py
    def __init__(self, n: int = 0) -> None:
        self._n = n
        self.parent_or_size = [-1] * n

    def union(self, a: int, b: int) -> int:
        assert 0 <= a < self._n
        assert 0 <= b < self._n

        x = self.leader(a)
        y = self.leader(b)

        if x == y:
            return x

        if -self.parent_or_size[x] < -self.parent_or_size[y]:
            x, y = y, x

        self.parent_or_size[x] += self.parent_or_size[y]
        self.parent_or_size[y] = x

        return x

    def same(self, a: int, b: int) -> bool:
        assert 0 <= a < self._n
        assert 0 <= b < self._n

        return self.leader(a) == self.leader(b)

    def find(self, a: int) -> int:
        return self.leader(a)
    
    def leader(self, a: int) -> int:
        assert 0 <= a < self._n

        parent = self.parent_or_size[a]
        while parent >= 0:
            if self.parent_or_size[parent] < 0:
                return parent
            self.parent_or_size[a], a, parent = (
                self.parent_or_size[parent],
                self.parent_or_size[parent],
                self.parent_or_size[self.parent_or_size[parent]]
            )

        return a

    def size(self, a: int) -> int:
        assert 0 <= a < self._n

        return -self.parent_or_size[self.leader(a)]

    def groups(self) -> typing.List[typing.List[int]]:
        leader_buf = [self.leader(i) for i in range(self._n)]

        result: typing.List[typing.List[int]] = [[] for _ in range(self._n)]
        for i in range(self._n):
            result[leader_buf[i]].append(i)

        return list(filter(lambda r: r, result))

In [None]:
# all questions are identified with its qid
disjoint_set = DisjointSet(maxidx)
for qid1, qid2, is_duplicate in zip(df["qid1"], df["qid2"], df["is_duplicate"]):
    if is_duplicate:
        disjoint_set.union(qid1, qid2)

### Counting the number of inconsistent nonduplicate labels

In [None]:
cnt = 0
for qid1, qid2, is_duplicate in zip(df["qid1"], df["qid2"], df["is_duplicate"]):
    if not is_duplicate:
        if disjoint_set.find(qid1) == disjoint_set.find(qid2):
            cnt += 1
            if cnt < 10:
                print(qid_to_question[qid1], "\n", qid_to_question[qid2], "\n")
print(cnt)

### Visualising the group size of similar questions

In [None]:
group_sizes = np.array([len(group) for group in disjoint_set.groups()])
count, bins = np.histogram(group_sizes, bins=range(max(group_sizes)+2))
count = count*bins[:-1]   # convert number of groups to population
plt.figure(figsize=(14,4))
plt.title("Group size of similar questions")
plt.bar(bins[2:-1], count[2:], width=1)
plt.xlim(0,50)
plt.xlabel("Group Size")
plt.ylabel("Dataset Frequency")
plt.show()

In [None]:
print("Largest group sizes:", sorted(group_sizes)[-20:])

### Counting the number of augmented connections

In [None]:
initial_connection_count = sum(duplicate_sizes)
final_connection_count = sum(group_sizes)
initial_connection_count, final_connection_count, final_connection_count - initial_connection_count

### Visualise distribution of overlapping word count for duplicate pairs

In [None]:
# define tokenisation process

import pickle, functools
qid_to_tokens_preprocessed_filename = "../input/quora-question-pairs-tokenise-pipeline/qid_to_processed_token_list_tokenise_then_spellcheck.pkl"
with open(qid_to_tokens_preprocessed_filename, "rb") as f:
    qid_to_tokens_preprocessed = pickle.load(f)

from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

stopword_set = set(stopwords.words())
stopword_set.update(["?"])

@functools.lru_cache(maxsize=None)
def tokenise_qid(qid, qid_to_tokens_preprocessed=qid_to_tokens_preprocessed):
    if qid_to_tokens_preprocessed:
        return qid_to_tokens_preprocessed[qid]
    
    sentence = qid_to_question[qid]
    return word_tokenize(sentence.lower())

In [None]:
groups = disjoint_set.groups()
overlap_count_duplicate = []
for group in tqdm.tqdm(groups):
    for qid1,qid2 in itertools.combinations(group, r=2):
        overlapping_tokens = set(tokenise_qid(qid1)) & set(tokenise_qid(qid2))
        overlapping_tokens = list(token for token in overlapping_tokens if token not in stopword_set)
        overlap_count_duplicate.append(len(overlapping_tokens))


overlap_count_random = []
sample1 = random.sample(qid_to_question.keys(), 20000)
sample2 = random.sample(qid_to_question.keys(), 20000)
for qid1, qid2 in zip(sample1, sample2):
    overlapping_tokens = set(tokenise_qid(qid1)) & set(tokenise_qid(qid2))
    overlapping_tokens = list(token for token in overlapping_tokens if token not in stopword_set)
    overlap_count_random.append(len(overlapping_tokens))

In [None]:
plt.figure(figsize=(14,4))
plt.hist(overlap_count_duplicate, bins=range(15), density=True, alpha=0.5, label="duplicate pair")
plt.hist(overlap_count_random, bins=range(15), density=True, alpha=0.5, label="random pair")
plt.title("Distribution of overlapping non-root word tokens for duplicate pairs and random pairs")
plt.legend()
plt.show()

### Understand the most frequent non rootword tokens

In [None]:
from nltk.corpus import stopwords
stopword_set = set(stopwords.words())

In [None]:
import pickle
import random

from nltk.corpus import stopwords
stopword_set = set(stopwords.words())

with open("../input/quora-question-pairs-tokenise-pipeline/qid_to_processed_token_list_spellcheck_then_tokenise.pkl", "rb") as f:
    qid_to_tokens = pickle.load(f)

# with open("../input/quora-question-pairs-tokenise-pipeline/token_to_qid_tokenise_then_spellcheck.pkl", "rb") as f:
#     token_to_qids = pickle.load(f)

token_to_qids = collections.defaultdict(set)
for qid, tokens in qid_to_tokens.items():
    for token in tokens:
        token_to_qids[token].add(qid)

In [None]:
# most common non-stop words, the question mark has been excluded
sorted([(len(v),k) for k,v in token_to_qids.items() if k not in stopword_set], reverse=True)[:20]

### Visualise distribution of the number of questions to compare against

In [None]:
token_length_sizes = []
considered_set_sizes = []
for qid in tqdm.tqdm(random.sample(qid_to_tokens.keys(), 10000)):
    considered_set = set()
    for token in qid_to_tokens[qid]:
        if token in stopword_set:
            continue
        if token in token_to_qids:  # some tokens are not found in the token_to_qids (probably from test set)
            for considered_qid in token_to_qids[token]:
                considered_set.add(considered_qid)
    token_length_sizes.append(len(set(qid_to_tokens[qid])))
    considered_set_sizes.append(len(considered_set))

In [None]:
plt.figure(figsize=(14,4))
plt.hist(considered_set_sizes, bins=np.arange(0,70000,1000), density=True)
plt.title("How many other questions has at least one common non-rootword token")
plt.xlabel("Query comparison size")
plt.legend()
plt.show()

In [None]:
plt.figure(figsize=(14,4))
plt.scatter(considered_set_sizes, token_length_sizes, alpha=0.1)
plt.title("Relationship between number of unique tokens and query comparison size")
plt.ylabel("Number of unique tokens")
plt.xlabel("Query comparison size")
plt.xlim(None,70000)
plt.show()

### Visualise distribution of sentence vectors

In [None]:
# model_name = "bert-base-nli-stsb-mean-tokens"
# sentence_vectors = np.load(f"../input/quora-question-pairs-bert-sentence-vectors/sentence_vectors_{model_name}.npy")

In [None]:
with open("../input/quora-question-pairs-tokenise-pipeline/qid_to_vec_trf.pkl", 'rb') as f:
    qid_to_vec = pickle.load(f)
sentence_vectors = []
for idx in sorted(qid_to_vec.keys()):
    sentence_vectors.append(qid_to_vec[idx])
sentence_vectors = np.array(sentence_vectors)

In [None]:
NUM_LARGEST_GROUPS = 1
largest_groups = sorted(disjoint_set.groups(), key=len)[-NUM_LARGEST_GROUPS:]
qids_of_largest_groups = np.array(sum(largest_groups, []))  # flatten

In [None]:
from sklearn.decomposition import PCA
pca = PCA(n_components=2)
sentence_pca = pca.fit_transform(sentence_vectors)

In [None]:
from matplotlib import collections  as mc

def plot_2d_distribution(vectors_2d, qids_to_connect, title=""):
    qids_to_connect = set(qids_to_connect)
    plt.figure(figsize=(10,8))
    plt.scatter(*list(zip(*vectors_2d))[:2], s=1, alpha=0.1)
    lines = []
    for qid1, qid2, is_duplicate in zip(df["qid1"], df["qid2"], df["is_duplicate"]):
        if is_duplicate and qid1 in qids_to_connect and qid2 in qids_to_connect:
            lines.append([vectors_2d[qid1][:2], vectors_2d[qid2][:2]])
    lc = mc.LineCollection(lines, color="red", alpha=0.2,
                           linewidths=[1/(0.1 + (a-c)**2 + (b-d)**2)**0.5 for (a,b),(c,d) in lines])
    plt.gca().add_collection(lc)
    plt.title(title)

    mx, my = np.nanmean(vectors_2d, axis=0)
    sx, sy = np.nanstd(vectors_2d, axis=0)
    plt.xlim(mx - 4*sx, mx + 4*sx)
    plt.ylim(my - 4*sy, mx + 4*sy)
    plt.show()

In [None]:
plot_2d_distribution(sentence_pca, qids_of_largest_groups, 
                     "Plot of PCA projection of all word embeddings, {} largest group(s) highlighted".format(NUM_LARGEST_GROUPS))

In [None]:
!git clone https://github.com/DmitryUlyanov/Multicore-TSNE.git > /dev/null && cd Multicore-TSNE/ && pip install . > /dev/null && rm -rf ./Multicore-TSNE 

In [None]:
NUM_LARGEST_GROUPS = 20
largest_groups = sorted(disjoint_set.groups(), key=len)[-NUM_LARGEST_GROUPS:]
qids_of_largest_groups = np.array(sum(largest_groups, []))  # flatten

In [None]:
from MulticoreTSNE import MulticoreTSNE as TSNE

# it was recommended by scipy that we first reduce the dimensions
qids_to_fit_tsne = np.array(list(set(qids_of_largest_groups) | set(random.sample(qid_to_question.keys(), 20000))))
sentence_pca = PCA(n_components=50).fit_transform(sentence_vectors[qids_to_fit_tsne])
tsne = TSNE(n_jobs=4)
sentence_tsne = np.empty((sentence_vectors.shape[0], 2))
sentence_tsne[:] = np.nan
sentence_tsne[qids_to_fit_tsne] = tsne.fit_transform(sentence_pca)

In [None]:
plot_2d_distribution(sentence_tsne, qids_of_largest_groups,
                     "Plot of T-SNE projection of {} largest groups and 20000 other questions".format(NUM_LARGEST_GROUPS))

#### Duplicate questions with largest cosine distance of sentence vectors

In [None]:
import itertools
from scipy.spatial.distance import cosine

distances = []
for group in tqdm.tqdm(disjoint_set.groups()):
    for qid1,qid2 in itertools.combinations(group, r=2):
        distance = cosine(sentence_vectors[qid1], sentence_vectors[qid2])
        distances.append((distance,qid1,qid2))

distances = sorted(distances)

In [None]:
for distance,qid1,qid2 in distances[-10:]:
    print(f"Distance: {distance:.2f}\n{qid_to_question[qid1]}\n{qid_to_question[qid2]}\n")