In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from collections import defaultdict
import sys
import random
from itertools import product

# load datasets

In [2]:
data_pairs = pd.read_csv("./data/train.csv")
data_pairs = data_pairs.drop("id", axis=1)
data_pairs.head()

Unnamed: 0,qid1,qid2,question1,question2,is_duplicate
0,1,2,What is the step by step guide to invest in sh...,What is the step by step guide to invest in sh...,0
1,3,4,What is the story of Kohinoor (Koh-i-Noor) Dia...,What would happen if the Indian government sto...,0
2,5,6,How can I increase the speed of my internet co...,How can Internet speed be increased by hacking...,0
3,7,8,Why am I mentally very lonely? How can I solve...,Find the remainder when [math]23^{24}[/math] i...,0
4,9,10,"Which one dissolve in water quikly sugar, salt...",Which fish would survive in salt water?,0


In [3]:
data_questions = pd.read_csv("./data/data_questions.csv")
data_questions.head()

Unnamed: 0,qid,question
0,1,What is the step by step guide to invest in sh...
1,2,What is the step by step guide to invest in sh...
2,3,What is the story of Kohinoor (Koh-i-Noor) Dia...
3,4,What would happen if the Indian government sto...
4,5,How can I increase the speed of my internet co...


# create graphs

In [4]:
graph_similar = defaultdict(lambda: [])
original_similar_pairs = set()
for _, (qid1, qid2, question1, question2, is_duplicate) in data_pairs.iterrows():
    if not is_duplicate: continue

    graph_similar[qid1].append(qid2)
    graph_similar[qid2].append(qid1)

    lower_id = min(qid1, qid2)
    higher_id = max(qid1, qid2)
    original_similar_pairs.add((lower_id, higher_id))

In [5]:
graph_non_similar = defaultdict(lambda: [])
original_non_similar_pairs = set()
for _, (qid1, qid2, question1, question2, is_duplicate) in data_pairs.iterrows():
    if is_duplicate: continue

    graph_non_similar[qid1].append(qid2)
    graph_non_similar[qid2].append(qid1)

    lower_id = min(qid1, qid2)
    higher_id = max(qid1, qid2)
    original_non_similar_pairs.add((lower_id, higher_id))

# calculate components

In [6]:
labels = defaultdict(lambda : 0)
components = defaultdict(lambda: [])

def visit(node, level):
    stack = [node]
    while len(stack) > 0:
        v = stack.pop()
        if labels[v] != 0: continue
        labels[v] = level
        components[level].append(v)
        stack += graph_similar[v]

level = 0
for node in graph_similar:
    if labels[node] == 0:
        level += 1
        visit(node, level)

# pozitivne augmentacije

In [7]:
transitive_positive_neighbours = defaultdict(lambda : 200)

def find_positive_transitive_neighbours(node):
    depth = 1
    visited = {node}
    queue = [node]
    while len(queue):
        next_neighbours = [u for v in queue for u in graph_similar[v] if u not in visited]
        for neighbour in next_neighbours:
            visited.add(neighbour)
            lower_id, higher_id = min(node, neighbour), max(node, neighbour)
            transitive_positive_neighbours[(lower_id, higher_id)] = depth

        depth += 1
        queue = list(set(next_neighbours))

for node in graph_similar:
    find_positive_transitive_neighbours(node)

In [8]:
augmented_positive_pairs = [[], [], []]

for pair, depth in transitive_positive_neighbours.items():
    if depth == 1: continue
    depth = min(depth, 4)
    augmented_positive_pairs[depth - 2].append(pair)

In [9]:
[len(pairs) for pairs in augmented_positive_pairs]

[55490, 17920, 5875]

# negativne augmentacije

In [10]:
augmented_negative_pairs = set()

for qid1, qid2 in original_non_similar_pairs:
    k1 = labels[qid1]
    k2 = labels[qid2]
    if k1 == k2: continue

    k1_components = components[k1]
    k2_components = components[k2]
    for qid1_new, qid2_new in product(k1_components, k2_components):
        lower_id = min(qid1_new, qid2_new)
        higher_id = max(qid1_new, qid2_new)
        augmented_negative_pairs.add((lower_id, higher_id))

augmented_negative_pairs = augmented_negative_pairs.difference(original_non_similar_pairs)
augmented_negative_pairs = list(augmented_negative_pairs)

In [11]:
len(augmented_negative_pairs)

153543

# stvaranje dataseta koji sadrzi sve moguce parove

In [12]:
all_pairs = []
for _, (qid1, qid2, question1, question2, is_duplicate) in data_pairs.iterrows():
    all_pairs.append([qid1, qid2, question1, question2, is_duplicate])

for qid1, qid2 in augmented_positive_pairs[0]:
    question1 = data_questions.iloc[qid1 - 1].question
    question2 = data_questions.iloc[qid2 - 1].question
    all_pairs.append([qid1, qid2, question1, question2, 1])

for qid1, qid2 in augmented_positive_pairs[1]:
    question1 = data_questions.iloc[qid1 - 1].question
    question2 = data_questions.iloc[qid2 - 1].question
    all_pairs.append([qid1, qid2, question1, question2, 1])

for qid1, qid2 in augmented_positive_pairs[2]:
    question1 = data_questions.iloc[qid1 - 1].question
    question2 = data_questions.iloc[qid2 - 1].question
    all_pairs.append([qid1, qid2, question1, question2, 1])

for qid1, qid2 in augmented_negative_pairs:
    question1 = data_questions.iloc[qid1 - 1].question
    question2 = data_questions.iloc[qid2 - 1].question
    all_pairs.append([qid1, qid2, question1, question2, 0])

In [13]:
all_pairs_dataset = pd.DataFrame(all_pairs, columns=["qid1", "qid2", "question1", "question2", "is_duplicate"])
len(all_pairs_dataset)

637118

In [14]:
all_pairs_dataset.to_csv("./data/all_possible_pairs.csv", index=False, )