In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from collections import defaultdict
import sys
import random
from itertools import product
from sklearn.model_selection import train_test_split

# load datasets

In [2]:
data_pairs = pd.read_csv("./data/train_split.csv")
data_pairs, _ = train_test_split(data_pairs, train_size=50000)
data_pairs.head()

Unnamed: 0,qid1,qid2,question1,question2,is_duplicate,pair_id
19485,332128,332129,What are some venture capital firms that focus...,What are some venture capital firms that focus...,0,224133
330988,413782,413783,Are Odias considered to be North Indians?,Why are odias considered as north Indians when...,1,292229
83911,25020,458354,What are some websites similar to Coursera?,Are Coursera programs recognised?,0,331454
105254,112361,53189,Why are electron orbitals shaped the way they ...,What is the easier way to understand the conce...,1,64682
248524,113720,113721,What should be the minimum TDS level for the d...,Is water with TDS around 9 and ph at 7.5 fit f...,0,65533


In [28]:
sum(data_pairs.is_duplicate == 1)

18424

In [29]:
sum(data_pairs.is_duplicate == 0)

31576

In [3]:
all_data_pairs = pd.read_csv("./data/all_possible_pairs.csv")
all_data_pairs.head()

Unnamed: 0,qid1,qid2,question1,question2,is_duplicate
0,1,2,What is the step by step guide to invest in sh...,What is the step by step guide to invest in sh...,0
1,3,4,What is the story of Kohinoor (Koh-i-Noor) Dia...,What would happen if the Indian government sto...,0
2,5,6,How can I increase the speed of my internet co...,How can Internet speed be increased by hacking...,0
3,7,8,Why am I mentally very lonely? How can I solve...,Find the remainder when [math]23^{24}[/math] i...,0
4,9,10,"Which one dissolve in water quikly sugar, salt...",Which fish would survive in salt water?,0


In [4]:
qids_to_index = dict()

for index, (qid1, qid2, question1, question2, is_duplicate) in all_data_pairs.iterrows():
    qids_to_index[(qid1, qid2)] = index

def get_pair_index(qid1, qid2):
    if (qid1, qid2) in qids_to_index:
        return qids_to_index[(qid1, qid2)]
    return qids_to_index[(qid2, qid1)]

In [5]:
valid_data_pairs = pd.read_csv("./data/valid_split.csv")
valid_pairs = set()
for _, (qid1, qid2, _, _, is_duplicate, _) in valid_data_pairs.iterrows():
    lower_id = min(qid1, qid2)
    higher_id = max(qid1, qid2)
    valid_pairs.add((lower_id, higher_id))

In [6]:
test_data_pairs = pd.read_csv("./data/test_split.csv")
test_pairs = set()
for _, (qid1, qid2, _, _, is_duplicate, _) in test_data_pairs.iterrows():
    lower_id = min(qid1, qid2)
    higher_id = max(qid1, qid2)
    test_pairs.add((lower_id, higher_id))

# create graphs

In [7]:
graph_similar = defaultdict(lambda: [])
original_similar_pairs = set()
for _, (qid1, qid2, _, _, is_duplicate, _) in data_pairs.iterrows():
    if not is_duplicate: continue

    graph_similar[qid1].append(qid2)
    graph_similar[qid2].append(qid1)

    lower_id = min(qid1, qid2)
    higher_id = max(qid1, qid2)
    original_similar_pairs.add((lower_id, higher_id))

In [8]:
graph_non_similar = defaultdict(lambda: [])
original_non_similar_pairs = set()
for _, (qid1, qid2, _, _, is_duplicate, _) in data_pairs.iterrows():
    if is_duplicate: continue

    graph_non_similar[qid1].append(qid2)
    graph_non_similar[qid2].append(qid1)

    lower_id = min(qid1, qid2)
    higher_id = max(qid1, qid2)
    original_non_similar_pairs.add((lower_id, higher_id))

# calculate components

In [9]:
labels = defaultdict(lambda : 0)
components = defaultdict(lambda: [])

def visit(node, level):
    stack = [node]
    while len(stack) > 0:
        v = stack.pop()
        if labels[v] != 0: continue
        labels[v] = level
        components[level].append(v)
        stack += graph_similar[v]

level = 0
for node in graph_similar:
    if labels[node] == 0:
        level += 1
        visit(node, level)

# pozitivne augmentacije

In [10]:
transitive_positive_neighbours = defaultdict(lambda : 200)

def find_positive_transitive_neighbours(node):
    depth = 1
    visited = {node}
    queue = [node]
    while len(queue):
        next_neighbours = [u for v in queue for u in graph_similar[v] if u not in visited]
        for neighbour in next_neighbours:
            visited.add(neighbour)
            lower_id, higher_id = min(node, neighbour), max(node, neighbour)
            transitive_positive_neighbours[(lower_id, higher_id)] = depth

        depth += 1
        queue = list(set(next_neighbours))

for node in graph_similar:
    find_positive_transitive_neighbours(node)

In [11]:
augmented_positive_pairs = [[], [], []]

for pair, depth in transitive_positive_neighbours.items():
    if depth == 1: continue
    depth = min(depth, 4)
    augmented_positive_pairs[depth - 2].append(pair)

In [12]:
[len(pairs) for pairs in augmented_positive_pairs]

[10070, 10615, 14973]

In [13]:
print(np.array([a < b for a, b, in transitive_positive_neighbours.keys()]).all())

True


# negativne augmentacije

In [14]:
augmented_negative_pairs = set()

for qid1, qid2 in original_non_similar_pairs:
    k1 = labels[qid1]
    k2 = labels[qid2]
    if k1 == k2: continue

    k1_components = components[k1]
    k2_components = components[k2]
    for qid1_new, qid2_new in product(k1_components, k2_components):
        lower_id = min(qid1_new, qid2_new)
        higher_id = max(qid1_new, qid2_new)
        augmented_negative_pairs.add((lower_id, higher_id))

augmented_negative_pairs = augmented_negative_pairs.difference(original_non_similar_pairs)
augmented_negative_pairs = list(augmented_negative_pairs)

In [15]:
len(augmented_negative_pairs)

5410

# makni augmentacije koje su u testi valid setu

In [16]:
augmented_positive_pairs[0] = list(set(augmented_positive_pairs[0]).difference(test_pairs))
augmented_positive_pairs[1] = list(set(augmented_positive_pairs[1]).difference(test_pairs))
augmented_positive_pairs[2] = list(set(augmented_positive_pairs[2]).difference(test_pairs))
augmented_negative_pairs = list(set(augmented_negative_pairs).difference(test_pairs))

print([len(pairs) for pairs in augmented_positive_pairs])
print(len(augmented_negative_pairs))

[9416, 10050, 14496]
5406


In [17]:
augmented_positive_pairs[0] = list(set(augmented_positive_pairs[0]).difference(valid_pairs))
augmented_positive_pairs[1] = list(set(augmented_positive_pairs[1]).difference(valid_pairs))
augmented_positive_pairs[2] = list(set(augmented_positive_pairs[2]).difference(valid_pairs))
augmented_negative_pairs = list(set(augmented_negative_pairs).difference(valid_pairs))

print([len(pairs) for pairs in augmented_positive_pairs])
print(len(augmented_negative_pairs))

[8779, 9514, 13996]
5404


# create dataset for each experiment

In [18]:
def get_pairs(step, fraction_negative):
    pairs = []
    for _, (qid1, qid2, _, _, is_duplicate, _) in data_pairs.iterrows():
        pairs.append([get_pair_index(qid1, qid2), is_duplicate])

    positive_count = 0
    for i in range(step):
        for qid1, qid2 in augmented_positive_pairs[i]:
            pairs.append([get_pair_index(qid1, qid2), 1])
        positive_count += len(augmented_positive_pairs[i])

    if fraction_negative == 0: return pairs

    total_number_of_augmented = len(augmented_negative_pairs)
    to_add_negative = int(fraction_negative * positive_count)
    to_add_negative = min(to_add_negative, total_number_of_augmented)
    random_permutation = np.random.permutation(augmented_negative_pairs)
    for qid1, qid2 in random_permutation[:to_add_negative]:
        pairs.append([get_pair_index(qid1, qid2), 0])

    return pairs

def get_dataset(step, fraction_negative):
    pairs = get_pairs(step, fraction_negative)
    return pd.DataFrame(pairs, columns=["pair_id", "is_duplicate"])

In [19]:
step, fraction = 1, 0.0
experiment_dataset = get_dataset(step, fraction)
experiment_dataset.to_csv(f"./data/experiments_small/instance_{step}_{fraction:.2f}.csv", index=False)

In [20]:
step, fraction = 1, 1.0
experiment_dataset = get_dataset(step, fraction)
experiment_dataset.to_csv(f"./data/experiments_small/instance_{step}_{fraction:.2f}.csv", index=False)

In [21]:
step, fraction = 1, 2.0
experiment_dataset = get_dataset(step, fraction)
experiment_dataset.to_csv(f"./data/experiments_small/instance_{step}_{fraction:.2f}.csv", index=False)

In [22]:
step, fraction = 2, 0.0
experiment_dataset = get_dataset(step, fraction)
experiment_dataset.to_csv(f"./data/experiments_small/instance_{step}_{fraction:.2f}.csv", index=False)

In [23]:
step, fraction = 2, 1.0
experiment_dataset = get_dataset(step, fraction)
experiment_dataset.to_csv(f"./data/experiments_small/instance_{step}_{fraction:.2f}.csv", index=False)

In [24]:
step, fraction = 2, 2.0
experiment_dataset = get_dataset(step, fraction)
experiment_dataset.to_csv(f"./data/experiments_small/instance_{step}_{fraction:.2f}.csv", index=False)

In [25]:
step, fraction = 3, 0.0
experiment_dataset = get_dataset(step, fraction)
experiment_dataset.to_csv(f"./data/experiments_small/instance_{step}_{fraction:.2f}.csv", index=False)

In [26]:
step, fraction = 3, 1.0
experiment_dataset = get_dataset(step, fraction)
experiment_dataset.to_csv(f"./data/experiments_small/instance_{step}_{fraction:.2f}.csv", index=False)

In [27]:
step, fraction = 3, 2.0
experiment_dataset = get_dataset(step, fraction)
experiment_dataset.to_csv(f"./data/experiments_small/instance_{step}_{fraction:.2f}.csv", index=False)

In [30]:
data_pairs.to_csv("./data/train_split_small.csv", index=False)