In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from collections import defaultdict
import sys
import random
from itertools import product

# load datasets

In [2]:
data_pairs = pd.read_csv("./data/train_split.csv")
data_pairs.head()

Unnamed: 0,qid1,qid2,question1,question2,is_duplicate,pair_id
0,2395,138789,"If the universe is ""expanding"", is my room exp...",If the universe is expanding does it mean matt...,1,81779
1,19200,7446,How can I slowly lose weight?,What should you do if you want to lose a lot o...,1,266887
2,477381,477382,What are the best shoes to hardstyle shuffle? ...,What are the best shoes for rock climbing?,0,348793
3,325056,325057,What will be best place in Brisbane to hire ef...,Where can I hire professional painting contrac...,1,218454
4,259926,259927,When will Lenovo K3 Note get the Android 7 upg...,When will Lenovo K3 Note get the Android 6 upg...,0,167686


In [3]:
all_data_pairs = pd.read_csv("./data/all_possible_pairs.csv")
all_data_pairs.head()

Unnamed: 0,qid1,qid2,question1,question2,is_duplicate
0,1,2,What is the step by step guide to invest in sh...,What is the step by step guide to invest in sh...,0
1,3,4,What is the story of Kohinoor (Koh-i-Noor) Dia...,What would happen if the Indian government sto...,0
2,5,6,How can I increase the speed of my internet co...,How can Internet speed be increased by hacking...,0
3,7,8,Why am I mentally very lonely? How can I solve...,Find the remainder when [math]23^{24}[/math] i...,0
4,9,10,"Which one dissolve in water quikly sugar, salt...",Which fish would survive in salt water?,0


In [4]:
qids_to_index = dict()

for index, (qid1, qid2, question1, question2, is_duplicate) in all_data_pairs.iterrows():
    qids_to_index[(qid1, qid2)] = index

def get_pair_index(qid1, qid2):
    if (qid1, qid2) in qids_to_index:
        return qids_to_index[(qid1, qid2)]
    return qids_to_index[(qid2, qid1)]

In [5]:
valid_data_pairs = pd.read_csv("./data/valid_split.csv")
valid_pairs = set()
for _, (qid1, qid2, _, _, is_duplicate, _) in valid_data_pairs.iterrows():
    lower_id = min(qid1, qid2)
    higher_id = max(qid1, qid2)
    valid_pairs.add((lower_id, higher_id))

In [6]:
test_data_pairs = pd.read_csv("./data/test_split.csv")
test_pairs = set()
for _, (qid1, qid2, _, _, is_duplicate, _) in test_data_pairs.iterrows():
    lower_id = min(qid1, qid2)
    higher_id = max(qid1, qid2)
    test_pairs.add((lower_id, higher_id))

In [7]:
data_questions = pd.read_csv("./data/data_questions.csv")
data_questions.head()

Unnamed: 0,qid,question
0,1,What is the step by step guide to invest in sh...
1,2,What is the step by step guide to invest in sh...
2,3,What is the story of Kohinoor (Koh-i-Noor) Dia...
3,4,What would happen if the Indian government sto...
4,5,How can I increase the speed of my internet co...


# create graphs

In [8]:
graph_similar = defaultdict(lambda: [])
original_similar_pairs = set()
for _, (qid1, qid2, _, _, is_duplicate, _) in data_pairs.iterrows():
    if not is_duplicate: continue

    graph_similar[qid1].append(qid2)
    graph_similar[qid2].append(qid1)

    lower_id = min(qid1, qid2)
    higher_id = max(qid1, qid2)
    original_similar_pairs.add((lower_id, higher_id))

In [9]:
graph_non_similar = defaultdict(lambda: [])
original_non_similar_pairs = set()
for _, (qid1, qid2, _, _, is_duplicate, _) in data_pairs.iterrows():
    if is_duplicate: continue

    graph_non_similar[qid1].append(qid2)
    graph_non_similar[qid2].append(qid1)

    lower_id = min(qid1, qid2)
    higher_id = max(qid1, qid2)
    original_non_similar_pairs.add((lower_id, higher_id))

# calculate components

In [10]:
labels = defaultdict(lambda : 0)
components = defaultdict(lambda: [])

def visit(node, level):
    stack = [node]
    while len(stack) > 0:
        v = stack.pop()
        if labels[v] != 0: continue
        labels[v] = level
        components[level].append(v)
        stack += graph_similar[v]

level = 0
for node in graph_similar:
    if labels[node] == 0:
        level += 1
        visit(node, level)

# pozitivne augmentacije

In [11]:
transitive_positive_neighbours = defaultdict(lambda : 200)

def find_positive_transitive_neighbours(node):
    depth = 1
    visited = {node}
    queue = [node]
    while len(queue):
        next_neighbours = [u for v in queue for u in graph_similar[v] if u not in visited]
        for neighbour in next_neighbours:
            visited.add(neighbour)
            lower_id, higher_id = min(node, neighbour), max(node, neighbour)
            transitive_positive_neighbours[(lower_id, higher_id)] = depth

        depth += 1
        queue = list(set(next_neighbours))

for node in graph_similar:
    find_positive_transitive_neighbours(node)

In [12]:
augmented_positive_pairs = [[], [], []]

for pair, depth in transitive_positive_neighbours.items():
    if depth == 1: continue
    depth = min(depth, 4)
    augmented_positive_pairs[depth - 2].append(pair)

In [13]:
[len(pairs) for pairs in augmented_positive_pairs]

[60941, 19303, 6657]

# negativne augmentacije

In [14]:
augmented_negative_pairs = set()

for qid1, qid2 in original_non_similar_pairs:
    k1 = labels[qid1]
    k2 = labels[qid2]
    if k1 == k2: continue

    k1_components = components[k1]
    k2_components = components[k2]
    for qid1_new, qid2_new in product(k1_components, k2_components):
        lower_id = min(qid1_new, qid2_new)
        higher_id = max(qid1_new, qid2_new)
        augmented_negative_pairs.add((lower_id, higher_id))

augmented_negative_pairs = augmented_negative_pairs.difference(original_non_similar_pairs)
augmented_negative_pairs = list(augmented_negative_pairs)

In [15]:
len(augmented_negative_pairs)

121263

# makni augmentacije koje su u testi valid setu

In [16]:
augmented_positive_pairs[0] = list(set(augmented_positive_pairs[0]).difference(test_pairs))
augmented_positive_pairs[1] = list(set(augmented_positive_pairs[1]).difference(test_pairs))
augmented_positive_pairs[2] = list(set(augmented_positive_pairs[2]).difference(test_pairs))
augmented_negative_pairs = list(set(augmented_negative_pairs).difference(test_pairs))

print([len(pairs) for pairs in augmented_positive_pairs])
print(len(augmented_negative_pairs))

[54643, 19149, 6639]
121211


In [17]:
augmented_positive_pairs[0] = list(set(augmented_positive_pairs[0]).difference(valid_pairs))
augmented_positive_pairs[1] = list(set(augmented_positive_pairs[1]).difference(valid_pairs))
augmented_positive_pairs[2] = list(set(augmented_positive_pairs[2]).difference(valid_pairs))
augmented_negative_pairs = list(set(augmented_negative_pairs).difference(valid_pairs))

print([len(pairs) for pairs in augmented_positive_pairs])
print(len(augmented_negative_pairs))

[48418, 18990, 6621]
121166


# primjeri augmentacija

pozitivni parovi

In [18]:
for i in range(50):
    qid1, qid2 = augmented_positive_pairs[0][i * 10]
    first_question = data_questions.iloc[qid1 - 1].question
    second_question = data_questions.iloc[qid2 - 1].question
    print(first_question)
    print(second_question)
    print()

How do we know if the surgical strikes by India in POK are real and not a misinformation?
Was surgical strike really happened in POK on morning of 29th September?

What are some great beginner ways to learn JavaScript?
What are the online resources for learning javascript?

What are some important things about finance everyone should know?
What should one know about spending money?

Why can't India ban the import of Chinese products in India?
Will China be hurt if India bans imports from it?

What are ways I can increase my height (I'm a ftm Asian)?
What are the ways to gain height?

What is the difference between front end and back end website development?
What is the difference between a front-end developer and back-end developer?

Will Trump's win affect the matriculation of students who wish to be graduate from USA?
How will Donald J Trump's presidency affect the opportunities offered to non US students?

How do I recover password for Gmail password without security questions?
I ma

In [19]:
for i in range(50):
    qid1, qid2 = augmented_positive_pairs[1][i * 10]
    first_question = data_questions.iloc[qid1 - 1].question
    second_question = data_questions.iloc[qid2 - 1].question
    print(first_question)
    print(second_question)
    print()

How do I get over the trauma of a breakup?
How do I move on after a very bad breakup?

What do you think of the decision by the Indian government to demonetize 500 and 1000 rupee notes?
Do you think scraping Rs. 500 and Rs. 100 notes marks the beginning of curbing of the black money laundering in India?

How do I study well without getting distracted?
What are some ways to keep yourself from getting distracted while working or studying?

Will Donald Trump or Hillary Clinton win the 2016 US presidential election?
Who will be the next POTUS - Hillary or Donald?

If a question doesn't need improvement, why does Quora mess with your questions?
Why is my question marked as needing improvement when it is perfectly clear and well written?

What’s the easiest most painless way to die?
How can I commit suicide without any pain?

Instagram (product): How can I know who visits my Instagram profile?
How can I see who my boyfriend views on instagram?

Can we be immortal?
Do you think mankind will e

In [20]:
positive_examples_list_3_step = list(augmented_positive_pairs[2])
for i in range(50):
    qid1, qid2 = augmented_positive_pairs[2][i * 10]
    first_question = data_questions.iloc[qid1 - 1].question
    second_question = data_questions.iloc[qid2 - 1].question
    print(first_question)
    print(second_question)
    print()

Which movies should you watch right now?
Which is the one movie you will recommend me to watch and why?

What is your view on the recent demonetisation of higher value currencies in India?
Is the decision to abandon Rs. 500 and Rs. 1000 denominations notes by PM Modi justified? Will it help in any way to curb the Black Money?

What are some extremely early signs of pregnancy?
What are the definite pregnancy symptoms?

Why do Indians get less medals in Olympics?
What are the reasons for India's poor performance at Olympics considering that India's population is only second to China?

How do prepare for IAS for a fresher?
What is a study plan for taking UPSC exams for IAS/IFS/IRS/IPS this year?

Can you make your penis larger at the age of 27?
How can I enlarge my penis?

How does it feel when a penis enters a vagina, from either partner's point of view?
What does sex feel like for women and men?

What is your view on the recent demonetisation of higher value currencies in India?
What ar

negativni parovi

In [21]:
for i in range(50):
    qid1, qid2 = augmented_negative_pairs[i * 10]
    first_question = data_questions.iloc[qid1 - 1].question
    second_question = data_questions.iloc[qid2 - 1].question
    print(first_question)
    print(second_question)
    print()

How can I improve my English vocabulary and writing skills?
What books do you suggest would help improve English writing skills?

What makes a girl love a boy?
How do I get a girlfriend?

Why Narendra Modi banned 500 and 1000 notes in India?
India's Prime Minister removed 500 and 1000 rupee notes from circulation. Is this a good way to curb the spread of black money?

How can someone lose weight quickly?
How do I lose 15 kilograms in a period of 3 months?

How can I be a better person and improve my questions on Quora?
My every question is marked as "needing improvements". How do I resolve this and get proper answers?

What is a good way to get over depression?
What should I do if I am getting depressed?

How can I lose my weight from 55 kg to 50 kg within two month?
What are some healthy weight gaining diet plans?

What are the repercussions of 500 and 1000 rupee notes not being legal tender anymore?
What will be the effect of the ban of 500 and 1000 rupee notes be on the stock market

# create dataset for each experiment

In [17]:
def get_pairs(step, fraction_negative):
    pairs = []
    for _, (qid1, qid2, _, _, is_duplicate, _) in data_pairs.iterrows():
        pairs.append([get_pair_index(qid1, qid2), is_duplicate])

    positive_count = 0
    for i in range(step):
        for qid1, qid2 in augmented_positive_pairs[i]:
            pairs.append([get_pair_index(qid1, qid2), 1])
        positive_count += len(augmented_positive_pairs[i])

    if fraction_negative == 0: return pairs

    total_number_of_augmented = len(augmented_negative_pairs)
    to_add_negative = int(fraction_negative * positive_count)
    to_add_negative = min(to_add_negative, total_number_of_augmented)
    random_permutation = np.random.permutation(augmented_negative_pairs)
    for qid1, qid2 in random_permutation[:to_add_negative]:
        pairs.append([get_pair_index(qid1, qid2), 0])

    return pairs

def get_dataset(step, fraction_negative):
    pairs = get_pairs(step, fraction_negative)
    return pd.DataFrame(pairs, columns=["pair_id", "is_duplicate"])

In [18]:
step, fraction = 1, 0.0
experiment_dataset = get_dataset(step, fraction)
experiment_dataset.to_csv(f"./data/experiments/instance_{step}_{fraction:.2f}.csv", index=False)

In [19]:
step, fraction = 1, 1.0
experiment_dataset = get_dataset(step, fraction)
experiment_dataset.to_csv(f"./data/experiments/instance_{step}_{fraction:.2f}.csv", index=False)

In [20]:
step, fraction = 1, 2.0
experiment_dataset = get_dataset(step, fraction)
experiment_dataset.to_csv(f"./data/experiments/instance_{step}_{fraction:.2f}.csv", index=False)

In [21]:
step, fraction = 2, 0.0
experiment_dataset = get_dataset(step, fraction)
experiment_dataset.to_csv(f"./data/experiments/instance_{step}_{fraction:.2f}.csv", index=False)

In [22]:
step, fraction = 2, 1.0
experiment_dataset = get_dataset(step, fraction)
experiment_dataset.to_csv(f"./data/experiments/instance_{step}_{fraction:.2f}.csv", index=False)

In [23]:
step, fraction = 2, 2.0
experiment_dataset = get_dataset(step, fraction)
experiment_dataset.to_csv(f"./data/experiments/instance_{step}_{fraction:.2f}.csv", index=False)

In [24]:
step, fraction = 3, 0.0
experiment_dataset = get_dataset(step, fraction)
experiment_dataset.to_csv(f"./data/experiments/instance_{step}_{fraction:.2f}.csv", index=False)

In [25]:
step, fraction = 3, 1.0
experiment_dataset = get_dataset(step, fraction)
experiment_dataset.to_csv(f"./data/experiments/instance_{step}_{fraction:.2f}.csv", index=False)

In [26]:
step, fraction = 3, 2.0
experiment_dataset = get_dataset(step, fraction)
experiment_dataset.to_csv(f"./data/experiments/instance_{step}_{fraction:.2f}.csv", index=False)