In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from collections import defaultdict
import sys

In [2]:
data_pairs = pd.read_csv("./data/train.csv")
data_pairs = data_pairs.drop("id", axis=1)
data_pairs.head()

Unnamed: 0,qid1,qid2,question1,question2,is_duplicate
0,1,2,What is the step by step guide to invest in sh...,What is the step by step guide to invest in sh...,0
1,3,4,What is the story of Kohinoor (Koh-i-Noor) Dia...,What would happen if the Indian government sto...,0
2,5,6,How can I increase the speed of my internet co...,How can Internet speed be increased by hacking...,0
3,7,8,Why am I mentally very lonely? How can I solve...,Find the remainder when [math]23^{24}[/math] i...,0
4,9,10,"Which one dissolve in water quikly sugar, salt...",Which fish would survive in salt water?,0


In [3]:
data_questions = pd.read_csv("./data/data_questions.csv")
data_questions.head()

Unnamed: 0,qid,question
0,1,What is the step by step guide to invest in sh...
1,2,What is the step by step guide to invest in sh...
2,3,What is the story of Kohinoor (Koh-i-Noor) Dia...
3,4,What would happen if the Indian government sto...
4,5,How can I increase the speed of my internet co...


In [4]:
qids = pd.unique(data_pairs[["qid1", "qid2"]].values.ravel())
print(f"broj parova: {len(data_pairs)}")
print(f"broj pitanja: {len(qids)}")

broj parova: 404290
broj pitanja: 537933


# Analiza tranzitivnih ovisnosti

moze se na dataset gledati kao na graf
pitanje je vrh, a similarity == 1 je brid
onda je augmentacija dodavnje parova svakog drugog susjeda, treceg, ...
maksimalna augmentacija za similar primjere bi bila da nedemo sve komponente grafa i dodamo sve parove u svakoj komponenti

jos mozemo razmotrit drugi graf kojem je brid similarity == 0 i dodavi tranzitivne parove koji nisu similar
mozda bi imalo smisla da dodajemo jedan ne similar primjer za svaki similar ili mozda dva. Da odrzimo dataset balansiram

In [5]:
# question_id -> question_id
graph_similar = defaultdict(lambda: [])
original_similar_pairs = set()
for _, (qid1, qid2, question1, question2, is_duplicate) in data_pairs.iterrows():
    if is_duplicate:
        graph_similar[qid1].append(qid2)
        graph_similar[qid2].append(qid1)

        lower_id = min(qid1, qid1)
        higher_id = max(qid2, qid2)
        original_similar_pairs.add((lower_id, higher_id))

In [6]:
second_neighbors = set()
for qid, neighbours in graph_similar.items():
    for neighbour in neighbours:
        for second_neighbor in graph_similar[neighbour]:
            if second_neighbor == qid:
                continue

            lower_id = min(qid, second_neighbor)
            higher_id = max(qid, second_neighbor)
            if (lower_id, higher_id) not in original_similar_pairs:
                second_neighbors.add((lower_id, higher_id))

In [7]:
second_neighbors = list(second_neighbors)
second_neighbors.sort()
print(f"second neighbours size: {len(second_neighbors)}")
second_neighbors

second neighbours size: 95840


[(25, 114035),
 (26, 114035),
 (28, 50277),
 (31, 1100),
 (31, 2066),
 (31, 2067),
 (31, 6079),
 (31, 6938),
 (31, 7751),
 (31, 7752),
 (31, 8577),
 (31, 8578),
 (31, 11434),
 (31, 16474),
 (31, 17171),
 (31, 24203),
 (31, 24204),
 (31, 24960),
 (31, 32509),
 (31, 32957),
 (31, 36835),
 (31, 36836),
 (31, 37617),
 (31, 38502),
 (31, 44686),
 (31, 46127),
 (31, 51528),
 (31, 53535),
 (31, 64848),
 (31, 77287),
 (31, 83195),
 (31, 93146),
 (31, 106091),
 (31, 120229),
 (31, 132922),
 (31, 140488),
 (31, 167907),
 (31, 218253),
 (31, 221900),
 (31, 249733),
 (31, 258354),
 (31, 333813),
 (31, 528360),
 (32, 1100),
 (32, 2066),
 (32, 2067),
 (32, 6079),
 (32, 6080),
 (32, 7751),
 (32, 7752),
 (32, 8577),
 (32, 8578),
 (32, 11434),
 (32, 12544),
 (32, 16474),
 (32, 17171),
 (32, 24203),
 (32, 24204),
 (32, 24960),
 (32, 32509),
 (32, 32957),
 (32, 36835),
 (32, 36836),
 (32, 38502),
 (32, 44686),
 (32, 51528),
 (32, 53535),
 (32, 64848),
 (32, 77287),
 (32, 81385),
 (32, 83195),
 (32, 88834

In [8]:
for i in range(8):
    qid1, qid2 = second_neighbors[i]
    first_question = data_questions.iloc[qid1 - 1].question
    second_question = data_questions.iloc[qid2 - 1].question
    print(first_question)
    print(second_question)
    print()

What can make Physics easy to learn?
Is there a way to make learning physics easier?

How can you make physics easy to learn?
Is there a way to make learning physics easier?

What was your first sexual experience?
What is your first sexual experience?

What would a Trump presidency mean for current international master’s students on an F1 visa?
How will Trump's presidency affect Indian students who are planning to do a PhD in the US?

What would a Trump presidency mean for current international master’s students on an F1 visa?
I am an Indian, planning to go to US for MS (a STEM course) this January. If Trump wins, how will that affect my future in US?

What would a Trump presidency mean for current international master’s students on an F1 visa?
How is Trump becoming the president affect the Indians applying for an MS in the US (Mech)?

What would a Trump presidency mean for current international master’s students on an F1 visa?
Now that Donald Trump is President, will international stu

In [9]:
for i in range(8):
    qid1, qid2 = second_neighbors[i * 100]
    first_question = data_questions.iloc[qid1 - 1].question
    second_question = data_questions.iloc[qid2 - 1].question
    print(first_question)
    print(second_question)
    print()

What can make Physics easy to learn?
Is there a way to make learning physics easier?

Why are so many Quora users posting questions that are readily answered on Google?
Why use Quora when Google answers almost everything?

Why do people ask Quora questions which can be answered easily by Google?
Why do people waste time waiting for answers on Quora rather than Google a question and get an instant answer?

I was suddenly logged off Gmail. I can't remember my Gmail password and just realized the recovery email is no longer alive. What can I do?
How do I gain access to my gmail when I don't have access to the phone number or recovery email?

How I can speak English fluently?
What are some ways to improve English?

How can I learn to speak English fluently?
What is the best way of improving spoken English?

What were the major effects of the cambodia earthquake, and how do these effects compare to the Kamchatca earthquakes in 1952?
What were the major effects of the cambodia earthquake, an

# provjera jel procurio neki primjer koji je vec bio u datasetu

In [10]:
original_similar_pairs = set()
for _, (qid1, qid2, question1, question2, is_duplicate) in data_pairs.iterrows():
    if is_duplicate:
        lower_id = min(qid1, qid1)
        higher_id = max(qid2, qid2)
        original_similar_pairs.add((lower_id, higher_id))

augmented_paris = set(second_neighbors)

In [11]:
print(data_pairs.is_duplicate.sum())
print(len(original_similar_pairs))
print(len(augmented_paris))

149263
149263
95840


In [12]:
intersection = original_similar_pairs.intersection(augmented_paris)
print(len(intersection))

0


# broj komponenti

In [13]:
labels = defaultdict(lambda : 0)
component_count = defaultdict(lambda: 0)

def visit(node, level):
    stack = [node]
    while len(stack) > 0:
        v = stack.pop()
        if labels[v - 1] != 0: continue
        labels[v - 1] = level
        component_count[level] += 1
        stack += graph_similar[v]

level = 0
for node in graph_similar:
    if labels[node - 1] == 0:
        level += 1
        visit(node, level)

print(level)

60460


In [14]:
component_counts = list(component_count.values())
component_counts.sort()
print(component_counts)

[2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 