In [10]:
import numpy as np
import pickle
import pandas as pd

with open('word_weights.pickle', 'rb') as handle:
    weights = pickle.load(handle)
    
edge_weights = {}


def get_weighted_edge_score(row):
    q1_words = row["question1"].lower().split()
    q2_words = row["question2"].lower().split()

    # modify this!
    if len(q1_words) == 0 or len(q2_words) == 0:
        return 0

    common_words = list(set(q1_words).intersection(q2_words))

    common_words_score = np.sum([weights.get(w, 0) for w in common_words])
    all_words_score = np.sum([weights.get(w, 0) for w in q1_words]) + np.sum(
        [weights.get(w, 0) for w in q2_words]) - common_words_score

    return common_words_score / all_words_score


def generate_edge_scores(row):
    node1 = row["qid1"]
    node2 = row["qid2"]

    if node1 not in edge_weights:
        edge_weights[node1] = {}

    if node2 not in edge_weights:
        edge_weights[node2] = {}

    edge_weight = get_weighted_edge_score(row)

    edge_weights[node1][node2] = edge_weight
    edge_weights[node2][node1] = edge_weight
    

df_train = pd.read_csv('./train.csv').fillna("")
df_test = pd.read_csv('./df_test_with_qid.csv').fillna("")
    
df_train.apply(generate_edge_scores, axis = 1)
df_test.apply(generate_edge_scores, axis = 1)

In [27]:
best_words = sorted(weights.items(), key= lambda k:k[1], reverse=True)
best_words[:5]

[('murwara', 9.999000099990002e-05),
 ('considered,', 9.999000099990002e-05),
 ('considered.', 9.999000099990002e-05),
 ('pieces/rest', 9.999000099990002e-05),
 ('throwaround', 9.999000099990002e-05)]

[(1, {2: 0}),
 (2, {1: 0, 3525799: 0}),
 (3,
  {4: 0,
   282170: 0,
   380197: 0,
   488853: 0,
   1368761: 0,
   1547732: 0,
   2067378: 0,
   4044637: 0}),
 (4, {3: 0, 32881: 0, 2600306: 0}),
 (5, {6: 0, 3714518: 0})]

In [17]:
df_train.head()

Unnamed: 0,id,qid1,qid2,question1,question2,is_duplicate
0,0,1,2,What is the step by step guide to invest in sh...,What is the step by step guide to invest in sh...,0
1,1,3,4,What is the story of Kohinoor (Koh-i-Noor) Dia...,What would happen if the Indian government sto...,0
2,2,5,6,How can I increase the speed of my internet co...,How can Internet speed be increased by hacking...,0
3,3,7,8,Why am I mentally very lonely? How can I solve...,Find the remainder when [math]23^{24}[/math] i...,0
4,4,9,10,"Which one dissolve in water quikly sugar, salt...",Which fish would survive in salt water?,0


In [29]:
with open('edge_weights.pickle', 'wb') as handle:
        pickle.dump(edge_weights, handle)

In [31]:
edge_weights.items()[:5]

[(1, {2: 0.50824638344898565}),
 (2, {1: 0.50824638344898565, 3525799: 0.16420049355875502}),
 (3,
  {4: 0.28510058567647312,
   282170: 0.00090753822116321746,
   380197: 0.43365615813260561,
   488853: 0.00069954674008697716,
   1368761: 0.23244737234635146,
   1547732: 0.1740334403267679,
   2067378: 0.63117983576257164,
   4044637: 0.15246564726666398}),
 (4,
  {3: 0.28510058567647312,
   32881: 0.41756849386861888,
   2600306: 0.25595373966109775}),
 (5, {6: 0.13975017652410615, 3714518: 0.4498198877247821})]