In [1]:
#Features taken from Kaggle discussions:


import pandas as pd
import numpy as np
from collections import defaultdict
import numpy as np
import math

data_folder = '/home/sidsvash26/kaggle_quora/data/'

train_df =  pd.read_csv(data_folder + 'train.csv', header=0)
test_df =  pd.read_csv(data_folder + 'test.csv', header=0)

ques = pd.concat([train_df[['question1', 'question2']], test_df[['question1', 'question2']]], axis=0).reset_index(drop='index')


In [2]:
q_dict = defaultdict(set)
#Creating the graph , Nodes - Questions, Edges - Question pair
for i in range(ques.shape[0]):
    q_dict[ques.question1[i]].add(ques.question2[i])
    q_dict[ques.question2[i]].add(ques.question1[i])
    

In [3]:
#Leaky Features:
def q1_freq(row):
    return len(q_dict[row['question1']])

def q2_freq(row):
    return len(q_dict[row['question2']])

def q1q2_intersect(row):
    return len(set(q_dict[row['question1']]).intersection(set(q_dict[row['question2']])))

def q1q2_jaccard(row):
    return len(set(q_dict[row['question1']]).intersection(set(q_dict[row['question2']]))) / len(set(q_dict[row['question1']]).union(set(q_dict[row['question2']])))
    
def pagerank():

    MAX_ITER = 40
    d = 0.85
    
    #Initializing -- every node gets a uniform value!
    pagerank_dict = {i:1/len(q_dict) for i in q_dict}
    num_nodes = len(pagerank_dict)
    
    for iter in range(0, MAX_ITER):
        
        for node in q_dict:    
            local_pr = 0
            
            for neighbor in q_dict[node]:
                local_pr += pagerank_dict[neighbor]/len(q_dict[neighbor])
            
            pagerank_dict[node] = (1-d)/num_nodes + d*local_pr

    return pagerank_dict

def get_pagerank_q1(row):
    return pagerank_dict[row["question1"]]

def get_pagerank_q2(row):
    return pagerank_dict[row["question2"]] 

print('Creating Page rank dictionary')
pagerank_dict = pagerank()
print('Done!!')

    

Creating Page rank dictionary
Done!!


In [4]:
#Building Train Features:
train_df['q1q2_intersect'] = train_df.apply(q1q2_intersect, axis=1, raw=True)
train_df['q1q2_jaccard'] = train_df.apply(q1q2_jaccard, axis=1, raw=True)
train_df['q1_freq'] = train_df.apply(q1_freq, axis=1, raw=True)
train_df['q2_freq'] = train_df.apply(q2_freq, axis=1, raw=True)

print('Creating page rank variables...')
train_df['q1_pagerank'] = train_df.apply(get_pagerank_q1, axis=1, raw=True)
train_df['q2_pagerank'] = train_df.apply(get_pagerank_q2, axis=1, raw=True)

train_df['q1_pagerank_log'] = train_df['q1_pagerank'].map(lambda x: math.log(x))
train_df['q2_pagerank_log'] = train_df['q2_pagerank'].map(lambda x: math.log(x))



Creating page rank variables...


In [6]:
#Building Test Features:
print('Building test features...')
test_df['q1q2_intersect'] = test_df.apply(q1q2_intersect, axis=1, raw=True)
test_df['q1q2_jaccard'] = test_df.apply(q1q2_jaccard, axis=1, raw=True)
test_df['q1_freq'] = test_df.apply(q1_freq, axis=1, raw=True)
test_df['q2_freq'] = test_df.apply(q2_freq, axis=1, raw=True)

print('Creating page rank variables...')
test_df['q1_pagerank'] = test_df.apply(get_pagerank_q1, axis=1, raw=True)
test_df['q2_pagerank'] = test_df.apply(get_pagerank_q2, axis=1, raw=True)

test_df['q1_pagerank_log'] = test_df['q1_pagerank'].map(lambda x: math.log(x))
test_df['q2_pagerank_log'] = test_df['q2_pagerank'].map(lambda x: math.log(x))
print('Done')

Building test features...
Creating page rank variables...
Done


In [7]:
test_df.head(10)

Unnamed: 0,test_id,question1,question2,q1q2_intersect,q1q2_jaccard,q1_freq,q2_freq,q1_pagerank,q2_pagerank,q1_pagerank_log,q2_pagerank_log
0,0,How does the Surface Pro himself 4 compare wit...,Why did Microsoft choose core m3 and not core ...,0,0.0,1,1,2.088105e-07,2.088105e-07,-15.381839,-15.381839
1,1,Should I have a hair transplant at age 24? How...,How much cost does hair transplant require?,0,0.0,2,2,2.366495e-07,2.193292e-07,-15.256686,-15.332692
2,2,What but is the best way to send money from Ch...,What you send money to China?,0,0.0,1,1,2.088105e-07,2.088105e-07,-15.381839,-15.381839
3,3,Which food not emulsifiers?,What foods fibre?,0,0.0,1,1,2.088105e-07,2.088105e-07,-15.381839,-15.381839
4,4,"How ""aberystwyth"" start reading?",How their can I start reading?,0,0.0,1,1,2.088105e-07,2.088105e-07,-15.381839,-15.381839
5,5,How are the two wheeler insurance from Bharti ...,I admire I am considering of buying insurance ...,0,0.0,1,1,2.088105e-07,2.088105e-07,-15.381839,-15.381839
6,6,How can I reduce my belly fat through a diet?,How can I reduce my lower belly fat in one month?,26,0.8125,28,30,4.173158e-07,4.541163e-07,-14.689423,-14.604913
7,7,"By scrapping the 500 and 1000 rupee notes, how...",How will the recent move to declare 500 and 10...,0,0.0,1,1,2.088105e-07,2.088105e-07,-15.381839,-15.381839
8,8,What are the how best books of all time?,What are some of the military history books of...,0,0.0,1,1,2.088105e-07,2.088105e-07,-15.381839,-15.381839
9,9,After 12th years old boy and I had sex with a ...,Can a 14 old guy date a 12 year old girl?,0,0.0,1,2,1.608405e-07,3.047504e-07,-15.642853,-15.003773


In [8]:
#Saving features:
import pickle

train_X1 = np.array(train_df[['q1q2_intersect', 'q1q2_jaccard', 'q1_freq', 'q2_freq', 
                                'q1_pagerank', 'q2_pagerank','q1_pagerank_log', 'q2_pagerank_log']])

test_X1 = np.array(test_df[['q1q2_intersect', 'q1q2_jaccard', 'q1_freq', 'q2_freq', 
                                'q1_pagerank', 'q2_pagerank','q1_pagerank_log', 'q2_pagerank_log']])

pickle.dump(train_X1, open(data_folder + 'feats9_all_magic.sav', 'wb'))
pickle.dump(test_X1, open(data_folder + 'feats9_all_magic_for_test.sav', 'wb'))