In [2]:
import pickle
import random
import collections
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split
from imblearn.under_sampling import RandomUnderSampler
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV

In [18]:
def loadfile(path):
    with open(path,'rb') as picklefile:
        file = pickle.load(picklefile, encoding='latin1')
    return file

In [3]:
with open('./data/2016_q2_newman-weights.pickle','rb') as picklefile:
    q2_newman_w = pickle.load(picklefile, encoding='latin1')

In [4]:
with open('./data/q1_4000_users.pickle', 'rb') as picklefile:
    users = pickle.load(picklefile, encoding='latin1')

In [5]:
with open('./data/newman_cn.pickle', 'rb') as picklefile:
    q1_cn = pickle.load(picklefile, encoding='latin1')

In [6]:
with open('./data/newman_adam.pickle', 'rb') as picklefile:
    q1_aa = pickle.load(picklefile, encoding='latin1')

In [7]:
with open('./data/newman_jaccard.pickle', 'rb') as picklefile:
    q1_jc = pickle.load(picklefile, encoding='latin1')

In [8]:
with open('./data/newman_pa.pickle', 'rb') as picklefile:
    q1_pa = pickle.load(picklefile, encoding='latin1')

In [9]:
with open('./data/2016_q1_newman-weights.pickle', 'rb') as picklefile:
    q1_newman_w = pickle.load(picklefile, encoding='latin1')

In [10]:
def check_accuracy(q1_w, q2_w, score, core_pairs):
    new_edges = set(q2_w.keys()).difference(q1_w.keys()).intersection(set(core_pairs))
    n = int(len(new_edges))
    predicted_new_edges = [(a,b) for a,b in list(zip(score.keys(), score.values())) if a not in q1_w.keys()]
    sorted_pred = sorted(predicted_new_edges, key=lambda x:x[1], reverse=True)[:n]
    predicted_edges = set([a for a,b in sorted_pred])
    accurate_prediction = set(predicted_edges).intersection(new_edges)
    return len(accurate_prediction)/len(predicted_edges)

In [11]:
check_accuracy(q1_newman_w, q2_newman_w, q1_jc, q1_pa.keys())

0.15958474167069048

In [166]:
check_accuracy(q1_newman_w, q2_newman_w, q1_pa, q1_pa.keys())

0.019099737110359997

In [167]:
check_accuracy(q1_newman_w, q2_newman_w, q1_cn, q1_pa.keys())

0.15694243253393422

In [168]:
check_accuracy(q1_newman_w, q2_newman_w, q1_aa, q1_pa.keys())

0.16659960298299265

## Construct dataframe for supervised learning

In [13]:
# Get new core edges
nce = list(set(q1_pa.keys()).difference(q1_newman_w.keys()))

In [14]:
def combine_scores(edges, score_list):
    scores = np.zeros((len(edges), len(score_list)))
    for i in range(len(edges)):
        for j in range(len(score_list)):
            scores[i][j] = score_list[j][edges[i]]
    return scores

In [16]:
X = combine_scores(nce, [q1_aa, q1_cn, q1_jc, q1_pa])
y = [a in q2_newman_w.keys() for a in nce]

## Combine with user behavior and demographic features

In [53]:
user_details = loadfile('./data/user_details.pickle')

In [48]:
user_country = user_details.country_code.to_dict()
user_created = user_details.months_since_1970.to_dict()
user_commit = user_details.commit_times.to_dict()

In [54]:
def get_user_details(edges, detail_list):
    details = np.zeros((len(edges), len(detail_list)))
    #print (f'{len(edges)/1000}k node pairs to process')
    for i in range(len(edges)):
        if not i%10000:
            print (f'Processing {i/1000}k th node pair')
        uid = edges[i][0]
        vid = edges[i][1]
        details[i][0] = int(detail_list[0][uid] == detail_list[0][vid])
        details[i][1] = abs(detail_list[1][uid] - detail_list[1][vid])
        details[i][2] = min(detail_list[2][uid], detail_list[2][vid]) == 0
    return details

In [55]:
details = get_user_details(nce, [user_country, user_created, user_commit])

7739.733k node pairs to process
Processing 0.0k th node pair
Processing 10.0k th node pair
Processing 20.0k th node pair
Processing 30.0k th node pair
Processing 40.0k th node pair
Processing 50.0k th node pair
Processing 60.0k th node pair
Processing 70.0k th node pair
Processing 80.0k th node pair
Processing 90.0k th node pair
Processing 100.0k th node pair
Processing 110.0k th node pair
Processing 120.0k th node pair
Processing 130.0k th node pair
Processing 140.0k th node pair
Processing 150.0k th node pair
Processing 160.0k th node pair
Processing 170.0k th node pair
Processing 180.0k th node pair
Processing 190.0k th node pair
Processing 200.0k th node pair
Processing 210.0k th node pair
Processing 220.0k th node pair
Processing 230.0k th node pair
Processing 240.0k th node pair
Processing 250.0k th node pair
Processing 260.0k th node pair
Processing 270.0k th node pair
Processing 280.0k th node pair
Processing 290.0k th node pair
Processing 300.0k th node pair
Processing 310.0k 

Processing 2590.0k th node pair
Processing 2600.0k th node pair
Processing 2610.0k th node pair
Processing 2620.0k th node pair
Processing 2630.0k th node pair
Processing 2640.0k th node pair
Processing 2650.0k th node pair
Processing 2660.0k th node pair
Processing 2670.0k th node pair
Processing 2680.0k th node pair
Processing 2690.0k th node pair
Processing 2700.0k th node pair
Processing 2710.0k th node pair
Processing 2720.0k th node pair
Processing 2730.0k th node pair
Processing 2740.0k th node pair
Processing 2750.0k th node pair
Processing 2760.0k th node pair
Processing 2770.0k th node pair
Processing 2780.0k th node pair
Processing 2790.0k th node pair
Processing 2800.0k th node pair
Processing 2810.0k th node pair
Processing 2820.0k th node pair
Processing 2830.0k th node pair
Processing 2840.0k th node pair
Processing 2850.0k th node pair
Processing 2860.0k th node pair
Processing 2870.0k th node pair
Processing 2880.0k th node pair
Processing 2890.0k th node pair
Processi

Processing 5190.0k th node pair
Processing 5200.0k th node pair
Processing 5210.0k th node pair
Processing 5220.0k th node pair
Processing 5230.0k th node pair
Processing 5240.0k th node pair
Processing 5250.0k th node pair
Processing 5260.0k th node pair
Processing 5270.0k th node pair
Processing 5280.0k th node pair
Processing 5290.0k th node pair
Processing 5300.0k th node pair
Processing 5310.0k th node pair
Processing 5320.0k th node pair
Processing 5330.0k th node pair
Processing 5340.0k th node pair
Processing 5350.0k th node pair
Processing 5360.0k th node pair
Processing 5370.0k th node pair
Processing 5380.0k th node pair
Processing 5390.0k th node pair
Processing 5400.0k th node pair
Processing 5410.0k th node pair
Processing 5420.0k th node pair
Processing 5430.0k th node pair
Processing 5440.0k th node pair
Processing 5450.0k th node pair
Processing 5460.0k th node pair
Processing 5470.0k th node pair
Processing 5480.0k th node pair
Processing 5490.0k th node pair
Processi

In [57]:
X = np.concatenate((X,details), axis=1)

In [59]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=1234, stratify=y)

In [60]:
rus = RandomUnderSampler()
X_under, y_under = rus.fit_sample(X_train, y_train)

## Build a baseline model with logistic regression

In [61]:
ssX = StandardScaler()
X_scaled = ssX.fit_transform(X_under)

In [None]:
models = [
          ('logistic', LogisticRegression),
          ('tree', DecisionTreeClassifier),
          ('forest', RandomForestClassifier),
          ('xgboost', XGBClassifier)
         ]

param_choices = [    
    {
        'C': np.logspace(-3, 6, 12),
        'penalty': ['l1', 'l2']
    },
    {
        'max_depth': [1,2,3,4,5],
        'min_samples_leaf': [1,3,5]
    },
    {
        'criterion':['gini','entropy'],
        'n_estimators': [40,50,60],
        'min_samples_leaf':[1,3],
        'min_samples_split':[2,5]
    },
    {
        'max_depth': [3,4,5],
        'n_estimators': [1, 50, 100,200],
        'objective':['binary:logistic']
    }
]

grids = {}
for model_info, params in zip(models, param_choices):
    name, model = model_info
    grid = GridSearchCV(model(), params, scoring='accuracy', cv=5, n_jobs=-1)
    grid.fit(X_scaled, y_under)
    s = "{}: best score: {}".format(name, grid.best_score_)
    print(s)
    grids[name] = grid