In [1]:
import numpy as np
import pandas as pd
import networkx as nx
import matplotlib.pyplot as plt
import plotly.express as px
from tqdm import tqdm
import random
import pickle



# Read data

In [2]:
def create_df(filename="train.txt"):
    data = {'Source':[], 'Sink':[]}
    with open(filename, "r") as f:
        for raw_line in f:
            line = raw_line.strip().split("\t")
            data['Source'] += [line[0]]*(len(line)-1)
            data['Sink'] += line[1:]
    pd_data = pd.DataFrame(data=data)
    pd_data[['Source', 'Sink']] = pd_data[['Source', 'Sink']].apply(pd.to_numeric)
    pd_data = pd_data.drop_duplicates(keep=False)
    return pd_data

In [3]:
def read_sub():
    with open('test-public.txt', 'r') as f:
        # skip the header
        f.readline()
        data = {'Source':[], 'Sink':[]}
        for raw_line in f:
            line = raw_line.strip().split("\t")
            data['Source'].append(int(line[1]))
            data['Sink'].append(int(line[2]))
        return pd.DataFrame(data=data)

In [4]:
data = create_df()

In [5]:
data.shape

(23888876, 2)

In [6]:
data.to_csv("to_processed_train.csv", header=False, index=False)

In [7]:
G = nx.read_edgelist('to_processed_train.csv',delimiter=',',create_using=nx.Graph(),nodetype=int)
print(nx.info(G))

Name: 
Type: Graph
Number of nodes: 4842581
Number of edges: 23358551
Average degree:   9.6471


In [8]:
g = nx.read_edgelist('processed_train.csv',delimiter=',',create_using=nx.DiGraph(),nodetype=int)


In [9]:
print(nx.info(g))

Name: 
Type: DiGraph
Number of nodes: 3084179
Number of edges: 8350000
Average in degree:   2.7074
Average out degree:   2.7074


# Split training and testing data

In [10]:
with open("new_data/train-mix.p", "rb") as f:
    train_mix = pickle.load(f)


In [11]:
from sklearn.model_selection import train_test_split
link = [[i[1], i[2]] for i in train_mix]
label = [i[3] for i in train_mix]
X_desire_train, X_desire_test, y_desire_train, y_desire_test = train_test_split(link, label, test_size=0.2)

In [12]:
print(len(X_desire_train), len(y_desire_train), len(X_desire_test), len(y_desire_test))

16000 16000 4000 4000


In [13]:
X_train_features = pd.DataFrame()
X_test_features = pd.DataFrame()
X_desire_train = pd.DataFrame(X_desire_train, columns=['Source', 'Sink'])
X_desire_test = pd.DataFrame(X_desire_test, columns=['Source', 'Sink'])
X_desire_test

Unnamed: 0,Source,Sink
0,1655093,812186
1,3974423,3289673
2,3848803,4263818
3,4382293,2614161
4,2497970,4539763
...,...,...
3995,646475,1640325
3996,1591102,351452
3997,1669475,4624407
3998,3466523,3765415


# Compute features

In [14]:
with open("new_data/cosin_dict_add_self_t10.p", "rb") as f:
    cosin_dict = pickle.load(f)
cosin_dict
# (4030539, 354904) in cosin_dict

{(2712039, 218222): 0.48248525325073366,
 (3296670, 3092525): 0.06423439460626545,
 (1569537, 281790): 0.1881782469302815,
 (3750582, 528038): 0.06808569838994395,
 (3253990, 3008062): 0.14736310918744813,
 (4126485, 2978899): 0.017931948989144783,
 (2353654, 1863046): 0.25851310324167864,
 (1227313, 109141): 0.1623700001988141,
 (685695, 2410046): 0.2768087433726595,
 (3262992, 2677316): 0.03367833706857019,
 (2323088, 2309976): 0.09396398230865341,
 (2466503, 4800253): 0.014925967280514044,
 (878044, 2541835): 0.0,
 (4672710, 3783690): 0.0007120511703484632,
 (2806975, 1202129): 0.04558103047994419,
 (3079468, 3325422): 0.0,
 (10090, 545853): 0.0,
 (4550069, 3248447): 0.0,
 (4786270, 3205531): 0.02025670009319049,
 (2198488, 1971518): 0.19662171063331382,
 (3063001, 1659272): 0.0,
 (3869033, 2572933): 0.34671057824608065,
 (1896126, 2402775): 0.07933180134909378,
 (3063973, 1380739): 0.17817532123357485,
 (3228493, 1304921): 0.0,
 (3545321, 3708550): 0.06866532857850899,
 (1550614, 2

In [15]:
(209445, 230750) in cosin_dict

False

In [16]:
with open("new_data/sub_cosin_dict_add_sm_t10.p", "rb") as f:
    sub_cosin_dict = pickle.load(f)

In [17]:
with open("new_data/random_walk_train.p", "rb") as f:
    random_walk_train = pickle.load(f)

In [18]:
with open("new_data/random_walk_sub.p", "rb") as f:
    random_walk_sub = pickle.load(f)

In [19]:
def jaccard_followees(a, b, train_graph=g):
    try:
        if len(set(train_graph.successors(a))) == 0 or len(set(train_graph.successors(b))) == 0:
            return 0
        else:
            intersection = len(set(train_graph.successors(a)).intersection(set(train_graph.successors(b))))
            union = len(set(train_graph.successors(a)).union(set(train_graph.successors(b))))
            return intersection/union
    except:
        return 0
    return sim

def jaccard_followers(a,b, train_graph=g):
    try:
        if set(train_graph.predecessors(a)) == 0 or len(set(g.predecessors(b))) == 0:
            return 0
        else:
            intersection = len(set(train_graph.predecessors(a)).intersection(set(train_graph.predecessors(b))))
            union = len(set(train_graph.predecessors(a)).union(set(train_graph.predecessors(b))))
            return intersection/union
    except:
        return 0

In [20]:
import math
#for followees
def cosine_followees(a, b, train_graph=g):
    try:
        if len(set(train_graph.successors(a))) == 0 or len(set(train_graph.successors(b))) == 0:
            print("yes")
            return 0
        else:
            return (len(set(train_graph.successors(a)).intersection(set(train_graph.successors(b)))))/\
                                    (math.sqrt(len(set(train_graph.successors(a)))*len((set(train_graph.successors(b))))))
    except:
        return 0
    
def cosine_followers(a, b, train_graph=g):
    try:
        
        if len(set(train_graph.predecessors(a))) == 0 or len(set(train_graph.predecessors(b))) == 0:
            return 0
        else:
            print(len(set(train_graph.predecessors(a)).intersection(set(train_graph.predecessors(b)))))
            return (len(set(train_graph.predecessors(a)).intersection(set(train_graph.predecessors(b)))))/\
                                     (math.sqrt(len(set(train_graph.predecessors(a))))*(len(set(train_graph.predecessors(b)))))
    except:
        return 0

In [21]:
def compute_shortest_path_length(a, b, train_graph=G):
    p = 99
    try:
        # if the edge already exist, we first remove the edge which let our model better understand the graph
        if train_graph.has_edge(a,b):
            train_graph.remove_edge(a,b)
            p= nx.shortest_path_length(train_graph,source=a,target=b)
            train_graph.add_edge(a,b)
        else:
            p= nx.shortest_path_length(train_graph,source=a,target=b)
        return p
    except:
        return 99

In [22]:
def preferencial_attchment(a, b, graph=g):
    try:
        a_set = set(graph.successors(a)).union(graph.predecessors(a))
        b_set = set(graph.successors(b)).union(graph.predecessors(b))
    except:
        return 0
    return len(a_set)*len(b_set)
    

In [118]:
def sorensen(a, b, graph=g):
    try:
        a_set = set(graph.successors(a)).union(graph.predecessors(a))
        b_set = set(graph.successors(b)).union(graph.predecessors(b))
    except:
        return 0
    
    return 2 * len(a_set & b_set) / (len(a_set) + len(b_set))


In [23]:
def hub_depressed(a, b, graph=g):
    try:
        s1 = set(graph.successors(a)).union(set(graph.predecessors(a)))
        s2 = set(graph.successors(b)).union(set(graph.predecessors(b)))
        neighbour = len(s1.intersection(s2))
    except:
        neighbour = 0
        
    if neighbour == 0:
        return 0
    
    anext = 0
    bnext = 0
    try:
        anext = len(list(graph.successors(a)))
    except:
        pass
    
    try:
        bnext = len(list(graph.successors(b)))
    except:
        pass
    
    min_two = min(anext, bnext)
    
    if min_two == 0:
        return 0
    else:
        return neighbour/min_two

In [24]:
def hub_promoted(a, b, graph=g):
    try:
#         neighbour = len(set(graph.successors(a)).intersection(graph.successors(b)))
        s1 = set(graph.successors(a)).union(set(graph.predecessors(a)))
        s2 = set(graph.successors(b)).union(set(graph.predecessors(b)))
        neighbour = len(s1.intersection(s2))
    except:
        neighbour = 0
        
    if neighbour == 0:
        return 0
    
    anext = 0
    bnext = 0
    try:
        anext = len(list(graph.successors(a)))
    except:
        pass
    
    try:
        bnext = len(list(graph.successors(b)))
    except:
        pass
    
    max_two = max(anext, bnext)
    
    if max_two == 0:
        return 0
    else:
        return neighbour/max_two

In [25]:
def lhn(nei, pre):
    if nei == 0 or pre == 0:
        return 0
    return nei/pre

In [26]:
def common_neighbours(a, b, graph=g):
    try:
        a_set = set(graph.successors(a)).union(graph.predecessors(a))
        b_set = set(graph.successors(b)).union(graph.predecessors(b))
    except:
        return 0
    
    return len(a_set.intersection(b_set))

In [27]:
X_train_features['jaccard_followers'] = X_desire_train.apply(lambda row:jaccard_followers(row['Source'], row['Sink']), axis=1)
X_test_features['jaccard_followers'] = X_desire_test.apply(lambda row:jaccard_followers(row['Source'], row['Sink']),axis=1)

#mapping jaccrd followees to train and test data
X_train_features['jaccard_followees'] = X_desire_train.apply(lambda row:jaccard_followees(row['Source'], row['Sink']), axis=1)
X_test_features['jaccard_followees'] = X_desire_test.apply(lambda row:jaccard_followees(row['Source'], row['Sink']), axis=1)

In [28]:
X_train_features['cosin_rec'] = X_desire_train.apply(lambda row: cosin_dict[(row['Source'], row['Sink'])], axis=1)
X_test_features['cosin_rec'] = X_desire_test.apply(lambda row: cosin_dict[(row['Source'], row['Sink'])], axis=1)

In [29]:
def compute_features_stage1(df_final):
    #calculating # of followers followees for source and destination
    #calculating intersection of followers and followees for source and destination
    num_followers_s=[]
    num_followees_s=[]
    num_followers_d=[]
    num_followees_d=[]
    inter_followers=[]
    inter_followees=[]
    
    for i, row in df_final.iterrows():
        try:
            s1 = set(g.predecessors(row['Source']))
            s2 = set(g.predecessors(row['Sink']))
        except:
            s1 = set()
            s2 = set()
            
        try:
            d1 = set(g.successors(row['Source']))
            d2 = set(g.successors(row['Sink']))
        except:
            d1 = set()
            d2 = set()

        num_followers_s.append(len(s1))
        num_followees_s.append(len(d1))

        num_followers_d.append(len(s2))
        num_followees_d.append(len(d2))
        
        inter_followers.append(len(s1.intersection(s2)))
        inter_followees.append(len(d1.intersection(d2)))

    return num_followers_s, num_followers_d, num_followees_s, num_followees_d,inter_followers,inter_followees

X_train_features['num_followers_s'], X_train_features['num_followers_d'], \
X_train_features['num_followees_s'], X_train_features['num_followees_d'], \
X_train_features['inter_followers'], X_train_features['inter_followees'] = compute_features_stage1(X_desire_train)

X_test_features['num_followers_s'], X_test_features['num_followers_d'], \
X_test_features['num_followees_s'], X_test_features['num_followees_d'], \
X_test_features['inter_followers'], X_test_features['inter_followees'] = compute_features_stage1(X_desire_test)

In [30]:
X_train_features['shortest_path'] = X_desire_train.apply(lambda row: compute_shortest_path_length(row['Source'], row['Sink']), axis=1)
X_test_features['shortest_path'] = X_desire_test.apply(lambda row: compute_shortest_path_length(row['Source'], row['Sink']), axis=1)

In [31]:
X_train_features['c_nei'] = X_desire_train.apply(lambda row: common_neighbours(row['Source'], row['Sink']), axis=1)
X_test_features['c_nei'] = X_desire_test.apply(lambda row: common_neighbours(row['Source'], row['Sink']), axis=1)

In [85]:
X_train_features['flow2'] = X_desire_train.apply(lambda row: random_walk_train.get((row['Source'], row['Sink']), [0])[0], axis=1)
X_test_features['flow2'] = X_desire_test.apply(lambda row: random_walk_train.get((row['Source'], row['Sink']), [0])[0], axis=1)

In [86]:
X_train_features['flow3'] = X_desire_train.apply(lambda row: random_walk_train.get((row['Source'], row['Sink']), [0,0])[1], axis=1)
X_test_features['flow3'] = X_desire_test.apply(lambda row: random_walk_train.get((row['Source'], row['Sink']), [0,0])[1], axis=1)

In [32]:
X_train_features['prefer'] = X_desire_train.apply(lambda row: preferencial_attchment(row['Source'], row['Sink']), axis=1)
X_test_features['prefer'] = X_desire_test.apply(lambda row: preferencial_attchment(row['Source'], row['Sink']), axis=1)

In [130]:
X_train_features['sor'] = X_desire_train.apply(lambda row: sorensen(row['Source'], row['Sink']), axis=1)
X_test_features['sor'] = X_desire_test.apply(lambda row: sorensen(row['Source'], row['Sink']), axis=1)

In [87]:
mappings = [
    hub_depressed, 
    hub_promoted
]

for f in tqdm(mappings, position=0, leave=True):
    X_train_features[f.__name__] = X_desire_train.apply(lambda row: f(row['Source'], row['Sink']), axis=1)
    X_test_features[f.__name__] = X_desire_test.apply(lambda row:f(row['Source'], row['Sink']),axis=1)

100%|██████████| 2/2 [00:22<00:00, 11.21s/it]


In [33]:
X_train_features['lhn'] = X_train_features.apply(lambda row: lhn(row['c_nei'], row['prefer']), axis=1)
X_test_features['lhn'] = X_test_features.apply(lambda row: lhn(row['c_nei'], row['prefer']), axis=1)

In [34]:
X_desire_train

Unnamed: 0,Source,Sink
0,4856212,2230237
1,1939867,1524883
2,4162561,323165
3,3581859,1150700
4,3096420,3628285
...,...,...
15995,296722,2326908
15996,1442657,711848
15997,3349730,649471
15998,4131768,572832


In [109]:
common_neighbours(4710933,1771100,g)

7

# NX features

In [45]:
def res_allo_index(a,b,graph=G):
    a = nx.resource_allocation_index(G,[(a, b)])
    try:
        for u,v,p in a:
            return p
    except:
        return 0

In [46]:
def jar_coe(a,b,graph=G):
    a = nx.jaccard_coefficient(G,[(a, b)])
    try:
        for u,v,p in a:
            return p
    except:
        return 0

In [47]:
def adamic_adar_index(a,b,graph=G):
    a = nx.adamic_adar_index(G,[(a, b)])
    try:
        for u,v,p in a:
            return p
    except:
        return 0

In [48]:
def preferential_attachment(a,b,graph=G):
    a = nx.preferential_attachment(G,[(a, b)])
    try:
        for u,v,p in a:
            return p
    except:
        return 0

In [49]:
desire_train_feature = list()
for i in range(len(X_desire_train)):
    features = {
                'reasource_allo_index': res_allo_index(X_desire_train['Source'][i], X_desire_train['Sink'][i], G),
                'jarccard_coef': jar_coe(X_desire_train['Source'][i], X_desire_train['Sink'][i], G),
                'adamic_adar_index': adamic_adar_index(X_desire_train['Source'][i], X_desire_train['Sink'][i], G),
                'preferential_attachment': preferential_attachment(X_desire_train['Source'][i], X_desire_train['Sink'][i], G)
    }
    desire_train_feature.append(features)

In [50]:
train_fea_df = pd.DataFrame(desire_train_feature)
train_fea_df.describe()

Unnamed: 0,reasource_allo_index,jarccard_coef,adamic_adar_index,preferential_attachment
count,16000.0,16000.0,16000.0,16000.0
mean,0.072149,0.013657,1.710092,838394.7
std,1.561703,0.034148,20.153034,26686310.0
min,0.0,0.0,0.0,0.0
25%,0.0,0.0,0.0,954.0
50%,3e-05,0.001888,0.113775,7263.0
75%,0.005084,0.011236,0.888693,48162.0
max,187.018371,0.494118,2322.840422,2840350000.0


In [51]:
desire_test_feature = list()
for i in range(len(X_desire_test)):
    features = {
                'reasource_allo_index': res_allo_index(X_desire_test['Source'][i], X_desire_test['Sink'][i], G),
                'jarccard_coef': jar_coe(X_desire_test['Source'][i], X_desire_test['Sink'][i], G),
                'adamic_adar_index': adamic_adar_index(X_desire_test['Source'][i], X_desire_test['Sink'][i], G),
                'preferential_attachment': preferential_attachment(X_desire_test['Source'][i], X_desire_test['Sink'][i], G)
    }
    desire_test_feature.append(features)

In [52]:
test_fea_df = pd.DataFrame(desire_test_feature)
test_fea_df

Unnamed: 0,reasource_allo_index,jarccard_coef,adamic_adar_index,preferential_attachment
0,0.000001,0.001585,0.073847,24231
1,0.000000,0.000000,0.000000,3550
2,0.050171,0.002445,0.877863,9804
3,0.000000,0.000000,0.000000,224
4,0.000000,0.000000,0.000000,0
...,...,...,...,...
3995,0.008113,0.011425,4.398665,531781
3996,0.000075,0.003891,0.105323,13160
3997,0.000000,0.000000,0.000000,11172
3998,0.011197,0.025000,0.634174,1248


In [123]:
all_train_features = pd.concat([X_train_features, train_fea_df], axis=1)
all_train_features.describe()

Unnamed: 0,jaccard_followers,jaccard_followees,cosin_rec,num_followers_s,num_followers_d,num_followees_s,num_followees_d,inter_followers,inter_followees,shortest_path,...,lhn,flow2,flow3,hub_depressed,hub_promoted,sor,reasource_allo_index,jarccard_coef,adamic_adar_index,preferential_attachment
count,16000.0,16000.0,16000.0,16000.0,16000.0,16000.0,16000.0,16000.0,16000.0,16000.0,...,16000.0,16000.0,16000.0,16000.0,16000.0,16000.0,16000.0,16000.0,16000.0,16000.0
mean,0.041698,0.001214,0.136861,19.816312,31.170938,308.7365,1362.875438,2.33925,0.475062,6.726,...,0.009451,0.0003489046,0.000281619,0.01113,0.010811,73672.14,0.072149,0.013657,1.710092,838394.7
std,0.084653,0.014728,0.146697,41.425706,71.136443,5734.420441,25188.794319,6.31246,6.219871,19.950183,...,0.04545,0.00292444,0.00253321,0.217121,0.470735,2214410.0,1.561703,0.034148,20.153034,26686310.0
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.0,0.0,0.020255,2.0,1.0,0.0,0.0,0.0,0.0,2.0,...,0.0,1.692402e-09,0.0,0.0,0.0,9.0,0.0,0.0,0.0,954.0
50%,0.0,0.0,0.086516,7.0,9.0,0.0,0.0,0.0,0.0,2.0,...,0.0,1.191617e-07,3.14725e-09,0.0,0.0,84.0,3e-05,0.001888,0.113775,7263.0
75%,0.051282,0.0,0.209223,21.0,30.0,0.0,0.0,2.0,0.0,3.0,...,0.004253,1.072719e-05,7.044665e-06,0.0,0.0,600.0,0.005084,0.011236,0.888693,48162.0
max,1.0,0.47561,0.925499,772.0,795.0,385859.0,759391.0,274.0,287.0,99.0,...,1.0,0.1666667,0.1666667,21.0,59.0,184826000.0,187.018371,0.494118,2322.840422,2840350000.0


In [124]:
all_test_features = pd.concat([X_test_features, test_fea_df], axis=1)
all_test_features.describe()

Unnamed: 0,jaccard_followers,jaccard_followees,cosin_rec,num_followers_s,num_followers_d,num_followees_s,num_followees_d,inter_followers,inter_followees,shortest_path,...,lhn,flow2,flow3,hub_depressed,hub_promoted,sor,reasource_allo_index,jarccard_coef,adamic_adar_index,preferential_attachment
count,4000.0,4000.0,4000.0,4000.0,4000.0,4000.0,4000.0,4000.0,4000.0,4000.0,...,4000.0,4000.0,4000.0,4000.0,4000.0,4000.0,4000.0,4000.0,4000.0,4000.0
mean,0.042241,0.001086,0.135453,19.96125,31.00925,547.144,1169.85325,2.38225,0.43075,7.13575,...,0.009473,0.0003177261,0.0002586743,0.016332,0.022049,90225.48,0.053542,0.013809,1.560653,827598.9
std,0.083576,0.012643,0.147926,43.526353,71.336782,13849.236064,22523.29752,6.851122,4.838911,20.805818,...,0.038629,0.002100487,0.001716675,0.359137,0.934449,3202765.0,0.311941,0.035988,6.077154,12557150.0
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.0,0.0,0.0183,2.0,1.0,0.0,0.0,0.0,0.0,2.0,...,0.0,1.281157e-09,0.0,0.0,0.0,8.0,0.0,0.0,0.0,794.25
50%,0.0,0.0,0.083583,7.0,8.0,0.0,0.0,0.0,0.0,2.0,...,0.0,1.09743e-07,1.581598e-09,0.0,0.0,78.0,2.2e-05,0.001584,0.100644,6808.0
75%,0.051282,0.0,0.206926,21.0,29.0,0.0,0.0,2.0,0.0,3.0,...,0.004668,9.081371e-06,6.344299e-06,0.0,0.0,602.75,0.00459,0.011206,0.854112,46438.75
max,1.0,0.29703,0.923102,772.0,772.0,759391.0,759391.0,155.0,121.0,99.0,...,1.0,0.04928596,0.04928018,21.0,59.0,184557500.0,7.941534,0.647059,151.256696,449316600.0


In [884]:
# from sklearn.preprocessing import Normalizer
# scaler = Normalizer(norm='l2')

# scaler.fit(all_train_features)
# all_train_features = scaler.transform(all_train_features)

# scaler.fit(all_test_features)
# all_test_features = scaler.transform(all_test_features)

In [125]:
pickle.dump(X_desire_train, open('save/X_desire_train_data1.p','wb'))
pickle.dump(y_desire_train, open('save/y_desire_train_data_label1.p','wb'))
pickle.dump(X_desire_test, open('save/X_desire_test_data1.p','wb'))
pickle.dump(y_desire_test, open('save/y_desire_test_data_label1.p','wb'))

In [126]:
pickle.dump(all_train_features, open('save/all_train_features1.p','wb'))
pickle.dump(all_test_features, open('save/all_test_features1.p','wb'))

# Build Model

In [91]:
from sklearn.linear_model import LogisticRegression
lr_clf = LogisticRegression()
lr_clf.fit(all_train_features, y_desire_train)
# lr_clf.fit(X_train_features, y_desire_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=None, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)

In [92]:
lr_clf.score(all_test_features, y_desire_test)
# lr_clf.score(X_test_features, y_desire_test)

0.50125

In [73]:
from sklearn.metrics import roc_auc_score
def evaluate_roc_auc(clf, features, labels):
    predicted = clf.predict_proba(features)

    # check which class corresponds to positive links
    positive_column = list(clf.classes_).index(1)
    return roc_auc_score(labels, predicted[:, positive_column])

In [93]:
from sklearn.ensemble import RandomForestClassifier
rf_clf = RandomForestClassifier(max_depth=10, random_state=90051, n_jobs=-1)
rf_clf.fit(all_train_features, y_desire_train)
# rf_clf.fit(X_train_features, y_desire_train)

RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='gini', max_depth=10, max_features='auto',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=100,
                       n_jobs=-1, oob_score=False, random_state=90051,
                       verbose=0, warm_start=False)

In [94]:
rf_clf.score(all_test_features, y_desire_test)
# rf_clf.score(X_test_features, y_desire_test)

0.85075

In [982]:
importances = rf_clf.feature_importances_
importances

array([1.41947926e-02, 1.28865552e-04, 9.13911375e-02, 2.26704699e-02,
       4.76901516e-02, 3.75759789e-03, 4.79726697e-03, 8.26624794e-03,
       9.55759002e-05, 6.57976410e-02, 1.04121847e-02, 3.99208399e-02,
       6.15069278e-04, 3.53766400e-03, 1.36974242e-02, 1.32724587e-01,
       1.47883850e-01, 1.70422842e-01, 5.70587031e-02, 8.06543317e-02,
       8.42827577e-02])

In [109]:
from sklearn.model_selection import GridSearchCV

base = RandomForestClassifier()
rf_param = {
#     "max_depth":[7,9,11,13],
    "max_depth":[15,17,19],
#     "min_samples_leaf":[1,3,5],
    "min_samples_leaf":[1],
#     "min_samples_split":[2,4,6,8,10],
    "min_samples_split":[4,6],
#     "max_features":["sqrt", "log2"],
    "max_features":["log2"],
#     "n_estimators":[100]
    "n_estimators":[1000,1200,1400,1600]
}


rf_grid = GridSearchCV(base, rf_param, n_jobs=-1, scoring='roc_auc')
rf_grid.fit(all_train_features, y_desire_train)
print(rf_grid.score(all_test_features, y_desire_test))

0.9333645841146028


In [979]:
importances = rf_grid.best_estimator_.feature_importances_

In [56]:
rf_grid.best_estimator_

RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='gini', max_depth=15, max_features='log2',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=6,
                       min_weight_fraction_leaf=0.0, n_estimators=1600,
                       n_jobs=None, oob_score=False, random_state=None,
                       verbose=0, warm_start=False)

In [981]:
importances

array([1.98330121e-02, 1.39174242e-04, 9.94969115e-02, 2.94160918e-02,
       5.72694138e-02, 6.79110552e-03, 6.06283093e-03, 9.80153065e-03,
       1.53878134e-04, 6.29048877e-02, 1.04874733e-02, 4.83532047e-02,
       3.01134059e-04, 4.61230874e-03, 1.88372160e-02, 1.24670832e-01,
       1.02001207e-01, 1.72028930e-01, 6.17423266e-02, 6.87002824e-02,
       9.63962491e-02])

In [128]:
import lightgbm

In [129]:
lgb_clf_dart = lightgbm.LGBMClassifier(boosting_type='dart',learning_rate=0.15,
                                               subsample=0.5,
                                               num_leaves=20)

In [130]:
lgb_clf_gbdt = lightgbm.LGBMClassifier(boosting_type='gbdt',learning_rate=0.1,
                                               subsample=0.5,max_depth=4,
                                               num_leaves=20)

In [131]:
lgb_clf_rf = lightgbm.LGBMClassifier(boosting_type='rf',bagging_freq=1,
                                               bagging_fraction=0.75,feature_fraction=0.75,
                                               num_leaves=20)

In [132]:
lgb_clf_dart.fit(all_train_features, y_desire_train)

LGBMClassifier(boosting_type='dart', class_weight=None, colsample_bytree=1.0,
               importance_type='split', learning_rate=0.15, max_depth=-1,
               min_child_samples=20, min_child_weight=0.001, min_split_gain=0.0,
               n_estimators=100, n_jobs=-1, num_leaves=20, objective=None,
               random_state=None, reg_alpha=0.0, reg_lambda=0.0, silent=True,
               subsample=0.5, subsample_for_bin=200000, subsample_freq=0)

In [636]:
evaluate_roc_auc(lgb_clf_dart, all_test_features, y_desire_test)

0.9349279034514254

In [110]:
evaluate_roc_auc(rf_grid, all_test_features, y_desire_test)

0.9333645841146028

In [133]:
from sklearn.ensemble import VotingClassifier
voting = VotingClassifier(
#     estimators=[('rf', rf_grid), ('xgb', xgb_grid), ('ada', ada_clf)]
    estimators=[
        ('lgb_dart', lgb_clf_dart), 
        ('lgb_gbdt', lgb_clf_gbdt), 
        ('lgb_rf', lgb_clf_rf),
        
    ]
    , voting='soft')

In [134]:
voting.fit(all_train_features, y_desire_train)

VotingClassifier(estimators=[('lgb_dart',
                              LGBMClassifier(boosting_type='dart',
                                             class_weight=None,
                                             colsample_bytree=1.0,
                                             importance_type='split',
                                             learning_rate=0.15, max_depth=-1,
                                             min_child_samples=20,
                                             min_child_weight=0.001,
                                             min_split_gain=0.0,
                                             n_estimators=100, n_jobs=-1,
                                             num_leaves=20, objective=None,
                                             random_state=None, reg_alpha=0.0,
                                             reg_lambda=0.0, silent=True,...
                                             importance_type='split',
                                  

In [135]:
print(voting.score(all_test_features, y_desire_test))

0.85925


In [136]:
evaluate_roc_auc(voting, all_test_features, y_desire_test)

0.929566703400492

# Generate Submission

In [58]:
sub_data = read_sub()

In [59]:
sub_data.head()

Unnamed: 0,Source,Sink
0,3563811,3600160
1,2052043,1401960
2,4517994,1690636
3,1660006,4349447
4,581111,1882617


In [60]:
X_sub = pd.DataFrame()

In [61]:
X_sub['jaccard_followers'] = sub_data.apply(
    lambda row:jaccard_followers(row['Source'], row['Sink'], train_graph=g), 
    axis=1)

#mapping jaccrd followees to train and test data
X_sub['jaccard_followees'] = sub_data.apply(
    lambda row:jaccard_followees(row['Source'], row['Sink'], train_graph=g), 
    axis=1)

In [62]:
X_sub['cosin_rec'] = sub_data.apply(lambda row: sub_cosin_dict[(row['Source'], row['Sink'])], axis=1)

In [63]:
X_sub['num_followers_s'], X_sub['num_followers_d'], \
X_sub['num_followees_s'], X_sub['num_followees_d'], \
X_sub['inter_followers'], X_sub['inter_followees'] = compute_features_stage1(sub_data)

In [64]:
X_sub['shortest_path'] = sub_data.apply(lambda row: compute_shortest_path_length(row['Source'], row['Sink']), axis=1)

In [65]:
X_sub['c_nei'] = sub_data.apply(lambda row: common_neighbours(row['Source'], row['Sink']), axis=1)

In [66]:
X_sub['prefer'] = sub_data.apply(lambda row: preferencial_attchment(row['Source'], row['Sink']), axis=1)

In [131]:
X_sub['sor'] = sub_data.apply(lambda row: sorensen(row['Source'], row['Sink']), axis=1)

In [95]:
X_sub['flow2'] = sub_data.apply(lambda row: random_walk_sub[(row['Source'], row['Sink'])][0], axis=1)

In [96]:
X_sub['flow3'] = sub_data.apply(lambda row: random_walk_sub[(row['Source'], row['Sink'])][1], axis=1)

In [97]:
mappings = [
    hub_depressed, 
    hub_promoted
]

for f in tqdm(mappings, position=0, leave=True):
    X_sub[f.__name__] = sub_data.apply(lambda row: f(row['Source'], row['Sink']), axis=1)
    X_sub[f.__name__] = sub_data.apply(lambda row:f(row['Source'], row['Sink']),axis=1)

100%|██████████| 2/2 [00:04<00:00,  2.11s/it]


In [67]:
X_sub['lhn'] = X_sub.apply(lambda row: lhn(row['c_nei'], row['prefer']), axis=1)

In [98]:
submission_feature = list()
for i in range(len(sub_data)):
    features = {
                'reasource_allo_index': res_allo_index(sub_data['Source'][i], sub_data['Sink'][i], G),
                'jarccard_coef': jar_coe(sub_data['Source'][i], sub_data['Sink'][i], G),
                'adamic_adar_index': adamic_adar_index(sub_data['Source'][i], sub_data['Sink'][i], G),
                'preferential_attachment': preferential_attachment(sub_data['Source'][i], sub_data['Sink'][i], G)
    }
    submission_feature.append(features)

In [99]:
submission_fea_df = pd.DataFrame(submission_feature)
submission_fea_df

Unnamed: 0,reasource_allo_index,jarccard_coef,adamic_adar_index,preferential_attachment
0,0.000000,0.000000,0.000000,667
1,0.000000,0.000000,0.000000,666
2,0.004624,0.011152,0.462729,4335
3,0.000028,0.003670,0.178806,18396
4,0.000000,0.000000,0.000000,920
...,...,...,...,...
1995,0.000000,0.000000,0.000000,116
1996,0.000049,0.006061,0.100722,5125
1997,0.000000,0.000000,0.000000,58
1998,0.000000,0.000000,0.000000,171


In [128]:
all_sub_features = pd.concat([X_sub, submission_fea_df], axis=1)
all_sub_features

Unnamed: 0,jaccard_followers,jaccard_followees,cosin_rec,num_followers_s,num_followers_d,num_followees_s,num_followees_d,inter_followers,inter_followees,shortest_path,...,lhn,flow2,flow3,hub_depressed,hub_promoted,sor,reasource_allo_index,jarccard_coef,adamic_adar_index,preferential_attachment
0,0.000000,0.0,0.013077,1,10,0,0,0,0,3,...,0.000000,0.000000e+00,0.000000e+00,0.0,0.000000,10,0.000000,0.000000,0.000000,667
1,0.000000,0.0,0.009110,7,2,0,0,0,0,3,...,0.000000,0.000000e+00,8.123854e-08,0.0,0.000000,14,0.000000,0.000000,0.000000,666
2,0.071429,0.0,0.088616,10,5,0,0,1,0,2,...,0.020000,7.365540e-05,7.788287e-05,0.0,0.000000,50,0.004624,0.011152,0.462729,4335
3,0.047619,0.0,0.175508,10,12,0,0,1,0,2,...,0.008333,5.578524e-08,7.085983e-08,0.0,0.000000,120,0.000028,0.003670,0.178806,18396
4,0.000000,0.0,0.023031,2,23,0,0,0,0,3,...,0.000000,0.000000e+00,6.330539e-09,0.0,0.000000,46,0.000000,0.000000,0.000000,920
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1995,0.000000,0.0,0.018680,1,1,0,0,0,0,3,...,0.000000,0.000000e+00,1.270053e-08,0.0,0.000000,1,0.000000,0.000000,0.000000,116
1996,0.021739,0.0,0.004964,33,14,95,0,1,0,2,...,0.000649,0.000000e+00,8.028945e-09,0.0,0.010526,1540,0.000049,0.006061,0.100722,5125
1997,0.000000,0.0,0.034330,5,2,0,0,0,0,3,...,0.000000,0.000000e+00,1.424484e-07,0.0,0.000000,10,0.000000,0.000000,0.000000,58
1998,0.000000,0.0,0.011626,2,1,0,0,0,0,3,...,0.000000,0.000000e+00,1.138948e-09,0.0,0.000000,2,0.000000,0.000000,0.000000,171


In [129]:
pickle.dump(all_sub_features, open('save/all_sub_features1.p','wb'))

In [843]:
# from sklearn.preprocessing import Normalizer
# scaler = Normalizer(norm='l2')

# scaler.fit(all_sub_features)
# all_sub_features = scaler.transform(all_sub_features)


In [181]:
# X_sub['cosin_rec'] = sub_data.apply(lambda row: sub_cosin_dict[(row['Source'], row['Sink'])], axis=1)

In [969]:
with open("save/all_sub_features.p", "wb") as f:
    pickle.dump(all_sub_features, f)

In [101]:
all_sub_features.shape

(2000, 21)

In [111]:
# y_sub = voting.predict_proba(all_sub_features)
y_sub = rf_grid.predict_proba(all_sub_features)

In [112]:
y_sub[0:15]

array([[0.90723864, 0.09276136],
       [0.88313184, 0.11686816],
       [0.00561666, 0.99438334],
       [0.82503714, 0.17496286],
       [0.93084603, 0.06915397],
       [0.72072096, 0.27927904],
       [0.89827279, 0.10172721],
       [0.96766158, 0.03233842],
       [0.87978143, 0.12021857],
       [0.93212339, 0.06787661],
       [0.71810801, 0.28189199],
       [0.88170535, 0.11829465],
       [0.00583333, 0.99416667],
       [0.86686246, 0.13313754],
       [0.96209116, 0.03790884]])

In [113]:
# rf_clf.predict(all_sub_features)[:15]
rf_grid.predict(all_sub_features)[:15]

array([0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0])

In [114]:
# sum(rf_clf.predict(all_sub_features))
sum(rf_grid.predict(all_sub_features))

829

In [115]:
result = pd.DataFrame(data={"Id":range(1,len(y_sub)+1), "Predicted":[x[1] for x in y_sub]})

In [116]:
result.to_csv('new_feature.csv', index=False)

In [117]:
result.head()

Unnamed: 0,Id,Predicted
0,1,0.092761
1,2,0.116868
2,3,0.994383
3,4,0.174963
4,5,0.069154


In [4]:
result_1 = pd.read_csv("submission_file/nx_submissioin_rf_with_flow.csv")

In [5]:
result_2 = pd.read_csv("submission_file/nx_submissioin_rf_norm.csv")

In [7]:
diff_list = []
for i in range(len(result_2)):
    if abs(result_1.iloc[i]['Predicted'] - result_2.iloc[i]['Predicted']) >= 0.5:
        diff_list.append(i)

In [8]:
len(diff_list)

4