In [1]:
import numpy as np
import pandas as pd
import networkx as nx
import matplotlib.pyplot as plt
import plotly.express as px
from tqdm import tqdm
import random
import pickle

In [2]:
with open("data/matrix.p", "rb") as f:
    matrix = pickle.load(f)

In [3]:
with open("data/train-mix.p", "rb") as f:
    data = pickle.load(f)

In [4]:
data

[[3966, 2712039, 218222, 1],
 [13826, 3296670, 3092525, 0],
 [18559, 1569537, 281790, 0],
 [14849, 3750582, 528038, 0],
 [3676, 3253990, 3008062, 1],
 [14879, 4126485, 2978899, 0],
 [6342, 2353654, 1863046, 1],
 [17047, 1227313, 109141, 0],
 [1595, 685695, 2410046, 1],
 [12578, 3262992, 2677316, 0],
 [3546, 2323088, 2309976, 1],
 [8157, 2466503, 4800253, 1],
 [5010, 878044, 2541835, 1],
 [7265, 4672710, 3783690, 1],
 [12900, 2806975, 1202129, 0],
 [12087, 3079468, 3325422, 0],
 [12682, 10090, 545853, 0],
 [11214, 4550069, 3248447, 0],
 [18558, 4786270, 3205531, 0],
 [2483, 2198488, 1971518, 1],
 [8905, 3063001, 1659272, 1],
 [5280, 3869033, 2572933, 1],
 [19560, 1896126, 2402775, 0],
 [14346, 3063973, 1380739, 0],
 [9280, 3228493, 1304921, 1],
 [18631, 3545321, 3708550, 0],
 [17997, 1550614, 2349803, 0],
 [3004, 2052207, 2055238, 1],
 [18820, 2360726, 3050762, 0],
 [18545, 4749438, 4346941, 0],
 [8471, 2595454, 1577010, 1],
 [19420, 4696379, 428622, 0],
 [17591, 4559284, 3591170, 0],
 

In [135]:
id_to_pair = {}

for i in data:
    id_to_pair[i[0]] = (i[1], i[2])

In [27]:
df = pd.DataFrame()

In [36]:
link = [[i[1], i[2]] for i in data]
label = [i[3] for i in data]

In [32]:
df['Source'] = [i[1] for i in data]
df['Sink'] = [i[2] for i in data]
df['Label'] = [i[3] for i in data]

In [37]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(link, label, test_size=0.1)

In [42]:
from features_generator import *

In [48]:
train_features = pd.DataFrame()
test_features = pd.DataFrame()

In [49]:
temp_train = []
for i in tqdm(X_train):
    temp.append(common_neighbour(i[0], i[1], matrix))
train_features['c_nei'] = temp

100%|██████████| 18000/18000 [00:00<00:00, 276833.48it/s]


In [79]:
!open .

In [51]:
from multiprocessing import Pool

def feature_generator(datum):

    id, a, b, l = datum

    # node feature
    a_in = indegree(a, matrix)
    a_out = outdegree(a, matrix)
    b_in = indegree(b, matrix)
    b_out = indegree(b, matrix)

    # neighbouring feature
    neighbour = common_neighbour(a, b, matrix)
    jac = jaccard(neighbour, a, b, matrix)
    dice = dice_idx(neighbour, a, b, matrix)
    p_a = pref_attach(a, b, matrix)
    cos = cosine_sim(neighbour, p_a)
    lhn = LHN(neighbour, p_a)
    adar = adamic_adar(a, b, matrix)
    ra = resource_allocation(a, b, matrix)
    reverse = reverse_link(a, b, matrix)
    hp = hub_promoted(neighbour, a, b, matrix)
    hd = hub_depressed(neighbour, a, b, matrix)

    # path feature
    #sim_r = sim_rank(a, b, matrix, 0)

    flow2, flow3 = propflow3(a, b, matrix)
    #print(flow)
    #return flow
    return [id,a_in,a_out,b_in,b_out,neighbour,jac,dice,p_a,cos,lhn,adar,reverse,hp,hd,flow2,flow3,l]

def logger(res):
    train_test.append(res)
    if len(train_test) % (len(data)//100) == 0:
        print("{:.2%} done".format(len(train_test)/len(data)))


train_test = []
print("start")
pool = Pool(processes=4)
for item in data:
    pool.apply_async(feature_generator, args=[item], callback=logger)
pool.close()
pool.join()
print("end")

train_test = np.array(train_test)
print(train_test.shape)

start
1.00% done
2.00% done
3.00% done
4.00% done
5.00% done
6.00% done
7.00% done
8.00% done
9.00% done
10.00% done
11.00% done
12.00% done
13.00% done
14.00% done
15.00% done
16.00% done
17.00% done
18.00% done
19.00% done
20.00% done
21.00% done
22.00% done
23.00% done
24.00% done
25.00% done
26.00% done
27.00% done
28.00% done
29.00% done
30.00% done
31.00% done
32.00% done
33.00% done
34.00% done
35.00% done
36.00% done
37.00% done
38.00% done
39.00% done
40.00% done
41.00% done
42.00% done
43.00% done
44.00% done
45.00% done
46.00% done
47.00% done
48.00% done
49.00% done
50.00% done
51.00% done
52.00% done
53.00% done
54.00% done
55.00% done
56.00% done
57.00% done
58.00% done
59.00% done
60.00% done
61.00% done
62.00% done
63.00% done
64.00% done
65.00% done
66.00% done
67.00% done
68.00% done
69.00% done
70.00% done
71.00% done
72.00% done
73.00% done
74.00% done
75.00% done
76.00% done
77.00% done
78.00% done
79.00% done
80.00% done
81.00% done
82.00% done
83.00% done
84.00% 

In [156]:
len(sub_data)

2000

In [6]:
from multiprocessing import Pool
from features_generator import propflow3

def feature_generator(pair):
    a, b = pair
    flow2, flow3 = propflow3(a, b, matrix)
    #print(flow)
    #return flow
    return [a, b, flow2,flow3]

def logger(res):
    random_walk_sub[(res[0], res[1])] = (res[2], res[3])
    if len(random_walk_sub) % (len(sub_data)//100) == 0:
        print("{:.2%} done".format(len(random_walk_sub)/(len(sub_data))))

random_walk_sub = {}
print("start")
pool = Pool(processes=6)
for pair in sub_data.to_numpy():
    pool.apply_async(feature_generator, args=[pair], callback=logger)
pool.close()
pool.join()
print("end")

print(len(random_walk_sub))

start
1.00% done
2.00% done
3.00% done
4.00% done
5.00% done
6.00% done
7.00% done
8.00% done
9.00% done
10.00% done
11.00% done
12.00% done
13.00% done
14.00% done
15.00% done
16.00% done
17.00% done
18.00% done
19.00% done
20.00% done
21.00% done
22.00% done
23.00% done
24.00% done
25.00% done
26.00% done
27.00% done
28.00% done
29.00% done
30.00% done
31.00% done
32.00% done
33.00% done
34.00% done
35.00% done
36.00% done
37.00% done
38.00% done
39.00% done
40.00% done
41.00% done
42.00% done
43.00% done
44.00% done
45.00% done
46.00% done
47.00% done
48.00% done
49.00% done
50.00% done
51.00% done
52.00% done
53.00% done
54.00% done
55.00% done
56.00% done
57.00% done
58.00% done
59.00% done
60.00% done
61.00% done
62.00% done
63.00% done
64.00% done
65.00% done
66.00% done
67.00% done
68.00% done
69.00% done
70.00% done
71.00% done
72.00% done
73.00% done
74.00% done
75.00% done
76.00% done
77.00% done
78.00% done
79.00% done
80.00% done
81.00% done
82.00% done
83.00% done
84.00% 

In [8]:
with open("data/random_walk_sub", "wb") as f:
    pickle.dump(random_walk_sub, f)

In [None]:
# train_test.dump("data/data.npy")

In [129]:
train_test = np.load("data/data.npy", allow_pickle=True)

In [146]:
random_walk_train = {}
for i in train_test[1:]:
    pair = id_to_pair[i[0]]
    # flow2, flow3
    random_walk_train[pair] = [i[-2], i[-3]]

In [148]:
with open('data/random_walk_train.p', "wb") as f:
    pickle.dump(random_walk_train, f)

In [152]:
len(data)

20000

In [151]:
len(random_walk_train)

19926

In [90]:
train_test_reduced = [i[5:] for i in train_test]

In [91]:
len(train_test_reduced)

20000

In [92]:
X_train, X_test, y_train, y_test = train_test_split(train_test_reduced, label, test_size=0.1)

In [93]:
from sklearn.linear_model import LogisticRegression
lr_clf = LogisticRegression()
lr_clf.fit(X_train, y_train)



LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='warn', n_jobs=None, penalty='l2',
                   random_state=None, solver='warn', tol=0.0001, verbose=0,
                   warm_start=False)

In [94]:
lr_clf.score(X_test, y_test)

0.462

In [74]:
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import roc_auc_score

base = RandomForestClassifier(n_estimators=100)
parameters = {
    "max_depth":[9,11,13,15,17],
    "min_samples_leaf":[1,3,5],
    "min_samples_split":[2,4,6,8,10],
}
model = GridSearchCV(base, parameters, n_jobs=-1)
model.fit(X_train, y_train)
print(model.score(X_test, y_test))

NameError: name 'X_train' is not defined

In [96]:
from sklearn.ensemble import AdaBoostClassifier

ada = AdaBoostClassifier(n_estimators=200, learning_rate=0.5)
ada.fit(X_train, y_train)
print(roc_auc_score(y_test, np.squeeze(ada.predict_proba(X_test)[:,1])))

0.6077165337005779


In [76]:
from sklearn.preprocessing import Normalizer
scaler = Normalizer(norm='l1')

scaler.fit(X_train)
X_train = scaler.transform(X_train)
scaler.fit(X_test)
X_test = scaler.transform(X_test)

# Generate Rec Sim

In [9]:
import numpy as np
from tqdm import tqdm
from sklearn.metrics.pairwise import cosine_similarity
import json
import pickle

train = {} # key: src   value: [dest1, dest2, ...]
vector_count = {} # indegree count

print ("reading train set")

with open("train.txt") as trainfile:
    for i, line in tqdm(enumerate(trainfile), position=0, leave=True):
        line_list = [int(k) for k in line[:-1].split("\t")]
        a = line_list[0]
        train[a] = []
        for b in line_list[1:]:
            train[a].append(b)
            vector_count[b] = vector_count.get(b,0)+1
        train[a] = list(set(train[a]))


print ("--------complete")
print ("generating dictionary")

0it [00:00, ?it/s]

reading train set


20000it [00:44, 452.42it/s] 

--------complete
generating dictionary





In [10]:
import pandas as pd
def read_sub():
    with open('test-public.txt', 'r') as f:
        # skip the header
        f.readline()
        data = {'Source':[], 'Sink':[]}
        for raw_line in f:
            line = raw_line.strip().split("\t")
            data['Source'].append(int(line[1]))
            data['Sink'].append(int(line[2]))
        return pd.DataFrame(data=data)

In [60]:
# generate new node set
# filter by indegree threshold
threshold = 10
new_set = set()
for i in vector_count:
    if vector_count[i] > threshold:
        new_set.add(i)

In [61]:
# add all source node
for i in train:
    new_set.add(i)

In [62]:
with open("data/train-mix.p","rb") as f:
    test = pickle.load(f)

In [63]:
# add all the node in testing to the new set
for _, j, k, _ in test:
    new_set.add(j)
    new_set.add(k)

In [64]:
sub_data = read_sub()

In [65]:
for source, sink in tqdm(sub_data.to_numpy(), position=0, leave=True):
    new_set.add(source)
    new_set.add(sink)

100%|██████████| 2000/2000 [00:00<00:00, 388056.07it/s]


In [66]:
sources_vector = list(train.keys())
source_to_index = {}

for i, j in enumerate(sources_vector):
    source_to_index[j] = i

In [67]:
from copy import deepcopy

In [68]:
new_train = deepcopy(train)
# remvoe existing link to prevent overfitting
for _, j, k, label in tqdm(test, position=0, leave=True):
    try:
        if label == 1:
            new_train[j].remove(k)
    except:
        print(j, k)

100%|██████████| 20000/20000 [00:00<00:00, 93813.84it/s]

526124 2291110
4754912 1080728
2052207 2055238
3069376 4366991
1934306 1613948
4487750 4759117
3344156 1051683
2881345 4299671
2169223 1829372
1332914 2306235
1934306 1613948
2727319 2082826
2169223 1829372
3240892 2015027
3789096 1115656
4496641 3638272
2292499 1255199
2700064 940665
2015161 2720688
1877454 3151904
3037367 86923
1870393 2635670
4246803 1893694
3412097 847110
1297643 3361377
4089639 2102581
2540494 899343
2181281 2757277
2540494 899343
1804572 4499618
2730668 2718230
1718468 1025098
2757448 4062558
4638959 2863144
1885790 389412
1332914 2306235
3240892 2015027
4497774 4272512
3540235 4467395
3256502 868229
1383948 4414047
3344156 1051683
1343794 2000752
3598745 2688275
291439 3091337
4001704 4522122
313799 2256983
2134565 2347332
4824678 4088634
871961 511838
63018 636171
1804572 4499618
2935645 1863043
1343794 2000752
2167647 2041022
3736991 4056937
2987310 402852
4496641 3638272
2094402 3779776
1262239 3166986
34959 2336938
944725 2305992
1442782 2676175
2423135 5590




In [69]:
sink_source = {}

for source in tqdm(sources_vector, position=0, leave=True):
    sinks = new_train[source]
    for sink in sinks:
        if sink in new_set:
            value = sink_source.get(sink, set())
            value.add(source_to_index[source])
            sink_source[sink] = value

100%|██████████| 20000/20000 [00:20<00:00, 977.95it/s] 


In [70]:
from scipy.sparse import csr_matrix

In [71]:
sink_to_vec = {}

for sink in tqdm(sink_source.keys(), position=0, leave=True):
    vec = np.zeros(20000)
    vec[list(sink_source[sink])] = 1/len(sink_source[sink])
    sink_to_vec[sink] = csr_matrix(vec)

100%|██████████| 376040/376040 [03:17<00:00, 1908.48it/s]


In [34]:
# source_to_vec = {}

# for source in tqdm(sources_vector, position=0, leave=True):
#     vec = csr_matrix(np.zeros(20000))
#     counter = 0
    
#     for sink in new_train[source]:
#         if sink not in new_set:
#             continue
#         vec += sink_to_vec[sink]
#         counter += 1
        
#     source_to_vec[source] = vec / max(counter, 1)

  1%|▏         | 295/20000 [01:35<3:05:30,  1.77it/s]

KeyboardInterrupt: 

  1%|▏         | 295/20000 [01:45<3:05:30,  1.77it/s]

In [72]:
test_source = set()
for _, source, _, _ in test:
    test_source.add(source)

In [73]:
test_source_to_vec = {}

for source in tqdm(test_source, position=0, leave=True):
    vec = csr_matrix(np.zeros(20000))
    counter = 0
    
    for sink in new_train[source]:
        if sink not in new_set:
            continue
        vec += sink_to_vec[sink]
        counter += 1
    
    # add self reference
    if source in sink_to_vec:
        vec += sink_to_vec[source]
        counter += 1
        
    test_source_to_vec[source] = vec / max(counter, 1)

100%|██████████| 12450/12450 [18:01<00:00, 11.51it/s] 


In [74]:
with open("data/test_source_to_vec_add_self_t10.p", "wb") as f:
    pickle.dump(test_source_to_vec, f)

In [75]:
cosin_dict = {}

for _, source, sink, _ in tqdm(test, position=0, leave=True):
    source_vec = test_source_to_vec[source]
    sink_vec = sink_to_vec.get(sink, np.array([np.zeros(20000)]))
    sim = cosine_similarity(X=source_vec, Y=sink_vec)[0][0]
    cosin_dict[(source, sink)] = sim

100%|██████████| 20000/20000 [00:14<00:00, 1422.32it/s]


In [76]:
with open("data/cosin_dict_add_self_t10.p", "wb") as f:
    pickle.dump(cosin_dict, f)

### Creating features for submission data

In [77]:
# TODO : add whole training graph source here
sub_sink_source = {}

for source in tqdm(train.keys(), position=0, leave=True):
    sinks = train[source]
    for sink in sinks:
        if sink in new_set:
            value = sub_sink_source.get(sink, set())
            value.add(source_to_index[source])
            sub_sink_source[sink] = value

100%|██████████| 20000/20000 [00:39<00:00, 508.73it/s] 


In [78]:
# sub_sink_to_vec = {}

# for sink in tqdm(sub_sink_source.keys(), position=0, leave=True):
#     vec = np.zeros(20000)
#     vec[list(sub_sink_source[sink])] = 1/len(sub_sink_source[sink])
#     sub_sink_to_vec[sink] = csr_matrix(vec)

In [79]:
sub_sink_to_vec = {}

def sub_sink_to_vec_func(sink):
    if sink in sub_sink_to_vec:
        return sub_sink_to_vec[sink]
    
    vec = np.zeros(20000)
    if sink in sub_sink_source:
        vec[list(sub_sink_source[sink])] = 1/len(sub_sink_source[sink])
    else:
        print(sink, "not in sink source")
    
    res = csr_matrix(vec)
    sub_sink_to_vec[sink] = res
    return res

In [80]:
sub_source_to_vec = {}

for source in tqdm(sub_data['Source'], position=0, leave=True):
    vec = csr_matrix(np.zeros(20000))
    counter = 0
    
    for sink in train[source]:
        if sink not in new_set:
            continue
        vec += sub_sink_to_vec_func(sink)
        counter += 1
        
    # add self reference
    if source in sub_sink_source:
        vec += sub_sink_to_vec_func(source)
        counter += 1
        
    sub_source_to_vec[source] = vec / max(counter, 1)

100%|██████████| 2000/2000 [12:26<00:00,  2.68it/s]  


In [81]:
sub_cosin_dict = {}

for source, sink in tqdm(sub_data.to_numpy(), position=0, leave=True):
    source_vec = sub_source_to_vec[source]
#     sink_vec = sub_sink_to_vec.get(sink, np.array([np.zeros(20000)]))
    sink_vec = sub_sink_to_vec_func(sink)
    sim = cosine_similarity(X=source_vec, Y=sink_vec)[0][0]
    sub_cosin_dict[(source, sink)] = sim

100%|██████████| 2000/2000 [00:01<00:00, 1322.21it/s]


In [82]:
sub_cosin_dict

{(3563811, 3600160): 0.01307710829136364,
 (2052043, 1401960): 0.009109753359933925,
 (4517994, 1690636): 0.08861559349005146,
 (1660006, 4349447): 0.17550832482870476,
 (581111, 1882617): 0.02303121606161453,
 (1039683, 1365102): 0.0,
 (4017576, 1291219): 0.03773880331992168,
 (1859935, 1059674): 0.025408184618404786,
 (4337137, 3648097): 0.02224321461486829,
 (2682050, 3501635): 0.07921580777618813,
 (4153157, 2634625): 0.008000039022997563,
 (1704301, 684494): 0.010051714453427495,
 (1190760, 500235): 0.28757414756756083,
 (1462670, 2590058): 0.0,
 (3648736, 1493425): 0.014265352837094588,
 (1374847, 1120944): 0.19875610171130118,
 (986556, 6342): 0.3430375426317033,
 (1079774, 3300386): 0.003695962561532072,
 (1578087, 4017095): 0.18177654303034646,
 (2711285, 1957200): 0.29366024277693975,
 (1461955, 2560869): 0.0,
 (2997746, 2961801): 0.14250834632621096,
 (627416, 2273729): 0.07403976595788088,
 (1689105, 2572994): 0.06537889990565485,
 (662454, 4397219): 0.2714192110528774,
 (3

In [83]:
with open("data/sub_cosin_dict_add_sm_t10.p", "wb") as f:
    pickle.dump(sub_cosin_dict, f)

In [125]:
!open .

In [69]:
labels = np.array([i[-1] for i in test])

In [70]:
labels.reshape(-1,1)

array([[1],
       [0],
       [0],
       ...,
       [1],
       [0],
       [1]])

In [72]:
from sklearn.ensemble import AdaBoostClassifier

ada = AdaBoostClassifier(n_estimators=200, learning_rate=0.5)
ada.fit(,)

AdaBoostClassifier(algorithm='SAMME.R', base_estimator=None, learning_rate=0.5,
                   n_estimators=200, random_state=None)

In [75]:
print(roc_auc_score(labels[-2000:], np.squeeze(ada.predict_proba([[i] for i in test_with_cosin_feature[-2000:]])[:,1])))

0.7788922459617613


In [76]:
from sklearn.model_selection import GridSearchCV

base = RandomForestClassifier(n_estimators=100)
parameters = {
    "max_depth":[5,7,9,11,13],
    "min_samples_leaf":[1,3,5],
    "min_samples_split":[2,4,6,8,10],
    "max_features":["sqrt", "log2"]
}

rf_grid = GridSearchCV(base, parameters, n_jobs=-1)
rf_grid.fit([[i] for i in test_with_cosin_feature[:-2000]], labels[:-2000])
print(rf_grid.score([[i] for i in test_with_cosin_feature[-2000:]], labels[-2000:]))



0.723


# Sink-wise modelling

In [6]:
id2v = list(new_set) # [v1, v2, ...]
v2id = {} # key: node    value: index of the node in id2v
for i, j in enumerate(id2v):
    v2id[j] = i
    
print ("length of new set:")
print (len(new_set))

length of new set:
376317


In [7]:
test[0]

[3966, 2712039, 218222, 1]

In [8]:
# generate new node id dictionary
new_train = {} # key: index    value: set of connected nodes after filtering
for i in train:
    # i is source node, train[i] is sink node from i-th source node
                                        # select sink node that in our new_set
    new_train[v2id[i]] = set([v2id[j] for j in train[i] if j in new_set])

new_test = {} # key: training sample id (old id)    value: [new id for source, id for sink] 
for i, j, k,_ in test:
    # i is id, j is source, k is sink
    new_test[i] = [v2id[j], v2id[k]]
    # remove true edge
    if v2id[k] in new_train[v2id[j]]:
        new_train[v2id[j]].remove(v2id[k])

In [9]:
tA = new_train.copy()
tB = {}
for i in new_train:
    if i not in tA[i]:
        # add self connection for each node
        tA[i].add(i)
    for j in new_train[i]:
        tB[j] = tB.get(j,set([]))
        tB[j].add(i)

print ("now processing...")

now processing...


In [128]:
len(tA.keys())

20000

In [10]:
def sim(pair, tA, tB, l):
    vi, vj = pair
    tempA = np.zeros(l)
    tempB = np.zeros(l)
    tempA[list(tA[vi])] = 1/len(tA[vi])
    
    if vj in tB:
        for i in tB[vj]:
            tempB[list(tA[i])] += 1/len(tB[vj])/len(tA[i])
    return cosine_similarity([tempA, tempB])[0][1]
    #return tempA,tempB

In [133]:
len(new_test.keys())

20000

In [11]:
l = len(new_set)
res = {}
for i in tqdm(new_test, ascii=True):
    source, sink = new_test[i]
    res[i] = []
    res[i].append(sim([source, sink], tA, tB, l))
#     res[i].append(sim([vj,vi], tB, tA, l))

with open("data/rec_sim.pickle","wb") as f:
    pickle.dump(res, f)

  8%|7         | 1544/20000 [07:40<48:49,  6.30it/s]  

KeyboardInterrupt: 