In [1]:
import pandas as pd
import numpy as np
import random
import networkx as nx
from tqdm import tqdm
import re
import matplotlib.pyplot as plt

from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, roc_auc_score
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix

In [2]:
from py2neo import Graph
graph = Graph("bolt://localhost", auth=("neo4j", "sjsu123"))
print(graph)

<Graph database=<Database uri='bolt://localhost:7687' secure=False user_agent='py2neo/4.1.3 neo4j-python/1.6.3 Python/3.6.9-final-0 (win32)'> name='data'>


In [3]:
# positive samples
train_positive = graph.run("""
MATCH (u1:user)-[:FOLLOWS]->(u2:user)
RETURN id(u1) AS node1, id(u2) AS node2, 1 AS label
""").to_data_frame()
print("len", len(train_positive))
train_positive.sample(5)

len 2102


Unnamed: 0,node1,node2,label
1133,34,345,1
1908,401,556,1
1238,74,382,1
1548,600,466,1
1322,371,403,1


In [4]:
# negative samples
train_negative = graph.run("""
MATCH (u1:user), (u2:user) WHERE NOT (u1)-[:FOLLOWS]-(u2)
RETURN id(u1) AS node1, id(u2) AS node2, 0 AS label
""").to_data_frame()

# 381,678
print("len", len(train_negative))
train_negative = train_negative.sample(2102)
train_negative.sample(5)

len 380207


Unnamed: 0,node1,node2,label
231541,377,329,0
368684,601,127,0
22206,36,120,0
310278,505,510,0
120149,195,501,0


In [5]:
training_df = train_negative.append(train_positive, ignore_index=True)
print("len", len(training_df))
training_df.sample(10)

len 4204


Unnamed: 0,node1,node2,label
3865,613,515,1
870,542,27,0
3417,199,402,1
3977,91,546,1
1588,58,116,0
1038,212,553,0
2694,191,182,1
2596,377,164,1
2343,585,65,1
2541,276,146,1


In [6]:
train_all = graph.run("""
MATCH (u1:user)-[r:TRAIN]->(u2:user)
RETURN id(u1) AS node1, id(u2) AS node2, r.flag AS label
""").to_data_frame()
print("len", len(train_all))
train_all.sample(5)

len 20501


Unnamed: 0,node1,node2,label
13485,390,438,0
2012,517,65,0
714,478,27,0
8909,103,281,0
8930,105,282,0


In [7]:
from node2vec import Node2Vec

In [8]:
train_data = graph.run("""
MATCH (u1:user)-[r:TRAINVEC]->(u2:user)
RETURN id(u1) AS node1, id(u2) AS node2
""").to_data_frame()
print("len", len(train_data))

train_data = nx.from_pandas_edgelist(train_data, "node1", "node2", create_using=nx.Graph())


len 619


In [9]:
node2vec = Node2Vec(train_data, dimensions=100, walk_length=16, num_walks=50)

# train node2vec model
n2w_model = node2vec.fit(window=7, min_count=1)

x = [(n2w_model[str(i)]+n2w_model[str(j)]) for i,j in zip(train_all['node1'], train_all['node2'])]


Computing transition probabilities: 100%|█████████████████████████████████████████| 620/620 [00:00<00:00, 22199.09it/s]
Generating walks (CPU: 1): 100%|███████████████████████████████████████████████████████| 50/50 [00:14<00:00,  3.47it/s]
  


In [10]:
from sklearn.model_selection import train_test_split
xtrain, xtest, ytrain, ytest = train_test_split(np.array(x), train_all['label'], 
                                                test_size = 0.3, 
                                                random_state = 35)
lr = LogisticRegression(class_weight="balanced")
print("xtrain", len(xtrain))
print("xtest", len(xtest))
lr.fit(xtrain, ytrain)
predictions = lr.predict_proba(xtest)
roc_auc_score(ytest, predictions[:,1])

xtrain 14350
xtest 6151


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


0.8176922210693114

In [11]:
from sklearn.ensemble import RandomForestClassifier
classifier = RandomForestClassifier(n_estimators=30, max_depth=10, 
                                    random_state=0)
classifier.fit(xtrain, ytrain)
predictions = classifier.predict_proba(xtest)
roc_auc_score(ytest, predictions[:,1])

0.8495149730942786

In [47]:
import lightgbm as lgbm
ytrain = pd.to_numeric(ytrain, errors='coerce')
ytest = pd.to_numeric(ytest, errors='coerce')
train_data = lgbm.Dataset(xtrain, ytrain)
test_data = lgbm.Dataset(xtest, ytest)
# define parameters
parameters = {
    'objective': 'binary',
    'metric': 'auc',
    'is_unbalance': 'true',
    'feature_fraction': 0.5,
    'bagging_fraction': 0.5,
    'bagging_freq': 20,
    'num_threads' : 2,
    'seed' : 76
}
# train lightGBM model
model = lgbm.train(parameters,
                   train_data,
                   valid_sets=test_data,
                   num_boost_round=1000,
                   early_stopping_rounds=20)

[1]	valid_0's auc: 0.726174
Training until validation scores don't improve for 20 rounds
[2]	valid_0's auc: 0.769021
[3]	valid_0's auc: 0.790313
[4]	valid_0's auc: 0.805087
[5]	valid_0's auc: 0.814897
[6]	valid_0's auc: 0.824414
[7]	valid_0's auc: 0.829259
[8]	valid_0's auc: 0.835931
[9]	valid_0's auc: 0.840983
[10]	valid_0's auc: 0.845668
[11]	valid_0's auc: 0.850604
[12]	valid_0's auc: 0.854409
[13]	valid_0's auc: 0.856402
[14]	valid_0's auc: 0.856884
[15]	valid_0's auc: 0.858523
[16]	valid_0's auc: 0.862026
[17]	valid_0's auc: 0.863562
[18]	valid_0's auc: 0.864544
[19]	valid_0's auc: 0.867736
[20]	valid_0's auc: 0.868032
[21]	valid_0's auc: 0.871753
[22]	valid_0's auc: 0.874201
[23]	valid_0's auc: 0.876726
[24]	valid_0's auc: 0.880661
[25]	valid_0's auc: 0.882768
[26]	valid_0's auc: 0.885015
[27]	valid_0's auc: 0.88771
[28]	valid_0's auc: 0.89016
[29]	valid_0's auc: 0.890693
[30]	valid_0's auc: 0.89175
[31]	valid_0's auc: 0.892816
[32]	valid_0's auc: 0.894929
[33]	valid_0's auc: 0.8