In [1]:
import pandas as pd
import numpy as np
import random
import networkx as nx
from tqdm import tqdm
import re
import matplotlib.pyplot as plt

from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, roc_auc_score
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix

In [2]:
from py2neo import Graph
graph = Graph("bolt://localhost", auth=("neo4j", "sjsu123"))
print(graph)

<Graph database=<Database uri='bolt://localhost:7687' secure=False user_agent='py2neo/4.1.3 neo4j-python/1.6.3 Python/3.6.9-final-0 (win32)'> name='data'>


In [27]:
# positive samples
train_positive = graph.run("""
MATCH (u1:user)-[:FOLLOWS]->(u2:user)
RETURN id(u1) AS node1, id(u2) AS node2, 1 AS label
""").to_data_frame()
print("len", len(train_positive))
train_positive.sample(5)

len 2102


Unnamed: 0,node1,node2,label
1759,105,511,1
332,310,105,1
310,575,95,1
774,276,219,1
1741,539,511,1


In [28]:
# negative samples
train_negative = graph.run("""
MATCH (u1:user), (u2:user) WHERE NOT (u1)-[:FOLLOWS]-(u2)
RETURN id(u1) AS node1, id(u2) AS node2, 0 AS label
""").to_data_frame()

# 381,678
print("len", len(train_negative))
train_negative = train_negative.sample(2102)
train_negative.sample(5)

len 380207


Unnamed: 0,node1,node2,label
213778,348,380,0
243916,397,428,0
61571,100,235,0
49903,81,246,0
365899,596,411,0


In [30]:
training_df = train_negative.append(train_positive, ignore_index=True)
print("len", len(training_df))
training_df.sample(10)

len 4204


Unnamed: 0,node1,node2,label
2583,137,155,1
2318,217,58,1
77,606,209,0
3940,559,538,1
3255,446,355,1
2337,130,64,1
1386,126,492,0
1635,552,70,0
1194,310,358,0
1967,341,338,0


In [36]:
train_all = graph.run("""
MATCH (u1:user)-[r:TRAIN]->(u2:user)
RETURN id(u1) AS node1, id(u2) AS node2, r.flag AS label
""").to_data_frame()
print("len", len(train_all))
train_all.sample(5)

len 20501


Unnamed: 0,node1,node2,label
17510,156,532,0
16633,320,510,0
6160,31,200,0
20378,93,617,0
10839,342,356,0


In [60]:
from node2vec import Node2Vec

<networkx.classes.graph.Graph at 0x1a28d72af28>

In [64]:
train_data = graph.run("""
MATCH (u1:user)-[r:TRAINVEC]->(u2:user)
RETURN id(u1) AS node1, id(u2) AS node2
""").to_data_frame()
print("len", len(train_data))

train_data = nx.from_pandas_edgelist(train_data, "node1", "node2", create_using=nx.Graph())


len 619


In [65]:
node2vec = Node2Vec(train_data, dimensions=100, walk_length=16, num_walks=50)

# train node2vec model
n2w_model = node2vec.fit(window=7, min_count=1)

x = [(n2w_model[str(i)]+n2w_model[str(j)]) for i,j in zip(train_all['node1'], train_all['node2'])]
#len(x)

Computing transition probabilities: 100%|█████████████████████████████████████████| 620/620 [00:00<00:00, 22201.56it/s]
Generating walks (CPU: 1): 100%|███████████████████████████████████████████████████████| 50/50 [00:13<00:00,  3.63it/s]
  


In [67]:
from sklearn.model_selection import train_test_split
xtrain, xtest, ytrain, ytest = train_test_split(np.array(x), train_all['label'], 
                                                test_size = 0.3, 
                                                random_state = 35)
lr = LogisticRegression(class_weight="balanced")
print("xtrain", len(xtrain))
print("xtest", len(xtest))
lr.fit(xtrain, ytrain)
predictions = lr.predict_proba(xtest)
roc_auc_score(ytest, predictions[:,1])

xtrain 14350
xtest 6151


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


0.815360751924358

In [70]:
from sklearn.ensemble import RandomForestClassifier
classifier = RandomForestClassifier(n_estimators=30, max_depth=10, 
                                    random_state=0)
classifier.fit(xtrain, ytrain)
predictions = classifier.predict_proba(xtest)
roc_auc_score(ytest, predictions[:,1])

0.8467584900511289