In [820]:
import pandas as pd
import numpy as np
import json
from graphdatascience import GraphDataScience

# neo4j desktop (5.3.0, m1)

In [821]:
host = "bolt://localhost:7687"
user = "neo4j"
password= "j4oenj4oen"

gds = GraphDataScience(host, auth=(user, password))
print(gds.version())

2.3.2


# (preprocessing) map string to int for ml process

In [822]:
subject_to_id = {
    "Neural_Networks": 0,
    "Rule_Learning": 1,
    "Reinforcement_Learning": 2,
    "Probabilistic_Methods": 3,
    "Theory": 4,
    "Genetic_Algorithms": 5,
    "Case_Based": 6,
}

# (opt1) project graph in memory by pandas read_csv

In [823]:
# cora_papers = "data/cora_papers.csv"
# cora_cites = "data/cora_cites.csv"
# papers = pd.read_csv(cora_papers, header=None)
# cites = pd.read_csv(cora_cites, header=None)
# nodes = pd.DataFrame().assign(
#     nodeId=papers[0],
#     labels="paper",
#     subject=papers[1].replace(subject_to_id),
#     features=papers.iloc[:, 2:].apply(list, axis=1),
# )
# dir_relationships = pd.DataFrame().assign(sourceNodeId=cites[0], targetNodeId=cites[1], relationshipType="CITES")
# inv_relationships = pd.DataFrame().assign(sourceNodeId=cites[1], targetNodeId=cites[0], relationshipType="CITES")
# relationships = pd.concat([dir_relationships, inv_relationships]).drop_duplicates()
# G = gds.alpha.graph.construct("cora-graph", nodes, relationships)

# (opt2) write to neo4j by LOAD CSV cypher

In [824]:
subject_map = json.dumps(subject_to_id).replace('"', "`") # {`Neural_Networks`: 0, `Rule_Learning`: 1, `Reinforcement_Learning`: 2, `Probabilistic_Methods`: 3, ...}
HOLDOUT_NODES = 10 # for test
# file should be placed in import/ 
load_nodes = f"""
  LOAD CSV FROM "file:///cora_papers.csv" AS row
  WITH 
    {subject_map} AS subject_to_id,
    toInteger(row[0]) AS nodeId, 
    row[1] AS subject, 
    toIntegerList(row[2..]) AS features
  MERGE (p:Paper {{nodeId: nodeId, subject: subject_to_id[subject], features: features}})
  WITH p LIMIT {HOLDOUT_NODES}
  REMOVE p:Paper
  SET p:UnclassifiedPaper
"""
load_relationships = f"""
  LOAD CSV FROM "file:///cora_cites.csv" AS row
  MATCH (n), (m) 
  WHERE n.nodeId = toInteger(row[0]) AND m.nodeId = toInteger(row[1])
  MERGE (n)-[:CITES]->(m)
"""
gds.run_cypher(load_nodes)
gds.run_cypher(load_relationships)

# Create the projected graph containing both classified and unclassified nodes

In [825]:
exists_result = gds.graph.exists("cora-graph")
if exists_result["exists"]:
    G = gds.graph.get("cora-graph")
    G.drop()

G, _ = gds.graph.project(
    "cora-graph",
    {"Paper": {"properties": ["features", "subject"]}, "UnclassifiedPaper": {"properties": ["features"]}},
    {"CITES": {"orientation": "UNDIRECTED", "aggregation": "SINGLE"}},
)
assert G.node_count() == 2708
assert G.relationship_count() == 10556

# (opt) Add graphical info as node properties (bad)

In [826]:
# gds.triangleCount.write(
#     G,
#     nodeLabels=["Paper", "UnclassifiedPaper"],
#     relationshipTypes=["CITES"],
#     writeProperty="triangles"
# )

# gds.localClusteringCoefficient.write(
#     G,
#     nodeLabels=["Paper", "UnclassifiedPaper"],
#     relationshipTypes=["CITES"],
#     writeProperty="coefficient"
# )

# gds.labelPropagation.write(
#     G,
#     nodeLabels=["Paper", "UnclassifiedPaper"],
#     relationshipTypes=["CITES"],
#     writeProperty="partition"
# )

# query = """
# CALL gds.louvain.stream("cora-graph", {
#   nodeLabels: ["Paper", "UnclassifiedPaper"],
#   relationshipTypes: ["CITES"],
#   includeIntermediateCommunities: true
# })
# YIELD nodeId, intermediateCommunityIds
# WITH gds.util.asNode(nodeId) AS node, intermediateCommunityIds[0] AS smallestCommunity
# SET node.louvain = smallestCommunity
# """
# gds.run_cypher(query)

# G.drop()
# G, _ = gds.graph.project(
#     "cora-graph",
#     {"Paper": {"properties": ["features", "subject", "triangles", "coefficient", "partition", "louvain"]}, "UnclassifiedPaper": {"properties": ["features", "triangles", "coefficient", "partition", "louvain"]}},
#     {"CITES": {"orientation": "UNDIRECTED", "aggregation": "SINGLE"}},
# )

# 1. Create the pipeline (baseline)

In [827]:
node_pipeline, _ = gds.beta.pipeline.nodeClassification.create("cora-pipeline")
node_pipeline.selectFeatures(["features"])
node_pipeline.configureSplit(testFraction=0.2, validationFolds=5)
# Add a model candidate to train (addLogisticRegression, addRandomForest, addMLP)
node_pipeline.addLogisticRegression(maxEpochs=200, penalty=(0.0, 0.5))
# Explicit set the number of trials for autotuning (default = 10)
node_pipeline.configureAutoTuning(maxTrials=5)

name                                                     cora-pipeline
nodePropertySteps                                                   []
featureProperties                                           [features]
splitConfig                {'testFraction': 0.2, 'validationFolds': 5}
autoTuningConfig                                      {'maxTrials': 5}
parameterSpace       {'MultilayerPerceptron': [], 'RandomForest': [...
Name: 0, dtype: object

# 1. Train --> model

In [828]:
# (opt) use train_estimate to estimate the resources needed for training the model
model, stats = node_pipeline.train(
    G,
    targetNodeLabels=["Paper"],
    modelName="cora-pipeline-model",
    targetProperty="subject",
    metrics=["F1_WEIGHTED"],
    randomSeed=42,
    concurrency=4,
)
print(stats["modelInfo"]["metrics"]["F1_WEIGHTED"]["test"]) # F1_WEIGHTED score
# # print all stats
# print(stats.to_json(indent=2))

Node Classification Train Pipeline:   0%|          | 0/100 [00:00<?, ?%/s]

0.7287325951256632


# 1. Use model to predict

In [829]:
classes = stats["modelInfo"]["classes"]
print("Class labels:", classes)
predicted = model.predict_stream(
    G, modelName="cora-pipeline-model", includePredictedProbabilities=True, targetNodeLabels=["UnclassifiedPaper"]
)
# Calculate the confidence percentage for the predicted class
predicted["confidence"] = predicted.apply(
    lambda row: np.floor(row["predictedProbabilities"][classes.index(row["predictedClass"])] * 100), axis=1
)
predicted

Class labels: [0, 1, 2, 3, 4, 5, 6]


Unnamed: 0,nodeId,predictedClass,predictedProbabilities,confidence
0,22645,0,"[0.43331478320672756, 0.0709012925654062, 0.02...",43.0
1,22646,5,"[0.14882552121488493, 0.1024869703594684, 0.04...",20.0
2,22647,2,"[0.25041203265697, 0.04005138780445345, 0.4716...",47.0
3,22648,2,"[0.029807497345911937, 0.03168648772215201, 0....",70.0
4,22649,3,"[0.07802459101179882, 0.05023837133685244, 0.0...",61.0
5,22650,5,"[0.10291879029602394, 0.16396779287525398, 0.1...",18.0
6,22651,6,"[0.08536939302774893, 0.0555098977096025, 0.11...",49.0
7,22652,0,"[0.47943332008874884, 0.035735334307960304, 0....",47.0
8,22653,0,"[0.28146539850321994, 0.07296843717126493, 0.0...",28.0
9,22654,4,"[0.15135384337526053, 0.05617760124948276, 0.1...",47.0


# 1. Verify the prediction (baseline)

In [830]:
nodes = gds.util.asNodes(predicted.nodeId.to_list())
nodes_df = pd.DataFrame([(node.id, node["subject"]) for node in nodes], columns=["nodeId", "subject"])
predicted.merge(nodes_df, on="nodeId")

Unnamed: 0,nodeId,predictedClass,predictedProbabilities,confidence,subject
0,22645,0,"[0.43331478320672756, 0.0709012925654062, 0.02...",43.0,0
1,22646,5,"[0.14882552121488493, 0.1024869703594684, 0.04...",20.0,1
2,22647,2,"[0.25041203265697, 0.04005138780445345, 0.4716...",47.0,2
3,22648,2,"[0.029807497345911937, 0.03168648772215201, 0....",70.0,2
4,22649,3,"[0.07802459101179882, 0.05023837133685244, 0.0...",61.0,3
5,22650,5,"[0.10291879029602394, 0.16396779287525398, 0.1...",18.0,3
6,22651,6,"[0.08536939302774893, 0.0555098977096025, 0.11...",49.0,4
7,22652,0,"[0.47943332008874884, 0.035735334307960304, 0....",47.0,0
8,22653,0,"[0.28146539850321994, 0.07296843717126493, 0.0...",28.0,0
9,22654,4,"[0.15135384337526053, 0.05617760124948276, 0.1...",47.0,4


# 2. Create the pipeline (fastrp)

In [831]:
node_pipeline_fastrp, _ = gds.beta.pipeline.nodeClassification.create("cora-pipeline-fastrp")
node_pipeline_fastrp.addNodeProperty(
    "fastRP",
    mutateProperty="embedding",
    embeddingDimension=512,
    propertyRatio=1.0,
    randomSeed=42,
    featureProperties=["features"],
    contextNodeLabels=["Paper", "UnclassifiedPaper"], # exceptional for contextNodeLabels and contextRelationshipTypes (not nodeLabels and relationshipTypes)
)
node_pipeline_fastrp.selectFeatures(["embedding"]) # With the node embeddings available as features, we no longer use the original raw `features`.
# node_pipeline_fastrp.selectFeatures(["embedding", "triangles", "coefficient", "partition", "louvain"])
node_pipeline_fastrp.configureSplit(testFraction=0.2, validationFolds=5)
node_pipeline_fastrp.addLogisticRegression(maxEpochs=200, penalty=(0.0, 0.5))
node_pipeline_fastrp.configureAutoTuning(maxTrials=5)

name                                              cora-pipeline-fastrp
nodePropertySteps    [{'name': 'gds.fastRP.mutate', 'config': {'ran...
featureProperties                                          [embedding]
splitConfig                {'testFraction': 0.2, 'validationFolds': 5}
autoTuningConfig                                      {'maxTrials': 5}
parameterSpace       {'MultilayerPerceptron': [], 'RandomForest': [...
Name: 0, dtype: object

# 2. Train --> model_fastrp

In [832]:
model_fastrp, stats_fastrp = node_pipeline_fastrp.train(
    G,
    targetNodeLabels=["Paper"],
    modelName="cora-pipeline-model-fastrp",
    targetProperty="subject",
    metrics=["F1_WEIGHTED"],
    randomSeed=42,
    concurrency=4,
)
print(stats_fastrp["modelInfo"]["metrics"]["F1_WEIGHTED"]["test"])

Node Classification Train Pipeline:   0%|          | 0/100 [00:00<?, ?%/s]

0.8323674420565268


# 2. Use model_fastrp to predict

In [833]:
# predicted_fastrp = model_fastrp.predict_stream(
#     G, modelName="cora-pipeline-model-fastrp", includePredictedProbabilities=True, targetNodeLabels=["UnclassifiedPaper"],
# )
model_fastrp.predict_mutate(
    G,
    mutateProperty="predictedClass_fastrp",
    modelName="cora-pipeline-model-fastrp",
    predictedProbabilityProperty="predictedProbabilities_fastrp",
    targetNodeLabels=["UnclassifiedPaper"],
)
predicted_fastrp = gds.graph.streamNodeProperty(G, "predictedClass_fastrp", ["UnclassifiedPaper"])
predicted_fastrp

Node Classification Predict Pipeline:   0%|          | 0/100 [00:00<?, ?%/s]

Unnamed: 0,nodeId,propertyValue
0,22645,0
1,22646,1
2,22647,2
3,22648,2
4,22649,3
5,22650,3
6,22651,4
7,22652,0
8,22653,0
9,22654,4


# 2. Verify the prediction (fastrp)

In [834]:
nodes = gds.util.asNodes(predicted_fastrp.nodeId.to_list())
nodes_df = pd.DataFrame([(node.id, node["subject"]) for node in nodes], columns=["nodeId", "subject"])
predicted_fastrp.merge(nodes_df, on="nodeId")

Unnamed: 0,nodeId,propertyValue,subject
0,22645,0,0
1,22646,1,1
2,22647,2,2
3,22648,2,2
4,22649,3,3
5,22650,3,3
6,22651,4,4
7,22652,0,0
8,22653,0,0
9,22654,4,4


# 3. Create the pipeline (hashGNN)

In [835]:
node_pipeline_hashgnn, _ = gds.beta.pipeline.nodeClassification.create("cora-pipeline-hashgnn")
node_pipeline_hashgnn.addNodeProperty(
    "beta.hashgnn",
    mutateProperty="embedding",
    iterations=4,
    heterogeneous=True,
    embeddingDensity=512,
    neighborInfluence=0.7,
    featureProperties=["features"],
    randomSeed=42,
    contextNodeLabels=["Paper", "UnclassifiedPaper"], # exceptional for contextNodeLabels and contextRelationshipTypes (not nodeLabels and relationshipTypes)
    # binarizeFeatures={"dimension": 8, "threshold": 0}, # too bad !!!
)
node_pipeline_hashgnn.selectFeatures(["embedding"]) # With the node embeddings available as features, we no longer use the original raw `features`.
# node_pipeline_hashgnn.selectFeatures(["embedding", "triangles", "coefficient", "partition", "louvain"])
node_pipeline_hashgnn.configureSplit(testFraction=0.2, validationFolds=5)
node_pipeline_hashgnn.addLogisticRegression(maxEpochs=200, penalty=(0.0, 0.5))
node_pipeline_hashgnn.configureAutoTuning(maxTrials=5)

name                                             cora-pipeline-hashgnn
nodePropertySteps    [{'name': 'gds.beta.hashgnn.mutate', 'config':...
featureProperties                                          [embedding]
splitConfig                {'testFraction': 0.2, 'validationFolds': 5}
autoTuningConfig                                      {'maxTrials': 5}
parameterSpace       {'MultilayerPerceptron': [], 'RandomForest': [...
Name: 0, dtype: object

# 3. Train --> model_hashgnn

In [836]:
model_hashgnn, stats_hashgnn = node_pipeline_hashgnn.train(
    G,
    targetNodeLabels=["Paper"],
    modelName="cora-pipeline-model-hashgnn",
    targetProperty="subject",
    metrics=["F1_WEIGHTED"],
    randomSeed=42,
    concurrency=4,
)
print(stats_hashgnn["modelInfo"]["metrics"]["F1_WEIGHTED"]["test"])

Node Classification Train Pipeline:   0%|          | 0/100 [00:00<?, ?%/s]

0.874485088125894


# 3. Use model_hashgnn to predict

In [837]:
model_hashgnn.predict_mutate(
    G,
    mutateProperty="predictedClass_hashgnn",
    modelName="cora-pipeline-model-hashgnn",
    predictedProbabilityProperty="predictedProbabilities_hashgnn",
    targetNodeLabels=["UnclassifiedPaper"],
)
predicted_hashgnn = gds.graph.streamNodeProperty(G, "predictedClass_hashgnn", ["UnclassifiedPaper"])
predicted_hashgnn

Node Classification Predict Pipeline:   0%|          | 0/100 [00:00<?, ?%/s]

Unnamed: 0,nodeId,propertyValue
0,22645,0
1,22646,1
2,22647,2
3,22648,2
4,22649,3
5,22650,3
6,22651,4
7,22652,0
8,22653,0
9,22654,4


# 3. Verify the prediction (hashgnn)

In [838]:
nodes = gds.util.asNodes(predicted_hashgnn.nodeId.to_list())
nodes_df = pd.DataFrame([(node.id, node["subject"]) for node in nodes], columns=["nodeId", "subject"])
predicted_hashgnn.merge(nodes_df, on="nodeId")

Unnamed: 0,nodeId,propertyValue,subject
0,22645,0,0
1,22646,1,1
2,22647,2,2
3,22648,2,2
4,22649,3,3
5,22650,3,3
6,22651,4,4
7,22652,0,0
8,22653,0,0
9,22654,4,4


# Write back to neo4j

In [839]:
gds.graph.nodeProperties.write(
    G,
    node_properties=["predictedClass_fastrp", "predictedClass_hashgnn"],
    node_labels=["UnclassifiedPaper"],
)

writeMillis                                                       17
graphName                                                 cora-graph
nodeProperties       [predictedClass_fastrp, predictedClass_hashgnn]
propertiesWritten                                                 20
Name: 0, dtype: object

# (postprocessing) free up memory

In [840]:
node_pipeline.drop()
node_pipeline_fastrp.drop()
node_pipeline_hashgnn.drop()
model.drop()
model_fastrp.drop()
model_hashgnn.drop()
G.drop()
query = """
MATCH (n) DETACH DELETE n
"""
gds.run_cypher(query)
gds.close()