In [439]:
import pandas as pd
import numpy as np
import json
from graphdatascience import GraphDataScience

In [440]:
# neo4j desktop m1, 5.3.0
host = "bolt://localhost:7687"
user = "neo4j"
password= "j4oenj4oen"

gds = GraphDataScience(host, auth=(user, password))
print(gds.version())

2.3.2


In [441]:
# # (opt1) project graph in memory by pandas read_csv
# cora_papers = "data/cora_papers.csv"
# cora_cites = "data/cora_cites.csv"
# papers = pd.read_csv(cora_papers, header=None)
# cites = pd.read_csv(cora_cites, header=None)

In [442]:
# # create nodes dataframe from csv
subject_to_id = {
    "Neural_Networks": 0,
    "Rule_Learning": 1,
    "Reinforcement_Learning": 2,
    "Probabilistic_Methods": 3,
    "Theory": 4,
    "Genetic_Algorithms": 5,
    "Case_Based": 6,
}
# nodes = pd.DataFrame().assign(
#     nodeId=papers[0],
#     labels="paper",
#     subject=papers[1].replace(subject_to_id),
#     features=papers.iloc[:, 2:].apply(list, axis=1),
# )
# nodes.head()

In [443]:
# # create edges dataframe from csv
# dir_relationships = pd.DataFrame().assign(sourceNodeId=cites[0], targetNodeId=cites[1], relationshipType="CITES")
# inv_relationships = pd.DataFrame().assign(sourceNodeId=cites[1], targetNodeId=cites[0], relationshipType="CITES")
# relationships = pd.concat([dir_relationships, inv_relationships]).drop_duplicates()
# relationships.head()

In [444]:
# # create in-memory graph
# G = gds.alpha.graph.construct("cora-graph", nodes, relationships)
# gds.graph.list()

In [445]:
# (opt2) write to neo4j by LOAD CSV cypher
# file should be placed in import/ if using e.g., LOAD CSV FROM $url AS row FIELDTERMINATOR ' ' in cypher ('url': 'file:///...')
subject_map = json.dumps(subject_to_id).replace('"', "`")
HOLDOUT_NODES = 10

load_nodes = f"""
  LOAD CSV FROM "file:///cora_papers.csv" AS row
  WITH 
    {subject_map} AS subject_to_id,
    toInteger(row[0]) AS nodeId, 
    row[1] AS subject, 
    toIntegerList(row[2..]) AS features
  MERGE (p:Paper {{nodeId: nodeId, subject: subject_to_id[subject], features: features}})
  WITH p LIMIT {HOLDOUT_NODES}
  REMOVE p:Paper
  SET p:UnclassifiedPaper
"""

load_relationships = f"""
  LOAD CSV FROM "file:///cora_cites.csv" AS row
  MATCH (n), (m) 
  WHERE n.nodeId = toInteger(row[0]) AND m.nodeId = toInteger(row[1])
  MERGE (n)-[:CITES]->(m)
"""

# Load nodes and relationships on Neo4j
gds.run_cypher(load_nodes)
gds.run_cypher(load_relationships)

In [446]:
# Create the projected graph containing both classified and unclassified nodes
G, _ = gds.graph.project(
    "cora-graph",
    {"Paper": {"properties": ["features", "subject"]}, "UnclassifiedPaper": {"properties": ["features"]}},
    {"CITES": {"orientation": "UNDIRECTED", "aggregation": "SINGLE"}},
)

assert G.node_count() == 2708
assert G.relationship_count() == 10556

In [447]:
# Create the pipeline
node_pipeline, _ = gds.beta.pipeline.nodeClassification.create("cora-pipeline")

# "Mark" some node properties that will be used as features
node_pipeline.selectFeatures(["features"])
# If needed, change the train/test split ratio and the number of folds
# for k-fold cross-validation
node_pipeline.configureSplit(testFraction=0.2, validationFolds=5)
# Add a model candidate to train (addLogisticRegression, addRandomForest, addMLP)
node_pipeline.addLogisticRegression(maxEpochs=200, penalty=(0.0, 0.5))
# Explicit set the number of trials for autotuning (default = 10)
node_pipeline.configureAutoTuning(maxTrials=5)

name                                                     cora-pipeline
nodePropertySteps                                                   []
featureProperties                                           [features]
splitConfig                {'testFraction': 0.2, 'validationFolds': 5}
autoTuningConfig                                      {'maxTrials': 5}
parameterSpace       {'MultilayerPerceptron': [], 'RandomForest': [...
Name: 0, dtype: object

In [448]:
# (opt) use train_estimate to estimate the resources needed for training the model
model, stats = node_pipeline.train(
    G,
    targetNodeLabels=["Paper"],
    modelName="cora-pipeline-model",
    targetProperty="subject",
    metrics=["F1_WEIGHTED"],
    randomSeed=42,
    concurrency=4,
)
print(stats["modelInfo"]["metrics"]["F1_WEIGHTED"]["test"])

# # print all stats
# print(stats.to_json(indent=2))

Node Classification Train Pipeline:   0%|          | 0/100 [00:00<?, ?%/s]

0.7287325951256631


In [449]:
# use the model to predict
classes = stats["modelInfo"]["classes"]
print("Class labels:", classes)
predicted = model.predict_stream(
    G, modelName="cora-pipeline-model", includePredictedProbabilities=True, targetNodeLabels=["UnclassifiedPaper"]
)
# Calculate the confidence percentage for the predicted class
predicted["confidence"] = predicted.apply(
    lambda row: np.floor(row["predictedProbabilities"][classes.index(row["predictedClass"])] * 100), axis=1
)
predicted

Class labels: [0, 1, 2, 3, 4, 5, 6]


Unnamed: 0,nodeId,predictedClass,predictedProbabilities,confidence
0,2064,0,"[0.43331478320672767, 0.07090129256540618, 0.0...",43.0
1,2065,5,"[0.148825521214885, 0.10248697035946847, 0.047...",20.0
2,2066,2,"[0.2504120326569696, 0.04005138780445343, 0.47...",47.0
3,2067,2,"[0.02980749734591193, 0.031686487722152004, 0....",70.0
4,2068,3,"[0.07802459101179884, 0.05023837133685244, 0.0...",61.0
5,2069,5,"[0.10291879029602394, 0.16396779287525406, 0.1...",18.0
6,2070,6,"[0.08536939302774896, 0.055509897709602504, 0....",49.0
7,2071,0,"[0.4794333200887491, 0.03573533430796032, 0.06...",47.0
8,2072,0,"[0.28146539850322, 0.07296843717126493, 0.0966...",28.0
9,2073,4,"[0.1513538433752605, 0.05617760124948276, 0.16...",47.0


In [450]:
# compare results

# Retrieve node information from Neo4j using the node IDs from the prediction result
nodes = gds.util.asNodes(predicted.nodeId.to_list())

# Create a new DataFrame containing node IDs along with node properties
nodes_df = pd.DataFrame([(node.id, node["subject"]) for node in nodes], columns=["nodeId", "subject"])

# Merge with the prediction result on node IDs, to check the predicted value
# against the original subject
#
# NOTE: This could also be replaced by just appending `node["subject"]` as a
# Series since the node order would not change, but a proper merge (or join)
# is clearer and less prone to errors.
predicted.merge(nodes_df, on="nodeId")

Unnamed: 0,nodeId,predictedClass,predictedProbabilities,confidence,subject
0,2064,0,"[0.43331478320672767, 0.07090129256540618, 0.0...",43.0,0
1,2065,5,"[0.148825521214885, 0.10248697035946847, 0.047...",20.0,1
2,2066,2,"[0.2504120326569696, 0.04005138780445343, 0.47...",47.0,2
3,2067,2,"[0.02980749734591193, 0.031686487722152004, 0....",70.0,2
4,2068,3,"[0.07802459101179884, 0.05023837133685244, 0.0...",61.0,3
5,2069,5,"[0.10291879029602394, 0.16396779287525406, 0.1...",18.0,3
6,2070,6,"[0.08536939302774896, 0.055509897709602504, 0....",49.0,4
7,2071,0,"[0.4794333200887491, 0.03573533430796032, 0.06...",47.0,0
8,2072,0,"[0.28146539850322, 0.07296843717126493, 0.0966...",28.0,0
9,2073,4,"[0.1513538433752605, 0.05617760124948276, 0.16...",47.0,4


In [451]:
# Create the pipeline fastrp
node_pipeline_fastrp, _ = gds.beta.pipeline.nodeClassification.create("cora-pipeline-fastrp")
# List all pipelines
gds.beta.pipeline.list()

node_pipeline_fastrp.addNodeProperty(
    "fastRP",
    mutateProperty="embedding",
    embeddingDimension=512,
    propertyRatio=1.0,
    randomSeed=42,
    featureProperties=["features"],
    contextNodeLabels=["Paper", "UnclassifiedPaper"],
)

# With the node embeddings available as features, we no longer use the original raw `features`.
node_pipeline_fastrp.selectFeatures(["embedding"])

# Configure the pipeline as before
node_pipeline_fastrp.configureSplit(testFraction=0.2, validationFolds=5)
node_pipeline_fastrp.addLogisticRegression(maxEpochs=200, penalty=(0.0, 0.5))
node_pipeline.configureAutoTuning(maxTrials=5)

name                                                     cora-pipeline
nodePropertySteps                                                   []
featureProperties                                           [features]
splitConfig                {'testFraction': 0.2, 'validationFolds': 5}
autoTuningConfig                                      {'maxTrials': 5}
parameterSpace       {'MultilayerPerceptron': [], 'RandomForest': [...
Name: 0, dtype: object

In [452]:
# Perform the actual training
model_fastrp, stats_fastrp = node_pipeline_fastrp.train(
    G,
    targetNodeLabels=["Paper"],
    modelName="cora-pipeline-model-fastrp",
    targetProperty="subject",
    metrics=["F1_WEIGHTED"],
    randomSeed=42,
    concurrency=4,
)
print(stats_fastrp["modelInfo"]["metrics"]["F1_WEIGHTED"]["test"])

Node Classification Train Pipeline:   0%|          | 0/100 [00:00<?, ?%/s]

0.8323674420565268


In [453]:
# predicted_fastrp = model_fastrp.predict_stream(
#     G, modelName="cora-pipeline-model-fastrp", includePredictedProbabilities=True, targetNodeLabels=["UnclassifiedPaper"],
# )

model_fastrp.predict_mutate(
    G,
    mutateProperty="predictedClass",
    modelName="cora-pipeline-model-fastrp",
    predictedProbabilityProperty="predictedProbabilities",
    targetNodeLabels=["UnclassifiedPaper"],
)

predicted_fastrp = gds.graph.streamNodeProperty(G, "predictedClass", ["UnclassifiedPaper"])
predicted_fastrp

Node Classification Predict Pipeline:   0%|          | 0/100 [00:00<?, ?%/s]

Unnamed: 0,nodeId,propertyValue
0,2064,0
1,2065,1
2,2066,2
3,2067,2
4,2068,3
5,2069,3
6,2070,4
7,2071,0
8,2072,0
9,2073,4


In [454]:
# compare results

# Retrieve node information from Neo4j using the node IDs from the prediction result
nodes = gds.util.asNodes(predicted_fastrp.nodeId.to_list())

# Create a new DataFrame containing node IDs along with node properties
nodes_df = pd.DataFrame([(node.id, node["subject"]) for node in nodes], columns=["nodeId", "subject"])

# Merge with the prediction result on node IDs, to check the predicted value
# against the original subject
#
# NOTE: This could also be replaced by just appending `node["subject"]` as a
# Series since the node order would not change, but a proper merge (or join)
# is clearer and less prone to errors.
predicted_fastrp.merge(nodes_df, on="nodeId")

Unnamed: 0,nodeId,propertyValue,subject
0,2064,0,0
1,2065,1,1
2,2066,2,2
3,2067,2,2
4,2068,3,3
5,2069,3,3
6,2070,4,4
7,2071,0,0
8,2072,0,0
9,2073,4,4


In [455]:
# write back to neo4j
gds.graph.nodeProperties.write(
    G,
    node_properties=["predictedClass"],
    node_labels=["UnclassifiedPaper"],
)

writeMillis                        25
graphName                  cora-graph
nodeProperties       [predictedClass]
propertiesWritten                  10
Name: 0, dtype: object

In [456]:
node_pipeline.drop()
node_pipeline_fastrp.drop()
model.drop()
model_fastrp.drop()

# free up memory
G.drop()

query = """
MATCH (n) DETACH DELETE n
"""
gds.run_cypher(query)
gds.close()