In [1]:
import os
import pandas as pd
import numpy as np
from graphdatascience import GraphDataScience
from dotenv import load_dotenv, find_dotenv

pd.set_option("display.max_rows", 999)
pd.set_option("display.max_columns", None)
pd.set_option("display.max_colwidth", None)

In [2]:
# Utilize python-dotenv to load AuraDS credentials securely

load_dotenv(find_dotenv())

NEO4J_AURA_URI = os.getenv("NEO4J_AURA_URI")
NEO4J_AURA_USR = os.getenv("NEO4J_AURA_USR")
NEO4J_AURA_PWD = os.getenv("NEO4J_AURA_PWD")

gds = GraphDataScience(endpoint=NEO4J_AURA_URI,
                       auth=(NEO4J_AURA_USR, NEO4J_AURA_PWD),
                       aura_ds=True)

In [3]:
# Call a GDS method to verify version

print(gds.version())

2.1.5


In [26]:
# Check Projected Graphs

gds.run_cypher("CALL gds.graph.list();")

# Check Pipelines
gds.run_cypher("CALL gds.beta.pipeline.list();")

# Check Models
gds.run_cypher("CALL gds.beta.model.list();")

Unnamed: 0,modelInfo,trainConfig,graphSchema,loaded,stored,creationTime,shared
0,"{'pipeline': {'nodePropertySteps': [{'name': 'gds.fastRP.mutate', 'config': {'randomSeed': 42, 'iterationWeights': [0.8, 1, 1, 1], 'embeddingDimension': 256, 'mutateProperty': 'embedding'}}, {'name': 'gds.pageRank.mutate', 'config': {'mutateProperty': 'pageRank'}}, {'name': 'gds.betweenness.mutate', 'config': {'mutateProperty': 'betweenness'}}], 'featureSteps': [{'name': 'HADAMARD', 'config': {'nodeProperties': ['embedding', 'pageRank', 'betweenness']}}]}, 'modelName': 'lp-pipeline-model', 'modelType': 'LinkPrediction', 'metrics': {'AUCPR': {'test': 0.7426907040948699, 'outerTrain': 0.7697504245867193, 'validation': {'avg': 0.9503151747573046, 'min': 0.871381625279237, 'max': 0.9966501752216039}, 'train': {'avg': 0.959932645328202, 'min': 0.953883287355528, 'max': 0.9711247646872891}}}, 'bestParameters': {'maxEpochs': 500, 'minEpochs': 1, 'penalty': 0.1, 'patience': 1, 'methodName': 'LogisticRegression', 'batchSize': 100, 'tolerance': 0.01, 'learningRate': 0.001}}","{'pipeline': 'pipe', 'randomSeed': 42, 'graphName': 'compound', 'jobId': '76114c78-d828-4373-956b-0dbd6f903a6f', 'modelName': 'lp-pipeline-model', 'negativeClassWeight': 1.0, 'metrics': ['AUCPR'], 'nodeLabels': ['*'], 'sudo': False, 'relationshipTypes': ['*'], 'concurrency': 4, 'username': None}","{'graphProperties': {}, 'relationships': {'TREATS_CtD': {}}, 'nodes': {'Disease': {}, 'Compound': {}}}",False,True,2022-07-15T13:49:26.516842000+00:00,False


# Create Link Prediction Pipeline

In [27]:
gds.run_cypher("CALL gds.beta.model.drop('lp-pipeline-model')")

q = f'''
CALL gds.beta.pipeline.linkPrediction.create('pipe')
YIELD name, nodePropertySteps, featureSteps, splitConfig, autoTuningConfig, parameterSpace;
'''
print(q)
gds.run_cypher(q)


CALL gds.beta.pipeline.linkPrediction.create('pipe')
YIELD name, nodePropertySteps, featureSteps, splitConfig, autoTuningConfig, parameterSpace;



Unnamed: 0,name,nodePropertySteps,featureSteps,splitConfig,autoTuningConfig,parameterSpace
0,pipe,[],[],"{'negativeSamplingRatio': 1.0, 'testFraction': 0.1, 'validationFolds': 3, 'trainFraction': 0.1}",{'maxTrials': 10},"{'RandomForest': [], 'LogisticRegression': []}"


# Add Fast RP Embeddings
- fastRP algo in mutate mode (before training) ensures the embedding property can be used as an input for link features

In [28]:
q = '''
CALL gds.beta.pipeline.linkPrediction.addNodeProperty('pipe', 'fastRP', {
  mutateProperty: 'embedding',
  embeddingDimension: 256,
  iterationWeights: [0.8, 1, 1, 1],
  //normalizationStrength: 0.5,
  randomSeed: 42
});
'''
print(q)

gds.run_cypher(q)


CALL gds.beta.pipeline.linkPrediction.addNodeProperty('pipe', 'fastRP', {
  mutateProperty: 'embedding',
  embeddingDimension: 256,
  iterationWeights: [0.8, 1, 1, 1],
  //normalizationStrength: 0.5,
  randomSeed: 42
});



Unnamed: 0,name,nodePropertySteps,featureSteps,splitConfig,autoTuningConfig,parameterSpace
0,pipe,"[{'name': 'gds.fastRP.mutate', 'config': {'randomSeed': 42, 'iterationWeights': [0.8, 1, 1, 1], 'embeddingDimension': 256, 'mutateProperty': 'embedding'}}]",[],"{'negativeSamplingRatio': 1.0, 'testFraction': 0.1, 'validationFolds': 3, 'trainFraction': 0.1}",{'maxTrials': 10},"{'RandomForest': [], 'LogisticRegression': []}"


# Add pageRank Feature

In [29]:
q = '''
CALL gds.beta.pipeline.linkPrediction.addNodeProperty('pipe', 'pageRank', {
    mutateProperty: 'pageRank'
    });
'''
print(q)

gds.run_cypher(q)


CALL gds.beta.pipeline.linkPrediction.addNodeProperty('pipe', 'pageRank', {
    mutateProperty: 'pageRank'
    });



Unnamed: 0,name,nodePropertySteps,featureSteps,splitConfig,autoTuningConfig,parameterSpace
0,pipe,"[{'name': 'gds.fastRP.mutate', 'config': {'randomSeed': 42, 'iterationWeights': [0.8, 1, 1, 1], 'embeddingDimension': 256, 'mutateProperty': 'embedding'}}, {'name': 'gds.pageRank.mutate', 'config': {'mutateProperty': 'pageRank'}}]",[],"{'negativeSamplingRatio': 1.0, 'testFraction': 0.1, 'validationFolds': 3, 'trainFraction': 0.1}",{'maxTrials': 10},"{'RandomForest': [], 'LogisticRegression': []}"


# Add Betweenness Centrality Feature

In [30]:
q = '''
CALL gds.beta.pipeline.linkPrediction.addNodeProperty('pipe', 'betweenness', {
    mutateProperty: 'betweenness'
    });
    '''
print(q)

gds.run_cypher(q)


CALL gds.beta.pipeline.linkPrediction.addNodeProperty('pipe', 'betweenness', {
    mutateProperty: 'betweenness'
    });
    


Unnamed: 0,name,nodePropertySteps,featureSteps,splitConfig,autoTuningConfig,parameterSpace
0,pipe,"[{'name': 'gds.fastRP.mutate', 'config': {'randomSeed': 42, 'iterationWeights': [0.8, 1, 1, 1], 'embeddingDimension': 256, 'mutateProperty': 'embedding'}}, {'name': 'gds.pageRank.mutate', 'config': {'mutateProperty': 'pageRank'}}, {'name': 'gds.betweenness.mutate', 'config': {'mutateProperty': 'betweenness'}}]",[],"{'negativeSamplingRatio': 1.0, 'testFraction': 0.1, 'validationFolds': 3, 'trainFraction': 0.1}",{'maxTrials': 10},"{'RandomForest': [], 'LogisticRegression': []}"


In [31]:
q = '''
CALL gds.beta.pipeline.linkPrediction.addFeature('pipe', 'hadamard', {
    nodeProperties: ['embedding', 'pageRank', 'betweenness']
    }) YIELD featureSteps;
'''
print(q)

gds.run_cypher(q)


CALL gds.beta.pipeline.linkPrediction.addFeature('pipe', 'hadamard', {
    nodeProperties: ['embedding', 'pageRank', 'betweenness']
    }) YIELD featureSteps;



Unnamed: 0,featureSteps
0,"[{'name': 'HADAMARD', 'config': {'nodeProperties': ['embedding', 'pageRank', 'betweenness']}}]"


# Split Train Test

In [32]:
q = '''
CALL gds.beta.pipeline.linkPrediction.configureSplit('pipe', {
    testFraction: 0.3,
    trainFraction: 0.3,
    negativeSamplingRatio: 1.33,
    validationFolds: 7
    }) YIELD splitConfig;
'''
print(q)

gds.run_cypher(q)


CALL gds.beta.pipeline.linkPrediction.configureSplit('pipe', {
    testFraction: 0.3,
    trainFraction: 0.3,
    negativeSamplingRatio: 1.33,
    validationFolds: 7
    }) YIELD splitConfig;



Unnamed: 0,splitConfig
0,"{'negativeSamplingRatio': 1.33, 'testFraction': 0.3, 'validationFolds': 7, 'trainFraction': 0.3}"


# Configure LP Model Parameters

In [33]:
q1 = '''
CALL gds.beta.pipeline.linkPrediction.addLogisticRegression(
    'pipe',
    {penalty: 0.001, tolerance: 0.01,  maxEpochs: 500})
YIELD parameterSpace;
'''

q2 = '''
CALL gds.beta.pipeline.linkPrediction.addLogisticRegression('pipe', {
    penalty: 0,
    tolerance: 0.001,
    maxEpochs: 500
    }) YIELD parameterSpace;
    '''

q3 = '''
CALL gds.beta.pipeline.linkPrediction.addLogisticRegression('pipe', {
    penalty: 0,
    tolerance: 0.01,
    maxEpochs: 500
    }) YIELD parameterSpace;
    '''

q4 = '''
CALL gds.beta.pipeline.linkPrediction.addLogisticRegression('pipe', {
    penalty: 0.01,
    tolerance: 0.001,
    maxEpochs: 500
    }) YIELD parameterSpace;
    '''

q5 = '''
CALL gds.beta.pipeline.linkPrediction.addLogisticRegression('pipe', {
    penalty: 0.01,
    tolerance: 0.01,
    maxEpochs: 500
    }) YIELD parameterSpace;
    '''

q6 = '''
CALL gds.beta.pipeline.linkPrediction.addLogisticRegression('pipe', {
    penalty: 0.1,
    tolerance: 0.001,
    maxEpochs: 500
    }) YIELD parameterSpace;
    '''

q7 = '''
CALL gds.beta.pipeline.linkPrediction.addLogisticRegression('pipe', {
    penalty: 0.1,
    tolerance: 0.01,
    maxEpochs: 500
    }) YIELD parameterSpace;
    '''

In [34]:
# Execute LP Model Params
print(q1)
gds.run_cypher(q1)

print(q2)
gds.run_cypher(q2)

print(q3)
gds.run_cypher(q3)

print(q4)
gds.run_cypher(q4)

print(q5)
gds.run_cypher(q5)

print(q6)
gds.run_cypher(q6)

print(q7)
gds.run_cypher(q7)


CALL gds.beta.pipeline.linkPrediction.addLogisticRegression(
    'pipe',
    {penalty: 0.001, tolerance: 0.01,  maxEpochs: 500})
YIELD parameterSpace;


CALL gds.beta.pipeline.linkPrediction.addLogisticRegression('pipe', {
    penalty: 0,
    tolerance: 0.001,
    maxEpochs: 500
    }) YIELD parameterSpace;
    

CALL gds.beta.pipeline.linkPrediction.addLogisticRegression('pipe', {
    penalty: 0,
    tolerance: 0.01,
    maxEpochs: 500
    }) YIELD parameterSpace;
    

CALL gds.beta.pipeline.linkPrediction.addLogisticRegression('pipe', {
    penalty: 0.01,
    tolerance: 0.001,
    maxEpochs: 500
    }) YIELD parameterSpace;
    

CALL gds.beta.pipeline.linkPrediction.addLogisticRegression('pipe', {
    penalty: 0.01,
    tolerance: 0.01,
    maxEpochs: 500
    }) YIELD parameterSpace;
    

CALL gds.beta.pipeline.linkPrediction.addLogisticRegression('pipe', {
    penalty: 0.1,
    tolerance: 0.001,
    maxEpochs: 500
    }) YIELD parameterSpace;
    

CALL gds.beta.pipeline.linkPre

Unnamed: 0,parameterSpace
0,"{'RandomForest': [], 'LogisticRegression': [{'maxEpochs': 500, 'minEpochs': 1, 'penalty': 0.001, 'patience': 1, 'methodName': 'LogisticRegression', 'batchSize': 100, 'tolerance': 0.01, 'learningRate': 0.001}, {'maxEpochs': 500, 'minEpochs': 1, 'penalty': 0, 'patience': 1, 'methodName': 'LogisticRegression', 'batchSize': 100, 'tolerance': 0.001, 'learningRate': 0.001}, {'maxEpochs': 500, 'minEpochs': 1, 'penalty': 0, 'patience': 1, 'methodName': 'LogisticRegression', 'batchSize': 100, 'tolerance': 0.01, 'learningRate': 0.001}, {'maxEpochs': 500, 'minEpochs': 1, 'penalty': 0.01, 'patience': 1, 'methodName': 'LogisticRegression', 'batchSize': 100, 'tolerance': 0.001, 'learningRate': 0.001}, {'maxEpochs': 500, 'minEpochs': 1, 'penalty': 0.01, 'patience': 1, 'methodName': 'LogisticRegression', 'batchSize': 100, 'tolerance': 0.01, 'learningRate': 0.001}, {'maxEpochs': 500, 'minEpochs': 1, 'penalty': 0.1, 'patience': 1, 'methodName': 'LogisticRegression', 'batchSize': 100, 'tolerance': 0.001, 'learningRate': 0.001}, {'maxEpochs': 500, 'minEpochs': 1, 'penalty': 0.1, 'patience': 1, 'methodName': 'LogisticRegression', 'batchSize': 100, 'tolerance': 0.01, 'learningRate': 0.001}]}"


# Create Graph Projection

In [36]:
# Drop in memory projection (if exists)
gds.run_cypher('call gds.graph.drop("compound");')

################################################################
# Create in-memory graph of (:Compound)-[:TREATS_CtD]-(:Disease)
q = '''
CALL gds.graph.project('compound', 
    ['Compound', 'Disease'],
    {TREATS_CtD: {orientation: 'UNDIRECTED'}});
'''
print(q)

gds.run_cypher(q)


CALL gds.graph.project('compound', 
    ['Compound', 'Disease'],
    {TREATS_CtD: {orientation: 'UNDIRECTED'}});



Unnamed: 0,nodeProjection,relationshipProjection,graphName,nodeCount,relationshipCount,projectMillis
0,"{'Disease': {'label': 'Disease', 'properties': {}}, 'Compound': {'label': 'Compound', 'properties': {}}}","{'TREATS_CtD': {'orientation': 'UNDIRECTED', 'aggregation': 'DEFAULT', 'type': 'TREATS_CtD', 'properties': {}}}",compound,1571,1392,32


# Train Link Prediction Model

In [65]:
gds.run_cypher("call gds.alpha.pipeline.drop('lp-pipeline-model')")

gds.run_cypher("call gds.alpha.model.load('lp-pipeline-model') YIELD modelName")

gds.run_cypher("call gds.beta.pipeline.list()")

q = '''
CALL gds.beta.pipeline.linkPrediction.train('compound', {
    pipeline: 'pipe',
    modelName: 'lp-pipeline-model',
    randomSeed: 42
    }) YIELD modelInfo
RETURN modelInfo.bestParameters AS winningModel,
       modelInfo.metrics.AUCPR.outerTrain AS trainGraphScore,
       modelInfo.metrics.AUCPR.test AS testGraphScore;
'''
print(q)

gds.run_cypher(q)

Unnamed: 0,modelName
0,lp-pipeline-model


# Stream Results

In [66]:
q = '''
CALL gds.beta.pipeline.linkPrediction.predict.stream('compound', {
    modelName: 'lp-pipeline-model',
    topN: 5,
    threshold: 0.2
    }) YIELD node1, node2, probability 
RETURN gds.util.asNode(node1).id AS n1, 
       gds.util.asNode(node2).id AS n2, 
       probability
ORDER BY probability DESC, n1;
'''
print(q)

results = gds.run_cypher(q)

df = pd.DataFrame(data=results)


CALL gds.beta.pipeline.linkPrediction.predict.stream('compound', {
    modelName: 'lp-pipeline-model',
    topN: 5,
    threshold: 0.2
    }) YIELD node1, node2, probability 
RETURN gds.util.asNode(node1).id AS n1, 
       gds.util.asNode(node2).id AS n2, 
       probability
ORDER BY probability DESC, n1;



In [68]:
df.probability

0    0.519388
1    0.519341
2    0.519155
3    0.519144
4    0.519141
Name: probability, dtype: float64

# Write Predictions to Projected Graph

In [69]:
q = '''
CALL gds.beta.pipeline.linkPrediction.predict.mutate('compound', {
    modelName: 'lp-pipeline-model',
    relationshipTypes: ['TREATS_CtD'],
    mutateRelationshipType: 'TREATS_CtD_EXHAUSTIVE_PREDICTED',
    topN: 5,
    threshold: 0.45
    }) YIELD relationshipsWritten, samplingStats;
'''
print(q)

gds.run_cypher(q)


CALL gds.beta.pipeline.linkPrediction.predict.mutate('compound', {
    modelName: 'lp-pipeline-model',
    relationshipTypes: ['TREATS_CtD'],
    mutateRelationshipType: 'TREATS_CtD_EXHAUSTIVE_PREDICTED',
    topN: 5,
    threshold: 0.45
    }) YIELD relationshipsWritten, samplingStats;



Unnamed: 0,relationshipsWritten,samplingStats
0,10,"{'linksConsidered': 1232539, 'strategy': 'exhaustive'}"


In [70]:
q = '''
CALL gds.beta.pipeline.linkPrediction.predict.mutate('compound', {
    modelName: 'lp-pipeline-model',
    relationshipTypes: ['TREATS_CtD'],
    mutateRelationshipType: 'TREATS_CtD_APPROX_PREDICTED',
    sampleRate: 0.5,
    topK: 1,
    randomJoins: 2,
    maxIterations: 3,
    // necessary for deterministic results
    concurrency: 1,
    randomSeed: 42
    }) YIELD relationshipsWritten, samplingStats;
'''
print(q)

gds.run_cypher(q)


CALL gds.beta.pipeline.linkPrediction.predict.mutate('compound', {
    modelName: 'lp-pipeline-model',
    relationshipTypes: ['TREATS_CtD'],
    mutateRelationshipType: 'TREATS_CtD_APPROX_PREDICTED',
    sampleRate: 0.5,
    topK: 1,
    randomJoins: 2,
    maxIterations: 3,
    // necessary for deterministic results
    concurrency: 1,
    randomSeed: 42
    }) YIELD relationshipsWritten, samplingStats;



Unnamed: 0,relationshipsWritten,samplingStats
0,3142,"{'linksConsidered': 15141, 'didConverge': False, 'strategy': 'approximate', 'ranIterations': 3}"


# Save Model To Disk

In [71]:
q = '''
CALL gds.alpha.model.store('lp-pipeline-model') 
YIELD modelName, storeMillis;
'''
print(q)

gds.run_cypher(q)


CALL gds.alpha.model.store('lp-pipeline-model') 
YIELD modelName, storeMillis;



Unnamed: 0,modelName,storeMillis
0,lp-pipeline-model,926


# Load Model From Disk

In [73]:
q = '''
CALL gds.alpha.model.load('lp-pipeline-model')
YIELD
  modelName,
  loadMillis;
'''
print(q)
gds.run_cypher(q)


CALL gds.alpha.model.load('lp-pipeline-model')
YIELD
  modelName,
  loadMillis;



Unnamed: 0,modelName,loadMillis
0,lp-pipeline-model,0


### Done

# Misc Path Finding Graph Algos

In [74]:
q = '''
MATCH (source:Compound {name: 'Amphetamine'}), (target:Disease {name: 'urinary bladder cancer'})
CALL gds.shortestPath.dijkstra.stream('compound', {
    sourceNode: source,
    targetNode: target
    })
YIELD index, sourceNode, targetNode, totalCost, nodeIds, costs, path
RETURN
    index,
    gds.util.asNode(sourceNode).name AS sourceNodeName,
    gds.util.asNode(targetNode).name AS targetNodeName,
    totalCost,
    [nodeId IN nodeIds | gds.util.asNode(nodeId).name] AS nodeNames,
    costs,
    nodes(path) as path
ORDER BY index;
'''
print(q)

gds.run_cypher(q)


MATCH (source:Compound {name: 'Amphetamine'}), (target:Disease {name: 'urinary bladder cancer'})
CALL gds.shortestPath.dijkstra.stream('compound', {
    sourceNode: source,
    targetNode: target
    })
YIELD index, sourceNode, targetNode, totalCost, nodeIds, costs, path
RETURN
    index,
    gds.util.asNode(sourceNode).name AS sourceNodeName,
    gds.util.asNode(targetNode).name AS targetNodeName,
    totalCost,
    [nodeId IN nodeIds | gds.util.asNode(nodeId).name] AS nodeNames,
    costs,
    nodes(path) as path
ORDER BY index;



Unnamed: 0,index,sourceNodeName,targetNodeName,totalCost,nodeNames,costs,path
0,0,Amphetamine,urinary bladder cancer,7.0,"[Amphetamine, Zidovudine, acquired immunodeficiency syndrome, Nevirapine, Prednisone, psoriatic arthritis, Methotrexate, urinary bladder cancer]","[0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0]","[(name, Id, cui), (name, Id, cui), (name, Id, cui), (name, Id, cui), (name, Id, cui), (name, Id, cui), (name, Id, cui), (name, Id, cui)]"
