In [41]:
import pandas as pd
from neo4j import GraphDatabase

In [42]:
URI      = "neo4j://localhost:7687"
creds    = ('neo4j', 'password')
driver   = GraphDatabase.driver(URI, auth=creds)

In [43]:
def run_cypher(cypher, results=False):
    with driver.session() as session:
        r = session.run(cypher).data()
    if results:
        return r

In [28]:
c = '''
// Create constraints
CREATE CONSTRAINT proteins IF NOT EXISTS ON (p:Protein) ASSERT p.id IS UNIQUE;
'''
run_cypher(c)

## Create Nodes

In [35]:
for i in [0,1]:
    c = '''
    // Load node list
    LOAD CSV FROM 'https://raw.githubusercontent.com/seankrobinson/Protein-Interaction_Link-Prediction/master/PP-Pathways_ppi.csv' AS line
    WITH line
    MERGE (p:Protein {{id: line[{index}]}})
    RETURN COUNT(DISTINCT p)
    '''
    result = run_cypher(c.format(index=i), results=True)
    print(result)

[{'COUNT(distinct p)': 15849}]
[{'COUNT(distinct p)': 19341}]


## Create Edges

In [38]:
c = '''
// Load node list
LOAD CSV FROM 'https://raw.githubusercontent.com/seankrobinson/Protein-Interaction_Link-Prediction/master/PP-Pathways_ppi.csv' AS line
WITH line
MATCH (p1:Protein {id: line[0]})
WITH line, p1
MATCH (p2:Protein {id: line[1]})
MERGE (p1)-[r:INTERACTS_WITH]->(p2)
RETURN COUNT(r)
'''
result = run_cypher(c, results=True)
print(result)

[{'COUNT(r)': 342353}]
[{'COUNT(r)': 342353}]


In [85]:
run_cypher("CALL gds.beta.model.drop('pipe')", results=True)

c = f'''
CALL gds.alpha.ml.pipeline.linkPrediction.create('pipe')
'''
print(c)
run_cypher(c, results=True)


CALL gds.alpha.ml.pipeline.linkPrediction.create('pipe')



[{'name': 'pipe',
  'nodePropertySteps': [],
  'featureSteps': [],
  'splitConfig': {'negativeSamplingRatio': 1.0,
   'trainFraction': 0.1,
   'testFraction': 0.1,
   'validationFolds': 3},
  'parameterSpace': [{'useBiasFeature': True,
    'maxEpochs': 100,
    'minEpochs': 1,
    'penalty': 0.0,
    'patience': 1,
    'batchSize': 100,
    'tolerance': 0.001,
    'concurrency': 4}]}]

In [86]:
c = '''
CALL gds.alpha.ml.pipeline.linkPrediction.addNodeProperty('pipe', 'fastRP', {
  mutateProperty: 'embedding',
  embeddingDimension: 256,
  randomSeed: 42
})
'''
print(c)
run_cypher(c, results=True)


CALL gds.alpha.ml.pipeline.linkPrediction.addNodeProperty('pipe', 'fastRP', {
  mutateProperty: 'embedding',
  embeddingDimension: 256,
  randomSeed: 42
})



[{'name': 'pipe',
  'nodePropertySteps': [{'name': 'gds.fastRP.mutate',
    'config': {'randomSeed': 42,
     'embeddingDimension': 256,
     'mutateProperty': 'embedding'}}],
  'featureSteps': [],
  'splitConfig': {'negativeSamplingRatio': 1.0,
   'trainFraction': 0.1,
   'testFraction': 0.1,
   'validationFolds': 3},
  'parameterSpace': [{'useBiasFeature': True,
    'maxEpochs': 100,
    'minEpochs': 1,
    'penalty': 0.0,
    'patience': 1,
    'batchSize': 100,
    'tolerance': 0.001,
    'concurrency': 4}]}]

In [87]:
c = '''
CALL gds.alpha.ml.pipeline.linkPrediction.addFeature('pipe', 'hadamard', {
  nodeProperties: ['embedding']
}) YIELD featureSteps
'''
print(c)
run_cypher(c, results=True)


CALL gds.alpha.ml.pipeline.linkPrediction.addFeature('pipe', 'hadamard', {
  nodeProperties: ['embedding']
}) YIELD featureSteps



[{'featureSteps': [{'name': 'HADAMARD',
    'config': {'nodeProperties': ['embedding']}}]}]

In [88]:
c = '''
CALL gds.alpha.ml.pipeline.linkPrediction.configureSplit('pipe', {
  testFraction: 0.3,
  trainFraction: 0.3,
  validationFolds: 7
})
YIELD splitConfig
'''
print(c)
run_cypher(c, results=True)


CALL gds.alpha.ml.pipeline.linkPrediction.configureSplit('pipe', {
  testFraction: 0.3,
  trainFraction: 0.3,
  validationFolds: 7
})
YIELD splitConfig



[{'splitConfig': {'negativeSamplingRatio': 1.0,
   'trainFraction': 0.3,
   'testFraction': 0.3,
   'validationFolds': 7}}]

In [89]:
c = '''
CALL gds.alpha.ml.pipeline.linkPrediction.configureParams('pipe',
  [{tolerance: 0.001}, {tolerance: 0.01}, {maxEpochs: 500}]
) YIELD parameterSpace
'''
print(c)
run_cypher(c, results=True)


CALL gds.alpha.ml.pipeline.linkPrediction.configureParams('pipe',
  [{tolerance: 0.001}, {tolerance: 0.01}, {maxEpochs: 500}]
) YIELD parameterSpace



[{'parameterSpace': [{'useBiasFeature': True,
    'maxEpochs': 100,
    'minEpochs': 1,
    'penalty': 0.0,
    'patience': 1,
    'batchSize': 100,
    'tolerance': 0.001},
   {'useBiasFeature': True,
    'maxEpochs': 100,
    'minEpochs': 1,
    'penalty': 0.0,
    'patience': 1,
    'batchSize': 100,
    'tolerance': 0.01},
   {'useBiasFeature': True,
    'maxEpochs': 500,
    'minEpochs': 1,
    'penalty': 0.0,
    'patience': 1,
    'batchSize': 100,
    'tolerance': 0.001}]}]

In [90]:
run_cypher('call gds.graph.list', results=True)

[{'degreeDistribution': {'p99': 310,
   'min': 1,
   'max': 2132,
   'mean': 31.762582919701256,
   'p90': 80,
   'p50': 10,
   'p999': 698,
   'p95': 132,
   'p75': 32},
  'graphName': 'proteins',
  'database': 'neo4j',
  'memoryUsage': '59 MiB',
  'sizeInBytes': 62304804,
  'nodeProjection': {'Protein': {'properties': {}, 'label': 'Protein'}},
  'relationshipProjection': {'INTERACTS_WITH': {'orientation': 'UNDIRECTED',
    'aggregation': 'DEFAULT',
    'type': 'INTERACTS_WITH',
    'properties': {}}},
  'nodeQuery': None,
  'relationshipQuery': None,
  'nodeCount': 21557,
  'relationshipCount': 1362009,
  'nodeFilter': None,
  'relationshipFilter': None,
  'density': 0.002931051712164911,
  'creationTime': neo4j.time.DateTime(2022, 2, 21, 10, 39, 25.327303, tzinfo=<DstTzInfo 'America/Chicago' CST-1 day, 18:00:00 STD>),
  'modificationTime': neo4j.time.DateTime(2022, 2, 21, 10, 39, 43.94862, tzinfo=<DstTzInfo 'America/Chicago' CST-1 day, 18:00:00 STD>),
  'schema': {'relationships': {

In [91]:
run_cypher('call gds.graph.drop("proteins")')

c = '''
// Create in-memory graph of (Protein)-[:INTERACTS_WITH]-(Protein)
CALL gds.graph.create(
    'proteins',
    'Protein',
    {INTERACTS_WITH: 
        {
            orientation: 'UNDIRECTED'
        }
    }
)
'''
run_cypher(c, results=True)

[{'nodeProjection': {'Protein': {'properties': {}, 'label': 'Protein'}},
  'relationshipProjection': {'INTERACTS_WITH': {'orientation': 'UNDIRECTED',
    'aggregation': 'DEFAULT',
    'type': 'INTERACTS_WITH',
    'properties': {}}},
  'graphName': 'proteins',
  'nodeCount': 21557,
  'relationshipCount': 684706,
  'createMillis': 33}]

In [92]:
c = '''
CALL gds.alpha.ml.pipeline.linkPrediction.train('proteins', {
  pipeline: 'pipe',
  modelName: 'lp-pipeline-model',
  randomSeed: 42
}) YIELD modelInfo
RETURN
  modelInfo.bestParameters AS winningModel,
  modelInfo.metrics.AUCPR.outerTrain AS trainGraphScore,
  modelInfo.metrics.AUCPR.test AS testGraphScore
'''
print(c)
run_cypher(c, results=True)


CALL gds.alpha.ml.pipeline.linkPrediction.train('proteins', {
  pipeline: 'pipe',
  modelName: 'lp-pipeline-model',
  randomSeed: 42
}) YIELD modelInfo
RETURN
  modelInfo.bestParameters AS winningModel,
  modelInfo.metrics.AUCPR.outerTrain AS trainGraphScore,
  modelInfo.metrics.AUCPR.test AS testGraphScore



[{'winningModel': {'useBiasFeature': True,
   'maxEpochs': 100,
   'minEpochs': 1,
   'penalty': 0.0,
   'patience': 1,
   'batchSize': 100,
   'tolerance': 0.001,
   'concurrency': 4},
  'trainGraphScore': 0.9166697161186019,
  'testGraphScore': 0.9143585845345259}]