## Imports

In [187]:
import pandas as pd
from neo4j import GraphDatabase

## Connection

In [188]:
URI      = "neo4j://localhost:7687"
creds    = ('neo4j', 'password')
driver   = GraphDatabase.driver(URI, auth=creds)

drop = False # Drop existing graphs/models if already in memory

In [189]:
def run_cypher(cypher, results=False):
    with driver.session() as session:
        r = session.run(cypher).data()
    if results:
        return r

## Dataset
[Human protein-protein interaction network](https://snap.stanford.edu/biodata/datasets/10000/10000-PP-Pathways.html)

In [190]:
pd.read_csv('https://raw.githubusercontent.com/seankrobinson/Protein-Interaction_Link-Prediction/master/PP-Pathways_ppi.csv',
             header=None).head()

Unnamed: 0,0,1
0,1394,2778
1,6331,17999
2,122704,54460
3,2597,2911
4,4790,79155


## Import Dataset

In [191]:
c = '''
// Create constraints
CREATE CONSTRAINT proteins IF NOT EXISTS ON (p:Protein) ASSERT p.id IS UNIQUE;
'''
run_cypher(c)

## Create Nodes

In [192]:
for i in [0,1]:
    c = '''
    // Load node list
    LOAD CSV FROM 'https://raw.githubusercontent.com/seankrobinson/Protein-Interaction_Link-Prediction/master/PP-Pathways_ppi.csv' AS line
    WITH line
    MERGE (p:Protein {{id: line[{index}]}})
    RETURN COUNT(DISTINCT p)
    '''
    result = run_cypher(c.format(index=i), results=True)
    print(result)

[{'COUNT(DISTINCT p)': 15849}]
[{'COUNT(DISTINCT p)': 19341}]


## Create Edges

In [193]:
c = '''
// Load node list
LOAD CSV FROM 'https://raw.githubusercontent.com/seankrobinson/Protein-Interaction_Link-Prediction/master/PP-Pathways_ppi.csv' AS line
WITH line
MATCH (p1:Protein {id: line[0]})
WITH line, p1
MATCH (p2:Protein {id: line[1]})
MERGE (p1)-[r:INTERACTS_WITH]->(p2)
RETURN COUNT(r)
'''
result = run_cypher(c, results=True)
print(result)

[{'COUNT(r)': 342353}]


## Create Link Prediction Pipeline

In [194]:
run_cypher("CALL gds.beta.model.drop('pipe')", results=True) if drop else ''

c = f'''
CALL gds.alpha.ml.pipeline.linkPrediction.create('pipe')
'''
print(c)
run_cypher(c, results=True)


CALL gds.alpha.ml.pipeline.linkPrediction.create('pipe')



[{'name': 'pipe',
  'nodePropertySteps': [],
  'featureSteps': [],
  'splitConfig': {'negativeSamplingRatio': 1.0,
   'testFraction': 0.1,
   'validationFolds': 3,
   'trainFraction': 0.1},
  'parameterSpace': [{'useBiasFeature': True,
    'maxEpochs': 100,
    'minEpochs': 1,
    'penalty': 0.0,
    'patience': 1,
    'batchSize': 100,
    'tolerance': 0.001}]}]

## Add Fast RP Embeddings

In [195]:
c = '''
CALL gds.alpha.ml.pipeline.linkPrediction.addNodeProperty('pipe', 'fastRP', {
  mutateProperty: 'embedding',
  embeddingDimension: 256,
  iterationWeights: [0.8, 1, 1, 1],
  normalizationStrength: 0.5,
  randomSeed: 42
})
'''
print(c)
run_cypher(c, results=True)


CALL gds.alpha.ml.pipeline.linkPrediction.addNodeProperty('pipe', 'fastRP', {
  mutateProperty: 'embedding',
  embeddingDimension: 256,
  iterationWeights: [0.8, 1, 1, 1],
  normalizationStrength: 0.5,
  randomSeed: 42
})



[{'name': 'pipe',
  'nodePropertySteps': [{'name': 'gds.fastRP.mutate',
    'config': {'randomSeed': 42,
     'normalizationStrength': 0.5,
     'iterationWeights': [0.8, 1, 1, 1],
     'embeddingDimension': 256,
     'mutateProperty': 'embedding'}}],
  'featureSteps': [],
  'splitConfig': {'negativeSamplingRatio': 1.0,
   'testFraction': 0.1,
   'validationFolds': 3,
   'trainFraction': 0.1},
  'parameterSpace': [{'useBiasFeature': True,
    'maxEpochs': 100,
    'minEpochs': 1,
    'penalty': 0.0,
    'patience': 1,
    'batchSize': 100,
    'tolerance': 0.001}]}]

## Add pageRank Feature

In [196]:
c = '''
CALL gds.alpha.ml.pipeline.linkPrediction.addNodeProperty('pipe', 'pageRank', 
{
  mutateProperty: 'pageRank'
})
'''
print(c)
run_cypher(c, results=True)


CALL gds.alpha.ml.pipeline.linkPrediction.addNodeProperty('pipe', 'pageRank', 
{
  mutateProperty: 'pageRank'
})



[{'name': 'pipe',
  'nodePropertySteps': [{'name': 'gds.fastRP.mutate',
    'config': {'randomSeed': 42,
     'normalizationStrength': 0.5,
     'iterationWeights': [0.8, 1, 1, 1],
     'embeddingDimension': 256,
     'mutateProperty': 'embedding'}},
   {'name': 'gds.pageRank.mutate', 'config': {'mutateProperty': 'pageRank'}}],
  'featureSteps': [],
  'splitConfig': {'negativeSamplingRatio': 1.0,
   'testFraction': 0.1,
   'validationFolds': 3,
   'trainFraction': 0.1},
  'parameterSpace': [{'useBiasFeature': True,
    'maxEpochs': 100,
    'minEpochs': 1,
    'penalty': 0.0,
    'patience': 1,
    'batchSize': 100,
    'tolerance': 0.001}]}]

## Add Betweenness Centrality Feature

In [197]:
c = '''
CALL gds.alpha.ml.pipeline.linkPrediction.addNodeProperty('pipe', 'betweenness', 
{
  mutateProperty: 'betweenness'
})
'''
print(c)
run_cypher(c, results=True)


CALL gds.alpha.ml.pipeline.linkPrediction.addNodeProperty('pipe', 'betweenness', 
{
  mutateProperty: 'betweenness'
})



[{'name': 'pipe',
  'nodePropertySteps': [{'name': 'gds.fastRP.mutate',
    'config': {'randomSeed': 42,
     'normalizationStrength': 0.5,
     'iterationWeights': [0.8, 1, 1, 1],
     'embeddingDimension': 256,
     'mutateProperty': 'embedding'}},
   {'name': 'gds.pageRank.mutate', 'config': {'mutateProperty': 'pageRank'}},
   {'name': 'gds.betweenness.mutate',
    'config': {'mutateProperty': 'betweenness'}}],
  'featureSteps': [],
  'splitConfig': {'negativeSamplingRatio': 1.0,
   'testFraction': 0.1,
   'validationFolds': 3,
   'trainFraction': 0.1},
  'parameterSpace': [{'useBiasFeature': True,
    'maxEpochs': 100,
    'minEpochs': 1,
    'penalty': 0.0,
    'patience': 1,
    'batchSize': 100,
    'tolerance': 0.001}]}]

In [198]:
c = '''
CALL gds.alpha.ml.pipeline.linkPrediction.addFeature('pipe', 'hadamard', {
  nodeProperties: ['embedding', 'pageRank', 'betweenness']
}) YIELD featureSteps
'''
print(c)
run_cypher(c, results=True)


CALL gds.alpha.ml.pipeline.linkPrediction.addFeature('pipe', 'hadamard', {
  nodeProperties: ['embedding', 'pageRank', 'betweenness']
}) YIELD featureSteps



[{'featureSteps': [{'name': 'HADAMARD',
    'config': {'nodeProperties': ['embedding', 'pageRank', 'betweenness']}}]}]

## Split Train/Test

In [199]:
c = '''
CALL gds.alpha.ml.pipeline.linkPrediction.configureSplit('pipe', {
  testFraction: 0.3,
  trainFraction: 0.3,
  //negativeSamplingRatio: 1.33,
  validationFolds: 7
  
})
YIELD splitConfig
'''
print(c)
run_cypher(c, results=True)


CALL gds.alpha.ml.pipeline.linkPrediction.configureSplit('pipe', {
  testFraction: 0.3,
  trainFraction: 0.3,
  //negativeSamplingRatio: 1.33,
  validationFolds: 7
  
})
YIELD splitConfig



[{'splitConfig': {'negativeSamplingRatio': 1.0,
   'testFraction': 0.3,
   'validationFolds': 7,
   'trainFraction': 0.3}}]

## Configure Model Params

In [200]:
c = '''
CALL gds.alpha.ml.pipeline.linkPrediction.configureParams('pipe',
  [{penalty:0, tolerance: 0.001, maxEpochs: 500},
   {penalty:0, tolerance: 0.01, maxEpochs: 500},
   {penalty:0.01, tolerance: 0.001, maxEpochs: 500},
   {penalty:0.01, tolerance: 0.01, maxEpochs: 500}, 
   {penalty:0.1, tolerance: 0.001, maxEpochs: 500}, 
   {penalty:0.1, tolerance: 0.01, maxEpochs: 500} ]
) YIELD parameterSpace
'''
print(c)
run_cypher(c, results=True)


CALL gds.alpha.ml.pipeline.linkPrediction.configureParams('pipe',
  [{penalty:0, tolerance: 0.001, maxEpochs: 500},
   {penalty:0, tolerance: 0.01, maxEpochs: 500},
   {penalty:0.01, tolerance: 0.001, maxEpochs: 500},
   {penalty:0.01, tolerance: 0.01, maxEpochs: 500}, 
   {penalty:0.1, tolerance: 0.001, maxEpochs: 500}, 
   {penalty:0.1, tolerance: 0.01, maxEpochs: 500} ]
) YIELD parameterSpace



[{'parameterSpace': [{'useBiasFeature': True,
    'maxEpochs': 500,
    'minEpochs': 1,
    'penalty': 0.0,
    'patience': 1,
    'batchSize': 100,
    'tolerance': 0.001},
   {'useBiasFeature': True,
    'maxEpochs': 500,
    'minEpochs': 1,
    'penalty': 0.0,
    'patience': 1,
    'batchSize': 100,
    'tolerance': 0.01},
   {'useBiasFeature': True,
    'maxEpochs': 500,
    'minEpochs': 1,
    'penalty': 0.01,
    'patience': 1,
    'batchSize': 100,
    'tolerance': 0.001},
   {'useBiasFeature': True,
    'maxEpochs': 500,
    'minEpochs': 1,
    'penalty': 0.01,
    'patience': 1,
    'batchSize': 100,
    'tolerance': 0.01},
   {'useBiasFeature': True,
    'maxEpochs': 500,
    'minEpochs': 1,
    'penalty': 0.1,
    'patience': 1,
    'batchSize': 100,
    'tolerance': 0.001},
   {'useBiasFeature': True,
    'maxEpochs': 500,
    'minEpochs': 1,
    'penalty': 0.1,
    'patience': 1,
    'batchSize': 100,
    'tolerance': 0.01}]}]

## Create graph projection

In [201]:
run_cypher('call gds.graph.drop("proteins")') if drop else ''

c = '''
// Create in-memory graph of (Protein)-[:INTERACTS_WITH]-(Protein)
CALL gds.graph.create(
    'proteins',
    'Protein',
    {INTERACTS_WITH: 
        {
            orientation: 'UNDIRECTED'
        }
    }
)
'''
run_cypher(c, results=True)

[{'nodeProjection': {'Protein': {'label': 'Protein', 'properties': {}}},
  'relationshipProjection': {'INTERACTS_WITH': {'orientation': 'UNDIRECTED',
    'aggregation': 'DEFAULT',
    'type': 'INTERACTS_WITH',
    'properties': {}}},
  'graphName': 'proteins',
  'nodeCount': 21557,
  'relationshipCount': 684706,
  'createMillis': 556}]

## Train link prediction model

In [202]:
run_cypher("call gds.beta.model.drop('lp-pipeline-model')", results=True) if drop else ''

c = '''
CALL gds.alpha.ml.pipeline.linkPrediction.train('proteins', {
  pipeline: 'pipe',
  modelName: 'lp-pipeline-model',
  randomSeed: 42
}) YIELD modelInfo
RETURN
  modelInfo.bestParameters AS winningModel,
  modelInfo.metrics.AUCPR.outerTrain AS trainGraphScore,
  modelInfo.metrics.AUCPR.test AS testGraphScore
'''
print(c)
run_cypher(c, results=True)


CALL gds.alpha.ml.pipeline.linkPrediction.train('proteins', {
  pipeline: 'pipe',
  modelName: 'lp-pipeline-model',
  randomSeed: 42
}) YIELD modelInfo
RETURN
  modelInfo.bestParameters AS winningModel,
  modelInfo.metrics.AUCPR.outerTrain AS trainGraphScore,
  modelInfo.metrics.AUCPR.test AS testGraphScore



[{'winningModel': {'useBiasFeature': True,
   'maxEpochs': 500,
   'minEpochs': 1,
   'penalty': 0.1,
   'patience': 1,
   'batchSize': 100,
   'tolerance': 0.001},
  'trainGraphScore': 0.8340738539501519,
  'testGraphScore': 0.8328571795535504}]

## Stream Results

In [None]:
c = '''
CALL gds.alpha.ml.pipeline.linkPrediction.predict.stream('proteins', {
  modelName: 'lp-pipeline-model',
  topN: 5,
  threshold: 0.3
})
 YIELD node1, node2, probability


 RETURN gds.util.asNode(node1).id AS protein1, gds.util.asNode(node2).id AS protein2, probability
 ORDER BY probability DESC, protein1
'''
print(c)
results = run_cypher(c, results=True)


CALL gds.alpha.ml.pipeline.linkPrediction.predict.stream('proteins', {
  modelName: 'lp-pipeline-model',
  topN: 5,
  threshold: 0.3
})
 YIELD node1, node2, probability


 RETURN gds.util.asNode(node1).id AS protein1, gds.util.asNode(node2).id AS protein2, probability
 ORDER BY probability DESC, protein1



In [None]:
pd.DataFrame(results)