In [97]:
import os
from py2neo import Graph, Node, Relationship
from graphdatascience import GraphDataScience

# Get Neo4j client
graph = Graph("bolt://localhost:7687", auth=("neo4j", "linkprediction"))


##drop functions for pipeline, prediction graph, and model, comment out function call if not needed
drop_pipeline = f'''
    CALL gds.beta.pipeline.drop('pipe', False)
'''
drop_prediction_graph = f'''
    CALL gds.graph.drop('prediction_graph', False)
'''
drop_model = f'''
    CALL gds.beta.model.drop('lp_model', False)
'''
graph.run(drop_pipeline)
graph.run(drop_prediction_graph)
graph.run(drop_model)


modelInfo,trainConfig,graphSchema,loaded,stored,creationTime,shared
"{modelName: 'lp_model', modelType: 'LinkPrediction', metrics: {AUCPR: {test: 0.6709057731834295, validation: {min: 0.5704058578324726, max: 0.6711996490445161, avg: 0.6217107117431012}, outerTrain: 0.6705448036189562, train: {min: 0.5323667010394065, max: 0.6905591018026928, avg: 0.6189393935419113}}}, pipeline: {nodePropertySteps: [{name: 'gds.fastRP.mutate', config: {randomSeed: 42, mutateProperty: 'fastRP', contextRelationshipTypes: [], embeddingDimension: 56, contextNodeLabels: []}}, {name: 'gds.degree.mutate', config: {mutateProperty: 'degree', contextRelationshipTypes: [], contextNodeLabels: []}}, {name: 'gds.alpha.scaleProperties.mutate', config: {mutateProperty: 'scaledDegree', contextRelationshipTypes: [], nodeProperties: ['degree'], scaler: 'Mean', contextNodeLabels: []}}, {name: 'gds.articleRank.mutate', config: {mutateProperty: 'articleRank', contextRelationshipTypes: [], contextNodeLabels: []}}, {name: 'gds.closeness.mutate', config: {mutateProperty: 'closeness', contextRelationshipTypes: [], contextNodeLabels: []}}, {name: 'gds.influenceMaximization.celf.mutate', config: {mutateProperty: 'celf', contextRelationshipTypes: [], seedSetSize: 20, contextNodeLabels: []}}, {name: 'gds.pageRank.mutate', config: {mutateProperty: 'pageRank', contextRelationshipTypes: [], contextNodeLabels: []}}], featureSteps: [{name: 'HADAMARD', config: {nodeProperties: ['fastRP', 'scaledDegree', 'articleRank', 'pageRank']}}]}, bestParameters: {maxDepth: 2147483647, criterion: 'GINI', minSplitSize: 2, minLeafSize: 1, numberOfSamplesRatio: 0.0007, methodName: 'RandomForest', numberOfDecisionTrees: 100}, nodePropertySteps: [{name: 'gds.fastRP.mutate', config: {randomSeed: 42, mutateProperty: 'fastRP', contextRelationshipTypes: [], embeddingDimension: 56, contextNodeLabels: []}}, {name: 'gds.degree.mutate', config: {mutateProperty: 'degree', contextRelationshipTypes: [], contextNodeLabels: []}}, {name: 'gds.alpha.scaleProperties.mutate', config: {mutateProperty: 'scaledDegree', contextRelationshipTypes: [], nodeProperties: ['degree'], scaler: 'Mean', contextNodeLabels: []}}, {name: 'gds.articleRank.mutate', config: {mutateProperty: 'articleRank', contextRelationshipTypes: [], contextNodeLabels: []}}, {name: 'gds.closeness.mutate', config: {mutateProperty: 'closeness', contextRelationshipTypes: [], contextNodeLabels: []}}, {name: 'gds.influenceMaximization.celf.mutate', config: {mutateProperty: 'celf', contextRelationshipTypes: [], seedSetSize: 20, contextNodeLabels: []}}, {name: 'gds.pageRank.mutate', config: {mutateProperty: 'pageRank', contextRelationshipTypes: [], contextNodeLabels: []}}], featureSteps: [{name: 'HADAMARD', config: {nodeProperties: ['fastRP', 'scaledDegree', 'articleRank', 'pageRank']}}]}","{randomSeed: 42, targetRelationshipType: 'HAS_KEYWORD', jobId: 'b6871eb7-f4e4-48d6-b451-9986dda9320b', graphName: 'prediction_graph', sudo: false, negativeClassWeight: 1.0, storeModelToDisk: false, modelName: 'lp_model', logProgress: true, metrics: ['AUCPR'], pipeline: 'pipe', concurrency: 4, sourceNodeLabel: '*', targetNodeLabel: '*'}","{graphProperties: {}, nodes: {Keyword: {}, Dataset: {}, Investigator: {}, Instrument: {}, Platform: {}}, relationships: {HAS_KEYWORD: {}}}",True,False,datetime('2023-11-29T22:22:01.924653000-05:00'),False


In [98]:
#create prediction graph

format_relationships = '''
{
HAS_KEYWORD: {orientation: 'UNDIRECTED'}
}
'''

create_prediction_graph = f'''
    CALL gds.graph.project(
        'prediction_graph', 
        ['Dataset', 'Keyword', 'Platform', 'Instrument', 'Investigator'],
        {format_relationships}
    )
'''

graph.run(create_prediction_graph)

nodeProjection,relationshipProjection,graphName,nodeCount,relationshipCount,projectMillis
"{Keyword: {label: 'Keyword', properties: {}}, Dataset: {label: 'Dataset', properties: {}}, Investigator: {label: 'Investigator', properties: {}}, Instrument: {label: 'Instrument', properties: {}}, Platform: {label: 'Platform', properties: {}}}","{HAS_KEYWORD: {aggregation: 'DEFAULT', orientation: 'UNDIRECTED', indexInverse: false, properties: {}, type: 'HAS_KEYWORD'}}",prediction_graph,2399,40500,6


In [99]:
#create/config pipeline
create_pipeline = f'''
    CALL gds.beta.pipeline.linkPrediction.create('pipe')
'''

graph.run(create_pipeline)

name,nodePropertySteps,featureSteps,splitConfig,autoTuningConfig,parameterSpace
pipe,[],[],"{testFraction: 0.1, validationFolds: 3, trainFraction: 0.1, negativeSamplingRatio: 1.0}",{maxTrials: 10},"{MultilayerPerceptron: [], RandomForest: [], LogisticRegression: []}"


In [100]:
#split config
split_config = '''
{
    testFraction: 0.6,
    trainFraction: 0.25,
    validationFolds: 3
}
'''

splits = f'''
    CALL gds.beta.pipeline.linkPrediction.configureSplit('pipe', {split_config})
'''

graph.run(splits)

name,nodePropertySteps,featureSteps,splitConfig,autoTuningConfig,parameterSpace
pipe,[],[],"{testFraction: 0.6, validationFolds: 3, trainFraction: 0.25, negativeSamplingRatio: 1.0}",{maxTrials: 10},"{MultilayerPerceptron: [], RandomForest: [], LogisticRegression: []}"


In [101]:
#add features to pipeline object before training, need at least one

#fastrp
fastrp_config = '''
{
    mutateProperty: 'fastRP', 
    embeddingDimension: 56, 
    randomSeed: 42
}
'''

fastrp = f'''
    CALL gds.beta.pipeline.linkPrediction.addNodeProperty('pipe', 'fastRP', {fastrp_config})
'''

graph.run(fastrp)

#degree
degree_config = '''
{
    mutateProperty: 'degree'
}
'''

degree = f'''
    CALL gds.beta.pipeline.linkPrediction.addNodeProperty('pipe', 'degree', {degree_config})
'''

graph.run(degree)

#scaled degree
scaled_config = '''
{
    nodeProperties: ['degree'],
    mutateProperty: 'scaledDegree', 
    scaler: 'Mean'
}
'''

scaled = f'''
    CALL gds.beta.pipeline.linkPrediction.addNodeProperty('pipe', 'alpha.scaleProperties', {scaled_config})
'''

graph.run(scaled)

#article rank (less emphasize on low degree nodes)
article_rank_config = '''
{
    mutateProperty: 'articleRank'
}
'''

article_rank = f'''
    CALL gds.beta.pipeline.linkPrediction.addNodeProperty('pipe', 'articleRank', {article_rank_config})
'''

graph.run(article_rank)

#closeness (prioritizing nodes that are topoligically close to all other nodes)
closeness_config = '''
{
    mutateProperty: 'closeness'
}
'''

closeness = f'''
    CALL gds.beta.pipeline.linkPrediction.addNodeProperty('pipe', 'closeness', {closeness_config})
'''

graph.run(closeness)

#CELF, set of nodes with maximum spread
celf_config = '''
{
    mutateProperty: 'celf',
    seedSetSize: 20
}
'''

celf = f'''
    CALL gds.beta.pipeline.linkPrediction.addNodeProperty('pipe', 'influenceMaximization.celf', {celf_config})
'''

graph.run(celf)

#page rank, quality of connected links (more emphasize on nodes that have links with other high degree nodes)
page_rank_config = '''
{
    mutateProperty: 'pageRank'
}
'''

page_rank = f'''
    CALL gds.beta.pipeline.linkPrediction.addNodeProperty('pipe', 'pageRank', {page_rank_config})
'''

graph.run(page_rank)


#add features
node_properties = '''
{
    nodeProperties: ['fastRP', 'scaledDegree', 'articleRank', 'pageRank']
}
'''


add_link_features = f'''
    CALL gds.beta.pipeline.linkPrediction.addFeature('pipe', 'HADAMARD', {node_properties})
'''

graph.run(add_link_features)


name,nodePropertySteps,featureSteps,splitConfig,autoTuningConfig,parameterSpace
pipe,"[{name: 'gds.fastRP.mutate', config: {randomSeed: 42, mutateProperty: 'fastRP', contextRelationshipTypes: [], embeddingDimension: 56, contextNodeLabels: []}}, {name: 'gds.degree.mutate', config: {mutateProperty: 'degree', contextRelationshipTypes: [], contextNodeLabels: []}}, {name: 'gds.alpha.scaleProperties.mutate', config: {mutateProperty: 'scaledDegree', contextRelationshipTypes: [], nodeProperties: ['degree'], scaler: 'Mean', contextNodeLabels: []}}, {name: 'gds.articleRank.mutate', config: {mutateProperty: 'articleRank', contextRelationshipTypes: [], contextNodeLabels: []}}, {name: 'gds.closeness.mutate', config: {mutateProperty: 'closeness', contextRelationshipTypes: [], contextNodeLabels: []}}, {name: 'gds.influenceMaximization.celf.mutate', config: {mutateProperty: 'celf', contextRelationshipTypes: [], seedSetSize: 20, contextNodeLabels: []}}, {name: 'gds.pageRank.mutate', config: {mutateProperty: 'pageRank', contextRelationshipTypes: [], contextNodeLabels: []}}]","[{name: 'HADAMARD', config: {nodeProperties: ['fastRP', 'scaledDegree', 'articleRank', 'pageRank']}}]","{testFraction: 0.6, validationFolds: 3, trainFraction: 0.25, negativeSamplingRatio: 1.0}",{maxTrials: 10},"{MultilayerPerceptron: [], RandomForest: [], LogisticRegression: []}"


In [102]:
#add model candidate(s)

logistic_regression = f'''
   CALL gds.beta.pipeline.linkPrediction.addLogisticRegression('pipe')
'''

#graph.run(logistic_regression)

#numberOfSamplesRatio comes from the true class ratio of the database, to 
#counteract class imbalance. The true class ratio of the database is calcuated as 
empty_brace = '''
{
numberOfSamplesRatio: .07
}
'''

random_forest = f'''
    CALL gds.beta.pipeline.linkPrediction.addRandomForest('pipe', {empty_brace})
'''

graph.run(random_forest)

MLP = f'''
    CALL gds.alpha.pipeline.linkPrediction.addMLP('pipe')
'''

#graph.run(MLP)

In [103]:
#train pipeline 0.00711986262

train_config = '''
{
pipeline: 'pipe',
modelName: 'lp_model',
targetRelationshipType: 'HAS_KEYWORD',
metrics: ['AUCPR'],
randomSeed: 42
}
'''

train_pipeline = f'''
    CALL gds.beta.pipeline.linkPrediction.train('prediction_graph', {train_config})
    YIELD modelInfo, modelSelectionStats
    RETURN
      modelInfo.bestParameters AS winningModel,
    modelInfo.metrics.AUCPR.train.avg AS avgTrainScore,
    modelInfo.metrics.AUCPR.outerTrain AS outerTrainScore,
    modelInfo.metrics.AUCPR.test AS testScore,
    [candidate IN modelSelectionStats.modelCandidates | candidate.metrics.AUCPR.validation.avg] AS validationScores
'''

graph.run(train_pipeline)

winningModel,avgTrainScore,outerTrainScore,testScore,validationScores
"{maxDepth: 2147483647, criterion: 'GINI', minSplitSize: 2, minLeafSize: 1, numberOfSamplesRatio: 0.07, methodName: 'RandomForest', numberOfDecisionTrees: 100}",0.9304246460698484,0.9323961513568296,0.9162893145893124,[0.9149495645454436]


In [104]:

#predict
predict_config_mutate = '''
{
modelName: 'lp_model',
mutateRelationshipType: 'HAS_KEYWORD_EXHAUSTIVE_PREDICTED',
topN: 40,
threshold: 0.5
}
'''

predict_config_stream = '''
{
modelName: 'lp_model',
sampleRate: 1,
topN: 1000,
threshold: .5
}
''' 

#use stream to see predicted pairings probilities, mutate to change the prediction graph
predict_stream = f'''
    CALL gds.beta.pipeline.linkPrediction.predict.stream('prediction_graph', {predict_config_stream})
    YIELD node1, node2, probability
    WHERE (gds.util.asNode(node1).longName IS NOT NULL OR gds.util.asNode(node2).longName IS NOT NULL)
    AND
    (gds.util.asNode(node1).name IS NOT NULL OR gds.util.asNode(node2).name IS NOT NULL)
    
    RETURN 
    CASE 
        WHEN gds.util.asNode(node1).shortName IS NOT NULL then gds.util.asNode(node1).shortName
        ELSE gds.util.asNode(node2).shortName
    END AS DATASET_SHORT,
    
    CASE
        WHEN gds.util.asNode(node1).name IS NOT NULL then gds.util.asNode(node1).name
        ELSE gds.util.asNode(node2).name
    END AS KEYWORD,

    probability,

    CASE 
        WHEN gds.util.asNode(node1).longName IS NOT NULL then gds.util.asNode(node1).longName
        ELSE gds.util.asNode(node2).longName
    END AS DATASET_LONG,

    CASE 
        WHEN gds.util.asNode(node1).longName IS NOT NULL then node1
        ELSE node2
    END AS DATASET_NODE_ID,

    CASE
        WHEN gds.util.asNode(node1).name IS NOT NULL then node1
        ELSE node2
    END AS KEYWORD_NODE_ID
'''
###WITH STREAM COPY PASTE BELOW DIRECTLY INTO NEO4j DESKTOP, VSCODE DISPLAY ISSUE???

'''
CALL gds.beta.pipeline.linkPrediction.predict.stream('prediction_graph', {
modelName: 'lp_model',
sampleRate: 1,
topN: 1000,
threshold: .85})
YIELD node1, node2, probability
WHERE (gds.util.asNode(node1).shortName IS NOT NULL OR gds.util.asNode(node2).shortName IS NOT NULL)
AND
(gds.util.asNode(node1).name IS NOT NULL OR gds.util.asNode(node2).name IS NOT NULL)

RETURN 
    CASE 
        WHEN gds.util.asNode(node1).shortName IS NOT NULL then gds.util.asNode(node1).shortName
        ELSE gds.util.asNode(node2).shortName
    END AS DATASET_SHORT,

    CASE
        WHEN gds.util.asNode(node1).name IS NOT NULL then gds.util.asNode(node1).name
        ELSE gds.util.asNode(node2).name
    END AS KEYWORD,

    probability,

    CASE 
        WHEN gds.util.asNode(node1).longName IS NOT NULL then gds.util.asNode(node1).longName
        ELSE gds.util.asNode(node2).longName
    END AS DATASET_LONG,

    CASE 
        WHEN gds.util.asNode(node1).shortName IS NOT NULL then node1
        ELSE node2
    END AS DATASET_NODE_ID,

    CASE
        WHEN gds.util.asNode(node1).name IS NOT NULL then node1
        ELSE node2
    END AS KEYWORD_NODE_ID
'''

predict_mutate = f'''
    CALL gds.beta.pipeline.linkPrediction.predict.mutate('prediction_graph', {predict_config_mutate})
'''

graph.run(predict_stream)

DATASET_SHORT,KEYWORD,probability,DATASET_LONG,DATASET_NODE_ID,KEYWORD_NODE_ID
GPM_3PRPSMT1SAPHIR_DAY_CLIM,surface temperature,1.0,GPM SAPHIR on MT1 (PRPS) Climate-based Radiometer Precipitation Profiling L3 1 day 0.25 x 0.25 degree V06,152,1671
M2T1NXINT,sea surface temperature,1.0,"MERRA-2 tavg1_2d_int_Nx: 2d,1-Hourly,Time-Averaged,Single-Level,Assimilation,Vertically Integrated Diagnostics V5.12.4",767,1696
M2T1NXINT,ocean temperature,1.0,"MERRA-2 tavg1_2d_int_Nx: 2d,1-Hourly,Time-Averaged,Single-Level,Assimilation,Vertically Integrated Diagnostics V5.12.4",767,1695


In [105]:
#rerun algorithms on graph and stream if needed
#fastrp
mutate_fastrp_config = '''
{
    mutateProperty: 'fastRP', 
    embeddingDimension: 56, 
    randomSeed: 42
}
'''

mutate_fastrp = f'''
    CALL gds.fastRP.mutate('prediction_graph', {mutate_fastrp_config})
'''

#degree
mutate_degree_config = '''
{
    mutateProperty: 'degree'
}
'''

mutate_degree = f'''
    CALL gds.degree.mutate('prediction_graph', {mutate_degree_config})
'''


#scaled degree
mutate_scaled_config = '''
{
    nodeProperties: ['degree'],
    mutateProperty: 'scaledDegree', 
    scaler: 'Mean'
}
'''

mutate_scaled = f'''
    CALL gds.scaleProperties.mutate('prediction_graph', {mutate_scaled_config})
'''


#article rank
mutate_article_rank_config = '''
{
    mutateProperty: 'articleRank'
}
'''

mutate_article_rank = f'''
    CALL gds.articleRank.mutate('prediction_graph', {mutate_article_rank_config})
'''

#closeness 
mutate_closeness_config = '''
{
    mutateProperty: 'closeness'
}
'''

mutate_closeness = f'''
    CALL gds.betweenness.mutate('prediction_graph', {mutate_closeness_config})
'''

stream_node_properties = f'''
    CALL gds.graph.nodeProperties.stream('prediction_graph', ['fastRP', 'degree', 'scaledDegree', 'articleRank', 'closeness'])
    YIELD nodeId, nodeProperty, propertyValue
    RETURN nodeId AS ID, gds.util.asNode(nodeId).shortName AS datasetName, gds.util.asNode(nodeId).name AS keywordName, nodeProperty, propertyValue
'''
##copy and paste above cypher and run directly in neo4j desktop to avoid jupyter notebook cutting off output


#must comment all below when wanting to run predictions in neo4j desktop
#graph.run(mutate_fastrp)
#graph.run(mutate_degree)
#graph.run(mutate_scaled)
#graph.run(mutate_article_rank)
#graph.run(mutate_closeness)
#graph.run(stream_node_properties)
