In [765]:
import os
from py2neo import Graph, Node, Relationship
from graphdatascience import GraphDataScience

# Get Neo4j client
graph = Graph("bolt://localhost:7690", auth=("neo4j", "exoplanet"))

In [766]:
#create an unknown set of stars to model on. 
#rand value is percent of database as unknowns
create_unknowns = f'''
    MATCH (N)
    WITH N
    WHERE rand() < 0.2
    REMOVE N:star
    SET N:unknownStar
'''

graph.run(create_unknowns)

In [767]:
##drop functions for pipeline, prediction graph
drop_pipeline = f'''
    CALL gds.beta.pipeline.drop('pipe', False)
'''
drop_prediction_graph = f'''
    CALL gds.graph.drop('prediction_graph', False)
'''
drop_model = f'''
    CALL gds.beta.model.drop('nr_model', False)
'''

graph.run(drop_pipeline)
graph.run(drop_prediction_graph)
graph.run(drop_model)

modelInfo,trainConfig,graphSchema,loaded,stored,creationTime,shared
"{modelName: 'nr_model', featureProperties: ['num_planets', 'effective_temperature', 'radius', 'mass', 'metallicity', 'luminosity', 'surface_gravity', 'age', 'density', 'rotational_velocity', 'rotational_period'], modelType: 'NodeRegression', metrics: {MEAN_SQUARED_ERROR: {test: 0.7164596679819795, validation: {min: 0.682950776099251, max: 0.7750458866201602, avg: 0.7165011697143114}, outerTrain: 0.7163053633102167, train: {min: 0.6868734734628394, max: 0.7331744252604133, avg: 0.7164029222783265}}}, pipeline: {featureProperties: [{feature: 'num_planets'}, {feature: 'effective_temperature'}, {feature: 'radius'}, {feature: 'mass'}, {feature: 'metallicity'}, {feature: 'luminosity'}, {feature: 'surface_gravity'}, {feature: 'age'}, {feature: 'density'}, {feature: 'rotational_velocity'}, {feature: 'rotational_period'}], nodePropertySteps: []}, bestParameters: {minEpochs: 1, maxEpochs: 100, patience: 7, tolerance: 0.4434069566039899, learningRate: 0.0003404910155852413, batchSize: 200, penalty: 69.0, methodName: 'LinearRegression'}, nodePropertySteps: []}","{targetProperty: 'num_planets', randomSeed: 42, jobId: '5a228f81-c8c5-441b-ba3f-0fa42c92a7e6', graphName: 'prediction_graph', sudo: false, storeModelToDisk: false, modelName: 'nr_model', logProgress: true, metrics: ['MEAN_SQUARED_ERROR'], pipeline: 'pipe', concurrency: 4, relationshipTypes: ['*'], targetNodeLabels: ['star']}","{graphProperties: {}, nodes: {star: {mass: 'Float (DefaultValue(NaN), PERSISTENT)', num_planets: 'Float (DefaultValue(NaN), PERSISTENT)', rotational_period: 'Float (DefaultValue(NaN), PERSISTENT)', density: 'Float (DefaultValue(NaN), PERSISTENT)', age: 'Float (DefaultValue(NaN), PERSISTENT)', effective_temperature: 'Float (DefaultValue(NaN), PERSISTENT)', rotational_velocity: 'Float (DefaultValue(NaN), PERSISTENT)', radius: 'Float (DefaultValue(NaN), PERSISTENT)', surface_gravity: 'Float (DefaultValue(NaN), PERSISTENT)', luminosity: 'Float (DefaultValue(NaN), PERSISTENT)', metallicity: 'Float (DefaultValue(NaN), PERSISTENT)'}}, relationships: {__ALL__: {}}}",True,False,datetime('2024-01-31T12:44:26.395303000-05:00'),False


In [768]:
#blank map
star_properties = '''
{
    star: {properties: ['num_planets', 'effective_temperature', 'radius',
    'mass', 'metallicity', 'luminosity', 'surface_gravity', 'age',
    'density', 'rotational_velocity', 'rotational_period']},
    unknownStar: {properties: ['num_planets', 'effective_temperature', 'radius',
    'mass', 'metallicity', 'luminosity', 'surface_gravity', 'age',
    'density', 'rotational_velocity', 'rotational_period']}
}
'''
#create prediction graph
create_prediction_graph = f'''
    CALL gds.graph.project(
    'prediction_graph',
    {star_properties},
    '*'
    )
'''

graph.run(create_prediction_graph)

nodeProjection,relationshipProjection,graphName,nodeCount,relationshipCount,projectMillis
"{unknownStar: {label: 'unknownStar', properties: {mass: {property: 'mass', defaultValue: null}, num_planets: {property: 'num_planets', defaultValue: null}, rotational_period: {property: 'rotational_period', defaultValue: null}, density: {property: 'density', defaultValue: null}, age: {property: 'age', defaultValue: null}, effective_temperature: {property: 'effective_temperature', defaultValue: null}, rotational_velocity: {property: 'rotational_velocity', defaultValue: null}, radius: {property: 'radius', defaultValue: null}, surface_gravity: {property: 'surface_gravity', defaultValue: null}, luminosity: {property: 'luminosity', defaultValue: null}, metallicity: {property: 'metallicity', defaultValue: null}}}, star: {label: 'star', properties: {mass: {property: 'mass', defaultValue: null}, num_planets: {property: 'num_planets', defaultValue: null}, rotational_period: {property: 'rotational_period', defaultValue: null}, density: {property: 'density', defaultValue: null}, age: {property: 'age', defaultValue: null}, effective_temperature: {property: 'effective_temperature', defaultValue: null}, rotational_velocity: {property: 'rotational_velocity', defaultValue: null}, radius: {property: 'radius', defaultValue: null}, surface_gravity: {property: 'surface_gravity', defaultValue: null}, luminosity: {property: 'luminosity', defaultValue: null}, metallicity: {property: 'metallicity', defaultValue: null}}}}","{__ALL__: {aggregation: 'DEFAULT', orientation: 'NATURAL', indexInverse: false, properties: {}, type: '*'}}",prediction_graph,4147,0,14


In [769]:
#create/config pipeline
create_pipeline = f'''
    CALL gds.alpha.pipeline.nodeRegression.create('pipe')
'''

graph.run(create_pipeline)

name,nodePropertySteps,featureProperties,splitConfig,autoTuningConfig,parameterSpace
pipe,[],[],"{testFraction: 0.3, validationFolds: 3}",{maxTrials: 10},"{LinearRegression: [], RandomForest: []}"


In [770]:
#split config 
split_config = '''
{
    testFraction: 0.4,
    validationFolds: 3
}
'''

splits = f'''
    CALL gds.alpha.pipeline.nodeRegression.configureSplit('pipe', {split_config})
'''

graph.run(splits)

name,nodePropertySteps,featureProperties,splitConfig,autoTuningConfig,parameterSpace
pipe,[],[],"{testFraction: 0.4, validationFolds: 3}",{maxTrials: 10},"{LinearRegression: [], RandomForest: []}"


In [771]:
#add features
add_features = f'''
    CALL gds.alpha.pipeline.nodeRegression.selectFeatures('pipe',
    ['num_planets', 'effective_temperature', 'radius',
    'mass', 'metallicity', 'luminosity', 'surface_gravity', 'age',
    'density', 'rotational_velocity', 'rotational_period'])
'''

graph.run(add_features)

name,nodePropertySteps,featureProperties,splitConfig,autoTuningConfig,parameterSpace
pipe,[],"['num_planets', 'effective_temperature', 'radius', 'mass', 'metallicity', 'luminosity', 'surface_gravity', 'age', 'density', 'rotational_velocity', 'rotational_period']","{testFraction: 0.4, validationFolds: 3}",{maxTrials: 10},"{LinearRegression: [], RandomForest: []}"


In [772]:
#add model candidates


linear_config = '''
{
    batchSize: {range: [1, 300]},
    learningRate: {range: [.0001, .01]},
    patience: {range: [1, 10]},
    tolerance: {range: [.001, 3]},
    penalty: {range: [0, 100]}
}
'''


linear_regression = f'''
    CALL gds.alpha.pipeline.nodeRegression.addLinearRegression('pipe', {linear_config})
'''

graph.run(linear_regression)

forest_config = '''
{

}
'''

random_forest = f'''
    CALL gds.alpha.pipeline.nodeRegression.addRandomForest('pipe',{forest_config})
'''

#graph.run(random_forest)


In [773]:
#training the pipeline
train_config = '''
{
    pipeline: 'pipe',
    targetNodeLabels: ['star'],
    modelName: 'nr_model',
    targetProperty: 'num_planets',
    randomSeed: 42,
    metrics: ['MEAN_SQUARED_ERROR', MEAN_ABSOLUTE_ERROR]
}
'''

train_pipeline = f'''
    CALL gds.alpha.pipeline.nodeRegression.train('prediction_graph', {train_config})
    YIELD modelInfo
    RETURN
    modelInfo.bestParameters as winningModel,
    modelInfo.metrics.MEAN_SQUARED_ERROR.train.avg AS avgTrainScore,
    modelInfo.metrics.MEAN_SQUARED_ERROR.outerTrain AS outerTrainScore,
    modelInfo.metrics.MEAN_SQUARED_ERROR.test AS testScore,
    modelInfo.metrics.MEAN_ABSOLUTE_ERROR.test as absoluteTestScore

'''

graph.run(train_pipeline)

winningModel,avgTrainScore,outerTrainScore,testScore,absoluteTestScore
"{minEpochs: 1, maxEpochs: 100, patience: 7, tolerance: 0.4434069566039899, learningRate: 0.0003404910155852413, batchSize: 200, penalty: 69.0, methodName: 'LinearRegression'}",0.6370290564198267,0.6373486730946374,0.8813055614850053,


In [774]:
#predict
predict_config = '''
{
    modelName: 'nr_model',
    targetNodeLabels: ['unknownStar']
}
'''

#streams stars with a number designating the amount predicted planets
predict_stream = f'''
    CALL gds.alpha.pipeline.nodeRegression.predict.stream('prediction_graph', {predict_config})
    YIELD nodeId, predictedValue
    WITH gds.util.asNode(nodeId) as node, floor(predictedValue) as predicted_num_planets
    RETURN ID(node) AS node_ID, node.id AS star, predicted_num_planets
    ORDER BY predicted_num_planets DESC
'''

#copy paste below if inserting straight to neo4j desktop
#CALL gds.alpha.pipeline.nodeRegression.predict.stream('prediction_graph', {modelName: 'nr_model', targetNodeLabels: ['unknownStar']}) YIELD nodeId, predictedValue WITH gds.util.asNode(nodeId) as node, round(predictedValue) as predicted_num_planets RETURN ID(node) AS node_ID, node.id AS star, predicted_num_planets ORDER BY predicted_num_planets DESC

#returns a correctness score based on how many it modeled correctly
predict_correctness = f'''
    CALL gds.alpha.pipeline.nodeRegression.predict.stream('prediction_graph', {predict_config})
    YIELD nodeId, predictedValue
    WITH gds.util.asNode(nodeId) as node, floor(predictedValue) as predicted_num_planets
    WITH node, predicted_num_planets, node.num_planets as actual_num_planets

    WITH 
        ID(node) as node_ID,
        node.id AS star,
        predicted_num_planets,
        actual_num_planets,
        CASE WHEN predicted_num_planets = actual_num_planets THEN 1 ELSE 0 END AS is_correct

    ORDER BY predicted_num_planets DESC
    WITH
        SUM(actual_num_planets) AS total_nodes,
        SUM(predicted_num_planets) AS correct_predictions

    RETURN
        correct_predictions AS num_planets_predicted,
        total_nodes AS actual_num_planets,
        (correct_predictions - total_nodes) / total_nodes * 100.0 AS error
'''

graph.run(predict_stream)
#graph.run(predict_correctness)

node_ID,star,predicted_num_planets
3513,NSVS 14256825,7.0
3170,Kepler-757,1.0
3179,Kepler-765,1.0


In [775]:
#Reset nodes to defaults
reset_unknowns = f'''
    MATCH (N:unknownStar)
    REMOVE N:unknownStar
    SET N:star
'''

graph.run(reset_unknowns)