## 

In [40]:
import pandas as pd
from neo4j import GraphDatabase
import os
from dotenv import load_dotenv

In [42]:
load_dotenv()

True

In [45]:
bolt_url = os.getenv('bolt_url')
user = os.getenv('user')
password = os.getenv('password')
driver = GraphDatabase.driver(bolt_url,auth=(user,password))

In [4]:
def run_query(query):
    with driver.session() as session:
        result = session.run(query)
        # result contains a dictionary with list of values as a value
        return pd.DataFrame([r.values() for r in result],columns=result.keys())

In [7]:
run_query("""
CALL apoc.schema.assert({}, {})
""")
# Run this to drop all constraints

# To drop specific constraints, use DROP CONSTRAINT ON (c:City) ASSERT c.id IS UNIQUE

Unnamed: 0,label,key,keys,unique,action
0,City,id,[id],True,DROPPED


In [8]:
run_query("""
CREATE CONSTRAINT IF NOT EXISTS ON (c:City) ASSERT c.id IS UNIQUE;
""")

In [9]:
run_query("""
CALL apoc.periodic.iterate('
  CALL apoc.load.json("https://nomadlist.com/graph.json")
  YIELD value
  WITH value, [x in keys(value) WHERE x <> "README" | x] AS keys
  UNWIND keys AS source_city
  WITH source_city, value
  RETURN source_city, value
','
  MERGE (s:City{name:source_city})
  WITH value[source_city] as destinations, s
  WHERE destinations <> []
  WITH destinations, keys(destinations) as destination_cities, s
  UNWIND destination_cities AS destination_city
  MERGE (t:City{name:destination_city})
  MERGE (s)-[r:TRAVEL_TO]->(t)
  SET r.weight = destinations[destination_city]', 
  {batchSize:10})
""")

Unnamed: 0,batches,total,timeTaken,committedOperations,failedOperations,failedBatches,retries,errorMessages,batch,operations,wasTerminated,failedParams,updateStatistics
0,34,332,25,332,0,0,0,{},"{'total': 34, 'committed': 34, 'failed': 0, 'e...","{'total': 332, 'committed': 332, 'failed': 0, ...",False,{},"{'nodesDeleted': 0, 'labelsAdded': 332, 'relat..."


In [10]:
run_query("""
LOAD CSV WITH HEADERS FROM "https://raw.githubusercontent.com/tomasonjo/blog-datasets/main/nomad/nomad_cities_location.csv" as row
MATCH (c:City)
WHERE c.name = row.city
SET c.location = point({latitude:toFloat(row.latitude), longitude:toFloat(row.longitude)})
""")

# The entire graph<br>
![](./entire-graph.svg)

#  Exploratory Graph Analysis

In [12]:
run_query("""
MATCH (s:City)-[r:TRAVEL_TO]->(t:City)
WITH r,distance(s.location, t.location) / 1000 AS distanceInKm
SET r.distance = distanceInKm
""")

## A sub graph from the entire graph 
###  Query used was 
#### MATCH p=()-[r:TRAVEL_TO]->()
#### RETURN p LIMIT 25 <br>
![](./sub-graph.svg)

In [13]:
# Total sum of weight property
run_query("""
MATCH p=()-[r:TRAVEL_TO]->()
RETURN sum(r.weight) as all_travels
""")

Unnamed: 0,all_travels
0,32709


In [14]:
# Identifying popular destinations
run_query("""
MATCH (c:City)<-[r:TRAVEL_TO]-()
RETURN c.name as city, sum(r.weight) as travels
ORDER BY travels DESC
LIMIT 10
""")

Unnamed: 0,city,travels
0,bangkok-thailand,1814
1,new-york-city-ny-united-states,1591
2,chiang-mai-thailand,1210
3,san-francisco-ca-united-states,1149
4,berlin-germany,1078
5,singapore-singapore,1077
6,los-angeles-ca-united-states,979
7,lisbon-portugal,934
8,barcelona-spain,906
9,mexico-city-mexico,886


In [16]:
# how far do people travel on an average
run_query("""
MATCH ()-[r:TRAVEL_TO]->()
RETURN sum(CASE WHEN r.distance < 500 THEN r.weight END) 
          / toFloat(sum(r.weight)) AS within_500,
sum(CASE WHEN 500 < r.distance < 1000 THEN r.weight END) 
          / toFloat(sum(r.weight)) AS within_1000,
sum(CASE WHEN 1000 < r.distance < 2000 THEN r.weight END) 
          / toFloat(sum(r.weight)) AS within_2000,
sum(CASE WHEN 2000 < r.distance < 3000  THEN r.weight END) 
          / toFloat(sum(r.weight)) AS within_3000,
sum(CASE WHEN 3000 < r.distance < 4000  THEN r.weight END) 
          / toFloat(sum(r.weight)) AS within_4000,
sum(CASE WHEN 4000 < r.distance  THEN r.weight END) 
          / toFloat(sum(r.weight)) AS rest
""")

Unnamed: 0,within_500,within_1000,within_2000,within_3000,within_4000,rest
0,0.273411,0.190376,0.183099,0.103121,0.057232,0.101195


## Graph Data Science

In [4]:
run_query("""
CALL gds.graph.project('nomad', 'City', 'TRAVEL_TO', {relationshipProperties:'weight'});
""")

Unnamed: 0,nodeProjection,relationshipProjection,graphName,nodeCount,relationshipCount,projectMillis
0,"{'City': {'label': 'City', 'properties': {}}}","{'TRAVEL_TO': {'orientation': 'NATURAL', 'aggr...",nomad,332,2554,218


In [5]:
# Weakly connected components
wcc = run_query("""
CALL gds.wcc.stats('nomad')
YIELD componentCount, componentDistribution;
""")
wcc
# What we can infer from here is that 
# 1. There are 26 weakly connected/ or disconnected components
# 2. There is a super component with 307 members
# 3. There are 25 components 

Unnamed: 0,componentCount,componentDistribution
0,26,"{'p99': 307, 'min': 1, 'max': 307, 'mean': 12...."


In [7]:
print(wcc['componentDistribution'])

0    {'p99': 307, 'min': 1, 'max': 307, 'mean': 12....
Name: componentDistribution, dtype: object


In [8]:
# Cities that do not have any connection with the outer world
run_query('''
MATCH (c:City)
WHERE NOT (c)--()
RETURN c.name as city
LIMIT 10;
''')

Unnamed: 0,city
0,hubli-india
1,eau-claire-wi-united-states
2,boumerdas-algeria
3,lubumbashi-dr-congo
4,kananga-dr-congo
5,port-said-egypt
6,mbuji-mayi-dr-congo
7,duhok-kurdistan
8,sulaymaniyah-kurdistan
9,qom-iran


### Betweeness centrality

In [10]:
bc = run_query('''
CALL gds.betweenness.stream('nomad')
YIELD nodeId, score
RETURN gds.util.asNode(nodeId).name AS city, score
ORDER BY score DESC
''')
bc[:10]

Unnamed: 0,city,score
0,new-york-city-ny-united-states,8472.207287
1,dubai-united-arab-emirates,7407.355557
2,lisbon-portugal,6524.510935
3,cape-town-south-africa,5824.587856
4,berlin-germany,4618.986531
5,valencia-spain,4307.860141
6,barcelona-spain,4185.178915
7,bangkok-thailand,4043.790178
8,cairo-egypt,3627.885578
9,los-angeles-ca-united-states,3596.666354


In [15]:
# Add a property of betweeness centrality to all nodes
# label_names = bc['city']
# scores = bc['score']

# for city, score in zip(label_names,scores):
#     query = '''
#     MATCH (n {name:' '''+city+''' '})
#     SET n.betweenness_cent_score='''+str(score)
#     run_query(query)
# print("Betweeness centrality added as property") , DID NOT WORK

Betweeness centrality added as property


In [16]:
# Add PageRank as property
query = '''
CALL gds.pageRank.write('nomad', {relationshipWeightProperty:'weight', writeProperty:'pagerank'});
'''
pr = run_query(query)
pr

Unnamed: 0,writeMillis,nodePropertiesWritten,ranIterations,didConverge,centralityDistribution,postProcessingMillis,preProcessingMillis,computeMillis,configuration
0,165,332,20,False,"{'p99': 8.755492210388184, 'min': 0.1499996185...",691,1,497,"{'maxIterations': 20, 'writeConcurrency': 4, '..."


In [17]:
# Add louvain modularity as property
query = '''
CALL gds.louvain.write('nomad', {relationshipWeightProperty:'weight', writeProperty:'louvain'});
'''
lm = run_query(query)
lm

Unnamed: 0,writeMillis,nodePropertiesWritten,modularity,modularities,ranLevels,communityCount,communityDistribution,postProcessingMillis,preProcessingMillis,computeMillis,configuration
0,288,332,0.674092,"[0.6037740918402317, 0.6740923843121237]",2,34,"{'p99': 74, 'min': 1, 'max': 74, 'mean': 9.764...",116,0,4055,"{'maxIterations': 10, 'writeConcurrency': 4, '..."


#### Louvain and PageRank
![](Louvain_pagerank.png)

In [20]:
# Trying to write betweeness centrality as property
query = '''
CALL gds.betweenness.write('nomad',{writeProperty:'betweness'});
'''
run_query(query)

Unnamed: 0,nodePropertiesWritten,writeMillis,centralityDistribution,postProcessingMillis,preProcessingMillis,computeMillis,configuration
0,332,83,"{'p99': 5824.593734741211, 'min': 0.0, 'max': ...",448,0,141,"{'writeConcurrency': 4, 'writeProperty': 'betw..."


### Bloom visualization based on Louvain and Pagerank
1. Minimum size 1x
![](./bloom-visualisation-louvain.png)

2. Minimum size 0.5x
![](./bloom-visualisation.png) 

In [22]:
# <> means not equal in cypher

In [20]:
# Define a pipeline
run_query("""
CALL gds.beta.pipeline.linkPrediction.create('lp-pipeline');
""")

Unnamed: 0,name,nodePropertySteps,featureSteps,splitConfig,parameterSpace
0,lp-pipeline,[],[],"{'negativeSamplingRatio': 1.0, 'testFraction':...","{'RandomForest': [], 'LogisticRegression': []}"


In [22]:
# FastRP embedding
query = '''
CALL gds.beta.pipeline.linkPrediction.addNodeProperty(
  'lp-pipeline', 
  'fastRP', {
    mutateProperty: 'embedding',
    featureProperties:['pagerank'],
    embeddingDimension: 128,
    randomSeed: 42
});
'''
frp = run_query(query)
frp

Unnamed: 0,name,nodePropertySteps,featureSteps,splitConfig,parameterSpace
0,lp-pipeline,"[{'name': 'gds.fastRP.mutate', 'config': {'ran...",[],"{'negativeSamplingRatio': 1.0, 'testFraction':...","{'RandomForest': [], 'LogisticRegression': []}"


In [23]:
run_query("""
CALL gds.beta.pipeline.linkPrediction.addFeature('lp-pipeline', 'cosine', {
  nodeProperties: ['embedding']
}) YIELD featureSteps;
""")

Unnamed: 0,featureSteps
0,"[{'name': 'COSINE', 'config': {'nodeProperties..."


In [24]:
run_query("""
CALL gds.beta.pipeline.linkPrediction.configureSplit(
 'lp-pipeline', {  
   testFraction: 0.3,
   trainFraction: 0.6,
   validationFolds: 7})
YIELD splitConfig;
""")

Unnamed: 0,splitConfig
0,"{'negativeSamplingRatio': 1.0, 'testFraction':..."


In [25]:
run_query("""
CALL gds.beta.pipeline.linkPrediction.addLogisticRegression(
  'lp-pipeline',  
    {tolerance: 0.001, maxEpochs: 500})
YIELD parameterSpace;
""")

Unnamed: 0,parameterSpace
0,"{'RandomForest': [], 'LogisticRegression': [{'..."


In [26]:
# Projecting relationships as undirected
run_query("""
CALL gds.graph.project('lp-graph', 
  'City', 
  {TRAVEL_TO:{orientation:'UNDIRECTED'}}, 
  {nodeProperties:['pagerank']});
""")

Unnamed: 0,nodeProjection,relationshipProjection,graphName,nodeCount,relationshipCount,projectMillis
0,"{'City': {'label': 'City', 'properties': {'pag...","{'TRAVEL_TO': {'orientation': 'UNDIRECTED', 'a...",lp-graph,332,5108,193


In [27]:
run_query("""
CALL gds.beta.pipeline.linkPrediction.train('lp-graph', 
  {pipeline: 'lp-pipeline',
   modelName: 'lp-model',
   randomSeed: 42})
YIELD modelInfo
RETURN  modelInfo.bestParameters AS winningModel,  modelInfo.metrics.AUCPR.outerTrain AS trainGraphScore,  modelInfo.metrics.AUCPR.test AS testGraphScore;
""")

Unnamed: 0,winningModel,trainGraphScore,testGraphScore
0,"{'maxEpochs': 500, 'minEpochs': 1, 'penalty': ...",0.786953,0.83135


In [28]:
run_query("""
CALL gds.beta.pipeline.linkPrediction.predict.mutate('lp-graph', 
  {modelName: 'lp-model',  
   mutateRelationshipType: 'TRAVEL_PREDICTED',
   topN: 20,
   threshold: 0.45})
YIELD relationshipsWritten;
""")

Unnamed: 0,relationshipsWritten
0,40


In [32]:
probable = run_query("""
CALL gds.graph.streamRelationshipProperty('lp-graph', 
  'probability', 
  ['TRAVEL_PREDICTED'])
YIELD  sourceNodeId, targetNodeId, propertyValue
WHERE sourceNodeId < targetNodeId
WITH  gds.util.asNode(sourceNodeId).name as city1, gds.util.asNode(targetNodeId).name as city2, propertyValue as probability
ORDER BY probability DESC
RETURN city1, city2, probability
LIMIT 10;
""")
probable

Unnamed: 0,city1,city2,probability
0,krabi-thailand,luang-prabang-laos,0.500981
1,ko-pha-ngan-thailand,luang-prabang-laos,0.50098
2,siem-reap-cambodia,yangon-myanmar,0.50098
3,ho-chi-minh-city-vietnam,yangon-myanmar,0.500978
4,luang-prabang-laos,yangon-myanmar,0.500978
5,krabi-thailand,penang-malaysia,0.500978
6,yangon-myanmar,hoi-an-vietnam,0.500977
7,luang-prabang-laos,hoi-an-vietnam,0.500977
8,adelaide-australia,byron-bay-australia,0.500977
9,krabi-thailand,yangon-myanmar,0.500977


In [33]:
len(probable)

10

In [None]:
# Now we will create the TRAVEL_PREDICTED edges between the predicted cities with probability as property of the relationship

In [39]:
for i,rec in probable.iterrows():
    print(rec)
    city1 = rec['city1']
    city2 = rec['city2']
    prob = rec['probability']
    qu = '''
    MERGE (c1:City {name:'%s'})
    MERGE (c2:City {name:'%s'})
    MERGE (c1)-[r:TRAVEL_PREDICTED {probability:'%s'}]->(c2)
    '''
    args = (city1,city2,prob)
    run_query(qu % args)

city1              krabi-thailand
city2          luang-prabang-laos
probability              0.500981
Name: 0, dtype: object
city1          ko-pha-ngan-thailand
city2            luang-prabang-laos
probability                 0.50098
Name: 1, dtype: object
city1          siem-reap-cambodia
city2              yangon-myanmar
probability               0.50098
Name: 2, dtype: object
city1          ho-chi-minh-city-vietnam
city2                    yangon-myanmar
probability                    0.500978
Name: 3, dtype: object
city1          luang-prabang-laos
city2              yangon-myanmar
probability              0.500978
Name: 4, dtype: object
city1           krabi-thailand
city2          penang-malaysia
probability           0.500978
Name: 5, dtype: object
city1          yangon-myanmar
city2          hoi-an-vietnam
probability          0.500977
Name: 6, dtype: object
city1          luang-prabang-laos
city2              hoi-an-vietnam
probability              0.500977
Name: 7, dtype: obje

#### Predicted travel edges
![](./predicted-travel.svg)

#### To check procedures available, run or use the cells below

In [8]:
procedures = run_query("CALL gds.list();")

In [10]:
procedures

Unnamed: 0,name,description,signature,type
0,gds.allShortestPaths.delta.mutate,The Delta Stepping shortest path algorithm com...,gds.allShortestPaths.delta.mutate(graphName ::...,procedure
1,gds.allShortestPaths.delta.mutate.estimate,Returns an estimation of the memory consumptio...,gds.allShortestPaths.delta.mutate.estimate(gra...,procedure
2,gds.allShortestPaths.delta.stream,The Delta Stepping shortest path algorithm com...,gds.allShortestPaths.delta.stream(graphName ::...,procedure
3,gds.allShortestPaths.delta.stream.estimate,Returns an estimation of the memory consumptio...,gds.allShortestPaths.delta.stream.estimate(gra...,procedure
4,gds.allShortestPaths.delta.write,The Delta Stepping shortest path algorithm com...,gds.allShortestPaths.delta.write(graphName :: ...,procedure
...,...,...,...,...
295,gds.util.infinity,RETURN gds.util.infinity() - Return infinity a...,gds.util.infinity() :: (FLOAT?),function
296,gds.util.isFinite,RETURN gds.util.isFinite(value) - Return true ...,gds.util.isFinite(value :: NUMBER?) :: (BOOLEAN?),function
297,gds.util.isInfinite,RETURN gds.util.isInfinite(value) - Return tru...,gds.util.isInfinite(value :: NUMBER?) :: (BOOL...,function
298,gds.util.nodeProperty,Returns a node property value from a named in-...,"gds.util.nodeProperty(graphName :: STRING?, no...",function


In [18]:
for i in procedures['name']:
    if i.startswith('gds.beta.pipeline'):
        print(i)

gds.beta.pipeline.drop
gds.beta.pipeline.exists
gds.beta.pipeline.linkPrediction.addFeature
gds.beta.pipeline.linkPrediction.addLogisticRegression
gds.beta.pipeline.linkPrediction.addNodeProperty
gds.beta.pipeline.linkPrediction.configureSplit
gds.beta.pipeline.linkPrediction.create
gds.beta.pipeline.linkPrediction.predict.mutate
gds.beta.pipeline.linkPrediction.predict.mutate.estimate
gds.beta.pipeline.linkPrediction.predict.stream
gds.beta.pipeline.linkPrediction.predict.stream.estimate
gds.beta.pipeline.linkPrediction.train
gds.beta.pipeline.linkPrediction.train.estimate
gds.beta.pipeline.list
gds.beta.pipeline.nodeClassification.addLogisticRegression
gds.beta.pipeline.nodeClassification.addNodeProperty
gds.beta.pipeline.nodeClassification.configureSplit
gds.beta.pipeline.nodeClassification.create
gds.beta.pipeline.nodeClassification.predict.mutate
gds.beta.pipeline.nodeClassification.predict.mutate.estimate
gds.beta.pipeline.nodeClassification.predict.stream
gds.beta.pipeline.nodeC