In [1]:
import pandas as pd
import numpy as np

from graphframes import *
from pyspark import SparkContext

from pyspark.sql import SparkSession
spark = SparkSession.builder.getOrCreate()

In [2]:
from py2neo import Graph
ip = "192.168.0.103"
graph = Graph("bolt://%s:7687"%ip, password="123456")

##### Degree Table

In [3]:
query = """
CALL algo.pageRank.stream('Person', 'KNOWS', {iterations:20, dampingFactor:0.85})
YIELD nodeId, score
RETURN algo.getNodeById(nodeId).name AS name, score as centrality
ORDER BY centrality DESC
"""

df_rank = graph.run(query).to_data_frame()
df_rank = spark.createDataFrame(df_rank.values.tolist(), list(df_rank.columns.values))
df_rank.createOrReplaceTempView('df_rank')
df_rank.show(5)

+-------+------------------+
|   name|        centrality|
+-------+------------------+
|  Jayce| 2.287159536732361|
|  Honor|2.2093509583268314|
|  Eskil|1.9679772497620436|
|   Ylia|1.8108202247880398|
|Sanders|1.5414855016628275|
+-------+------------------+
only showing top 5 rows



##### Component Table

In [4]:
query = """
CALL algo.unionFind.stream("Person", "KNOWS")
YIELD nodeId, setId
RETURN algo.getNodeById(nodeId).name as name, setId as component
"""

graph.run(query).to_data_frame()

df_tmp = graph.run(query).to_data_frame()
df_tmp = spark.createDataFrame(df_tmp.values.tolist(), list(df_tmp.columns.values))
df_tmp.createOrReplaceTempView('df_tmp')
df_tmp.show(5)

+--------+---------+
|    name|component|
+--------+---------+
|  Lucius|       18|
|   Randy|       38|
|    Cruz|        2|
|Felicity|        3|
|Anderson|       18|
+--------+---------+
only showing top 5 rows



##### Size Table (for filtering)

In [5]:
sql = """
select component, count(*) as sz
from df_tmp
group by component
order by sz desc
"""

df_sz = spark.sql(sql)
df_sz.createOrReplaceTempView('df_sz')
df_sz.show(5)

+---------+---+
|component| sz|
+---------+---+
|       60| 39|
|       18| 24|
|       38| 16|
|       30|  9|
|       11|  2|
+---------+---+
only showing top 5 rows



#### Mark Core


<img src="assets/50_closeness_center.svg" alt="Drawing" style="width: 800px;"/>

In [6]:
sql = """
with numbered as (
select
  a.name, a.component, b.centrality
  ,row_number() over (partition by component order by centrality desc, a.name) as rn
from df_tmp as a
join df_rank as b
on a.name = b.name
)
select a.name, a.component, a.centrality, b.sz
from numbered a
join df_sz as b
on a.component = b.component
where rn = 1
  and b.sz > 2
"""
df_core = spark.sql(sql).toPandas()
df_core.head()

Unnamed: 0,name,component,centrality,sz
0,Boston,18,0.921808,24
1,Archer,38,0.952334,16
2,Shiloh,30,0.449625,9
3,Jayce,60,2.28716,39


In [7]:
# Unmark Core
cql = """
MATCH (n)
REMOVE n:Core
return n
"""
graph.run(cql)

<py2neo.database.Cursor at 0x7ff80f2bf2b0>

In [8]:
cql = """
MATCH (n)
WHERE n.name IN [%s]
SET n:Core
return n
"""%", ".join("\"%s\""%core for core in df_core.name.values)
print(cql)
graph.run(cql)


MATCH (n)
WHERE n.name IN ["Boston", "Archer", "Shiloh", "Jayce"]
SET n:Core
return n



<py2neo.database.Cursor at 0x7ff80f2bffd0>