In [1]:
import pandas as pd
import numpy as np

from graphframes import *
from pyspark import SparkContext

from pyspark.sql import SparkSession
spark = SparkSession.builder.getOrCreate()

In [2]:
from py2neo import Graph
ip = "192.168.0.103"
graph = Graph("bolt://%s:7687"%ip, password="123456")

##### Degree Table

In [3]:
query = """
CALL algo.betweenness.stream("Person", "KNOWS")
YIELD nodeId, centrality
RETURN algo.getNodeById(nodeId).name as name, centrality
ORDER BY centrality DESC
"""

df_rank = graph.run(query).to_data_frame()
df_rank = spark.createDataFrame(df_rank.values.tolist(), list(df_rank.columns.values))
df_rank.createOrReplaceTempView('df_rank')
df_rank.show(5)

+-------+----------+
|   name|centrality|
+-------+----------+
|  Estes|      22.5|
|  Steph|      19.0|
|Antonia|      19.0|
|Melanie|      19.0|
| Briley|      16.0|
+-------+----------+
only showing top 5 rows



##### Component Table

In [4]:
query = """
CALL algo.unionFind.stream("Person", "KNOWS")
YIELD nodeId, setId
RETURN algo.getNodeById(nodeId).name as name, setId as component
"""

graph.run(query).to_data_frame()

df_tmp = graph.run(query).to_data_frame()
df_tmp = spark.createDataFrame(df_tmp.values.tolist(), list(df_tmp.columns.values))
df_tmp.createOrReplaceTempView('df_tmp')
df_tmp.show(5)

+-------+---------+
|   name|component|
+-------+---------+
|  Kyson|        0|
| Xander|        1|
|Gabriel|        2|
|  Edith|        0|
| Emelia|        0|
+-------+---------+
only showing top 5 rows



##### Size Table (for filtering)

In [5]:
sql = """
select component, count(*) as sz
from df_tmp
group by component
order by sz desc
"""

df_sz = spark.sql(sql)
df_sz.createOrReplaceTempView('df_sz')
df_sz.show(5)

+---------+---+
|component| sz|
+---------+---+
|        9| 14|
|       14| 14|
|        0|  7|
|        1|  3|
|        7|  2|
+---------+---+
only showing top 5 rows



#### Mark Core


<img src="assets/50_degree_center.svg" alt="Drawing" style="width: 800px;"/>

In [6]:
sql = """
with numbered as (
select
  a.name, a.component, b.centrality
  ,row_number() over (partition by component order by centrality desc, a.name) as rn
from df_tmp as a
join df_rank as b
on a.name = b.name
)
select a.name, a.component, a.centrality, b.sz
from numbered a
join df_sz as b
on a.component = b.component
where rn = 1
  and b.sz > 2
"""
df_core = spark.sql(sql).toPandas()
df_core.head()

Unnamed: 0,name,component,centrality,sz
0,Edith,0,2.0,7
1,Melanie,9,19.0,14
2,Xander,1,1.0,3
3,Estes,14,22.5,14


In [None]:
# Unmark Core
cql = """
MATCH (n)
REMOVE n:Core
return n
"""
graph.run(cql)

In [7]:
cql = """
MATCH (n)
WHERE n.name IN [%s]
SET n:Core
return n
"""%", ".join("\"%s\""%core for core in df_core.name.values)
print(cql)
graph.run(cql)


MATCH (n)
WHERE n.name IN ["Edith", "Melanie", "Xander", "Estes"]
SET n:Core
return n

