In [1]:
import pandas as pd
import numpy as np

from graphframes import *
from pyspark import SparkContext

from pyspark.sql import SparkSession
spark = SparkSession.builder.getOrCreate()

In [8]:
from py2neo import Graph
ip = "192.168.0.101"
graph = Graph("bolt://%s:7687"%ip, password="123456")

In [9]:
v = spark.read.csv("data/community_evolution/day_0_v.csv", header=True)
e = spark.read.csv("data/community_evolution/day_0_e.csv", header=True)
g = GraphFrame(v, e)

##### Degree Table

In [19]:
cql = "match (p)--(r) with p, count(r) as degree return p.name as name, degree order by degree desc"

df_degree = graph.run(cql).to_data_frame()
df_degree = spark.createDataFrame(df_degree.values.tolist(), list(df_degree.columns.values))
df_degree.createOrReplaceTempView('df_rank')
df_degree.show(5)

+-------+------+
|   name|degree|
+-------+------+
| Gerald|     9|
| Rainey|     7|
|  Helga|     6|
|Barbara|     6|
|   Trey|     6|
+-------+------+
only showing top 5 rows



##### Component Table

In [20]:
query = """
CALL algo.unionFind.stream("Person", "KNOWS")
YIELD nodeId, setId
RETURN algo.getNodeById(nodeId).name as name, setId as component
"""

graph.run(query).to_data_frame()

df_tmp = graph.run(query).to_data_frame()
df_tmp = spark.createDataFrame(df_tmp.values.tolist(), list(df_tmp.columns.values))
df_tmp.createOrReplaceTempView('df_tmp')
df_tmp.show(5)

+--------+---------+
|    name|component|
+--------+---------+
|     Roy|        0|
|  Philip|        1|
|    Joey|        1|
|Isabella|        1|
|  Howard|        1|
+--------+---------+
only showing top 5 rows



##### Size Table (for filtering)

In [21]:
sql = """
select component, count(*) as sz
from df_tmp
group by component
order by sz desc
"""

df_sz = spark.sql(sql)
df_sz.createOrReplaceTempView('df_sz')
df_sz = df_sz.toPandas()
df_sz.head()

Unnamed: 0,component,sz
0,31,39
1,1,20
2,16,12
3,35,8
4,43,5


#### Mark Core


<img src="assets/50_degree_center.svg" alt="Drawing" style="width: 800px;"/>

In [24]:
sql = """
with numbered as (
select
  a.name, a.component, b.degree
  ,row_number() over (partition by component order by degree desc, a.name) as rn
from df_tmp as a
join df_rank as b
on a.name = b.name
)
select a.name, a.component, a.degree, b.sz
from numbered a
join df_sz as b
on a.component = b.component
where rn = 1
  and b.sz > 2
"""
df_core = spark.sql(sql).toPandas()
df_core.head()

Unnamed: 0,name,component,degree,sz
0,Joanna,43,4,5
1,Gerald,31,9,39
2,Helga,1,6,20
3,Cornelia,35,3,8
4,Florence,16,3,12


In [25]:
# Unmark Core
cql = """
MATCH (n)
REMOVE n:Core
return n
"""
graph.run(cql)

<py2neo.database.Cursor at 0x7f16ceccd0b8>

In [26]:
cql = """
MATCH (n)
WHERE n.name IN [%s]
SET n:Core
return n
"""%", ".join("\"%s\""%core for core in df_core.name.values)
print(cql)
graph.run(cql)


MATCH (n)
WHERE n.name IN ["Joanna", "Gerald", "Helga", "Cornelia", "Florence"]
SET n:Core
return n



<py2neo.database.Cursor at 0x7f16cdec85f8>