In [1]:
import pandas as pd
import numpy as np

from graphframes import *
from pyspark import SparkContext

from pyspark.sql import SparkSession
spark = SparkSession.builder.getOrCreate()

In [2]:
from py2neo import Graph
ip = "192.168.0.101"
graph = Graph("bolt://%s:7687"%ip, password="123456")

In [3]:
v = spark.read.csv("data/community_evolution/day_0_v.csv", header=True)
e = spark.read.csv("data/community_evolution/day_0_e.csv", header=True)
g = GraphFrame(v, e)

##### Degree Table

In [4]:
total_degree = g.degrees
in_degree = g.inDegrees
out_degree = g.outDegrees

df_degree = (total_degree.join(in_degree, "id", how="left")
 .join(out_degree, "id", how="left")
 .join(v, "id", how="left")
 .fillna(0)
 .sort("degree", ascending=False))

df_degree.createOrReplaceTempView('df_degree')
df_degree = df_degree.toPandas()
df_degree.head(5)

Unnamed: 0,id,degree,inDegree,outDegree,name
0,94,9,6,3,Gerald
1,99,7,5,2,Rainey
2,71,6,2,4,Lenny
3,31,6,2,4,Barbara
4,95,6,3,3,Trey


In [5]:
df_degree.shape

(84, 5)

##### Component Table

In [6]:
sc = spark.sparkContext
sc.setCheckpointDir("checkpoint")

df_cc = g.connectedComponents()
df_cc.createOrReplaceTempView('df_tmp')

df_cc.show()

+---+--------+------------+
| id|    name|   component|
+---+--------+------------+
| 51|Channing| 25769803776|
|  7|  Kilian| 25769803776|
| 15| Gabriel| 25769803776|
| 54|  Joanna| 60129542145|
| 11|    Oren| 25769803776|
| 29|Geoffrey|120259084288|
| 69|Arabella|120259084289|
| 42| Bristol| 25769803776|
| 73|Cheyenne|137438953472|
| 87|   Julie|120259084289|
| 64| Katelyn|120259084289|
|  3|Isabella| 25769803776|
| 30|   Donna|163208757248|
| 34|  Zahara|120259084289|
| 59|   Coryn| 25769803776|
|  8|   Helen|223338299392|
| 22|Florence|249108103168|
| 28|  Wilson|249108103169|
| 85|   Idaia|120259084289|
| 16|  Winona|249108103168|
+---+--------+------------+
only showing top 20 rows



##### Size Table (for filtering)

In [7]:
sql = """
select component, count(*) as sz
from df_tmp
group by component
order by sz desc
"""

df_sz = spark.sql(sql)
df_sz.createOrReplaceTempView('df_sz')
df_sz = df_sz.toPandas()
df_sz.head()

Unnamed: 0,component,sz
0,120259084289,39
1,25769803776,20
2,249108103168,12
3,163208757248,8
4,60129542145,5


#### Mark Core


<img src="assets/50_degree_center.svg" alt="Drawing" style="width: 800px;"/>

In [8]:
sql = """
with numbered as (
select
  a.name, a.component, b.degree
  ,row_number() over (partition by component order by degree desc, a.name) as rn
from df_tmp as a
join df_degree as b
on a.id = b.id
)
select * 
from numbered a
join df_sz as b
on a.component = b.component
where rn = 1
  and b.sz > 2
"""
df_core = spark.sql(sql).toPandas()
df_core.head()

Unnamed: 0,name,component,degree,rn,component.1,sz
0,Cornelia,163208757248,3,1,163208757248,8
1,Gerald,120259084289,9,1,120259084289,39
2,Florence,249108103168,3,1,249108103168,12
3,Helga,25769803776,6,1,25769803776,20
4,Joanna,60129542145,4,1,60129542145,5


In [9]:
# Unmark Core
cql = """
MATCH (n)
REMOVE n:Core
return n
"""
graph.run(cql)

<py2neo.database.Cursor at 0x7f6c5cb5d2e8>

In [10]:
cql = """
MATCH (n)
WHERE n.name IN [%s]
SET n:Core
return n
"""%", ".join("\"%s\""%core for core in df_core.name.values)
print(cql)
graph.run(cql)


MATCH (n)
WHERE n.name IN ["Cornelia", "Gerald", "Florence", "Helga", "Joanna"]
SET n:Core
return n



<py2neo.database.Cursor at 0x7f6c5d37c9b0>