## Run this in pyspark shell
pyspark --packages graphframes:graphframes:0.6.0-spark2.3-s_2.11

In [1]:
from common import Common
from pyspark import *
from pyspark.sql import *

In [2]:
## Do not run this multiple times
common = Common()
sc,spark = common.get_spark_sql()

sc.addPyFile('/home/ec2-user/jars/graphframes-0.8.0-spark2.4-s_2.11.jar')


<SparkContext master=local[*] appName=myapp>
2.4.5


## If running from pyspark-shell, skip everything above it

In [3]:
from graphframes import *
svm_data = '/home/ec2-user/data/sample_svm_data.txt'
libsvm_data = '/home/ec2-user/data/sample_libsvm_data.txt'

## GraphFrames

In [9]:
v = spark.createDataFrame([
  ("a", "Alice", 34),
  ("b", "Bob", 36),
  ("c", "Charlie", 30),
], ["id", "name", "age"])

# Create an Edge DataFrame with "src" and "dst" columns
e = spark.createDataFrame([
  ("a", "b", "friend"),
  ("b", "c", "follow"),
  ("c", "b", "follow"),
  ("c", "a", "follow"),
], ["src", "dst", "relationship"])



In [10]:
v.show()

+---+-------+---+
| id|   name|age|
+---+-------+---+
|  a|  Alice| 34|
|  b|    Bob| 36|
|  c|Charlie| 30|
+---+-------+---+



In [11]:
e.show()

+---+---+------------+
|src|dst|relationship|
+---+---+------------+
|  a|  b|      friend|
|  b|  c|      follow|
|  c|  b|      follow|
|  c|  a|      follow|
+---+---+------------+



In [8]:
# Create a GraphFrame
g = GraphFrame(v, e)


In [None]:
# Query: Get in-degree of each vertex.
g.inDegrees.show()

"""
+---+--------+
| id|inDegree|
+---+--------+
|  c|       1|
|  b|       2|
|  a|       1|
+---+--------+
"""


In [None]:
# Query: Count the number of "follow" connections in the graph.
g.edges.filter("relationship = 'follow'").count()

"""
3
"""

g.edges.filter("relationship = 'friend'").count()

"""
1
"""

## pagerank

In [None]:
# Run PageRank algorithm, and show results.
results = g.pageRank(resetProbability=0.01, maxIter=5)
results

"""
GraphFrame(v:[id: string, name: string ... 2 more fields], e:[src: string, dst: string ... 2 more fields])
"""


In [None]:
results.vertices.select("id", "pagerank").show()

"""
+---+------------------+
| id|          pagerank|
+---+------------------+
|  b|1.1286509937624998|
|  a|   0.6262995037375|
|  c|      1.2450495025|
+---+------------------+
"""

## triangle counting

In [None]:
g.triangleCount().show()

"""
+-----+---+-------+---+
|count| id|   name|age|
+-----+---+-------+---+
|    1|  c|Charlie| 30|
|    1|  b|    Bob| 36|
|    1|  a|  Alice| 34|
+-----+---+-------+---+
"""