In [1]:
from graphframes import *
from pyspark import SparkContext

In [2]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.getOrCreate()

In [3]:
v = spark.read.csv("data/nodes.csv", header=True)
e = spark.read.csv("data/edges.csv", header=True)
g = GraphFrame(v, e)

In [4]:
g.vertices.collect()

[Row(id='a'), Row(id='b'), Row(id='c')]

In [5]:
g.edges.collect()

[Row(src='a', dst='b', relationship='FOLLOWS'),
 Row(src='a', dst='c', relationship='FOLLOWS'),
 Row(src='b', dst='c', relationship='FOLLOWS')]

In [6]:
total_degree = g.degrees
in_degree = g.inDegrees
out_degree = g.outDegrees

(total_degree.join(in_degree, "id", how="left")
 .join(out_degree, "id", how="left")
 .fillna(0)
 .sort("inDegree", ascending=False)
.show())

+---+------+--------+---------+
| id|degree|inDegree|outDegree|
+---+------+--------+---------+
|  c|     2|       2|        0|
|  b|     2|       1|        1|
|  a|     2|       0|        2|
+---+------+--------+---------+



In [7]:
results = g.pageRank(resetProbability=0.15, tol=0.01)
results.vertices.sort("pagerank", ascending=False).show()

+---+------------------+
| id|          pagerank|
+---+------------------+
|  c|1.5626080513707088|
|  b|0.8446530007409238|
|  a|0.5927389478883676|
+---+------------------+



In [8]:
results.edges.collect()

[Row(src='b', dst='c', relationship='FOLLOWS', weight=1.0),
 Row(src='a', dst='c', relationship='FOLLOWS', weight=0.5),
 Row(src='a', dst='b', relationship='FOLLOWS', weight=0.5)]

In [9]:
results.vertices.collect()

[Row(id='a', pagerank=0.5927389478883676),
 Row(id='b', pagerank=0.8446530007409238),
 Row(id='c', pagerank=1.5626080513707088)]

In [10]:
results = g.pageRank(resetProbability=0.15, maxIter=20)
results.vertices.sort("pagerank", ascending=False).show()

+---+------------------+
| id|          pagerank|
+---+------------------+
|  c|1.5626080513707088|
|  b|0.8446530007409236|
|  a|0.5927389478883675|
+---+------------------+

