In [1]:
from graphframes import *
from pyspark import SparkContext

In [2]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.getOrCreate()

In [3]:
v = spark.read.csv("data/social-nodes.csv", header=True)
e = spark.read.csv("data/social-relationships.csv", header=True)
g = GraphFrame(v, e)

In [4]:
g.vertices.collect()

[Row(id='a', name='Alice'),
 Row(id='b', name='Bridget'),
 Row(id='c', name='Charles'),
 Row(id='d', name='Doug'),
 Row(id='m', name='Mark'),
 Row(id='i', name='Michael'),
 Row(id='v', name='David'),
 Row(id='y', name='Amy'),
 Row(id='j', name='James')]

In [5]:
g.edges.collect()

[Row(src='a', dst='b', relationship='FOLLOWS'),
 Row(src='a', dst='c', relationship='FOLLOWS'),
 Row(src='m', dst='d', relationship='FOLLOWS'),
 Row(src='b', dst='i', relationship='FOLLOWS'),
 Row(src='d', dst='m', relationship='FOLLOWS'),
 Row(src='i', dst='a', relationship='FOLLOWS'),
 Row(src='a', dst='i', relationship='FOLLOWS'),
 Row(src='b', dst='a', relationship='FOLLOWS'),
 Row(src='i', dst='b', relationship='FOLLOWS'),
 Row(src='c', dst='d', relationship='FOLLOWS'),
 Row(src='b', dst='d', relationship='FOLLOWS'),
 Row(src='i', dst='d', relationship='FOLLOWS'),
 Row(src='a', dst='d', relationship='FOLLOWS'),
 Row(src='m', dst='a', relationship='FOLLOWS'),
 Row(src='v', dst='y', relationship='FOLLOWS'),
 Row(src='j', dst='v', relationship='FOLLOWS')]

In [6]:
total_degree = g.degrees
in_degree = g.inDegrees
out_degree = g.outDegrees

(total_degree.join(in_degree, "id", how="left")
 .join(out_degree, "id", how="left")
 .fillna(0)
 .sort("inDegree", ascending=False)
.show())

+---+------+--------+---------+
| id|degree|inDegree|outDegree|
+---+------+--------+---------+
|  d|     6|       5|        1|
|  a|     7|       3|        4|
|  i|     5|       2|        3|
|  b|     5|       2|        3|
|  v|     2|       1|        1|
|  m|     3|       1|        2|
|  y|     1|       1|        0|
|  c|     2|       1|        1|
|  j|     1|       0|        1|
+---+------+--------+---------+



#### Convergence PR

In [7]:
results = g.pageRank(resetProbability=0.15, tol=0.01)
results.vertices.sort("pagerank", ascending=False).show()

+---+-------+-------------------+
| id|   name|           pagerank|
+---+-------+-------------------+
|  d|   Doug| 2.2233188859989745|
|  m|   Mark|  2.090451188336932|
|  a|  Alice| 1.5056291439101062|
|  i|Michael|  0.733738785109624|
|  b|Bridget|  0.733738785109624|
|  y|    Amy|  0.559446807245026|
|  c|Charles| 0.5338811076334145|
|  v|  David|0.40232326274180685|
|  j|  James|0.21747203391449021|
+---+-------+-------------------+



In [8]:
results.edges.collect()

[Row(src='b', dst='a', relationship='FOLLOWS', weight=0.3333333333333333),
 Row(src='a', dst='b', relationship='FOLLOWS', weight=0.25),
 Row(src='b', dst='i', relationship='FOLLOWS', weight=0.3333333333333333),
 Row(src='i', dst='d', relationship='FOLLOWS', weight=0.3333333333333333),
 Row(src='j', dst='v', relationship='FOLLOWS', weight=1.0),
 Row(src='c', dst='d', relationship='FOLLOWS', weight=1.0),
 Row(src='d', dst='m', relationship='FOLLOWS', weight=1.0),
 Row(src='a', dst='c', relationship='FOLLOWS', weight=0.25),
 Row(src='i', dst='b', relationship='FOLLOWS', weight=0.3333333333333333),
 Row(src='a', dst='d', relationship='FOLLOWS', weight=0.25),
 Row(src='b', dst='d', relationship='FOLLOWS', weight=0.3333333333333333),
 Row(src='m', dst='d', relationship='FOLLOWS', weight=0.5),
 Row(src='v', dst='y', relationship='FOLLOWS', weight=1.0),
 Row(src='i', dst='a', relationship='FOLLOWS', weight=0.3333333333333333),
 Row(src='a', dst='i', relationship='FOLLOWS', weight=0.25),
 Row(s

In [9]:
results.vertices.collect()

[Row(id='y', name='Amy', pagerank=0.559446807245026),
 Row(id='m', name='Mark', pagerank=2.090451188336932),
 Row(id='a', name='Alice', pagerank=1.5056291439101062),
 Row(id='j', name='James', pagerank=0.21747203391449021),
 Row(id='d', name='Doug', pagerank=2.2233188859989745),
 Row(id='i', name='Michael', pagerank=0.733738785109624),
 Row(id='v', name='David', pagerank=0.40232326274180685),
 Row(id='b', name='Bridget', pagerank=0.733738785109624),
 Row(id='c', name='Charles', pagerank=0.5338811076334145)]

#### Max Iter PR

In [10]:
results = g.pageRank(resetProbability=0.15, maxIter=2)
results.vertices.sort("pagerank", ascending=False).show()

+---+-------+-------------------+
| id|   name|           pagerank|
+---+-------+-------------------+
|  m|   Mark| 2.4519522046449005|
|  d|   Doug| 1.8075143049478286|
|  y|    Amy| 1.2117132278694043|
|  a|  Alice| 1.1401884887243352|
|  i|Michael| 0.6974503534163581|
|  b|Bridget| 0.6974503534163581|
|  c|Charles| 0.4757236620666441|
|  v|  David|0.33625042073375966|
|  j|  James|0.18175698418041064|
+---+-------+-------------------+



#### Personal PR

In [11]:
me = "d"
results = g.pageRank(resetProbability=0.15, maxIter=20, sourceId=me)
people_to_follow = results.vertices.sort("pagerank", ascending=False)
already_follows = list(g.edges.filter(f"src = '{me}'").toPandas()["dst"])
people_to_exclude = already_follows + [me]
people_to_follow[~people_to_follow.id.isin(people_to_exclude)].show()

+---+-------+-------------------+
| id|   name|           pagerank|
+---+-------+-------------------+
|  a|  Alice| 0.1650183746272782|
|  i|Michael|  0.048842467744892|
|  b|Bridget|  0.048842467744892|
|  c|Charles|0.03497796119878669|
|  y|    Amy|                0.0|
|  j|  James|                0.0|
|  v|  David|                0.0|
+---+-------+-------------------+

