In [None]:
!pip install pyspark

In [7]:
from pyspark import SparkConf, SparkContext

In [8]:
conf = SparkConf().setMaster("local").setAppName("PageRank")
sc = SparkContext.getOrCreate(conf=conf)

In [10]:
rdd = sc.textFile("/content/drive/MyDrive/link_data.txt")
print(rdd.collect())

['0 2', '2 0', '1 2', '1 3', '3 2']


In [11]:
linksRDD = rdd.map(lambda x: tuple(x.split(" "))).map(lambda x: (x[0], [x[1]])).reduceByKey(lambda x, y: x+y)
print(linksRDD.collect())

[('0', ['2']), ('2', ['0']), ('1', ['2', '3']), ('3', ['2'])]


In [12]:
ranksRDD = linksRDD.map(lambda x: (x[0], 1.0))
print(ranksRDD.collect())

[('0', 1.0), ('2', 1.0), ('1', 1.0), ('3', 1.0)]


In [13]:
def computeContribs(node_rank):
    _, (links, rank) = node_rank
    nb_links = len(links)
    for link in links:
        yield link, rank / nb_links

In [14]:
from operator import add

for iteration in range(10):
    # compute contributions of each node where it links to
    contribs = linksRDD.join(ranksRDD).flatMap(computeContribs)

    # use a full outer join to make sure, that not well connected nodes aren't dropped
    contribs = linksRDD.fullOuterJoin(contribs).mapValues(lambda x : x[1] or 0.0)

    # Sum up all contributions per link
    ranksRDD = contribs.reduceByKey(add)

    # Re-calculate ranks
    ranksRDD = ranksRDD.mapValues(lambda rank: rank * 0.85 + 0.15)

In [15]:
# Collects all ranks
for (link, rank) in sorted(ranksRDD.collect()):
    print("%s has rank: %s." % (link, rank / rdd.count()))

0 has rank: 0.36844926958806634.
1 has rank: 0.03.
2 has rank: 0.35880073041193344.
3 has rank: 0.042749999999999996.
