In [1]:
#!pip install pyspark

# COS 598 Homework 3 - Gregory Roberts

In [2]:
from pyspark import SparkConf, SparkContext
import pyspark.sql as psql
from pyspark.sql import SQLContext
from operator import add

import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

# Task 2
-----

In [3]:
conf = SparkConf().setMaster("local").setAppName("PageRank")
sc = SparkContext.getOrCreate(conf=conf)
sqlc = SQLContext.getOrCreate(sc)
sqlContext = SQLContext(sc)

In [4]:
# Read file into Spark RDD
rdd = sc.textFile("/content/drive/MyDrive/link_data.txt")
# Print RDD
print(rdd.collect())

['0 2', '2 0', '1 2', '1 3', '3 2']


In [5]:
# Reduce to X instances and map Y's to X
linksRDD = rdd.map(lambda x: tuple(x.split(" "))).map(lambda x: (x[0], [x[1]])).reduceByKey(lambda x, y: x+y)
# Print RDD
print(linksRDD.collect())

[('0', ['2']), ('2', ['0']), ('1', ['2', '3']), ('3', ['2'])]


In [6]:
# 1) Initialize the page rank of every node as 1.

ranksRDD = linksRDD.map(lambda x: (x[0], 1.0))
# Print RDD
print(ranksRDD.collect())

[('0', 1.0), ('2', 1.0), ('1', 1.0), ('3', 1.0)]


In [7]:
# 2) During each iteration, let vertex v contribute rank(v)/|neighbors(v)| to its neighboring vertices, 
#    where rank(v) is the current page rank of vertex v and |neighbors(v)| denotes the number of vertices 
#    to which vertex v links (i.e., the number of outgoing edges of vertex v).

# This function takes elements from the joined datasets (linksRDD and ranksRDD)
# and computes the contribution to each outgoing link based on the current rank.
def computeContribs(node_rank):
    # Separates links and ranks
    _, (links, rank) = node_rank
    # Counts number of links
    nb_links = len(links)
    # Loop through links
    for link in links:
        # For each link divide rank by number of links
        yield link, rank / nb_links

In [8]:
# 4) Repeat steps 2 and 3 k times (you may set k = 10).

for iteration in range(10):
    # Compute contributions of each node where it links to
    contribs = linksRDD.join(ranksRDD).flatMap(computeContribs)
    # Use a full outer join to make sure, that not well connected nodes aren't dropped
    contribs = linksRDD.fullOuterJoin(contribs).mapValues(lambda x : x[1] or 0.0)
    # Sum up all contributions per link
    ranksRDD = contribs.reduceByKey(add)
    # Re-calculate ranks
    # 3) Set each vertex’s rank to 0.15 + 0.85 × (contributions from all vertices that have 
    #    edges pointing to the current vertex), where the contributions are computed in step 2.
    ranksRDD = ranksRDD.mapValues(lambda rank: rank * 0.85 + 0.15)

In [9]:
# 5) In the end, divide the page rank of every vertex by the total number of 
#    vertices we have in the input graph.

# Get the count of the original vertices
N = rdd.count()
# Divide each rank by the count
ranksRDD2 = ranksRDD.mapValues(lambda rank: rank / N)

In [10]:
# Your implementation should output a text file, where each line contains a vertex and its 
# page rank. With the test input text file shown above, your output text file should look like:
#    0 <page rank of 0>
#    1 <page rank of 1>
#    2 <page rank of 2>
#    3 <page rank of 3>

# Loop through RDD by sorted link and rank
for (link, rank) in sorted(ranksRDD2.collect()):
    # Print sorted link and rank
    print("%s <%s>" % (link, rank))

0 <0.36844926958806634>
1 <0.03>
2 <0.35880073041193344>
3 <0.042749999999999996>


# Task 3
-----

In [11]:
# Create dataframe from RDD
df = sqlContext.createDataFrame(ranksRDD2)
# Rename columns to a readable format
df = df.withColumnsRenamed({'_1': 'link', '_2': 'rank'})
# Create a temporary view to use in the SQL queries
df.createOrReplaceTempView("ranksRDD2")
# Show entire Spark SQL dataframe
print(df.show())

+----+--------------------+
|link|                rank|
+----+--------------------+
|   2| 0.35880073041193344|
|   1|                0.03|
|   3|0.042749999999999996|
|   0| 0.36844926958806634|
+----+--------------------+

None


In [12]:
# (b) Write a Spark SQL query to find the page rank of vertex 2.
#     Print it to the screen.
sqlc.sql("SELECT rank as vertex_2 FROM ranksRDD2 where link = '2'").show()

+-------------------+
|           vertex_2|
+-------------------+
|0.35880073041193344|
+-------------------+



In [13]:
# (c) Write a Spark SQL query to find the vertex with the largest page rank. 
#     Print both the vertex ID and its page rank.
sqlc.sql("SELECT link, rank FROM ranksRDD2 where rank = (select max(rank) from ranksRDD2)").show()

+----+-------------------+
|link|               rank|
+----+-------------------+
|   0|0.36844926958806634|
+----+-------------------+



In [14]:
# (d) Suppose that there is another input text file, where each line contains the meaning of a vertex. 
#     For example, suppose that we have another input text file which looks like:
#
#           0 Adam
#           1 Lisa
#           2 Bert
#           3 Ralph
#
#     where each vertex corresponds to a person. (You may think of the example graph in Task 1 as the 
#     “follows” graph in a social network, e.g., the edge from 0 to 2 means that Adam follows Bert – 
#     in this case, the page rank of a vertex measures the social influence of a person.) Create a 
#     Spark data frame from the input text file shown above.

In [15]:
# Read file into Spark RDD
rdd2 = sc.textFile("/content/drive/MyDrive/link_person.txt")
# Print RDD
print(rdd2.collect())

['0 Adam', '1 Lisa', '2 Bert', '3 Ralph']


In [16]:
# Split the RDD values
personRDD = rdd2.map(lambda x: tuple(x.split(" ")))
# Print RDD
print(personRDD.collect())

[('0', 'Adam'), ('1', 'Lisa'), ('2', 'Bert'), ('3', 'Ralph')]


In [17]:
# Create dataframe from RDD
df2 = sqlContext.createDataFrame(personRDD)
# Rename columns to a readable format
df2 = df2.withColumnsRenamed({'_1': 'link', '_2': 'person'})
# Create a temporary view to use in the SQL queries
df2.createOrReplaceTempView("personRDD")
# Show entire Spark SQL dataframe
print(df2.show())

+----+------+
|link|person|
+----+------+
|   0|  Adam|
|   1|  Lisa|
|   2|  Bert|
|   3| Ralph|
+----+------+

None


In [18]:
#(e) Write a Spark SQL query to join the data frame containing the page rank information and the 
#    data frame containing the meaning of each vertex (in this case, the name of the person each 
#    vertex corresponds to). Save the query result in CSV format.
df_res = sqlc.sql("SELECT r.*, p.person FROM ranksRDD2 r join personRDD p on r.link = p.link")

# Write query results to CSV file
df_res.repartition(1).write.csv("/content/drive/MyDrive/hw3_task3.csv", mode="overwrite")

In [19]:
# Print query results to screen
print(df_res.show())

+----+--------------------+------+
|link|                rank|person|
+----+--------------------+------+
|   0| 0.36844926958806634|  Adam|
|   1|                0.03|  Lisa|
|   2| 0.35880073041193344|  Bert|
|   3|0.042749999999999996| Ralph|
+----+--------------------+------+

None
