In [1]:
%env PYTHONHASHSEED 3
!apt-get install openjdk-8-jdk-headless -qq > /dev/null
!pip install -q pyspark

from math import sqrt
import pyspark
from pyspark.sql import *

env: PYTHONHASHSEED=3
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m310.8/310.8 MB[0m [31m4.2 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
  Building wheel for pyspark (setup.py) ... [?25l[?25hdone


In [2]:
from pyspark import SparkContext, SparkConf
import os

directory = "/content/drive/My Drive/twitter" #The directory in my Google Drive containing our files.

spark = SparkSession.builder.master("local[*]").appName('Twitter Analysis').config(
    "spark.executor.memory", "1g").config("spark.ui.port", "4050"
        ).getOrCreate()
sc = spark.sparkContext

from google.colab import drive
drive.mount('/content/drive')

raw_edges = sc.textFile('/content/drive/My Drive/twitter_analysis/edges_rdd.txt') #This is our pre-processed file containing all our twitter graph edges.

files = [a_file for a_file in os.listdir(directory) if a_file.endswith(".edges")] #Gets a list of all files in our directory specified above.
ego_users = [int(os.path.splitext(file)[0]) for file in files] #Generates list of all ego users, with which to filter the PageRank output.

Mounted at /content/drive


In [3]:
def get_sources_and_destinations(raw_edges_file):
  """ Modified version of 2(a). from lab 4, reads in text file and converts it into an RDD of format: (source, [destinations])"""
  edges_rdd = raw_edges_file.map(lambda x: (int(x.split(',')[0].split('r=')[1].strip("'")), int(x.split(',')[1].split('d=')[1].split(')')[0].strip("'"))))
  #edges_rdd = edges.map(lambda x: tuple(x.split(',')))
  #edges_rdd = edges_rdd.map(lambda x: x[0].split("'")[1]).map(lambda x: (int(x.split()[0]), int(x.split()[1]))) #Formats our RDD to (source, destination) pairs of graph nodes
  graph_rdd = edges_rdd.groupByKey().map(lambda x: (x[0], list(x[1])))
  return graph_rdd

In [4]:
def get_col_trans_matrix(graph_rdd):
  def get_length(destinations):
    destinations = set(destinations)
    output = {}
    for item in destinations:
      output[item] = 1/len(destinations)
    return output
  col_matrix = graph_rdd.map(lambda x: (x[0], get_length(x[1])))
  return col_matrix

In [5]:
def col_to_row_matrix(col_trans_matrix):
    row_matrix = col_trans_matrix.flatMap(lambda column: ((row, (column[0], column[1][row])) for row in column[1])).groupByKey().sortByKey()
    return row_matrix

In [6]:
def row_multiply(row, R):
    result = 0
    for column, value in row:
      if column in R:
        result += value * R[column]
    return result

In [7]:
def produce_main_input(file):
  return col_to_row_matrix(get_col_trans_matrix(get_sources_and_destinations(file)))
input = produce_main_input(raw_edges)
input.persist() #Caches to make future calls to the RDD faster

PythonRDD[16] at RDD at PythonRDD.scala:53

In [8]:
def page_rank_main(input, iterations=90, convergence_threshold=0.001):
    graph_rows = input
    N = graph_rows.count()
    damping_factor = 0.85
    R = graph_rows.map(lambda x: (x[0], 1/N)).collectAsMap()
    previous_R = R.copy()
    for t in range(iterations):
        vecR = sc.broadcast(R)
        row_results = graph_rows.map(lambda kv: (kv[0], row_multiply(kv[1], vecR.value)))
        R = row_results.reduceByKey(lambda a, b: damping_factor*a + (1 - damping_factor)/N + b).collectAsMap()
        # Check if values have converged
        delta = sum(abs(R[i] - previous_R[i]) for i in R.keys())
        if delta < convergence_threshold:
            break # Stops the loop, preventing unnecessary iterations
        previous_R = R.copy()
    # Sort by rank and return the top 10 results
    top_results = row_results
    graph_rows.unpersist()
    return top_results

In [9]:
output = page_rank_main(input).filter(lambda x: x[0] in ego_users).sortBy(lambda x: -x[1])

In [10]:
influential_users = output.take(20)