In [1]:
%env PYTHONHASHSEED 3
!apt-get install openjdk-8-jdk-headless -qq > /dev/null
!pip install -q pyspark

from math import sqrt
import pyspark
from pyspark.sql import *

env: PYTHONHASHSEED=3
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m310.8/310.8 MB[0m [31m4.7 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
  Building wheel for pyspark (setup.py) ... [?25l[?25hdone


In [2]:
from pyspark import SparkContext, SparkConf

spark = SparkSession.builder.master("local[*]").appName('Twitter Analysis').config(
    "spark.executor.memory", "1g").config("spark.ui.port", "4050"
        ).getOrCreate()
sc = spark.sparkContext

from google.colab import drive
drive.mount('/content/drive')

raw_edges = sc.textFile('/content/drive/My Drive/twitter_analysis/edges_rdd.txt') #This is our pre-processed file containing all our twitter graph edges.

Mounted at /content/drive


In [3]:
def get_sources_and_destinations(edges):
  edges_rdd = edges.map(lambda x: tuple(x.split(',')))
  edges_rdd = edges_rdd.map(lambda x: x[0].split("'")[1]).map(lambda x: (int(x.split()[0]), int(x.split()[1]))) #Formats our RDD to (source, destination) pairs of graph nodes
  graph_rdd = edges_rdd.groupByKey().map(lambda x: (x[0], list(x[1])))
  return graph_rdd

In [None]:
def get_col_trans_matrix(graph_rdd):
  def get_length(destinations):
    destinations = set(destinations)
    output = {}
    for item in destinations:
      output[item] = 1/len(destinations)
    return output
  col_matrix = graph_rdd.map(lambda x: (x[0], get_length(x[1])))
  return col_matrix

In [None]:
def col_to_row_matrix(col_trans_matrix):
    row_matrix = col_trans_matrix.flatMap(lambda column: ((row, (column[0], column[1][row])) for row in column[1])).groupByKey().sortByKey()
    return row_matrix

In [None]:
def row_multiply(row, R):
    result = 0
    for column, value in row:
      print(row)
      if column in R: # For nodes that might not be connected to any other nodes in the graph
          result += value * R[column]
    return result

In [None]:
def produce_main_input(file):
  return col_to_row_matrix(get_col_trans_matrix(get_sources_and_destinations(file)))
input = produce_main_input(raw_edges)

In [None]:
# This implementation has been modified from the one produced in lab 4, to add a convergence threshold + account for potential dead ends and spider traps (since I can't
# guarantee whether the input graph is cyclical or not).
# Originally, I was aiming to have a convergence threshold of 0.0001, but in wanting to reduce the runtime of my algorithm - I made it 0.001.
def page_rank_main(input, iterations=100, convergence_threshold=0.01):
  graph_rows = input
  N = graph_rows.count()
  R = dict(enumerate([1/N]*N))
  previous_R = R.copy()
  for t in range(iterations):
    vecR = sc.broadcast(R)
    row_results = graph_rows.map(lambda x: (x[0], row_multiply(x[1],vecR.value)))
    R = row_results.collectAsMap()
    print(R)
    # Check if values have converged
    delta = sum(abs(R[i] - previous_R[i]) for i in range(N))
    if delta < convergence_threshold:
      break # Stops the loop, preventing unnecessary iterations
    previous_R = R.copy()
  print("R:",sorted(R.items()))
  return row_results.sortBy(lambda kv: -kv[1]).take(10)

In [None]:
page_rank_main(input).take(10)
#print(list(input.take(10)[0][1]))
#input.foreach(check_input_format)

Py4JJavaError: ignored

TypeError: ignored

In [None]:
input2 = input.collect()

In [None]:
%config NotebookApp.iopub_data_rate_limit=10000000

Twitter Analysis

In this study, we aim to determine the relationship between influential twitter users and the features of their profiles.

In order to determine influential users, we will implement a PageRank algorithm.