In [1]:
import org.apache.spark.sql.types.{StructType, StructField, StringType}
import org.apache.spark.sql.functions.{col}
import org.apache.spark.sql._
import spark.implicits._

Intitializing Scala interpreter ...

Spark Web UI available at http://10.0.0.232:4040
SparkContext available as 'sc' (version = 3.3.2, master = local[*], app id = local-1679778653942)
SparkSession available as 'spark'


import org.apache.spark.sql.types.{StructType, StructField, StringType}
import org.apache.spark.sql.functions.col
import org.apache.spark.sql._
import spark.implicits._


In [2]:
def load_data() = {
    val graph = Seq(Row("1","2"),Row("1","3"),Row("2","3"),Row("2","1"))
    val graphDF = spark.createDataFrame(
        sc.parallelize(graph), 
        StructType(
            List(
                StructField("follower", StringType), 
                StructField("followee", StringType)
            )
        )
    ).as("social_graph")
    graphDF.cache()
    
    graphDF
}

val graphDF = load_data()
graphDF.show()

+--------+--------+
|follower|followee|
+--------+--------+
|       1|       2|
|       1|       3|
|       2|       3|
|       2|       1|
+--------+--------+



load_data: ()org.apache.spark.sql.Dataset[org.apache.spark.sql.Row]
graphDF: org.apache.spark.sql.Dataset[org.apache.spark.sql.Row] = [follower: string, followee: string]


In [3]:
def get_users(graphDF: DataFrame) = {
    graphDF.select(col("followee"))
        .union(graphDF.select(col("follower")))
        .withColumnRenamed("followee","user_id").distinct.as("users")
}


val users = get_users(graphDF)
val user_count = sc.broadcast(users.count())
users.show()

+-------+
|user_id|
+-------+
|      2|
|      3|
|      1|
+-------+



get_users: (graphDF: org.apache.spark.sql.DataFrame)org.apache.spark.sql.Dataset[org.apache.spark.sql.Row]
users: org.apache.spark.sql.Dataset[org.apache.spark.sql.Row] = [user_id: string]
user_count: org.apache.spark.broadcast.Broadcast[Long] = Broadcast(6)


In [4]:
def initialize_ranks(users: DataFrame) = {
    users.select(col("user_id"), lit(1.0/user_count.value).as("rank_value")).as("rank")
}

val rank = initialize_ranks(users)
rank.show()

+-------+------------------+
|user_id|        rank_value|
+-------+------------------+
|      2|0.3333333333333333|
|      3|0.3333333333333333|
|      1|0.3333333333333333|
+-------+------------------+



initialize_ranks: (users: org.apache.spark.sql.DataFrame)org.apache.spark.sql.Dataset[org.apache.spark.sql.Row]
rank: org.apache.spark.sql.Dataset[org.apache.spark.sql.Row] = [user_id: string, rank_value: double]


In [5]:
def get_followers_per_user(social_graph: DataFrame) = {
    social_graph.groupBy("followee").agg(collect_list("follower").as("followers")).as("followers")
}

val followers = get_followers_per_user(graphDF)
followers.show()

+--------+---------+
|followee|followers|
+--------+---------+
|       2|      [1]|
|       3|   [1, 2]|
|       1|      [2]|
+--------+---------+



get_followers_per_user: (social_graph: org.apache.spark.sql.DataFrame)org.apache.spark.sql.Dataset[org.apache.spark.sql.Row]
followers: org.apache.spark.sql.Dataset[org.apache.spark.sql.Row] = [followee: string, followers: array<string>]


In [6]:
def get_following_per_user(social_graph: DataFrame) = {
    social_graph.groupBy("follower").agg(collect_list("followee").as("following")).as("following")
}
val following = get_following_per_user(graphDF)
following.show()

+--------+---------+
|follower|following|
+--------+---------+
|       1|   [2, 3]|
|       2|   [3, 1]|
+--------+---------+



get_following_per_user: (social_graph: org.apache.spark.sql.DataFrame)org.apache.spark.sql.Dataset[org.apache.spark.sql.Row]
following: org.apache.spark.sql.Dataset[org.apache.spark.sql.Row] = [follower: string, following: array<string>]


In [8]:
var enhanced_rank = rank
    .join(followers, col("followers.followee") === col("rank.user_id"), "left")
    .select(col("user_id").as("user_id"), col("rank_value"), col("followers")).as("rank_followers")
    .join(following, col("following.follower") === col("rank_followers.user_id"), "left")
    .select(col("user_id"), col("rank_value"), col("followers"), col("following")).as("rank_followers_following")
    .as("rank")
enhanced_rank.show()

+-------+------------------+---------+---------+
|user_id|        rank_value|followers|following|
+-------+------------------+---------+---------+
|      2|0.3333333333333333|      [1]|   [3, 1]|
|      3|0.3333333333333333|   [1, 2]|     null|
|      1|0.3333333333333333|      [2]|   [2, 3]|
+-------+------------------+---------+---------+



enhanced_rank: org.apache.spark.sql.Dataset[org.apache.spark.sql.Row] = [user_id: string, rank_value: double ... 2 more fields]


In [10]:
// val non_dangling_users = following_followers_with_rank.filter(col("following").isNotNull)
// val dangling_users = following_followers_with_rank.filter(col("following").isNull)

non_dangling_users: org.apache.spark.sql.Dataset[org.apache.spark.sql.Row] = [user_id: string, rank_value: double ... 2 more fields]
dangling_users: org.apache.spark.sql.Dataset[org.apache.spark.sql.Row] = [user_id: string, rank_value: double ... 2 more fields]


In [9]:
def get_contributions(rank: DataFrame) = {
    rank.withColumn("contributions", col("rank_value") / when(col("following").isNotNull, size(col("following"))).otherwise(1))
}

val contributions = get_contributions(enhanced_rank)
contributions.show()

+-------+------------------+---------+---------+-------------------+
|user_id|        rank_value|followers|following|      contributions|
+-------+------------------+---------+---------+-------------------+
|      2|0.3333333333333333|      [1]|   [3, 1]|0.16666666666666666|
|      3|0.3333333333333333|   [1, 2]|     null| 0.3333333333333333|
|      1|0.3333333333333333|      [2]|   [2, 3]|0.16666666666666666|
+-------+------------------+---------+---------+-------------------+



get_contributions: (rank: org.apache.spark.sql.DataFrame)org.apache.spark.sql.DataFrame
contributions: org.apache.spark.sql.DataFrame = [user_id: string, rank_value: double ... 3 more fields]


In [10]:
def explode_and_sum_contributions(contributions: DataFrame) = {
    val exploded_contribution = contributions.select(col("user_id"),col("rank_value"), col("followers"), explode_outer(col("following")).as("contribute_to"), col("contributions")).as("exploded_contributions")
    exploded_contribution.groupBy("contribute_to").agg(sum("contributions").alias("contributions")).as("summed_contributions")
}

val summed_contributions = explode_and_sum_contributions(contributions)
val total_dangling_bonus = summed_contributions.select("contributions").where(col("contribute_to").isNull).first.getDouble(0)
summed_contributions.show()

+-------------+-------------------+
|contribute_to|      contributions|
+-------------+-------------------+
|            3| 0.3333333333333333|
|         null| 0.3333333333333333|
|            1|0.16666666666666666|
|            2|0.16666666666666666|
+-------------+-------------------+



explode_and_sum_contributions: (contributions: org.apache.spark.sql.DataFrame)org.apache.spark.sql.Dataset[org.apache.spark.sql.Row]
summed_contributions: org.apache.spark.sql.Dataset[org.apache.spark.sql.Row] = [contribute_to: string, contributions: double]
total_dangling_bonus: Double = 0.3333333333333333


In [11]:
def calculate_rank(contributions: DataFrame, total_dangling_bonus: Double) = {
    val new_ranks_without_dangling = contributions
        .drop("contributions").join(summed_contributions, col("contribute_to") === col("user_id"), "left")
        .drop("contribute_to", "rank_value")
        .withColumnRenamed("contributions", "rank_value")
    new_ranks_without_dangling
        .withColumn("rank_value", col("rank_value") + (total_dangling_bonus/user_count.value))
        .withColumn("rank_value", (col("rank_value") * 0.85) + (0.15 / user_count.value))
}
val new_ranks = calculate_rank(contributions, total_dangling_bonus)
new_ranks.show()

+-------+---------+---------+-------------------+
|user_id|followers|following|         rank_value|
+-------+---------+---------+-------------------+
|      2|      [1]|   [3, 1]| 0.2861111111111111|
|      3|   [1, 2]|     null|0.42777777777777776|
|      1|      [2]|   [2, 3]| 0.2861111111111111|
+-------+---------+---------+-------------------+



calculate_rank: (contributions: org.apache.spark.sql.DataFrame, total_dangling_bonus: Double)org.apache.spark.sql.DataFrame
new_ranks: org.apache.spark.sql.DataFrame = [user_id: string, followers: array<string> ... 2 more fields]


In [12]:
var number_of_iterations = 10

while(number_of_iterations > 0) {
    val contributions = get_contributions(enhanced_rank)
    val summed_contributions = explode_and_sum_contributions(contributions)
    val total_dangling_bonus = summed_contributions.select("contributions").where(col("contribute_to").isNull).first.getDouble(0)
    
    val new_ranks = calculate_rank(contributions, total_dangling_bonus)
    enhanced_rank = new_ranks
    
    print(enhanced_rank.show())
    number_of_iterations -= 1
}

+-------+---------+---------+-------------------+
|user_id|followers|following|         rank_value|
+-------+---------+---------+-------------------+
|      2|      [1]|   [3, 1]| 0.2861111111111111|
|      3|   [1, 2]|     null|0.42777777777777776|
|      1|      [2]|   [2, 3]| 0.2861111111111111|
+-------+---------+---------+-------------------+

()+-------+---------+---------+-------------------+
|user_id|followers|following|         rank_value|
+-------+---------+---------+-------------------+
|      2|      [1]|   [3, 1]|0.31287037037037035|
|      3|   [1, 2]|     null|0.45453703703703696|
|      1|      [2]|   [2, 3]|0.31287037037037035|
+-------+---------+---------+-------------------+

()+-------+---------+---------+-------------------+
|user_id|followers|following|         rank_value|
+-------+---------+---------+-------------------+
|      2|      [1]|   [3, 1]|0.32045216049382713|
|      3|   [1, 2]|     null| 0.4621188271604938|
|      1|      [2]|   [2, 3]|0.3204521604938

number_of_iterations: Int = 0


In [13]:
enhanced_rank.show()

+-------+---------+---------+------------------+
|user_id|followers|following|        rank_value|
+-------+---------+---------+------------------+
|      2|      [1]|   [3, 1]|0.3234491730275153|
|      3|   [1, 2]|     null|0.4651158396941819|
|      1|      [2]|   [2, 3]|0.3234491730275153|
+-------+---------+---------+------------------+

