In [1]:
import org.apache.spark.sql.types.{StructType, StructField, StringType}
import org.apache.spark.sql.functions.{col, lit}
import org.apache.spark.sql._
import spark.implicits._

Intitializing Scala interpreter ...

Spark Web UI available at http://10.0.0.232:4040
SparkContext available as 'sc' (version = 3.3.2, master = local[*], app id = local-1679803924657)
SparkSession available as 'spark'


import org.apache.spark.sql.types.{StructType, StructField, StringType}
import org.apache.spark.sql.functions.{col, lit}
import org.apache.spark.sql._
import spark.implicits._


In [2]:
def load_data() = {
    val graph = Seq(Row("0","2"),Row("1","0"),Row("1","2"),Row("1","3"), Row("2", "3"))
    val graphDF = spark.createDataFrame(
        sc.parallelize(graph), 
        StructType(
            List(
                StructField("follower", StringType), 
                StructField("followee", StringType)
            )
        )
    ).as("social_graph")
    graphDF.cache()
    
    graphDF
}

val graphDF = load_data()

load_data: ()org.apache.spark.sql.Dataset[org.apache.spark.sql.Row]
graphDF: org.apache.spark.sql.Dataset[org.apache.spark.sql.Row] = [follower: string, followee: string]


In [3]:
def get_users(graphDF: DataFrame) = {
    graphDF.select(col("followee"))
        .union(graphDF.select(col("follower")))
        .withColumnRenamed("followee","user_id").distinct.as("users")
}


val users = get_users(graphDF)
val user_count = sc.broadcast(users.count())

get_users: (graphDF: org.apache.spark.sql.DataFrame)org.apache.spark.sql.Dataset[org.apache.spark.sql.Row]
users: org.apache.spark.sql.Dataset[org.apache.spark.sql.Row] = [user_id: string]
user_count: org.apache.spark.broadcast.Broadcast[Long] = Broadcast(3)


In [4]:
def initialize_ranks(users: DataFrame) = {
    users.select(col("user_id"), lit(1.0/user_count.value).as("rank_value")).as("rank")
}

val rank = initialize_ranks(users)

initialize_ranks: (users: org.apache.spark.sql.DataFrame)org.apache.spark.sql.Dataset[org.apache.spark.sql.Row]
rank: org.apache.spark.sql.Dataset[org.apache.spark.sql.Row] = [user_id: string, rank_value: double]


In [5]:
def get_followers_per_user(social_graph: DataFrame) = {
    social_graph.groupBy("followee").agg(collect_list("follower").as("followers")).as("followers")
}

val followers = get_followers_per_user(graphDF)

get_followers_per_user: (social_graph: org.apache.spark.sql.DataFrame)org.apache.spark.sql.Dataset[org.apache.spark.sql.Row]
followers: org.apache.spark.sql.Dataset[org.apache.spark.sql.Row] = [followee: string, followers: array<string>]


In [6]:
def get_following_per_user(social_graph: DataFrame) = {
    social_graph.groupBy("follower").agg(collect_list("followee").as("following")).as("following")
}
val following = get_following_per_user(graphDF)

get_following_per_user: (social_graph: org.apache.spark.sql.DataFrame)org.apache.spark.sql.Dataset[org.apache.spark.sql.Row]
following: org.apache.spark.sql.Dataset[org.apache.spark.sql.Row] = [follower: string, following: array<string>]


In [7]:
var enhanced_rank = rank
    .join(followers, col("followers.followee") === col("rank.user_id"), "left")
    .select(col("user_id").as("user_id"), col("rank_value"), col("followers")).as("rank_followers")
    .join(following, col("following.follower") === col("rank_followers.user_id"), "left")
    .select(col("user_id"), col("rank_value"), col("followers"), col("following")).as("rank_followers_following")
    .as("rank")

enhanced_rank: org.apache.spark.sql.Dataset[org.apache.spark.sql.Row] = [user_id: string, rank_value: double ... 2 more fields]


In [8]:
// val non_dangling_users = following_followers_with_rank.filter(col("following").isNotNull)
// val dangling_users = following_followers_with_rank.filter(col("following").isNull)

In [9]:
def get_contributions(rank: DataFrame) = {
    rank.withColumn("contributions", col("rank_value") / when(col("following").isNotNull, size(col("following"))).otherwise(1))
}

val contributions = get_contributions(enhanced_rank)

get_contributions: (rank: org.apache.spark.sql.DataFrame)org.apache.spark.sql.DataFrame
contributions: org.apache.spark.sql.DataFrame = [user_id: string, rank_value: double ... 3 more fields]


In [10]:
def explode_and_sum_contributions(contributions: DataFrame) = {
    val exploded_contribution = contributions.select(col("user_id"),col("rank_value"), col("followers"), explode(col("following")).as("contribute_to"), col("contributions")).as("exploded_contributions")
    exploded_contribution.groupBy("contribute_to").agg(sum("contributions").alias("contributions")).as("summed_contributions")
}

val summed_contributions = explode_and_sum_contributions(contributions)
// val total_dangling_bonus = summed_contributions.select("contributions").where(col("contribute_to").isNull).first.getDouble(0)

explode_and_sum_contributions: (contributions: org.apache.spark.sql.DataFrame)org.apache.spark.sql.Dataset[org.apache.spark.sql.Row]
summed_contributions: org.apache.spark.sql.Dataset[org.apache.spark.sql.Row] = [contribute_to: string, contributions: double]


In [11]:
// def calculate_rank(contributions: DataFrame) = {
//     val new_ranks_without_dangling = contributions
//         .drop("contributions").join(summed_contributions, col("contribute_to") === col("user_id"), "left")
//         .drop("contribute_to")        
//         .select(col("user_id"), col("followers"), col("rank_value"), col("following"), col("contributions").as("rank_contributions"))
//     val coalesce_ranking = new_ranks_without_dangling
//             .withColumn("new_ranks", when(col("following").isNotNull, coalesce(col("rank_contributions"), lit(0))).otherwise(col("rank_value")))
//     val dangling_user_contribution = coalesce_ranking.filter(col("following").isNull).select("rank_contributions").agg(sum("rank_contributions")).first.getDouble(0)
//     coalesce_ranking.withColumn("computed_rank", col("new_ranks") + dangling_user_contribution / user_count.value)
//             .drop("rank_contributions", "new_ranks")
//             .withColumn("rank_value", (col("computed_rank") * 0.85) + (0.15 / user_count.value))
//     coalesce_ranking
// }

def calculate_rank(contributions: DataFrame, summed_contributions: DataFrame) = {
    val remainder = 1.0 - summed_contributions.agg(sum("contributions"))
    val new_ranks = contributions
        .drop("contributions")
        .join(summed_contributions, col("contribute_to") === col("user_id"), "left").drop("contribute_to")
        .select(col("user_id"), col("followers"), col("rank_value"), col("following"), col("contributions").as("rank_contributions"))
        .withColumn("final_rank", (when(col("rank_contributions").isNotNull, col("rank_contributions")).otherwise(lit(0))) + remainder / user_count.value)
        .withColumn("rank_value", (col("final_rank") * 0.85) + (0.15 / user_count.value))
        .select(col("user_id"), col("followers"), col("following"), col("rank_value"))
    new_ranks
}
val new_ranks = calculate_rank(contributions, summed_contributions)

calculate_rank: (contributions: org.apache.spark.sql.DataFrame, summed_contributions: org.apache.spark.sql.DataFrame)org.apache.spark.sql.DataFrame
new_ranks: org.apache.spark.sql.DataFrame = [user_id: string, followers: array<string> ... 2 more fields]


In [12]:
var number_of_iterations = 10

while(number_of_iterations > 0) {
    val contributions = get_contributions(enhanced_rank)
    val summed_contributions = explode_and_sum_contributions(contributions)
    val new_ranks = calculate_rank(contributions, summed_contributions)
    enhanced_rank = new_ranks
    number_of_iterations -= 1
}

23/03/26 00:12:56 WARN DAGScheduler: Broadcasting large task binary with size 1018.0 KiB
23/03/26 00:12:57 WARN DAGScheduler: Broadcasting large task binary with size 1004.3 KiB
23/03/26 00:13:25 ERROR Utils: uncaught error in thread spark-listener-group-appStatus, stopping SparkContext
java.lang.OutOfMemoryError: Java heap space
	at java.base/java.util.Arrays.copyOf(Arrays.java:3537)
	at java.base/java.lang.AbstractStringBuilder.ensureCapacityInternal(AbstractStringBuilder.java:241)
	at java.base/java.lang.AbstractStringBuilder.append(AbstractStringBuilder.java:586)
	at java.base/java.lang.StringBuilder.append(StringBuilder.java:179)
	at scala.collection.mutable.StringBuilder.append(StringBuilder.scala:213)
	at scala.collection.TraversableOnce$appender$1.apply(TraversableOnce.scala:418)
	at scala.collection.TraversableOnce$appender$1.apply(TraversableOnce.scala:410)
	at scala.collection.Iterator.foreach(Iterator.scala:943)
	at scala.collection.Iterator.foreach$(Iterator.scala:943)
	at

java.lang.IllegalStateException:  Cannot call methods on a stopped SparkContext.

In [None]:
enhanced_rank.show()