In [8]:
PROJECT_NAME: str = "web_graphs"
HDFS_NAMENODE: str = "hdfs://namenode:9000"
INPUT_DIR: str = f"{HDFS_NAMENODE}/input/{PROJECT_NAME}"
OUTPUT_DIR: str = f"{HDFS_NAMENODE}/output/{PROJECT_NAME}"

MASTER_URI = "spark://spark-master:7077"

In [9]:
from pyspark.sql import SparkSession


def spark_session() -> SparkSession:
    spark = (
        SparkSession.builder.appName(PROJECT_NAME.capitalize)
        .master(MASTER_URI)
        .config("spark.driver.memory", "4g")
        .config("spark.hadoop.fs.defaultFS", HDFS_NAMENODE)
        .config("spark.hadoop.dfs.client.use.datanode.hostname", "true")
        .getOrCreate()
    )

    print(f"Connected to Spark {spark.version}")

    return spark

In [10]:
WEB_DATA_PATH = f"{INPUT_DIR}/web-Google.txt"

In [11]:
from pyspark.sql.dataframe import DataFrame
from pyspark.sql.functions import col
from pyspark.sql.types import IntegerType


def load_web_data(spark: SparkSession) -> DataFrame:
    raw_data = spark.read.text(WEB_DATA_PATH)
    edges_df = (
        raw_data.filter(~col("value").startswith("#"))
        .selectExpr(
            "split(value, '\\t')[0] as FromNodeID", "split(value, '\\t')[1] as ToNodeID"
        )
        .withColumn("FromNodeID", col("FromNodeID").cast(IntegerType()))
        .withColumn("ToNodeID", col("ToNodeID").cast(IntegerType()))
        .cache()
    )

    return edges_df

# Task 1

Given the Google web graph dataset, please output the sorted list of web pages with the number of outlinks, sorted in descending order of the out-degrees.

## Output Format
a sorted list of pages with their out-degrees
Each line contains: <NodeID>, <out-degree> 

In [13]:
%%time

from pyspark.sql.functions import count

spark = spark_session()

edges_df = load_web_data(spark)

result_df = (
    edges_df.groupBy("FromNodeID")
    .agg(count("ToNodeID").alias("OutDegree"))
    .orderBy(col("OutDegree").desc())
    .select(["FromNodeID", "OutDegree"])
)
result_df.cache()

result_df.show(n=10)
print(f"Writing results to '{OUTPUT_DIR}/node_out_degree'\n")
result_df.coalesce(1).write.mode("overwrite").csv(
    f"{OUTPUT_DIR}/node_out_degree", header=True, sep=","
)

result_df.unpersist()

spark.stop()

Connected to Spark 3.5.0
+----------+---------+
|FromNodeID|OutDegree|
+----------+---------+
|    506742|      456|
|    203748|      372|
|    305229|      372|
|    768091|      330|
|    808643|      277|
|    412410|      268|
|    600479|      265|
|    376428|      258|
|    156950|      257|
|    885728|      256|
+----------+---------+
only showing top 10 rows

Writing results to 'hdfs://namenode:9000/output/web_graphs/node_out_degree'

CPU times: user 35.6 ms, sys: 16.3 ms, total: 51.8 ms
Wall time: 5.99 s
