In [1]:
PROJECT_NAME: str = "web_graphs"
HDFS_NAMENODE: str = "hdfs://namenode:9000"
INPUT_DIR: str = f"{HDFS_NAMENODE}/input/{PROJECT_NAME}"
OUTPUT_DIR: str = f"{HDFS_NAMENODE}/output/{PROJECT_NAME}"

MASTER_URI = "spark://spark-master:7077"

In [None]:
from pyspark.sql import SparkSession


def spark_session() -> SparkSession:
    spark = (
        SparkSession.builder.appName(PROJECT_NAME.capitalize)
        .master(MASTER_URI)
        .config("spark.driver.memory", "4g")
        .config("spark.hadoop.fs.defaultFS", HDFS_NAMENODE)
        .config("spark.hadoop.dfs.client.use.datanode.hostname", "true")
        .getOrCreate()
    )

    print(f"Connected to Spark {spark.version}")

    return spark

In [3]:
WEB_DATA_PATH = f"{INPUT_DIR}/web-Google.txt"

In [4]:
from pyspark.sql.dataframe import DataFrame
from pyspark.sql.functions import col
from pyspark.sql.types import IntegerType

def load_web_data(spark: SparkSession) -> DataFrame:
    raw_data = spark.read.text(WEB_DATA_PATH)
    edges_df = (
        raw_data.filter(~col("value").startswith("#"))
        .selectExpr(
            "split(value, '\\t')[0] as from_node", "split(value, '\\t')[1] as to_node"
        )
        .withColumn("from_node", col("from_node").cast(IntegerType()))
        .withColumn("to_node", col("to_node").cast(IntegerType()))
        .cache()
    )

    
    return edges_df

# Task 1

Given the Google web graph dataset, please output the sorted list of web pages with the number of outlinks, sorted in descending order of the out-degrees.

## Output Format
a sorted list of pages with their out-degrees
Each line contains: <NodeID>, <out-degree> 

In [5]:
%%time

spark = spark_session()

edges_df = load_web_data(spark)

edges_df.show()

spark.stop()

Spark Version: 3.5.0
Spark Master: spark://spark-master:7077
+---------+-------+
|from_node|to_node|
+---------+-------+
|        0|  11342|
|        0| 824020|
|        0| 867923|
|        0| 891835|
|    11342|      0|
|    11342|  27469|
|    11342|  38716|
|    11342| 309564|
|    11342| 322178|
|    11342| 387543|
|    11342| 427436|
|    11342| 538214|
|    11342| 638706|
|    11342| 645018|
|    11342| 835220|
|    11342| 856657|
|    11342| 867923|
|    11342| 891835|
|   824020|      0|
|   824020|  91807|
+---------+-------+
only showing top 20 rows

CPU times: user 22.2 ms, sys: 30.9 ms, total: 53.1 ms
Wall time: 8.64 s
