In [2]:
PROJECT_NAME: str = "recommender_system"
HDFS_NAMENODE: str = "hdfs://namenode:9000"
INPUT_DIR: str = f"{HDFS_NAMENODE}/input/{PROJECT_NAME}"
OUTPUT_DIR: str = f"{HDFS_NAMENODE}/output/{PROJECT_NAME}"

MASTER_URI = "spark://spark-master:7077"

In [3]:
# Schemas
from pyspark.sql.types import (
    StructType,
    StructField,
    StringType,
    IntegerType,
)

userSchema = StructType(
    [
        StructField("UserID", IntegerType(), True),
        StructField("Gender", StringType(), True),
        StructField("Age", IntegerType(), True),
        StructField("Occupation", StringType(), True),
        StructField("Zip_code", StringType(), True),
    ]
)

movieSchema = StructType(
    [
        StructField("MovieID", IntegerType(), True),
        StructField("Title", StringType(), True),
        StructField("Genres", StringType(), True),
    ]
)

ratingSchema = StructType(
    [
        StructField("UserID", IntegerType(), True),
        StructField("MovieID", IntegerType(), True),
        StructField("Rating", IntegerType(), True),
        StructField("Timestamp", StringType(), True),
    ]
)


In [4]:
from pyspark.sql import SparkSession


def spark_session() -> SparkSession:
    spark = (
        SparkSession.builder.appName(PROJECT_NAME.capitalize)
        .master(MASTER_URI)
        .config("spark.hadoop.fs.defaultFS", HDFS_NAMENODE)
        .config("spark.hadoop.dfs.client.use.datanode.hostname", "true")
        .getOrCreate()
    )
    print(f"Connected to Spark {spark.version}")
    return spark

## Task 1
List the top-rated movies by all users. 

#### Output format
A list of <movie, score> pairs
sorted in descending order of ‘average’ rating score

In [11]:
%%time

from pyspark.sql import SparkSession
from pyspark.sql.types import StructType, StructField, StringType, IntegerType

spark = spark_session()

df = spark.read.option("sep", "::").csv(f"{INPUT_DIR}/ratings.dat", schema=ratingSchema)

print(f"Loaded {df.count()} ratings")

sorted_df = df.groupBy("MovieId").avg("Rating").orderBy("avg(Rating)", ascending=False)

sorted_df.show(n=10)

print(f"Writing results to '{OUTPUT_DIR}/average_movie_ratings'\n")
sorted_df.write.mode("overwrite").csv(f"{OUTPUT_DIR}/average_movie_ratings", header=True, sep=",")

spark.stop()

Connected to Spark 3.5.0
Loaded 1000209 ratings
+-------+-----------+
|MovieId|avg(Rating)|
+-------+-----------+
|    989|        5.0|
|   3382|        5.0|
|   3233|        5.0|
|   3656|        5.0|
|   3172|        5.0|
|    787|        5.0|
|   3280|        5.0|
|   1830|        5.0|
|   3881|        5.0|
|   3607|        5.0|
+-------+-----------+
only showing top 10 rows

Writing results to 'hdfs://namenode:9000/output/recommender_system/average_movie_ratings'

CPU times: user 13.9 ms, sys: 22.7 ms, total: 36.6 ms
Wall time: 5.62 s


## Task 2
List the top-rated movies grouped by gender, by age group, and by occupation, respectively. 

#### Output format
3 sorted lists: <movie, gender, score> pairs, <movie, age group, score> pairs, <movie, occupation, score>
sorted in descending order of ‘average’ rating score grouped by gender, by age group, and by occupation 

## Task 3 