In [2]:
PROJECT_NAME: str = "recommender_system"
HDFS_NAMENODE: str = "hdfs://namenode:9000"
INPUT_DIR: str = f"{HDFS_NAMENODE}/input/{PROJECT_NAME}"
OUTPUT_DIR: str = f"{HDFS_NAMENODE}/output/{PROJECT_NAME}"

MASTER_URI = "spark://spark-master:7077"

In [3]:
# Schemas
from pyspark.sql.types import (
    StructType,
    StructField,
    StringType,
    IntegerType,
)

userSchema = StructType(
    [
        StructField("UserID", IntegerType(), True),
        StructField("Gender", StringType(), True),
        StructField("Age", IntegerType(), True),
        StructField("Occupation", StringType(), True),
        StructField("Zip_code", StringType(), True),
    ]
)

movieSchema = StructType(
    [
        StructField("MovieID", IntegerType(), True),
        StructField("Title", StringType(), True),
        StructField("Genres", StringType(), True),
    ]
)

ratingSchema = StructType(
    [
        StructField("UserID", IntegerType(), True),
        StructField("MovieID", IntegerType(), True),
        StructField("Rating", IntegerType(), True),
        StructField("Timestamp", StringType(), True),
    ]
)


In [4]:
from pyspark.sql import SparkSession


def spark_session() -> SparkSession:
    spark = (
        SparkSession.builder.appName(PROJECT_NAME.capitalize)
        .master(MASTER_URI)
        .config("spark.hadoop.fs.defaultFS", HDFS_NAMENODE)
        .config("spark.hadoop.dfs.client.use.datanode.hostname", "true")
        .getOrCreate()
    )
    print(f"Connected to Spark {spark.version}")
    return spark

In [None]:
%%time


spark = spark_session()


def convert_dat_to_parquet(file_name: str, schema: StructType):
    input_path = f"{INPUT_DIR}/{file_name}.dat"
    output_path = f"{INPUT_DIR}/{file_name}_parquet"

    print(f"Processing: {input_path}")

    df = spark.read.option("sep", "::").csv(input_path, schema=schema)
    df.write.mode("overwrite").parquet(output_path)

    print(f"Successfully converted to Parquet at: {output_path}")
    print("-" * 30)


print("Starting data conversion to Parquet...")
print("-" * 30)

convert_dat_to_parquet("ratings", ratingSchema)
convert_dat_to_parquet("users", userSchema)
convert_dat_to_parquet("movies", movieSchema)

print("All files converted.")

spark.stop()

Connected to Spark 3.5.0
Starting data conversion to Parquet...
------------------------------
Processing: hdfs://namenode:9000/input/recommender_system/ratings.dat
Successfully converted to Parquet at: hdfs://namenode:9000/input/recommender_system/ratings_parquet
------------------------------
Processing: hdfs://namenode:9000/input/recommender_system/users.dat
Successfully converted to Parquet at: hdfs://namenode:9000/input/recommender_system/users_parquet
------------------------------
Processing: hdfs://namenode:9000/input/recommender_system/movies.dat
Successfully converted to Parquet at: hdfs://namenode:9000/input/recommender_system/movies_parquet
------------------------------
All files converted.
CPU times: user 10.8 ms, sys: 7.47 ms, total: 18.3 ms
Wall time: 3.71 s


In [66]:
RATINGS_FILE = f"{INPUT_DIR}/ratings_parquet"
USERS_FILE = f"{INPUT_DIR}/users_parquet"
MOVIES_FILE = f"{INPUT_DIR}/movies_parquet"

## Task 1
List the top-rated movies by all users. 

#### Output format
A list of <movie, score> pairs
sorted in descending order of ‘average’ rating score

In [67]:
%%time

from pyspark.sql.functions import col, avg

spark = spark_session()

df = spark.read.parquet(RATINGS_FILE)

agg_df = df.groupBy("MovieId").agg(avg("Rating").alias("AvgRating"))

sorted_df = agg_df.orderBy(col("AvgRating").desc())

sorted_df.cache()

sorted_df = sorted_df.select(
    col("MovieId").alias("movie"), col("AvgRating").alias("score")
)
print(f"Writing results to '{OUTPUT_DIR}/average_movie_ratings'\n")
sorted_df.coalesce(1).write.mode("overwrite").csv(
    f"{OUTPUT_DIR}/average_movie_ratings", header=True, sep=","
)

sorted_df.unpersist()

spark.stop()

Connected to Spark 3.5.0
Writing results to 'hdfs://namenode:9000/output/recommender_system/average_movie_ratings'

CPU times: user 18.3 ms, sys: 9.51 ms, total: 27.8 ms
Wall time: 5.59 s


## Task 2
List the top-rated movies grouped by gender, by age group, and by occupation, respectively. 

#### Output format
3 sorted lists: <movie, gender, score> pairs, <movie, age group, score> pairs, <movie, occupation, score>
sorted in descending order of ‘average’ rating score grouped by gender, by age group, and by occupation 

In [68]:
%%time

from pyspark.sql.functions import broadcast, col, avg

spark = spark_session()

ratings_df = spark.read.parquet(RATINGS_FILE)
user_df = spark.read.parquet(USERS_FILE)

ratings_users_df = ratings_df.join(broadcast(user_df), on="UserId").select(
    ["MovieId", "Rating", "Gender", "Age", "Occupation"]
)


def top_rated_movies_grouped_by(category: str):
    print(f"Listing top rated movies by {category}...")
    agg_df = ratings_users_df.groupBy(category, "MovieID").agg(
        avg(col("Rating")).alias("AvgRating")
    )
    sorted_df = agg_df.orderBy(col("AvgRating").desc())
    sorted_df = sorted_df.select(
        col("MovieID").alias("movie"),
        col(category).alias(category.lower()),
        col("AvgRating").alias("score"),
    )

    sorted_df.coalesce(1).write.mode("overwrite").csv(
        f"{OUTPUT_DIR}/top_movies_by_{category.lower()}", header=True, sep=","
    )


top_rated_movies_grouped_by("Gender")
top_rated_movies_grouped_by("Age")
top_rated_movies_grouped_by("Occupation")

spark.stop()

Connected to Spark 3.5.0
Listing top rated movies by Gender...
Listing top rated movies by Age...
Listing top rated movies by Occupation...
CPU times: user 33.2 ms, sys: 16.2 ms, total: 49.4 ms
Wall time: 8.11 s


## Task 3
List the average rating score of each user for all movies, and grouped by genre, respectively. 


#### Output format
two sorted lists: <user, score> pairs, <user, genre, score> pairs


In [85]:
%%time

spark = spark_session()

ratings_df = spark.read.parquet(RATINGS_FILE)
movies_df = spark.read.parquet(MOVIES_FILE)

user_agg = ratings_df.groupBy("UserId").agg(avg(col("Rating")).alias("AvgRating"))
sorted_user_df = user_agg.orderBy(col("AvgRating").desc())

sorted_user_df = sorted_user_df.select(
    col("UserId").alias("user"), col("AvgRating").alias("score")
)

ratings_movies_df = ratings_df.join(broadcast(movies_df), on="MovieId").select(
    ["MovieId", "Genres", "UserId", "Rating"]
)
ratings_movies_agg = ratings_movies_df.groupBy("UserId", "Genres").agg(
    avg(col("Rating")).alias("AvgRating")
)
sorted_ratings_movies_df = ratings_movies_agg.orderBy(col("AvgRating").desc())
sorted_ratings_movies_df = sorted_ratings_movies_df.select(
    col("UserId").alias("user"),
    col("Genres").alias("genre"),
    col("AvgRating").alias("score"),
)

print("Computing average rating score of each user for all movies...")
sorted_user_df.coalesce(1).write.mode("overwrite").csv(
    f"{OUTPUT_DIR}/average_rating_by_user", header=True, sep=","
)
print("Computing average rating score of each user for all movies by genre...")
sorted_ratings_movies_df.coalesce(1).write.mode("overwrite").csv(
    f"{OUTPUT_DIR}/average_rating_by_user_and_genre", header=True, sep=","
)

spark.stop()

Connected to Spark 3.5.0
Computing average rating score of each user for all movies...
Computing average rating score of each user for all movies by genre...
CPU times: user 31.9 ms, sys: 13.5 ms, total: 45.4 ms
Wall time: 5.32 s
