In [5]:
PROJECT_NAME: str = "recommender_system"
HDFS_NAMENODE: str = "hdfs://namenode:9000"
INPUT_DIR: str = f"{HDFS_NAMENODE}/input/{PROJECT_NAME}"
OUTPUT_DIR: str = f"{HDFS_NAMENODE}/output/{PROJECT_NAME}"

MASTER_URI = "spark://spark-master:7077"

In [6]:
# Schemas
from pyspark.sql.types import (
    StructType,
    StructField,
    StringType,
    IntegerType,
)

userSchema = StructType(
    [
        StructField("UserID", IntegerType(), True),
        StructField("Gender", StringType(), True),
        StructField("Age", IntegerType(), True),
        StructField("Occupation", StringType(), True),
        StructField("Zip_code", StringType(), True),
    ]
)

movieSchema = StructType(
    [
        StructField("MovieID", IntegerType(), True),
        StructField("Title", StringType(), True),
        StructField("Genres", StringType(), True),
    ]
)

ratingSchema = StructType(
    [
        StructField("UserID", IntegerType(), True),
        StructField("MovieID", IntegerType(), True),
        StructField("Rating", IntegerType(), True),
        StructField("Timestamp", StringType(), True),
    ]
)


In [7]:
from pyspark.sql import SparkSession


def spark_session() -> SparkSession:
    spark = (
        SparkSession.builder.appName(PROJECT_NAME.capitalize)
        .master(MASTER_URI)
        .config("spark.driver.memory", "4g")
        .config("spark.hadoop.fs.defaultFS", HDFS_NAMENODE)
        .config("spark.hadoop.dfs.client.use.datanode.hostname", "true")
        .getOrCreate()
    )
    print(f"Connected to Spark {spark.version}")
    return spark

In [4]:
%%time


spark = spark_session()


def convert_dat_to_parquet(file_name: str, schema: StructType):
    input_path = f"{INPUT_DIR}/{file_name}.dat"
    output_path = f"{INPUT_DIR}/{file_name}_parquet"

    print(f"Processing: {input_path}")

    df = spark.read.option("sep", "::").csv(input_path, schema=schema)
    df.write.mode("overwrite").parquet(output_path)

    print(f"Successfully converted to Parquet at: {output_path}")
    print("-" * 30)


print("Starting data conversion to Parquet...")
print("-" * 30)

convert_dat_to_parquet("ratings", ratingSchema)
convert_dat_to_parquet("users", userSchema)
convert_dat_to_parquet("movies", movieSchema)

print("All files converted.")

spark.stop()

Connected to Spark 3.5.0
Starting data conversion to Parquet...
------------------------------
Processing: hdfs://namenode:9000/input/recommender_system/ratings.dat
Successfully converted to Parquet at: hdfs://namenode:9000/input/recommender_system/ratings_parquet
------------------------------
Processing: hdfs://namenode:9000/input/recommender_system/users.dat
Successfully converted to Parquet at: hdfs://namenode:9000/input/recommender_system/users_parquet
------------------------------
Processing: hdfs://namenode:9000/input/recommender_system/movies.dat
Successfully converted to Parquet at: hdfs://namenode:9000/input/recommender_system/movies_parquet
------------------------------
All files converted.
CPU times: user 39.9 ms, sys: 18.6 ms, total: 58.5 ms
Wall time: 9.48 s


In [8]:
RATINGS_FILE = f"{INPUT_DIR}/ratings_parquet"
USERS_FILE = f"{INPUT_DIR}/users_parquet"
MOVIES_FILE = f"{INPUT_DIR}/movies_parquet"

## Task 1
List the top-rated movies by all users. 

#### Output format
A list of <movie, score> pairs
sorted in descending order of ‘average’ rating score

In [None]:
%%time

from pyspark.sql.functions import col, avg

spark = spark_session()

df = spark.read.parquet(RATINGS_FILE)

agg_df = df.groupBy("MovieId").agg(avg("Rating").alias("AvgRating"))

sorted_df = agg_df.orderBy(col("AvgRating").desc())

sorted_df.cache()

sorted_df = sorted_df.select(
    col("MovieId").alias("movie"), col("AvgRating").alias("score")
)
print(f"Writing results to '{OUTPUT_DIR}/average_movie_ratings'\n")
sorted_df.coalesce(1).write.mode("overwrite").csv(
    f"{OUTPUT_DIR}/average_movie_ratings", header=True, sep=","
)

sorted_df.unpersist()

spark.stop()

## Task 2
List the top-rated movies grouped by gender, by age group, and by occupation, respectively. 

#### Output format
3 sorted lists: <movie, gender, score> pairs, <movie, age group, score> pairs, <movie, occupation, score>
sorted in descending order of ‘average’ rating score grouped by gender, by age group, and by occupation 

In [None]:
%%time

from pyspark.sql.functions import broadcast, col, avg

spark = spark_session()

ratings_df = spark.read.parquet(RATINGS_FILE)
user_df = spark.read.parquet(USERS_FILE)

ratings_users_df = ratings_df.join(broadcast(user_df), on="UserId").select(
    ["MovieId", "Rating", "Gender", "Age", "Occupation"]
)


def top_rated_movies_grouped_by(category: str):
    print(f"Listing top rated movies by {category}...")
    agg_df = ratings_users_df.groupBy(category, "MovieID").agg(
        avg(col("Rating")).alias("AvgRating")
    )
    sorted_df = agg_df.orderBy(col("AvgRating").desc())
    sorted_df = sorted_df.select(
        col("MovieID").alias("movie"),
        col(category).alias(category.lower()),
        col("AvgRating").alias("score"),
    )

    sorted_df.coalesce(1).write.mode("overwrite").csv(
        f"{OUTPUT_DIR}/top_movies_by_{category.lower()}", header=True, sep=","
    )


top_rated_movies_grouped_by("Gender")
top_rated_movies_grouped_by("Age")
top_rated_movies_grouped_by("Occupation")

spark.stop()

## Task 3
List the average rating score of each user for all movies, and grouped by genre, respectively. 


#### Output format
two sorted lists: <user, score> pairs, <user, genre, score> pairs


In [None]:
%%time

spark = spark_session()

ratings_df = spark.read.parquet(RATINGS_FILE)
movies_df = spark.read.parquet(MOVIES_FILE)

user_agg = ratings_df.groupBy("UserId").agg(avg(col("Rating")).alias("AvgRating"))
sorted_user_df = user_agg.orderBy(col("AvgRating").desc())

sorted_user_df = sorted_user_df.select(
    col("UserId").alias("user"), col("AvgRating").alias("score")
)

ratings_movies_df = ratings_df.join(broadcast(movies_df), on="MovieId").select(
    ["MovieId", "Genres", "UserId", "Rating"]
)
ratings_movies_agg = ratings_movies_df.groupBy("UserId", "Genres").agg(
    avg(col("Rating")).alias("AvgRating")
)
sorted_ratings_movies_df = ratings_movies_agg.orderBy(col("AvgRating").desc())
sorted_ratings_movies_df = sorted_ratings_movies_df.select(
    col("UserId").alias("user"),
    col("Genres").alias("genre"),
    col("AvgRating").alias("score"),
)

print("Computing average rating score of each user for all movies...")
sorted_user_df.coalesce(1).write.mode("overwrite").csv(
    f"{OUTPUT_DIR}/average_rating_by_user", header=True, sep=","
)
print("Computing average rating score of each user for all movies by genre...")
sorted_ratings_movies_df.coalesce(1).write.mode("overwrite").csv(
    f"{OUTPUT_DIR}/average_rating_by_user_and_genre", header=True, sep=","
)

spark.stop()

# Utility functions

A list of utility functions to compute similarity and build utility matrix for next tasks 

In [9]:
from pyspark.sql.dataframe import DataFrame
from pyspark.sql.functions import col, avg, broadcast


def mean_normalized_ratings(for_column: str) -> DataFrame:
    ratings_df: DataFrame = spark.read.parquet(RATINGS_FILE).select(
        ["UserId", "MovieId", "Rating"]
    )

    avg_rating_df = ratings_df.groupBy(for_column).agg(
        avg(col("Rating")).alias("AvgRating")
    )
    ratings_df = ratings_df.join(broadcast(avg_rating_df), for_column)
    ratings_df = (
        ratings_df.withColumn("NormalizedRating", col("Rating") - col("AvgRating"))
        .select(["UserId", "MovieId", "NormalizedRating"])
        .fillna(0)
    )

    return ratings_df


# Task 4

Given any user, please list the top-’similar’ users based on the cosine similarity of previous ratings each user has given. (sorted in descending order of ‘user’ similarity score)

### Output Format

A list of <user, score> pairs
sorted in descending order of ‘user’ similarity score

In [14]:
%%time

from pyspark.sql.functions import sqrt, pow, sum

spark = spark_session()

def user_similarities_ratings_based(target_user_id: str) -> DataFrame:
    ratings_df = mean_normalized_ratings(for_column="UserId")

    target_user_df = ratings_df.where(f"UserId == {target_user_id}").select(
        col("UserId").alias("TargetUserId"),
        col("NormalizedRating").alias("TargetUserNormalizedRating"),
        col("MovieId"),
    )
    common_ratings_df = ratings_df.join(broadcast(target_user_df), on="MovieId")


    dot_df = common_ratings_df.groupBy("UserId").agg(
        sqrt(sum(pow(col("NormalizedRating"), 2))).alias("Norm"),
        sum(col("NormalizedRating") * col("TargetUserNormalizedRating")).alias(
            "DotProduct"
        ),
    )

    target_norm_val = dot_df.where(f"UserId == {target_user_id}").first()["Norm"]

    result_df = (
        dot_df.filter(col("UserId") != target_user_id)
        .withColumn("Similarity", col("DotProduct") / (col("Norm") * target_norm_val))
        .select(["UserId", "Similarity"])
    ).orderBy(col("Similarity").desc())
    return result_df

## Target user to compute similarity against.
target_user_id = "2"
similarity_df = user_similarities_ratings_based(target_user_id)

similarity_df.coalesce(1).write.mode("overwrite").csv(
    f"{OUTPUT_DIR}/similar_users_to_user_{target_user_id}", header=True, sep=","
)

spark.stop()

Connected to Spark 3.5.0
CPU times: user 38.6 ms, sys: 7.82 ms, total: 46.4 ms
Wall time: 7.17 s


# Task 5

Given any movie, please list the top-’similar’ movies based on the cosine similarity of previous ratings each movie received. (sorted in descending order of ‘item’ similarity score)

### Output Format

a list of <movie, score> pairs
sorted in descending order of ‘movie’ similarity score

In [15]:
%%time

spark = spark_session()

def movie_similarities_ratings_based(target_movie_id: str) -> DataFrame:
    ratings_df = mean_normalized_ratings(for_column="MovieId")

    target_movie_df = ratings_df.where(f"MovieId == {target_movie_id}").select(
        col("MovieId").alias("TargetMovieId"),
        col("NormalizedRating").alias("TargetUserNormalizedRating"),
        col("UserId"),
    )
    common_ratings_df = ratings_df.join(broadcast(target_movie_df), on="UserId")


    dot_df = common_ratings_df.groupBy("MovieId").agg(
        sqrt(sum(pow(col("NormalizedRating"), 2))).alias("Norm"),
        sum(col("NormalizedRating") * col("TargetUserNormalizedRating")).alias(
            "DotProduct"
        ),
    )

    target_norm_val = dot_df.where(f"MovieId == {target_movie_id}").first()["Norm"]

    result_df = (
        dot_df.filter(col("MovieId") != target_movie_id)
        .withColumn("Similarity", col("DotProduct") / (col("Norm") * target_norm_val))
        .select(["MovieId", "Similarity"])
    ).orderBy(col("Similarity").desc())
    return result_df

## Target movie to compute similarity against.
target_movie_id = "2"
similarity_df = movie_similarities_ratings_based(target_movie_id)

similarity_df.coalesce(1).write.mode("overwrite").csv(
    f"{OUTPUT_DIR}/similar_movites_to_movie_{target_movie_id}", header=True, sep=","
)

spark.stop()

Connected to Spark 3.5.0
CPU times: user 39.2 ms, sys: 11.4 ms, total: 50.6 ms
Wall time: 5.54 s


# Task 6

Implement a recommender system that recommends top-k similar movies for a given user based on collaborative filtering: item-based, and user-based. (sorted in descending order of similarity score)
- (a) For item-based collaborative filtering: estimated by similar items
- (b) For user-based collaborative filtering: estimated by similar users

### Output format
two lists of <movie, score> pairs: item-based, user-based
sorted in descending order of similarity score

In [16]:
%%time

from pyspark.sql.functions import col, sum, sqrt, pow, abs, broadcast

spark = spark_session()

norm_ratings_df = mean_normalized_ratings("MovieId")
norm_ratings_df.cache()


def recommend_user_based(target_user_id: str, k: int = 10):
    print("Generating User-Based Recommendations...")

    sim_users = user_similarities_ratings_based(str(target_user_id))

    candidates = norm_ratings_df.join(sim_users, on="UserId")

    target_user_movies = (
        norm_ratings_df.filter(col("UserId") == target_user_id)
        .select("MovieId")
        .distinct()
    )
    candidates = candidates.join(target_user_movies, on="MovieId", how="left_anti")

    recs = candidates.groupBy("MovieId").agg(
        (
            sum(col("NormalizedRating") * col("Similarity"))
            / sum(abs(col("Similarity")))
        ).alias("Score")
    )

    return recs.orderBy(col("Score").desc()).limit(k)


def recommend_item_based(target_user_id: str, k: int = 10):
    print("Generating Item-Based Recommendations...")

    user_movies_df = norm_ratings_df.filter(col("UserId") == target_user_id).select(
        col("MovieId").alias("MyMovieId"), col("NormalizedRating").alias("MyRating")
    )

    my_movie_ids = [row.MyMovieId for row in user_movies_df.collect()]

    my_movies_ratings = norm_ratings_df.filter(
        col("MovieId").isin(my_movie_ids)
    ).select(
        col("UserId"),
        col("MovieId").alias("MyMovieId"),
        col("NormalizedRating").alias("MyNormRating"),
    )

    pairs = my_movies_ratings.join(norm_ratings_df, on="UserId")

    movie_norms = norm_ratings_df.groupBy("MovieId").agg(
        sqrt(sum(pow(col("NormalizedRating"), 2))).alias("Norm")
    )

    dot_products = pairs.groupBy("MyMovieId", "MovieId").agg(
        sum(col("MyNormRating") * col("NormalizedRating")).alias("DotProduct")
    ).alias("dp")

    sims = (
        dot_products.join(
            movie_norms.alias("m1"), col("dp.MyMovieId") == col("m1.MovieId")
        )
        .join(movie_norms.alias("m2"), col("dp.MovieId") == col("m2.MovieId"))
        .select(
            col("dp.MyMovieId"),
            col("dp.MovieId"),
            (col("dp.DotProduct") / (col("m1.Norm") * col("m2.Norm"))).alias("Similarity"),
        )
    )

    preds = sims.join(user_movies_df, on="MyMovieId")

    results = preds.groupBy("MovieId").agg(
        (sum(col("Similarity") * col("MyRating")) / sum(abs(col("Similarity")))).alias(
            "Score"
        )
    )

    results = results.filter(~col("MovieId").isin(my_movie_ids))

    return results.orderBy(col("Score").desc()).limit(k)


target_user_id = "2"

ub_recs_df = recommend_user_based(target_user_id)
print("Top-10 User-Based Recommendations:")
ub_recs_df.show()
ub_recs_df.coalesce(1).write.mode("overwrite").csv(
    f"{OUTPUT_DIR}/recs_user_based_user_{target_user_id}", header=True, sep=","
)

ib_recs_df = recommend_item_based(target_user_id)
print("Top-10 Item-Based Recommendations:")
ib_recs_df.show()
ib_recs_df.coalesce(1).write.mode("overwrite").csv(
    f"{OUTPUT_DIR}/recs_item_based_user_{target_user_id}", header=True, sep=","
)

spark.stop()

Connected to Spark 3.5.0
Generating User-Based Recommendations...
Top-10 User-Based Recommendations:
+-------+------------------+
|MovieId|             Score|
+-------+------------------+
|    572|               2.0|
|   1360|               1.5|
|   3236|1.4999999999999998|
|   1420| 1.267555051490446|
|   1659|1.1856731362702553|
|    108| 1.113359305400627|
|   2869|1.0887100905109315|
|   2258|               1.0|
|   1868|               1.0|
|    690|               1.0|
+-------+------------------+

Generating Item-Based Recommendations...
Top-10 Item-Based Recommendations:
+-------+-------------------+
|MovieId|              Score|
+-------+-------------------+
|    889| 0.5025702416233744|
|    404|0.40691151886043936|
|    972| 0.3580488208820493|
|   2258| 0.3219903536607946|
|   2510|0.27824864917431674|
|   1062|0.26865034769346263|
|   1509|0.26616891733204046|
|   2591| 0.2657844540978354|
|     53| 0.2624776760223743|
|    753| 0.2541647952122626|
+-------+-----------------

# Bonus

Compute the similarities between users based on their gender, age, occupation.
Compute the similarities between movies based on their genres.