# BIG DATA ASSIGNMENT WEEK 09
## Colaborative Filtering
- Rafi Akbar Rafsanjani
- 05111942000004

## Install & Initialization

In [1]:
!pip install pyspark

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting pyspark
  Downloading pyspark-3.3.2.tar.gz (281.4 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m281.4/281.4 MB[0m [31m4.6 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting py4j==0.10.9.5
  Downloading py4j-0.10.9.5-py2.py3-none-any.whl (199 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m199.7/199.7 kB[0m [31m14.4 MB/s[0m eta [36m0:00:00[0m
[?25hBuilding wheels for collected packages: pyspark
  Building wheel for pyspark (setup.py) ... [?25l[?25hdone
  Created wheel for pyspark: filename=pyspark-3.3.2-py2.py3-none-any.whl size=281824028 sha256=badccb944eb4b84c54d2eda2bd2552994f216141a1134cb35cd2411d6a111966
  Stored in directory: /root/.cache/pip/wheels/6c/e3/9b/0525ce8a69478916513509d43693511463c6468db0de237c86
Successfully built pyspark
Installing collected packages: py4j, pyspa

In [2]:
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.ml.recommendation import ALS
from pyspark.sql import Row, SparkSession

In [3]:
# SparkSession Initialization
spark = SparkSession.builder \
    .master("local") \
    .appName("MovieLens") \
    .getOrCreate()

In [5]:
# Read data from a text file and separate elements of each line
lines = spark.read.text("/content/sample_movielens_ratings.txt").rdd
parts = lines.map(lambda row: row.value.split("::"))

In [6]:
# Convert data into a DataFrame with userId, movieId, rating, and timestamp columns
ratingsRDD = parts.map(lambda p: Row(userId=int(p[0]), movieId=int(p[1]),
                                     rating=float(p[2]), timestamp=int(p[3])))

# Split the data into training (80%) and testing (20%) sets
ratings = spark.createDataFrame(ratingsRDD)
(training, test) = ratings.randomSplit([0.8, 0.2])

## Build Recomendation model using ALS

In [7]:
# Initialize the parameters to be tried
max_iters = [5, 10, 20]
reg_params = [0.1, 0.5, 1.0]

# Dictionary to store RMSE results
results = {}

In [8]:
# Loop for every combination of maxIter and regParam
for max_iter in max_iters:
    for reg_param in reg_params:
        # Build the recommendation model using ALS on the training data
        # Note we set cold start strategy to 'drop' to ensure we don't get NaN evaluation metrics
        als = ALS(maxIter=max_iter, regParam=reg_param, userCol="userId", itemCol="movieId", ratingCol="rating",
                  coldStartStrategy="drop")
        model = als.fit(training)

        # Evaluate the model by computing the RMSE on the test data
        predictions = model.transform(test)
        evaluator = RegressionEvaluator(metricName="rmse", labelCol="rating",
                                        predictionCol="prediction")
        rmse = evaluator.evaluate(predictions)

        # Save the RMSE result in the dictionary
        results[(max_iter, reg_param)] = rmse
        print(f"Root-mean-square error for maxIter={max_iter}, regParam={reg_param} = {rmse}")

Root-mean-square error for maxIter=5, regParam=0.1 = 1.0459830613833163
Root-mean-square error for maxIter=5, regParam=0.5 = 1.2894025106664904
Root-mean-square error for maxIter=5, regParam=1.0 = 1.5713764674497686
Root-mean-square error for maxIter=10, regParam=0.1 = 0.9601964615335945
Root-mean-square error for maxIter=10, regParam=0.5 = 1.281481369209705
Root-mean-square error for maxIter=10, regParam=1.0 = 1.5713748150735862
Root-mean-square error for maxIter=20, regParam=0.1 = 0.9500120213865103
Root-mean-square error for maxIter=20, regParam=0.5 = 1.2815220299506749
Root-mean-square error for maxIter=20, regParam=1.0 = 1.5713748199752815


In [9]:
# Find the hyperparameter combination with the lowest RMSE
best_params = min(results, key=results.get)
best_rmse = results[best_params]
print(f"\nBest hyperparameters: maxIter={best_params[0]}, regParam={best_params[1]} with RMSE={best_rmse}")


Best hyperparameters: maxIter=20, regParam=0.1 with RMSE=0.9500120213865103


## Generate Movie Recomendation

In [10]:
# Train the model with the best hyperparameters
best_als = ALS(maxIter=best_params[0], regParam=best_params[1], userCol="userId", itemCol="movieId", ratingCol="rating",
               coldStartStrategy="drop")
best_model = best_als.fit(training)

## Print Result and Show Ouput

In [11]:
# Generate top 10 movie recommendations for each user
userRecs = best_model.recommendForAllUsers(10)
userRecs.show()

+------+--------------------+
|userId|     recommendations|
+------+--------------------+
|    20|[{22, 3.8107607},...|
|    10|[{92, 3.1808662},...|
|     0|[{92, 2.7941234},...|
|     1|[{62, 3.167192}, ...|
|    21|[{29, 4.292471}, ...|
|    11|[{30, 4.5209475},...|
|    12|[{46, 5.1609254},...|
|    22|[{74, 4.385069}, ...|
|     2|[{93, 4.29524}, {...|
|    13|[{52, 3.3166378},...|
|     3|[{51, 4.147035}, ...|
|    23|[{46, 4.992766}, ...|
|     4|[{74, 3.4080062},...|
|    24|[{52, 4.58324}, {...|
|    14|[{29, 4.6909695},...|
|     5|[{46, 4.699138}, ...|
|    15|[{46, 4.2682977},...|
|    25|[{46, 3.1945887},...|
|    26|[{32, 5.0091343},...|
|     6|[{25, 4.006712}, ...|
+------+--------------------+
only showing top 20 rows



In [12]:
# Generate top 10 user recommendations for each movie
movieRecs = best_model.recommendForAllItems(10)
movieRecs.show()

+-------+--------------------+
|movieId|     recommendations|
+-------+--------------------+
|     20|[{17, 3.8052187},...|
|     40|[{28, 3.5157113},...|
|     10|[{17, 3.0356047},...|
|     50|[{23, 3.6350877},...|
|     80|[{26, 3.299038}, ...|
|     70|[{4, 3.0145392}, ...|
|     60|[{8, 2.3175797}, ...|
|     90|[{24, 4.35687}, {...|
|     30|[{11, 4.5209475},...|
|      0|[{12, 1.1367803},...|
|     31|[{7, 2.4439905}, ...|
|     81|[{28, 4.1144366},...|
|     91|[{12, 2.6076496},...|
|      1|[{12, 2.828721}, ...|
|     41|[{4, 3.1798055}, ...|
|     61|[{6, 2.1949468}, ...|
|     51|[{26, 4.839849}, ...|
|     21|[{26, 2.8554924},...|
|     11|[{18, 3.2296104},...|
|     71|[{25, 2.9940975},...|
+-------+--------------------+
only showing top 20 rows



In [13]:
# Generate top 10 movie recommendations for a specific set of users
users = ratings.select(best_als.getUserCol()).distinct().limit(3)
userSubsetRecs = best_model.recommendForUserSubset(users, 10)
userSubsetRecs.show()

+------+--------------------+
|userId|     recommendations|
+------+--------------------+
|    26|[{32, 5.0091343},...|
|    19|[{90, 3.2949443},...|
|    29|[{46, 4.328896}, ...|
+------+--------------------+



In [14]:
# Generate top 10 user recommendations for a specific set of movies
movies = ratings.select(best_als.getItemCol()).distinct().limit(3)
movieSubSetRecs = best_model.recommendForItemSubset(movies, 10)
movieSubSetRecs.show()

+-------+--------------------+
|movieId|     recommendations|
+-------+--------------------+
|     65|[{23, 3.817163}, ...|
|     26|[{15, 2.2117326},...|
|     29|[{14, 4.6909695},...|
+-------+--------------------+



## Summary

To summarize the process, the team performed data loading and preprocessing by splitting the MovieLens dataset into training and test sets. They then conducted hyperparameter tuning by testing various combinations of maxIter and regParam values and recorded the results in an RMSE dictionary. The team selected the best hyperparameters based on the lowest RMSE and displayed the results. Lastly, they trained the model using the best hyperparameters and produced recommendations for all users, movies, and specific subsets. The recommendations included the top 10 suggestions for each user, movie, and selected subset.



