# Movie Recommendation System

## Data Preprocessing

In [1]:
!pip install pyspark

Defaulting to user installation because normal site-packages is not writeable


In [2]:
from pyspark import SparkContext
from pyspark.sql import SparkSession
from pyspark.mllib.recommendation import ALS, Rating
from pyspark.sql import functions as F
import random
from pyspark.sql.types import StructType, StructField, IntegerType, FloatType, ArrayType
from pyspark.sql.functions import udf, col, sum, explode

In [None]:
# Initialize Spark session
spark = SparkSession.builder \
    .appName("ALS Recommendation System") \
    .master("spark://cm009:33793") \
    .getOrCreate()

In [4]:
data = spark.read.csv("ml-latest/ratings-large.csv", header=True, schema="userId INT, movieId INT, rating FLOAT")
data.show()

[Stage 0:>                                                          (0 + 1) / 1]

+------+-------+------+
|userId|movieId|rating|
+------+-------+------+
|     1|      1|   4.0|
|     1|    110|   4.0|
|     1|    158|   4.0|
|     1|    260|   4.5|
|     1|    356|   5.0|
|     1|    381|   3.5|
|     1|    596|   4.0|
|     1|   1036|   5.0|
|     1|   1049|   3.0|
|     1|   1066|   4.0|
|     1|   1196|   3.5|
|     1|   1200|   3.5|
|     1|   1210|   4.5|
|     1|   1214|   4.0|
|     1|   1291|   5.0|
|     1|   1293|   2.0|
|     1|   1376|   3.0|
|     1|   1396|   3.0|
|     1|   1537|   4.0|
|     1|   1909|   3.0|
+------+-------+------+
only showing top 20 rows



                                                                                

In [15]:
# Group by movieId and count the number of ratings for each movie
movie_ratings_count = data.groupBy("movieId").agg(F.count("movieId").alias("rating_count"))

# Filter out movies with less than 1000 ratings
popular_movies = movie_ratings_count.filter(col("rating_count") >= 50000)

popular_movies.count()

# Join the original DataFrame with popular_movies on movieId to retain all user ID information
filtered_data = data.join(popular_movies, "movieId", "inner")

# Show the resulting DataFrame
filtered_data.show()



+-------+------+------+------------+
|movieId|userId|rating|rating_count|
+-------+------+------+------------+
|    858| 51022|   5.0|       75004|
|    858| 51031|   3.5|       75004|
|    858| 51032|   5.0|       75004|
|    858| 51036|   4.5|       75004|
|    858| 51041|   2.0|       75004|
|    858| 51043|   4.0|       75004|
|    858| 51044|   4.0|       75004|
|    858| 51047|   4.5|       75004|
|    858| 51049|   4.5|       75004|
|    858| 51053|   4.0|       75004|
|    858| 51059|   5.0|       75004|
|    858| 51062|   4.5|       75004|
|    858| 51068|   3.0|       75004|
|    858| 51073|   5.0|       75004|
|    858| 51078|   2.0|       75004|
|    858| 51081|   4.5|       75004|
|    858| 51089|   4.0|       75004|
|    858| 51097|   5.0|       75004|
|    858| 51098|   5.0|       75004|
|    858| 51099|   4.5|       75004|
+-------+------+------+------------+
only showing top 20 rows



                                                                                

In [14]:
filtered_data.count()

                                                                                

7470434

In [17]:
train_ratio = 0.7
validation_ratio = 0.1
test_ratio = 0.2

In [18]:
# partitioning *one* user
def partition_ratings(user_id, ratings_list):
    total_ratings = len(ratings_list)
    train_count = int(total_ratings * train_ratio)
    validation_count = int(total_ratings * validation_ratio)
    test_count = total_ratings - train_count - validation_count

    # Generate random indices for each set
    indices = list(range(total_ratings))
    random.seed(42)
    random.shuffle(indices)
    train_indices = indices[:train_count]
    validation_indices = indices[train_count:train_count + validation_count]
    test_indices = indices[train_count + validation_count:]

    # Partition the ratings based on the selected indices
    train_set = [(user_id, movie_id, rating) for (movie_id, rating) in [ratings_list[i] for i in train_indices]]
    validation_set = [(user_id, movie_id, rating) for (movie_id, rating) in [ratings_list[i] for i in validation_indices]]
    test_set = [(user_id, movie_id, rating) for (movie_id, rating) in [ratings_list[i] for i in test_indices]]
    print(len(train_set))
    print(len(validation_set))
    print(len(test_set))

    return train_set, validation_set, test_set

In [19]:
# potential optimization: filter out rows that are not in your split's indices - need to read table three times though - need some more custom
# `filter` method that allows branching

def train_val_test_split(data):
    grouped_ratings = data.groupby('userId').agg(F.collect_list(F.struct('movieId', 'rating')).alias('ratings'))

    schema = StructType([
    StructField("train", ArrayType(StructType([
        StructField("userId", IntegerType(), False),
        StructField("movieId", IntegerType(), False),
        StructField("rating", FloatType(), False)
    ])), False),
    StructField("validation", ArrayType(StructType([
        StructField("userId", IntegerType(), False),
        StructField("movieId", IntegerType(), False),
        StructField("rating", FloatType(), False)
    ])), False),
    StructField("test", ArrayType(StructType([
        StructField("userId", IntegerType(), False),
        StructField("movieId", IntegerType(), False),
        StructField("rating", FloatType(), False)
    ])), False)
    ])

    # UDF to apply partition_ratings function to each row of the DataFrame
    partition_udf = udf(lambda user_id, ratings_list: partition_ratings(user_id, ratings_list), schema)

    # This will be cleaned by garbage collector after the function returns
    partitioned_ratings_df = grouped_ratings.withColumn('partitioned_data', partition_udf(col('userId'), col('ratings')))

    # Extract the three partitions into separate columns
    train_data = partitioned_ratings_df.selectExpr("partitioned_data['train'] AS train_data").withColumn("exploded", explode('train_data')).drop('train_data').select(col('exploded.userId').alias('userId'),
                                     col('exploded.movieId').alias('movieId'),
                                     col('exploded.rating').alias('rating'))

    test_data = partitioned_ratings_df.selectExpr("partitioned_data['test'] AS test_data").withColumn("exploded", explode('test_data')).drop('test_data').select(col('exploded.userId').alias('userId'),
                                     col('exploded.movieId').alias('movieId'),
                                     col('exploded.rating').alias('rating'))

    validation_data = partitioned_ratings_df.selectExpr("partitioned_data['validation'] AS validation_data").withColumn("exploded", explode('validation_data')).drop('validation_data').select(col('exploded.userId').alias('userId'),
                                     col('exploded.movieId').alias('movieId'),
                                     col('exploded.rating').alias('rating'))

    return train_data, test_data, validation_data


In [21]:
train_data, test_data, validation_data = train_val_test_split(filtered_data)

In [22]:
num_rows = train_data.count()
print("Length of train_data:", num_rows)

# first_row = train_data.select('train_data').first()
# num_elements = len(first_row[0])
# print("Length of train_data[0]", num_elements)



Length of train_data: 2083738


                                                                                

In [23]:
train_data.show(10)



+------+-------+------+
|userId|movieId|rating|
+------+-------+------+
|   148|      1|   3.0|
|   148|    110|   3.0|
|   148|     50|   5.0|
|   148|    527|   1.0|
|   148|    592|   2.0|
|   148|    589|   3.0|
|   148|    590|   1.0|
|   148|    593|   2.0|
|   148|    318|   3.0|
|   148|     47|   1.0|
+------+-------+------+
only showing top 10 rows



                                                                                

## Popularity Based Model

In [24]:
from pyspark.sql.functions import count, desc

In [25]:
data.printSchema()

root
 |-- userId: integer (nullable = true)
 |-- movieId: integer (nullable = true)
 |-- rating: float (nullable = true)



In [26]:
popularity = data.groupBy('movieId').agg(sum('rating').alias('total_ratings_sum')).sort(desc('total_ratings_sum'))

In [27]:
popularity.show()



+-------+-----------------+
|movieId|total_ratings_sum|
+-------+-----------------+
|    318|         540156.0|
|    356|         462069.0|
|    296|         455881.0|
|   2571|         445420.5|
|    593|         422507.5|
|    260|         397789.5|
|   2959|         365174.5|
|    527|         357340.5|
|   1196|         330266.0|
|   4993|         327700.5|
|    858|         324512.5|
|     50|         311097.5|
|   7153|         310368.0|
|   1198|         308595.0|
|    480|         306284.0|
|   1210|         305656.5|
|    110|         301766.5|
|   5952|         300707.0|
|      1|         299072.0|
|   2858|         286713.5|
+-------+-----------------+
only showing top 20 rows



                                                                                

In [28]:
popular_100 = popularity.limit(100)

## Latent Factor Model

In [29]:
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.ml.recommendation import ALS
from pyspark.ml.tuning import CrossValidator, ParamGridBuilder

Model Selection: With limited data (using the smaller dataset), let's select the best performing hyperparameters

In [30]:
# Define ALS model
als = ALS(userCol="userId", itemCol="movieId", ratingCol="rating", coldStartStrategy="drop") #maxIter = 10, rank = 10

# # Define parameter grid
# param_grid = ParamGridBuilder() \
#     .addGrid(als.maxIter, [5]) \
#     .addGrid(als.rank, [15]) \
#     .build()

In [31]:
rank_list = [10, 20, 30]
reg_param_list = [0.1, 0.01]
max_iter_list = [10, 20]

In [None]:
rmse = []
for rank in rank_list:
    for reg_param in reg_param_list:
        for max_iter in max_iter_list:
            print(f"evaluating model: rank {rank}, reg_param {reg_param}, max_iter {max_iter}")
            als = ALS(userCol="userId", itemCol="movieId", ratingCol="rating", coldStartStrategy="drop")
            als.setRank(rank)
            als.setRegParam(reg_param)
            als.setMaxIter(max_iter)
            model = als.fit(train_data)
            evaluator=RegressionEvaluator(metricName="rmse",labelCol="rating",predictionCol="prediction")
            predictions=model.transform(validation_data)
            loss=evaluator.evaluate(predictions)
            rmse.append((rank, reg_param, max_iter, loss))

evaluating model: rank 10, reg_param 0.1, max_iter 10
evaluating model: rank 10, reg_param 0.1, max_iter 20
evaluating model: rank 10, reg_param 0.01, max_iter 10
evaluating model: rank 10, reg_param 0.01, max_iter 20
evaluating model: rank 20, reg_param 0.1, max_iter 10
evaluating model: rank 20, reg_param 0.1, max_iter 20
evaluating model: rank 20, reg_param 0.01, max_iter 10
evaluating model: rank 20, reg_param 0.01, max_iter 20
evaluating model: rank 30, reg_param 0.1, max_iter 10
evaluating model: rank 30, reg_param 0.1, max_iter 20
evaluating model: rank 30, reg_param 0.01, max_iter 10
evaluating model: rank 30, reg_param 0.01, max_iter 20


In [None]:
print(rmse)

[(10, 0.1, 10, 0.8960124165831306), (10, 0.1, 20, 0.891636139154729), (10, 0.01, 10, 1.1626709986100443), (10, 0.01, 20, 1.189762887715596), (20, 0.1, 10, 0.8933265392660743), (20, 0.1, 20, 0.8911999626018452), (20, 0.01, 10, 1.2594200867313885), (20, 0.01, 20, 1.2818195524256082), (30, 0.1, 10, 0.8896305173338086), (30, 0.1, 20, 0.8889671471587874), (30, 0.01, 10, 1.3464718282456434), (30, 0.01, 20, 1.33212001073361)]


Overfitting has been clearly displayed in many of the hyperparamter combinations.

In [21]:
rank_list = [5, 10, 20]
reg_param_list = [0.05, 0.1, 0.5]
max_iter_list = [5, 10]

In [None]:
rmse = []
for rank in rank_list:
  for reg_param in reg_param_list:
    for max_iter in max_iter_list:
      print(f"evaluating model: rank {rank}, reg_param {reg_param}, max_iter {max_iter}")
      als = ALS(userCol="userId", itemCol="movieId", ratingCol="rating", coldStartStrategy="drop")
      als.setRank(rank)
      als.setRegParam(reg_param)
      als.setMaxIter(max_iter)
      model = als.fit(train_data)
      evaluator=RegressionEvaluator(metricName="rmse",labelCol="rating",predictionCol="prediction")
      predictions=model.transform(validation_data)
      loss=evaluator.evaluate(predictions)
      print(f"rmse = {loss}")
      rmse.append((rank, reg_param, max_iter, loss))

evaluating model: rank 5, reg_param 0.05, max_iter 5
rmse = 0.9381609181376621
evaluating model: rank 5, reg_param 0.05, max_iter 10
rmse = 0.9442073093559297
evaluating model: rank 5, reg_param 0.1, max_iter 5
rmse = 0.8969699690226526
evaluating model: rank 5, reg_param 0.1, max_iter 10
rmse = 0.8947458079251303
evaluating model: rank 5, reg_param 0.5, max_iter 5
rmse = 0.9984674030247629
evaluating model: rank 5, reg_param 0.5, max_iter 10
rmse = 1.0041226002238426
evaluating model: rank 10, reg_param 0.05, max_iter 5
rmse = 0.9712526068508573
evaluating model: rank 10, reg_param 0.05, max_iter 10
rmse = 0.9685195032402788
evaluating model: rank 10, reg_param 0.1, max_iter 5
rmse = 0.9022475756782403
evaluating model: rank 10, reg_param 0.1, max_iter 10
rmse = 0.8960124165831306
evaluating model: rank 10, reg_param 0.5, max_iter 5
rmse = 1.0074279530094896
evaluating model: rank 10, reg_param 0.5, max_iter 10
rmse = 1.004480227145437
evaluating model: rank 20, reg_param 0.05, max_it

Evaluating all parameter combinations, we choose one that is low-loss and computationally frugal: `rank=10 or 20, reg_param=0.1, max_iter=10`.

## Model Evaluation

In [32]:
rank = 10
max_iter = 10
reg_param = 0.1

# Train ALS model
als = ALS(userCol="userId", itemCol="movieId", ratingCol="rating", coldStartStrategy="drop")
als.setRank(rank)
als.setRegParam(reg_param)
als.setMaxIter(max_iter)
als_model = als.fit(train_data)

                                                                                

In [33]:
# Evaluate ALS model on test data
als_evaluator = RegressionEvaluator(metricName="rmse", labelCol="rating", predictionCol="prediction")
als_predictions = als_model.transform(test_data)
als_rmse = als_evaluator.evaluate(als_predictions)
print(f"ALS RMSE on test data: {als_rmse}")



ALS RMSE on test data: 0.6285165153565172


                                                                                

In [155]:
# Evaluate ALS model on validation data
als_evaluator = RegressionEvaluator(metricName="rmse", labelCol="rating", predictionCol="prediction")
als_predictions = als_model.transform(validation_data)
als_rmse = als_evaluator.evaluate(als_predictions)
print(f"ALS RMSE on validation data: {als_rmse}")



ALS RMSE on validation data: 0.810838107029413


                                                                                

In [37]:
als_predictions.show()



+------+-------+------+----------+
|userId|movieId|rating|prediction|
+------+-------+------+----------+
| 95216|    148|   3.0| 2.6570385|
|297640|    148|   1.0|  2.188863|
|250352|    148|   3.0| 2.7228742|
|120043|    148|   3.5| 2.2446926|
|182465|    148|   1.0| 2.4053843|
|178823|    148|   3.0| 2.9604418|
|144866|    148|   1.0|  2.693301|
| 80238|    148|   2.0| 2.8222027|
|142220|    148|   3.0| 3.0233731|
|238584|    148|   3.0| 2.8368232|
|124271|    148|   1.0| 2.5842838|
| 25497|    148|   2.0| 2.3792486|
|184999|    148|   3.0| 3.2300584|
|275563|    148|   2.0| 2.4938354|
|279270|    148|   4.0| 2.8768306|
|180406|    148|   3.0| 2.6834228|
|170125|    148|   4.0| 3.3659523|
|316509|    148|   3.0| 1.9560167|
|316701|    148|   3.0| 2.9937894|
|190375|    148|   3.0| 3.2058291|
+------+-------+------+----------+
only showing top 20 rows



                                                                                

In [34]:
user_matrix = als_model.userFactors.withColumnRenamed("features", "user_features")
item_matrix = als_model.itemFactors.withColumnRenamed("features", "item_features")

In [35]:
user_rows = user_matrix.collect()
item_rows = item_matrix.collect()

In [36]:
len(user_rows)

243038

In [37]:
user_dict = {}
item_dict = {}

# Iterate over each row in user_rows
for row in user_rows:
    # Extract the user ID and user features
    user_id = row.id
    user_features = row.user_features
    # Store them in the dictionary
    user_dict[user_id] = user_features

for row in item_rows:
    # Extract the user ID and user features
    item_id = row.id
    item_features = row.item_features
    # Store them in the dictionary
    item_dict[item_id] = item_features

In [38]:
print(type(user_dict[1]))

<class 'list'>


In [39]:
# predict the RMSE loss for popularity based model

# need to get <u_i, v_j> for each R_ij in the test dataset
import builtins

def dot(userId, movieId):
    user_vector = user_dict[userId]
    item_vector = item_dict[movieId]
    dot_product = builtins.sum(u * v for u, v in zip(user_vector, item_vector))
    
    return dot_product

In [40]:
test_data.show()



+------+-------+------+
|userId|movieId|rating|
+------+-------+------+
|   148|    588|   3.0|
|   148|    780|   3.0|
|   148|     32|   2.0|
|   148|    318|   3.0|
|   148|    858|   5.0|
|   148|    527|   1.0|
|   148|     47|   1.0|
|   463|    296|   4.5|
|   463|      1|   4.0|
|   463|    527|   4.5|
|   463|    318|   5.0|
|   463|   4993|   4.0|
|   471|    527|   5.0|
|   471|    480|   3.0|
|   471|  79132|   5.0|
|   496|     50|   3.0|
|   496|    527|   4.0|
|   833|   2959|   4.5|
|   833|  58559|   4.5|
|   833|     47|   4.5|
+------+-------+------+
only showing top 20 rows



                                                                                

In [41]:
import math

Loss = 0
for row in test_data.collect():
    userId = row.userId
    movieId = row.movieId
    actualRating = row.rating
    # if a user has only watch one movie, it's very likely that their data is missing from the val/test set. This is not a spliting issue but a data scarcity issue.
    if userId not in user_dict or movieId not in item_dict:
        continue  # Return None to indicate that the dot product couldn't be calculated
    predictedRating = dot(userId, movieId)
    squaredError = (actualRating - predictedRating)**2
    Loss += squaredError
MSE = Loss/test_data.count()
test_RMSE = math.sqrt(MSE)

                                                                                

In [42]:
print(f"popular RMSE on test data is:{test_RMSE}")

popular RMSE on test data is:0.6198334305509302


In [43]:
Loss = 0
for row in validation_data.collect():
    userId = row.userId
    movieId = row.movieId
    actualRating = row.rating
    # if a user has only watch one movie, it's very likely that their data is missing from the val/test set. This is not a spliting issue but a data scarcity issue.
    if userId not in user_dict or movieId not in item_dict:
        continue  # Return None to indicate that the dot product couldn't be calculated
    predictedRating = dot(userId, movieId)
    squaredError = (actualRating - predictedRating)**2
    Loss += squaredError
MSE = Loss/validation_data.count()
val_RMSE = math.sqrt(MSE)

                                                                                

In [44]:
print(f"validation RMSE is:{val_RMSE}")

validation RMSE is:0.6062775661060448


## Bipartite Eval

In [45]:
predictions = sorted(als_model.transform(test_data).collect(), key=lambda r: r[0])

                                                                                

In [46]:
predictions[:10]

[Row(userId=1, movieId=4993, rating=4.0, prediction=4.245413303375244),
 Row(userId=1, movieId=1196, rating=3.5, prediction=3.963167428970337),
 Row(userId=1, movieId=260, rating=4.5, prediction=4.072638511657715),
 Row(userId=1, movieId=2028, rating=5.0, prediction=4.833735942840576),
 Row(userId=2, movieId=588, rating=4.0, prediction=4.0913801193237305),
 Row(userId=2, movieId=593, rating=5.0, prediction=4.707738399505615),
 Row(userId=2, movieId=1, rating=5.0, prediction=4.328616619110107),
 Row(userId=2, movieId=318, rating=5.0, prediction=5.000772953033447),
 Row(userId=2, movieId=110, rating=5.0, prediction=4.321163177490234),
 Row(userId=3, movieId=296, rating=5.0, prediction=4.7898173332214355)]

In [67]:
recommendations = als_model.recommendForAllUsers(numItems=100)
recommendations.show(200)



+------+--------------------+
|userId|     recommendations|
+------+--------------------+
|   148|[{1197, 3.826783}...|
|   463|[{1196, 4.5965676...|
|   471|[{858, 4.9972434}...|
|   496|[{457, 3.9838488}...|
|   833|[{318, 4.5789957}...|
|  1088|[{296, 4.2954226}...|
|  1238|[{318, 4.116899},...|
|  1342|[{58559, 4.053484...|
|  1645|[{318, 4.9192066}...|
|  1829|[{296, 4.8683267}...|
|  1959|[{6539, 3.512825}...|
|  2122|[{2959, 4.5689454...|
|  2142|[{318, 4.5706997}...|
|  2366|[{4993, 4.295436}...|
|  2659|[{260, 4.6977572}...|
|  2866|[{858, 4.95711}, ...|
|  3175|[{318, 4.6478224}...|
|  3749|[{58559, 4.078947...|
|  3794|[{318, 4.9330587}...|
|  3918|[{260, 3.811087},...|
|  4101|[{480, 3.5143492}...|
|  4900|[{590, 4.470261},...|
|  5156|[{858, 3.452961},...|
|  5518|[{1197, 4.237961}...|
|  5803|[{296, 4.9420238}...|
|  6336|[{2571, 4.553399}...|
|  6357|[{318, 4.428268},...|
|  6466|[{318, 5.091797},...|
|  6620|[{364, 2.6377027}...|
|  7240|[{2959, 3.4168527...|
|  7253|[{

                                                                                

In [72]:
cleaned_df = recommendations.select('userId', explode('recommendations').alias('recommendation')) \
               .select('userId', 'recommendation.movieId').groupBy('userId').agg(collect_list('movieId').alias('movieId'))
cleaned_df.show()



+------+--------------------+
|userId|             movieId|
+------+--------------------+
|   148|[1197, 2571, 5855...|
|   463|[1196, 260, 58559...|
|   471|[858, 58559, 296,...|
|   496|[457, 318, 1197, ...|
|   833|[318, 4993, 7153,...|
|  1088|[296, 2959, 858, ...|
|  1238|[318, 593, 858, 3...|
|  1342|[58559, 1210, 257...|
|  1645|[318, 58559, 7153...|
|  1829|[296, 608, 4226, ...|
|  1959|[6539, 79132, 276...|
|  2122|[2959, 296, 2571,...|
|  2142|[318, 58559, 2959...|
|  2366|[4993, 7153, 5952...|
|  2659|[260, 1196, 1210,...|
|  2866|[858, 318, 50, 29...|
|  3175|[318, 608, 593, 5...|
|  3749|[58559, 2571, 422...|
|  3794|[318, 2571, 296, ...|
|  3918|[260, 1196, 1198,...|
+------+--------------------+
only showing top 20 rows



                                                                                

In [73]:
cleaned_df.where(cleaned_df.userId == 148).select("movieId").collect()

                                                                                

[Row(movieId=[1197, 2571, 58559, 1196, 260, 1210, 1198, 50, 79132, 4226, 296, 1, 2858, 589, 3578, 1704, 32, 4306, 608, 858, 2959, 2028, 6539, 4993, 150, 780, 7153, 5952, 110, 527, 592, 318, 1270, 2762, 380, 457, 588, 47, 480, 364, 593, 356, 1721, 590])]

In [74]:
from pyspark.sql.functions import collect_list

# Group by userId and collect movieIds into a list
ground_truth_train = train_data.groupBy("userId").agg(collect_list("movieId").alias("items"))
ground_truth_validation = validation_data.groupBy("userId").agg(collect_list("movieId").alias("items"))
ground_truth_test = test_data.groupBy("userId").agg(collect_list("movieId").alias("items"))

# Show the ground truth data
ground_truth_test.show()



+------+--------------------+
|userId|               items|
+------+--------------------+
|   148|[2858, 260, 457, ...|
|   463|[1, 318, 2959, 85...|
|   471|  [2959, 79132, 260]|
|   496|          [110, 150]|
|   833|[1196, 2959, 110,...|
|  1088|[592, 364, 858, 1...|
|  1238|           [380, 47]|
|  1342|[58559, 1721, 595...|
|  1645|[1196, 1, 2959, 4...|
|  1829|     [457, 296, 480]|
|  1959|[58559, 6539, 499...|
|  2122|          [47, 2571]|
|  2142|[592, 589, 457, 1...|
|  2366|[2028, 2959, 110,...|
|  2659|         [2959, 260]|
|  2866|    [527, 2959, 593]|
|  3175|    [2959, 110, 588]|
|  3749|[593, 364, 858, 1...|
|  3794|        [2571, 2959]|
|  3918|[589, 1198, 1196,...|
+------+--------------------+
only showing top 20 rows



                                                                                

In [50]:
ground_truth_train.where(ground_truth_train.userId == 148).select("items").collect()

                                                                                

[Row(items=[592, 527, 1197, 364, 608, 780, 1270, 1196, 2858, 356, 480, 318, 50, 150, 2028, 32, 296, 1198, 858, 590, 1704, 593, 1])]

In [77]:
sdf = predictionAndLabels_validation = cleaned_df.join(ground_truth_validation, cleaned_df.userId == ground_truth_validation.userId, "inner") \
     .select(cleaned_df.movieId, ground_truth_validation.items)
sdf.show()



+--------------------+--------------------+
|             movieId|               items|
+--------------------+--------------------+
|[1197, 2571, 5855...|   [527, 2028, 2762]|
|[1196, 260, 58559...|         [858, 2858]|
|[858, 58559, 296,...|              [1270]|
|[457, 318, 1197, ...|               [592]|
|[318, 4993, 7153,...|              [2571]|
|[296, 2959, 858, ...|[1704, 1197, 457,...|
|[58559, 1210, 257...|              [2959]|
|[318, 58559, 7153...|        [110, 58559]|
|[6539, 79132, 276...|              [1198]|
|[318, 58559, 2959...|   [608, 1721, 2959]|
|[4993, 7153, 5952...|              [2571]|
|[318, 608, 593, 5...|               [380]|
|[58559, 2571, 422...|  [1197, 7153, 2571]|
|[260, 1196, 1198,...|         [2959, 296]|
|[1197, 296, 608, ...|   [4226, 2762, 858]|
|[296, 2571, 318, ...|             [79132]|
|[2571, 1196, 260,...|                [47]|
|[318, 7153, 4993,...|          [364, 110]|
|[364, 4993, 588, ...|             [79132]|
|[2959, 58559, 257...|         [

                                                                                

In [79]:
predictionAndLabels_validation = cleaned_df.join(ground_truth_validation, cleaned_df.userId == ground_truth_validation.userId, "inner") \
     .select(cleaned_df.movieId, ground_truth_validation.items) \
     .rdd

predictionAndLabels_test =  cleaned_df.join(ground_truth_test, cleaned_df.userId == ground_truth_test.userId, "inner") \
     .select(cleaned_df.movieId, ground_truth_test.items) \
     .rdd

predictionAndLabels_train =  cleaned_df.join(ground_truth_train, cleaned_df.userId == ground_truth_train.userId, "inner") \
     .select(cleaned_df.movieId, ground_truth_train.items) \
     .rdd

In [52]:
# from pyspark.mllib.evaluation import RankingMetrics

# # Create a RankingMetrics object
# metrics_train = RankingMetrics(predictionAndLabels_train)

# # Calculate Mean Average Precision (MAP) for validation and test sets
# map_train = metrics_train.meanAveragePrecisionAt(100)

# print("Mean Average Precision (MAP) at 100 for train set:", map_train)



Mean Average Precision (MAP) at 100 for train set: 0.0


                                                                                

In [80]:
from pyspark.mllib.evaluation import RankingMetrics

# Create a RankingMetrics object
metrics_validation = RankingMetrics(predictionAndLabels_validation)
metrics_test = RankingMetrics(predictionAndLabels_test)

# Calculate Mean Average Precision (MAP) for validation and test sets
map_validation = metrics_validation.meanAveragePrecisionAt(100)
map_test = metrics_test.meanAveragePrecisionAt(100)

# map_validation = metrics_validation.ndcgAt(100)
# map_test = metrics_test.ndcgAt(100)


print("Mean Average Precision (MAP) for validation set:", map_validation)
print("Mean Average Precision (MAP) for test set:", map_test)



Mean Average Precision (MAP) for validation set: 0.15688806723860668
Mean Average Precision (MAP) for test set: 0.29139083007708294


                                                                                

In [82]:
popular100Ids = popular_100.select('movieId')

# Collect the movieIds into a list
popular100Ids = [row['movieId'] for row in popular100Ids.collect()]

# Display the list of movieIds
print(popular100Ids)




[318, 356, 296, 2571, 593, 260, 2959, 527, 1196, 4993, 858, 50, 7153, 1198, 480, 1210, 110, 5952, 1, 2858, 589, 58559, 79132, 47, 1270, 2028, 608, 457, 150, 2762, 3578, 32, 4226, 1704, 4306, 780, 1197, 1193, 588, 364, 1221, 590, 1136, 541, 1291, 6539, 592, 1240, 1214, 7361, 1265, 1036, 4973, 1213, 4886, 1089, 380, 6377, 6874, 1580, 293, 1682, 1721, 377, 4995, 33794, 2329, 60069, 109487, 595, 3147, 8961, 1097, 648, 1258, 4963, 1200, 1527, 111, 5989, 68954, 165, 1206, 5418, 48516, 1732, 3793, 5618, 5445, 733, 2997, 2716, 924, 778, 7438, 59315, 912, 68157, 1617, 4878]


                                                                                

In [124]:
movie_ids_to_replicate = [[[318, 356, 296, 2858, 260, 457]]] * 271351  # Adjust 20 to match the length of ground_truth DataFrame
movie_ids_replicated = spark.createDataFrame(movie_ids_to_replicate, ["items"])

In [154]:
user_ids_list = [row['userId'] for row in ground_truth_validation.select('userId').collect()]

                                                                                

In [155]:
user_ids_list

[148,
 463,
 471,
 496,
 833,
 1088,
 1342,
 1645,
 1959,
 2142,
 2366,
 3175,
 3749,
 3918,
 5518,
 5803,
 6336,
 6466,
 6620,
 7240,
 7253,
 7340,
 7754,
 7982,
 7993,
 8389,
 9465,
 9852,
 10362,
 11033,
 11317,
 11458,
 12027,
 13285,
 15727,
 15957,
 17753,
 18024,
 18498,
 18654,
 19984,
 20135,
 20683,
 20924,
 21220,
 24347,
 25462,
 25591,
 26755,
 27760,
 28088,
 28577,
 28664,
 29054,
 29194,
 29894,
 30903,
 31035,
 31236,
 31261,
 31367,
 31528,
 32460,
 32539,
 32592,
 32855,
 33375,
 33412,
 34239,
 34759,
 35351,
 35361,
 36525,
 36538,
 37146,
 37251,
 37489,
 38153,
 38311,
 38422,
 39285,
 39432,
 40011,
 40107,
 40383,
 42834,
 43103,
 43714,
 43852,
 45307,
 46266,
 46943,
 46952,
 47217,
 47283,
 48254,
 48398,
 49331,
 49717,
 49855,
 50353,
 51123,
 53565,
 53963,
 55155,
 55869,
 56054,
 56110,
 57370,
 57380,
 57693,
 57984,
 58797,
 58835,
 59384,
 59990,
 61051,
 62680,
 62985,
 63271,
 63574,
 63645,
 63964,
 64628,
 66010,
 67294,
 67753,
 67861,
 68090,
 

In [122]:
movie_ids_df = spark.createDataFrame([(popular100Ids,)], ['popular100Ids'])
movie_ids_df.show()

+--------------------+
|       popular100Ids|
+--------------------+
|[318, 356, 296, 2...|
+--------------------+



In [142]:
type(user_ids_list)

list

In [156]:
import pandas as pd

data = [{'userId': user_id, 'items': popular100Ids} for user_id in user_ids_list]

# Create a DataFrame from the list of dictionaries
pandas_df = pd.DataFrame(data)

In [157]:
popular_df = spark.createDataFrame(pandas_df)

In [158]:
popular_df.show()

+------+--------------------+
|userId|               items|
+------+--------------------+
|   148|[318, 356, 296, 2...|
|   463|[318, 356, 296, 2...|
|   471|[318, 356, 296, 2...|
|   496|[318, 356, 296, 2...|
|   833|[318, 356, 296, 2...|
|  1088|[318, 356, 296, 2...|
|  1342|[318, 356, 296, 2...|
|  1645|[318, 356, 296, 2...|
|  1959|[318, 356, 296, 2...|
|  2142|[318, 356, 296, 2...|
|  2366|[318, 356, 296, 2...|
|  3175|[318, 356, 296, 2...|
|  3749|[318, 356, 296, 2...|
|  3918|[318, 356, 296, 2...|
|  5518|[318, 356, 296, 2...|
|  5803|[318, 356, 296, 2...|
|  6336|[318, 356, 296, 2...|
|  6466|[318, 356, 296, 2...|
|  6620|[318, 356, 296, 2...|
|  7240|[318, 356, 296, 2...|
+------+--------------------+
only showing top 20 rows



In [159]:
# predictionAndLabels_popularity_validation = popular_df.join(ground_truth_validation, popular_df.userId == ground_truth_validation.userId, "inner") \
#      .select(popular_df.items, ground_truth_validation.items) \
#      .rdd

predictionAndLabels_popularity_test =  popular_df.join(ground_truth_test, popular_df.userId == ground_truth_test.userId, "inner") \
     .select(popular_df.items, ground_truth_test.items) \
     .rdd

In [160]:
# Create a RankingMetrics object
# metrics_validation_popular = RankingMetrics(predictionAndLabels_popularity_validation)
metrics_test_popular = RankingMetrics(predictionAndLabels_popularity_test)

# Calculate Mean Average Precision (MAP) for validation and test sets
# map_validation = metrics_validation_popular.meanAveragePrecisionAt(100)
map_test = metrics_test_popular.meanAveragePrecisionAt(100)

# map_validation = metrics_validation.ndcgAt(100)
# map_test = metrics_test.ndcgAt(100)


# print("Mean Average Precision (MAP) for validation set:", map_validation)
print("Mean Average Precision (MAP) for test set:", map_test)



Mean Average Precision (MAP) for test set: 0.23233645525437063


                                                                                