In [5]:
import pandas as pd
from sklearn.model_selection import train_test_split

def load_and_preprocess_data_balanced():
    # Load data
    movies = pd.read_csv('data/movies.csv')
    ratings = pd.read_csv('data/ratings.csv')
    # Merge datasets on movieId
    merged_data = pd.merge(ratings, movies, on='movieId')
    train_data = pd.DataFrame()
    test_data = pd.DataFrame()
    for user_id in merged_data['userId'].unique():
        user_data = merged_data[merged_data['userId'] == user_id]
        # Adjust split ratio if number of ratings is odd
        # Favor training set by allocating the extra review to it
        split_ratio = 0.5 if len(user_data) % 2 == 0 else (len(user_data) // 2 + 1) / len(user_data)
        user_train, user_test = train_test_split(user_data, test_size=1-split_ratio, shuffle=True)
        train_data = pd.concat([train_data, user_train])
        test_data = pd.concat([test_data, user_test])
    return merged_data, train_data, test_data

In [6]:
data1, train_data_balanced, test_data_balanced = load_and_preprocess_data_balanced()

In [7]:
train_data_balanced

Unnamed: 0,userId,movieId,rating,timestamp,title,genres
12495,1,2528,3.0,964982328,Logan's Run (1976),Action|Adventure|Sci-Fi
1365,1,231,5.0,964981179,Dumb & Dumber (Dumb and Dumber) (1994),Adventure|Comedy
15190,1,3176,1.0,964983504,"Talented Mr. Ripley, The (1999)",Drama|Mystery|Thriller
3892,1,553,5.0,964984153,Tombstone (1993),Action|Drama|Western
4310,1,593,4.0,964983793,"Silence of the Lambs, The (1991)",Crime|Horror|Thriller
...,...,...,...,...,...,...
42879,578,81845,5.0,1300996688,"King's Speech, The (2010)",Drama
91071,578,78316,4.0,1300991370,Letters to Juliet (2010),Drama|Romance
99955,578,56389,4.0,1300996756,My Blueberry Nights (2007),Drama|Romance
42739,578,73017,5.0,1300996791,Sherlock Holmes (2009),Action|Crime|Mystery|Thriller


In [8]:
train_data_balanced.describe()

Unnamed: 0,userId,movieId,rating,timestamp
count,50539.0,50539.0,50539.0,50539.0
mean,326.100754,19421.092048,3.50839,1206035000.0
std,182.606379,35486.749046,1.043505,216210600.0
min,1.0,1.0,0.5,828124600.0
25%,177.0,1198.0,3.0,1019125000.0
50%,325.0,2990.0,3.5,1186088000.0
75%,477.0,8138.0,4.0,1435994000.0
max,610.0,193609.0,5.0,1537675000.0


In [9]:
test_data_balanced

Unnamed: 0,userId,movieId,rating,timestamp,title,genres
12236,1,2414,3.0,964982513,Young Sherlock Holmes (1985),Action|Adventure|Children|Fantasy|Mystery|Thri...
12101,1,2387,5.0,964983123,Very Bad Things (1998),Comedy|Crime
12642,1,2571,5.0,964981888,"Matrix, The (1999)",Action|Sci-Fi|Thriller
11243,1,2105,4.0,964981725,Tron (1982),Action|Adventure|Sci-Fi
5764,1,1024,5.0,964982876,"Three Caballeros, The (1945)",Animation|Children|Musical
...,...,...,...,...,...,...
86401,578,5092,3.5,1300990600,Big Fat Liar (2002),Children|Comedy
94654,578,71823,4.5,1300996741,"New York, I Love You (2009)",Drama|Romance
85986,578,3155,4.5,1300989860,Anna and the King (1999),Drama|Romance
98345,578,52668,4.0,1300991167,In the Land of Women (2007),Comedy|Drama|Romance


# Sparsity of the data

In [112]:
def calculate_sparsity(dataframe, user_col='userId', item_col='movieId'):
    # Count the total number of possible interactions (num_users * num_items)
    num_users = dataframe[user_col].nunique()
    num_items = dataframe[item_col].nunique()
    total_possible_interactions = num_users * num_items

    # Count the number of actual interactions
    num_actual_interactions = dataframe.shape[0]

    # Calculate sparsity
    sparsity = 1 - (num_actual_interactions / total_possible_interactions)
    return sparsity

# Assuming train_data_balanced and test_data_balanced are your train and test datasets
train_sparsity = calculate_sparsity(train_data_balanced)
test_sparsity = calculate_sparsity(test_data_balanced)

print(f"Train dataset sparsity: {train_sparsity:.4f}")
print(f"Test dataset sparsity: {test_sparsity:.4f}")


Train dataset sparsity: 0.9890
Test dataset sparsity: 0.9891


In [10]:
# Verify the split for a few users
for user_id in train_data_balanced['userId'].unique()[:5]:  # Check for the first 5 users
    train_reviews = train_data_balanced[train_data_balanced['userId'] == user_id]
    test_reviews = test_data_balanced[test_data_balanced['userId'] == user_id]
    diff = abs(len(train_reviews) - len(test_reviews))
    assert diff <= 1, f"User {user_id} does not have an equal or nearly equal split"
    print(f"User {user_id}: Train reviews = {len(train_reviews)}, Test reviews = {len(test_reviews)}")

User 1: Train reviews = 116, Test reviews = 116
User 5: Train reviews = 22, Test reviews = 22
User 7: Train reviews = 76, Test reviews = 76
User 15: Train reviews = 68, Test reviews = 67
User 17: Train reviews = 53, Test reviews = 52


In [11]:
!pip install pyspark



In [12]:
from pyspark.sql import SparkSession
from pyspark.ml.recommendation import ALS
from pyspark.sql.functions import col

In [13]:
# Initialize Spark Session
spark = SparkSession.builder.appName("MovieRecommender").getOrCreate()

# Convert the training DataFrame to a Spark DataFrame
train_data_spark = spark.createDataFrame(train_data_balanced)

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
24/02/04 07:43:00 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [111]:
# check sparsity of train_data_spark


                                                                                

In [14]:
test_data_spark = spark.createDataFrame(test_data_balanced)

In [15]:
# ALS model parameters
als = ALS(userCol="userId", itemCol="movieId", ratingCol="rating",
          coldStartStrategy="drop", nonnegative=True)

In [16]:
# Train the ALS model
model = als.fit(train_data_spark)

24/02/04 07:43:07 WARN InstanceBuilder: Failed to load implementation from:dev.ludovic.netlib.blas.JNIBLAS


In [17]:
from pyspark.ml.evaluation import RegressionEvaluator
evaluator=RegressionEvaluator(metricName="rmse",labelCol="rating",predictionCol="prediction")

In [18]:
# Make predictions and print the RMSE of the ALS model
predictions=model.transform(test_data_spark)
rmse=evaluator.evaluate(predictions)
print("New RMSE: ", evaluator.evaluate(model.transform(test_data_spark)))

New RMSE:  0.9157571614881965


In [19]:
from pyspark.ml.tuning import CrossValidator
from pyspark.ml.tuning import ParamGridBuilder

In [20]:
# Now we try to improve the performance of the original model using cross validation and solve the cold-start problem.
# we set cold start strategy to 'drop' to ensure we don't get NaN evaluation metrics
model1 = ALS(userCol="userId", itemCol="movieId", ratingCol="rating", nonnegative = True, coldStartStrategy="drop")

In [21]:
#For Parameter tuning of the ALS model we use ParamGridBuilder function
#We tune two parameters
#1. The Regularization parameter ranging from 0.1, 0.01, 0.001, 0.0001
#2. The rank for matrix factorization
paramGrid = ParamGridBuilder() \
    .addGrid(model1.regParam, [0.1, 0.05, 0.01, 0.001]) \
    .addGrid(model1.rank, [5, 10, 20, 30]) \
    .build()

In [22]:
#Defining a cross-validator object
#Setting up CV and adding parameters. We will be performing a 5 fold CV
crossvalidation = CrossValidator(estimator = model1,
                     estimatorParamMaps = paramGrid,
                     evaluator = evaluator,
                     numFolds=5)

In [23]:
# Run cross-validation, and choose the best set of parameters.
Best_model = crossvalidation.fit(train_data_spark).bestModel

                                                                                

In [24]:
#The Best_model
print(type(Best_model))
#Complete the code below to extract the ALS model parameters
print("**Best Model**")
#Rank
print("Rank: ", Best_model._java_obj.parent().getRank())
#MaxIter
print("MaxIter: ", Best_model._java_obj.parent().getMaxIter())
#RegParam
print("RegParam: ", Best_model._java_obj.parent().getRegParam())

<class 'pyspark.ml.recommendation.ALSModel'>
**Best Model**
Rank:  30
MaxIter:  10
RegParam:  0.1


In [25]:
# Calculate the RMSE on test data using the best set of parameters obtained after cross validation
print("Best RMSE value is: ", evaluator.evaluate(Best_model.transform(test_data_spark)))

Best RMSE value is:  0.9191989489219747


In [26]:
pred = Best_model.transform(test_data_spark)
pred.show(10)

+------+-------+------+----------+--------------------+--------------------+----------+
|userId|movieId|rating| timestamp|               title|              genres|prediction|
+------+-------+------+----------+--------------------+--------------------+----------+
|    27|   2142|   3.0| 962685569|American Tail: Fi...|Adventure|Animati...| 1.8742591|
|    44|    833|   2.0| 869252237|High School High ...|              Comedy| 2.3530664|
|    91|   1645|   3.0|1112712216|The Devil's Advoc...|Drama|Mystery|Thr...| 3.1277187|
|    91|    471|   1.0|1112713817|Hudsucker Proxy, ...|              Comedy| 3.1096866|
|    93|   1591|   4.0| 942946677|        Spawn (1997)|Action|Adventure|...|  3.285619|
|   132|  44022|   2.0|1161799379|Ice Age 2: The Me...|Adventure|Animati...| 2.8572009|
|    57|    471|   3.0| 969753604|Hudsucker Proxy, ...|              Comedy| 3.6794982|
|    96|   3175|   5.0| 964774067| Galaxy Quest (1999)|Adventure|Comedy|...| 3.9004605|
|    19|   1342|   2.0| 96570495

In [32]:
pred.show()

                                                                                

+------+-------+------+----------+--------------------+--------------------+----------+
|userId|movieId|rating| timestamp|               title|              genres|prediction|
+------+-------+------+----------+--------------------+--------------------+----------+
|    27|   2142|   3.0| 962685569|American Tail: Fi...|Adventure|Animati...| 1.8742591|
|    44|    833|   2.0| 869252237|High School High ...|              Comedy| 2.3530664|
|    91|   1645|   3.0|1112712216|The Devil's Advoc...|Drama|Mystery|Thr...| 3.1277187|
|    91|    471|   1.0|1112713817|Hudsucker Proxy, ...|              Comedy| 3.1096866|
|    93|   1591|   4.0| 942946677|        Spawn (1997)|Action|Adventure|...|  3.285619|
|   132|  44022|   2.0|1161799379|Ice Age 2: The Me...|Adventure|Animati...| 2.8572009|
|    57|    471|   3.0| 969753604|Hudsucker Proxy, ...|              Comedy| 3.6794982|
|    96|   3175|   5.0| 964774067| Galaxy Quest (1999)|Adventure|Comedy|...| 3.9004605|
|    19|   1342|   2.0| 96570495

In [117]:
pred_pandas = pred.toPandas()

pred_pandas

Unnamed: 0,userId,movieId,rating,timestamp,title,genres,prediction
0,27,2142,3.0,962685569,"American Tail: Fievel Goes West, An (1991)",Adventure|Animation|Children|Musical|Western,1.874259
1,44,833,2.0,869252237,High School High (1996),Comedy,2.353066
2,91,1645,3.0,1112712216,The Devil's Advocate (1997),Drama|Mystery|Thriller,3.127719
3,91,471,1.0,1112713817,"Hudsucker Proxy, The (1994)",Comedy,3.109687
4,93,1591,4.0,942946677,Spawn (1997),Action|Adventure|Sci-Fi|Thriller,3.285619
...,...,...,...,...,...,...,...
47375,143,2706,4.5,1444768580,American Pie (1999),Comedy|Romance,2.301104
47376,95,3053,3.0,1043340086,"Messenger: The Story of Joan of Arc, The (1999)",Drama|War,4.161676
47377,506,56367,4.0,1424487733,Juno (2007),Comedy|Drama|Romance,2.698774
47378,311,2706,0.5,1057854230,American Pie (1999),Comedy|Romance,1.912456


In [74]:
# Cap predictions at 5
pred_pandas['prediction'] = pred_pandas['prediction'].apply(lambda x: min(x, 5.0))


In [75]:
pred_pandas.to_excel('predictions.xlsx', index=False)
train_data_balanced.to_excel('train_data_balanced.xlsx', index=False)

## 1. Movies from the training data set for that user and how that user rated them.

In [76]:
train_data_balanced

Unnamed: 0,userId,movieId,rating,timestamp,title,genres
12495,1,2528,3.0,964982328,Logan's Run (1976),Action|Adventure|Sci-Fi
1365,1,231,5.0,964981179,Dumb & Dumber (Dumb and Dumber) (1994),Adventure|Comedy
15190,1,3176,1.0,964983504,"Talented Mr. Ripley, The (1999)",Drama|Mystery|Thriller
3892,1,553,5.0,964984153,Tombstone (1993),Action|Drama|Western
4310,1,593,4.0,964983793,"Silence of the Lambs, The (1991)",Crime|Horror|Thriller
...,...,...,...,...,...,...
42879,578,81845,5.0,1300996688,"King's Speech, The (2010)",Drama
91071,578,78316,4.0,1300991370,Letters to Juliet (2010),Drama|Romance
99955,578,56389,4.0,1300996756,My Blueberry Nights (2007),Drama|Romance
42739,578,73017,5.0,1300996791,Sherlock Holmes (2009),Action|Crime|Mystery|Thriller


In [77]:
train_data_balanced[train_data_balanced['userId'] == 1].sort_values(by='rating', ascending=False)

Unnamed: 0,userId,movieId,rating,timestamp,title,genres
15656,1,3578,5.0,964980668,Gladiator (2000),Action|Adventure|Drama
15985,1,3703,5.0,964981909,"Road Warrior, The (Mad Max 2) (1981)",Action|Adventure|Sci-Fi|Thriller
12247,1,2427,5.0,964982242,"Thin Red Line, The (1998)",Action|Drama|War
10404,1,1927,5.0,964981497,All Quiet on the Western Front (1930),Action|Drama|War
14854,1,3033,5.0,964983762,Spaceballs (1987),Comedy|Sci-Fi
...,...,...,...,...,...,...
5742,1,1009,3.0,964981775,Escape to Witch Mountain (1975),Adventure|Children|Fantasy
5830,1,1030,3.0,964982903,Pete's Dragon (1977),Adventure|Animation|Children|Musical
15254,1,3247,3.0,964983108,Sister Act (1992),Comedy|Crime
12495,1,2528,3.0,964982328,Logan's Run (1976),Action|Adventure|Sci-Fi


# 2.The top 5 movie recommendations and predicted rating for that user

In [79]:
pred_pandas[pred_pandas['userId'] == 1].sort_values(by='prediction', ascending=False).head(5)

Unnamed: 0,userId,movieId,rating,timestamp,title,genres,prediction
18784,1,940,5.0,964982176,"Adventures of Robin Hood, The (1938)",Action|Adventure|Romance,5.0
3369,1,1208,4.0,964983250,Apocalypse Now (1979),Action|Drama|War,4.960655
8419,1,2571,5.0,964981888,"Matrix, The (1999)",Action|Sci-Fi|Thriller,4.940359
20109,1,1080,5.0,964981327,Monty Python's Life of Brian (1979),Comedy,4.934304
2835,1,1210,5.0,964980499,Star Wars: Episode VI - Return of the Jedi (1983),Action|Adventure|Sci-Fi,4.887311


## 3. From the movies that the user did rate in the test data set show statistics on how well the model predicted the users rating of the movie. Show sum error statistics, false positives, false negatives, accuracy statistics etc. 

In [81]:
pred_pandas[pred_pandas['userId'] == 1].sort_values(by='prediction', ascending=False)

Unnamed: 0,userId,movieId,rating,timestamp,title,genres,prediction
18784,1,940,5.0,964982176,"Adventures of Robin Hood, The (1938)",Action|Adventure|Romance,5.000000
3369,1,1208,4.0,964983250,Apocalypse Now (1979),Action|Drama|War,4.960655
8419,1,2571,5.0,964981888,"Matrix, The (1999)",Action|Sci-Fi|Thriller,4.940359
20109,1,1080,5.0,964981327,Monty Python's Life of Brian (1979),Comedy,4.934304
2835,1,1210,5.0,964980499,Star Wars: Episode VI - Return of the Jedi (1983),Action|Adventure|Sci-Fi,4.887311
...,...,...,...,...,...,...,...
9073,1,3440,4.0,964981799,Teenage Mutant Ninja Turtles III (1993),Action|Adventure|Children|Comedy|Fantasy,2.171772
22072,1,1445,3.0,964984112,McHale's Navy (1997),Comedy|War,2.089249
16162,1,2899,5.0,964982703,Gulliver's Travels (1939),Adventure|Animation|Children,1.734506
17805,1,2389,2.0,964983094,Psycho (1998),Crime|Horror|Thriller,1.447344


In [106]:
# change pred_pandas column prediction name to ALS_predicted

pred_pandas.rename(columns={'prediction': 'ALS_predicted'}, inplace=True)

pred_pandas

Unnamed: 0,userId,movieId,rating,timestamp,title,genres,ALS_predicted
0,27,2142,3.0,962685569,"American Tail: Fievel Goes West, An (1991)",Adventure|Animation|Children|Musical|Western,1.874259
1,44,833,2.0,869252237,High School High (1996),Comedy,2.353066
2,91,1645,3.0,1112712216,The Devil's Advocate (1997),Drama|Mystery|Thriller,3.127719
3,91,471,1.0,1112713817,"Hudsucker Proxy, The (1994)",Comedy,3.109687
4,93,1591,4.0,942946677,Spawn (1997),Action|Adventure|Sci-Fi|Thriller,3.285619
...,...,...,...,...,...,...,...
47375,143,2706,4.5,1444768580,American Pie (1999),Comedy|Romance,2.301104
47376,95,3053,3.0,1043340086,"Messenger: The Story of Joan of Arc, The (1999)",Drama|War,4.161676
47377,506,56367,4.0,1424487733,Juno (2007),Comedy|Drama|Romance,2.698774
47378,311,2706,0.5,1057854230,American Pie (1999),Comedy|Romance,1.912456


In [119]:
train_data_balanced[train_data_balanced['userId'] == 1]

Unnamed: 0,userId,movieId,rating,timestamp,title,genres
12495,1,2528,3.0,964982328,Logan's Run (1976),Action|Adventure|Sci-Fi
1365,1,231,5.0,964981179,Dumb & Dumber (Dumb and Dumber) (1994),Adventure|Comedy
15190,1,3176,1.0,964983504,"Talented Mr. Ripley, The (1999)",Drama|Mystery|Thriller
3892,1,553,5.0,964984153,Tombstone (1993),Action|Drama|Western
4310,1,593,4.0,964983793,"Silence of the Lambs, The (1991)",Crime|Horror|Thriller
...,...,...,...,...,...,...
11157,1,2090,5.0,964982838,"Rescuers, The (1977)",Adventure|Animation|Children|Crime|Drama
8294,1,1224,5.0,964984018,Henry V (1989),Action|Drama|Romance|War
12519,1,2529,5.0,964982242,Planet of the Apes (1968),Action|Drama|Sci-Fi
11623,1,2193,4.0,964981710,Willow (1988),Action|Adventure|Fantasy


In [116]:
import pandas as pd
import numpy as np

user_id = 1  # Example user ID

# 1. Movies from the training dataset for the user and their ratings
print("Movies rated by User ID {} in the training data:".format(user_id))
user_train_ratings = train_data_balanced[train_data_balanced['userId'] == user_id]
print(user_train_ratings[['movieId', 'title', 'rating']].sort_values(by='rating', ascending=False).head())

# 2. The top 5 movie recommendations and predicted rating for that user
print("\nTop 5 movie recommendations for User ID {}:".format(user_id))
user_recommendations = pred_pandas[pred_pandas['userId'] == user_id].nlargest(5, 'ALS_predicted')
print(user_recommendations[['movieId', 'title', 'ALS_predicted']])

# 3. Evaluation: Compare actual and predicted ratings for movies the user has rated
user_actual_vs_predicted = pred_pandas[pred_pandas['userId'] == user_id]
user_actual_vs_predicted['error'] = np.abs(user_actual_vs_predicted['rating'] - user_actual_vs_predicted['ALS_predicted'])

# Error statistics
sum_error = user_actual_vs_predicted['error'].sum()
mean_absolute_error = user_actual_vs_predicted['error'].mean()
rmse = np.sqrt((user_actual_vs_predicted['error']**2).mean())

print("\nEvaluation Metrics for User ID {}:".format(user_id))
print("Sum of Errors: {:.2f}".format(sum_error))
print("Mean Absolute Error (MAE): {:.2f}".format(mean_absolute_error))
print("Root Mean Square Error (RMSE): {:.2f}".format(rmse))

# Since false positives and false negatives don't directly apply to regression tasks,
# we can consider setting a threshold for what we consider an acceptable prediction error,
# and then calculate metrics that could resemble "false" predictions based on that threshold.

significant_error_threshold = 1.5  #  threshold for significant prediction error
significant_errors = user_actual_vs_predicted[user_actual_vs_predicted['error'] > significant_error_threshold]
false_positives = significant_errors[significant_errors['ALS_predicted'] > significant_errors['rating']].shape[0]
false_negatives = significant_errors[significant_errors['ALS_predicted'] < significant_errors['rating']].shape[0]
# Calculate True Positives (TP): Predictions within the significant error threshold
true_positives = user_actual_vs_predicted[user_actual_vs_predicted['error'] <= significant_error_threshold].shape[0]

# Note: In this context, we're only considering predictions close to actual ratings as "accurate."
total_predictions = user_actual_vs_predicted.shape[0]
accuracy = true_positives / total_predictions if total_predictions else 0


print("Number of Significant Errors (>|{}|): {}".format(significant_error_threshold, significant_errors.shape[0]))
print("False Positives (Prediction > Actual by >|{}|): {}".format(significant_error_threshold, false_positives))
print("False Negatives (Prediction < Actual by >|{}|): {}".format(significant_error_threshold, false_negatives))
print("True Positives (Prediction within |{}| of Actual): {}".format(significant_error_threshold, true_positives))

print("Accuracy (% of Predictions Within Error Threshold): {:.2%}".format(accuracy))


Movies rated by User ID 1 in the training data:
       movieId                                  title  rating
15656     3578                       Gladiator (2000)     5.0
15985     3703   Road Warrior, The (Mad Max 2) (1981)     5.0
12247     2427              Thin Red Line, The (1998)     5.0
10404     1927  All Quiet on the Western Front (1930)     5.0
14854     3033                      Spaceballs (1987)     5.0

Top 5 movie recommendations for User ID 1:
       movieId                                              title  \
18784      940               Adventures of Robin Hood, The (1938)   
3369      1208                              Apocalypse Now (1979)   
8419      2571                                 Matrix, The (1999)   
20109     1080                Monty Python's Life of Brian (1979)   
2835      1210  Star Wars: Episode VI - Return of the Jedi (1983)   

       ALS_predicted  
18784       5.000000  
3369        4.960655  
8419        4.940359  
20109       4.934304  
2835  

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  user_actual_vs_predicted['error'] = np.abs(user_actual_vs_predicted['rating'] - user_actual_vs_predicted['ALS_predicted'])


In [118]:
# Assuming `Best_model` is your trained ALS model
model_path = "als_model/"
Best_model.write().overwrite().save(model_path)


24/02/04 15:13:27 WARN MemoryManager: Total allocation exceeds 95.00% (1,020,054,720 bytes) of heap memory
Scaling row group sizes to 95.00% for 8 writers
24/02/04 15:13:27 WARN MemoryManager: Total allocation exceeds 95.00% (1,020,054,720 bytes) of heap memory
Scaling row group sizes to 84.44% for 9 writers
24/02/04 15:13:27 WARN MemoryManager: Total allocation exceeds 95.00% (1,020,054,720 bytes) of heap memory
Scaling row group sizes to 76.00% for 10 writers
24/02/04 15:13:27 WARN MemoryManager: Total allocation exceeds 95.00% (1,020,054,720 bytes) of heap memory
Scaling row group sizes to 84.44% for 9 writers
24/02/04 15:13:27 WARN MemoryManager: Total allocation exceeds 95.00% (1,020,054,720 bytes) of heap memory
Scaling row group sizes to 95.00% for 8 writers
24/02/04 15:13:27 WARN MemoryManager: Total allocation exceeds 95.00% (1,020,054,720 bytes) of heap memory
Scaling row group sizes to 95.00% for 8 writers
24/02/04 15:13:27 WARN MemoryManager: Total allocation exceeds 95.00%

# KNN

In [101]:
# Step 1: Preprocess Data

import pandas as pd
from scipy.sparse import csr_matrix
from sklearn.neighbors import NearestNeighbors

# Step 1: Prepare User-Item Matrix from train_data_balanced
# Convert train_data_balanced to a user-item matrix for KNN training
user_item_matrix = train_data_balanced.pivot_table(index='userId', columns='movieId', values='rating', fill_value=0)


# Convert to sparse matrix
sparse_user_item_matrix = csr_matrix(user_item_matrix.values)


In [102]:
user_item_matrix

movieId,1,2,3,4,5,6,7,8,9,10,...,190183,190207,190209,190219,190221,193567,193571,193585,193587,193609
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,0.0,0.0,4.0,0.0,0.0,4.0,0.0,0,0.0,0.0,...,0.0,0.0,0,0,0,0,0,0.0,0.0,0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0.0,0.0,...,0.0,0.0,0,0,0,0,0,0.0,0.0,0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0.0,0.0,...,0.0,0.0,0,0,0,0,0,0.0,0.0,0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0.0,0.0,...,0.0,0.0,0,0,0,0,0,0.0,0.0,0
5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0.0,0.0,...,0.0,0.0,0,0,0,0,0,0.0,0.0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
606,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0.0,0.0,...,0.0,0.0,0,0,0,0,0,0.0,0.0,0
607,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0.0,0.0,...,0.0,0.0,0,0,0,0,0,0.0,0.0,0
608,2.5,2.0,0.0,0.0,0.0,0.0,0.0,0,0.0,0.0,...,0.0,0.0,0,0,0,0,0,0.0,0.0,0
609,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0.0,0.0,...,0.0,0.0,0,0,0,0,0,0.0,0.0,0


In [104]:
# Step 2: Build KNN Model

model_knn = NearestNeighbors(metric='cosine', algorithm='brute', n_neighbors=20, n_jobs=-1)
model_knn.fit(sparse_user_item_matrix)


In [85]:
# Step 3: Generate Recommendations

def get_recommendations(user_id, user_item_matrix, n_recommendations=5):
    # Find the index of the user_id in the user_item_matrix
    user_index = user_id - 1  # Adjusting for 0-based index
    
    # Find neighbors
    distances, indices = model_knn.kneighbors(user_item_matrix.iloc[user_index:user_index+1], n_neighbors=20)
    
    # Flatten the indices and distances arrays
    indices = indices.flatten()
    distances = distances.flatten()
    
    # Aggregate the ratings of the neighbors for each movie
    agg_ratings = pd.Series(dtype=float)
    for i, user_neighbor_index in enumerate(indices):
        if distances[i] > 0:  # Ignore itself (distance 0)
            user_ratings = user_item_matrix.iloc[user_neighbor_index]
            agg_ratings = agg_ratings.add(user_ratings, fill_value=0)
    
    # Filter out movies the user has already rated
    rated_movies = user_item_matrix.iloc[user_index] > 0
    recommendations = agg_ratings[~rated_movies].nlargest(n_recommendations)
    
    # Get movie titles
    movie_titles = movies.set_index('movieId').loc[recommendations.index]['title']
    return movie_titles

# Example: Get recommendations for user ID 1
print(get_recommendations(1, user_item_matrix))


movieId
589     Terminator 2: Judgment Day (1991)
1200                        Aliens (1986)
2762              Sixth Sense, The (1999)
858                 Godfather, The (1972)
1387                          Jaws (1975)
Name: title, dtype: object


In [90]:
def get_recommendations_with_estimated_ratings(user_id, user_item_matrix, n_recommendations=5):
    # Get the user index
    user_index = user_id - 1  # Adjusting for 0-based indexing
    
    # Find the 20 nearest neighbors
    distances, indices = model_knn.kneighbors(user_item_matrix.iloc[user_index:user_index+1], n_neighbors=20)
    
    # Flatten the arrays
    indices = indices.flatten()
    distances = distances.flatten()
    
    # Inverse of distances to use as similarity scores
    similarities = 1 - distances
    
    # Dict to hold potential recommendations and their weighted ratings
    recommendations = {}
    
    for i, user_neighbor_index in enumerate(indices[1:]):  # skip the first one because it's the user itself
        neighbor_ratings = user_item_matrix.iloc[user_neighbor_index]
        for movieId, rating in neighbor_ratings.items():
            if movieId not in user_item_matrix.columns[user_item_matrix.iloc[user_index] > 0]:  # User hasn't rated
                if movieId not in recommendations:
                    recommendations[movieId] = [rating * similarities[i], similarities[i]]
                else:
                    recommendations[movieId][0] += rating * similarities[i]
                    recommendations[movieId][1] += similarities[i]
    
    # Calculate the weighted average to estimate ratings
    estimated_ratings = {movieId: total / sim_sum for movieId, (total, sim_sum) in recommendations.items()}
    
    # Sort by estimated rating
    recommended_movies = sorted(estimated_ratings.items(), key=lambda x: x[1], reverse=True)[:n_recommendations]
    
    # Get movie titles and return results
    movie_titles = movies.set_index('movieId').loc[[movie[0] for movie in recommended_movies]]
    movie_titles['estimated_rating'] = [movie[1] for movie in recommended_movies]
    return movie_titles[['title', 'estimated_rating']]

# Example usage
user_id = 1
print(get_recommendations_with_estimated_ratings(user_id, user_item_matrix))


                                     title  estimated_rating
movieId                                                     
589      Terminator 2: Judgment Day (1991)          3.935203
1200                         Aliens (1986)          3.926345
1610      Hunt for Red October, The (1990)          3.499134
1036                       Die Hard (1988)          3.485055
2762               Sixth Sense, The (1999)          3.385030


In [105]:
def predict_ratings_knn(user_id, movie_ids, user_item_matrix, model_knn):
    """
    Predict ratings for specific movie_ids for a given user.
    """
    user_index = user_id - 1  # Adjusting for 0-based indexing
    predictions = {}

    # Get distances and indices of k-nearest neighbors
    distances, indices = model_knn.kneighbors(user_item_matrix.iloc[user_index:user_index+1], n_neighbors=20)
    indices = indices.flatten()
    distances = distances.flatten()
    similarities = 1 - distances  # Convert distances to similarities

    # Iterate through each movie_id to predict its rating
    for movie_id in movie_ids:
        weighted_sum = 0
        similarity_sum = 0
        for i, neighbor_index in enumerate(indices[1:]):  # Skip the first one (it's the user itself)
            neighbor_similarity = similarities[i]
            # Check if the neighbor has rated the movie
            if movie_id in user_item_matrix.columns:
                neighbor_rating = user_item_matrix.iloc[neighbor_index][movie_id]
                if neighbor_rating > 0:  # Neighbor has rated the movie
                    weighted_sum += neighbor_rating * neighbor_similarity
                    similarity_sum += neighbor_similarity
        
        # Calculate the predicted rating
        predicted_rating = weighted_sum / similarity_sum if similarity_sum != 0 else 0
        predictions[movie_id] = predicted_rating

    return predictions


In [109]:
from sklearn.metrics import mean_squared_error
from math import sqrt

def calculate_rmse(actual_ratings, predicted_ratings):
    mse = mean_squared_error(actual_ratings, predicted_ratings)
    return sqrt(mse)

# Assuming `pred_pandas` contains ALS predictions and actual ratings
# Let's add a column for KNN predictions
pred_pandas['KNN_predicted'] = 0.0
index = 0
for index, row in pred_pandas.iterrows():
    # show how many rows have been processed with regards to total rows
    if index % 100 == 0:
        print(f"Processed {index} rows out of {len(pred_pandas)}")

    user_id = row['userId']
    movie_id = row['movieId']
    # Predict rating using KNN
    knn_predictions = predict_ratings_knn(user_id, [movie_id], user_item_matrix, model_knn)
    pred_pandas.at[index, 'KNN_predicted'] = knn_predictions.get(movie_id, 0)

# Calculate residuals for ALS and KNN
pred_pandas['ALS_residual'] = (pred_pandas['rating'] - pred_pandas['ALS_predicted']).abs()
pred_pandas['KNN_residual'] = (pred_pandas['rating'] - pred_pandas['KNN_predicted']).abs()

# Calculate RMSE for ALS and KNN
als_rmse = calculate_rmse(pred_pandas['rating'], pred_pandas['ALS_predicted'])
knn_rmse = calculate_rmse(pred_pandas['rating'], pred_pandas['KNN_predicted'])

print(f"ALS RMSE: {als_rmse}")
print(f"KNN RMSE: {knn_rmse}")

# Compare RMSE to decide the better model
better_model = "ALS" if als_rmse < knn_rmse else "KNN"
print(f"Better model: {better_model}")


Processed 0 rows out of 47380
Processed 100 rows out of 47380
Processed 200 rows out of 47380
Processed 300 rows out of 47380
Processed 400 rows out of 47380
Processed 500 rows out of 47380
Processed 600 rows out of 47380
Processed 700 rows out of 47380
Processed 800 rows out of 47380
Processed 900 rows out of 47380
Processed 1000 rows out of 47380
Processed 1100 rows out of 47380
Processed 1200 rows out of 47380
Processed 1300 rows out of 47380
Processed 1400 rows out of 47380
Processed 1500 rows out of 47380
Processed 1600 rows out of 47380
Processed 1700 rows out of 47380
Processed 1800 rows out of 47380
Processed 1900 rows out of 47380
Processed 2000 rows out of 47380
Processed 2100 rows out of 47380
Processed 2200 rows out of 47380
Processed 2300 rows out of 47380
Processed 2400 rows out of 47380
Processed 2500 rows out of 47380
Processed 2600 rows out of 47380
Processed 2700 rows out of 47380
Processed 2800 rows out of 47380
Processed 2900 rows out of 47380
Processed 3000 rows ou