In [156]:
# This seed is used in all models
SEED = 42

### Initializing Spark

In [135]:
from pyspark.sql import SparkSession
from pyspark.ml.recommendation import ALS
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.ml.tuning import ParamGridBuilder, CrossValidator

# Initialize Spark Session
spark = SparkSession.builder.appName("MovieLensALS").getOrCreate()

# Load data
data_path = "../data/interim/"
ratings = spark.read.parquet(data_path + "ratings_train.parquet", sep='\t', inferSchema=True)
ratings = ratings.withColumnRenamed("_c0", "user_id") \
                 .withColumnRenamed("_c1", "movie_id") \
                 .withColumnRenamed("_c2", "rating")

test = spark.read.parquet(data_path + "ratings_test.parquet", sep='\t', inferSchema=True)
test = test.withColumnRenamed("_c0", "user_id") \
                 .withColumnRenamed("_c1", "movie_id") \
                 .withColumnRenamed("_c2", "rating")

### Creating ALS and forming a param grid for grid search

In [136]:
als = ALS(
    maxIter=50,
    userCol="user_id",
    itemCol="movie_id",
    ratingCol="rating",
    coldStartStrategy="drop",
    nonnegative=True,
    seed=SEED
)

# I've chosen the following hyperparameters because they influence the result the most.
paramGrid = (
    ParamGridBuilder()
    .addGrid(als.rank, [10, 20, 30, 35, 40, 50])
    .addGrid(als.regParam, [0.01, 0.025, 0.05, 0.1, 0.14, 0.15])
    .build()
)

In [138]:
rmse_evaluator = RegressionEvaluator(
    metricName="rmse", labelCol="rating", predictionCol="prediction"
)

crossval = CrossValidator(
    estimator=als, estimatorParamMaps=paramGrid, evaluator=rmse_evaluator, numFolds=3
)


cvModel = crossval.fit(ratings)

bestModel = cvModel.bestModel

# Print best rank and regParam
print("Best Rank:", bestModel._java_obj.parent().getRank())
print("Best regParam:", bestModel._java_obj.parent().getRegParam())
# Best Rank: 50
# Best regParam: 0.14

23/12/04 00:53:48 WARN CacheManager: Asked to cache already cached data.
23/12/04 00:53:48 WARN CacheManager: Asked to cache already cached data.


Best Rank: 50
Best regParam: 0.14


### Evaluating the acquired hyperparameters

In [139]:
mae_evaluator = RegressionEvaluator(
    metricName="mae", labelCol="rating", predictionCol="prediction"
)

predictions = bestModel.transform(test)

rmse = rmse_evaluator.evaluate(predictions)
mae = mae_evaluator.evaluate(predictions)
print("RMSE = " + str(rmse))
print("MAE = " + str(mae))
# RMSE = 0.9241108498697321
# MAE = 0.7382249872480782

RMSE = 0.9241108498697321
MAE = 0.7382249872480782


### Importing the data for the Random Forest approach

In [140]:
import pandas as pd

# Load user data
data_path = "../data/interim/"

ratings_df_train = pd.read_parquet(data_path + "ratings_train.parquet")
ratings_df_test = pd.read_parquet(data_path + "ratings_test.parquet")
users_df = pd.read_parquet(data_path + "users.parquet")

combined_df_train = pd.merge(ratings_df_train, users_df, on="user_id")
combined_df_test = pd.merge(ratings_df_test, users_df, on="user_id")

labels_train = combined_df_train["rating"]
labels_test = combined_df_test["rating"]
combined_df_train.drop(["rating"], axis=1, inplace=True)
combined_df_test.drop(["rating"], axis=1, inplace=True)

### Initializing the model and param grid for grid search

In [111]:
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.metrics import mean_squared_error, mean_absolute_error

param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}


In [112]:
grid_search = GridSearchCV(estimator=RandomForestClassifier(random_state=SEED),
                           param_grid=param_grid,
                           cv=3,  # Number of folds for cross-validation
                           n_jobs=-1,  # Use all available cores
                           verbose=2)

# Fit the grid search to the data
grid_search.fit(combined_df_train, labels_train)


Fitting 3 folds for each of 108 candidates, totalling 324 fits
[CV] END max_depth=None, min_samples_leaf=1, min_samples_split=2, n_estimators=100; total time=   6.6s
[CV] END max_depth=None, min_samples_leaf=1, min_samples_split=2, n_estimators=100; total time=   6.8s
[CV] END max_depth=None, min_samples_leaf=1, min_samples_split=2, n_estimators=100; total time=   6.8s
[CV] END max_depth=None, min_samples_leaf=1, min_samples_split=5, n_estimators=100; total time=   5.9s
[CV] END max_depth=None, min_samples_leaf=1, min_samples_split=5, n_estimators=100; total time=   6.2s
[CV] END max_depth=None, min_samples_leaf=1, min_samples_split=2, n_estimators=200; total time=  13.6s
[CV] END max_depth=None, min_samples_leaf=1, min_samples_split=2, n_estimators=200; total time=  14.1s
[CV] END max_depth=None, min_samples_leaf=1, min_samples_split=2, n_estimators=200; total time=  14.3s
[CV] END max_depth=None, min_samples_leaf=1, min_samples_split=5, n_estimators=100; total time=   5.9s
[CV] END m



[CV] END max_depth=None, min_samples_leaf=1, min_samples_split=10, n_estimators=200; total time=  11.1s
[CV] END max_depth=None, min_samples_leaf=1, min_samples_split=10, n_estimators=200; total time=  10.2s
[CV] END max_depth=None, min_samples_leaf=1, min_samples_split=5, n_estimators=300; total time=  18.5s
[CV] END max_depth=None, min_samples_leaf=2, min_samples_split=2, n_estimators=100; total time=   5.3s
[CV] END max_depth=None, min_samples_leaf=2, min_samples_split=2, n_estimators=100; total time=   5.5s
[CV] END max_depth=None, min_samples_leaf=2, min_samples_split=2, n_estimators=100; total time=   5.1s
[CV] END max_depth=None, min_samples_leaf=1, min_samples_split=10, n_estimators=300; total time=  15.3s
[CV] END max_depth=None, min_samples_leaf=1, min_samples_split=10, n_estimators=300; total time=  15.9s
[CV] END max_depth=None, min_samples_leaf=1, min_samples_split=10, n_estimators=300; total time=  15.3s
[CV] END max_depth=None, min_samples_leaf=2, min_samples_split=2, n_

### Evaluating best parameters

In [141]:
best_params = grid_search.best_params_
print("Best parameters:", best_params)

best_rf = grid_search.best_estimator_

# Re-evaluate with the best parameters
rf_predictions = best_rf.predict(combined_df_test)

# Calculate accuracy, RMSE, and MAE
accuracy = accuracy_score(labels_test, rf_predictions)
rmse = mean_squared_error(labels_test, rf_predictions, squared=False)
mae = mean_absolute_error(labels_test, rf_predictions)

print(f'Best Model RMSE: {rmse}')
print(f'Best Model MAE: {mae}')
print(f'Best Model Accuracy: {accuracy}')

Best parameters: {'max_depth': 10, 'min_samples_leaf': 4, 'min_samples_split': 2, 'n_estimators': 100}
Best Model RMSE: 1.16586877477699
Best Model MAE: 0.83625
Best Model Accuracy: 0.37475


### Saving the models for later inference

In [150]:
import joblib

joblib.dump(best_rf, "./random_forest.joblib")
bestModel.write().save("als")

### Exporting the result data to pandas

In [142]:
als_predictions_selected = predictions.select('user_id', 'movie_id', 'prediction')
als_predictions_pandas = als_predictions_selected.toPandas()

als_predictions_pandas.head()
rf_predictions_df = pd.DataFrame({
    'user_id': ratings_df_test['user_id'],
    'movie_id': ratings_df_test['movie_id'],
    'prediction_rf': rf_predictions  # Your NumPy array
})
combined_predictions = pd.merge(als_predictions_pandas, rf_predictions_df, on=['user_id', 'movie_id'])
final_evaluation_df = pd.merge(combined_predictions, ratings_df_test, on=['user_id', 'movie_id'])

### Running Linear Regression to find the best parameters for combining two recommendation approaches

In [155]:
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split

# Your features are the predictions from ALS and Random Forest
X = final_evaluation_df[["prediction", "prediction_rf"]]

# The target variable is the actual rating
y = final_evaluation_df["rating"]

# Split the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.3, random_state=SEED
)

# Initialize and train the linear regression model
lr_model = LinearRegression()
lr_model.fit(X_train, y_train)

# Predict on the test set
lr_predictions = lr_model.predict(X_test)

# Extracting coefficients
coefficients = lr_model.coef_
als_weight = coefficients[0]
rf_weight = coefficients[1]

print(f"Weight for ALS predictions: {als_weight}")
print(f"Weight for Random Forest predictions: {rf_weight}")

Weight for ALS predictions: 1.005058707886241
Weight for Random Forest predictions: 0.029824356446759744


### Combining and printing the final results

In [153]:
combined_predictions['final_prediction'] = (combined_predictions['prediction'] * als_weight + combined_predictions['prediction_rf'] * rf_weight)

In [154]:
final_evaluation_df = pd.merge(combined_predictions, ratings_df_test, on=['user_id', 'movie_id'])

# Calculate RMSE and MAE
from sklearn.metrics import mean_squared_error, mean_absolute_error

final_rmse = mean_squared_error(final_evaluation_df['rating'], final_evaluation_df['final_prediction'], squared=False)
final_mae = mean_absolute_error(final_evaluation_df['rating'], final_evaluation_df['final_prediction'])

print(f'Final Combined RMSE: {final_rmse}')
print(f'Final Combined MAE: {final_mae}')

Final Combined RMSE: 0.9108300704330717
Final Combined MAE: 0.7186688314985313


In [168]:
threshold = 3
binary_actual = (final_evaluation_df['rating'] >= threshold)
binary_predicted = (final_evaluation_df['final_prediction'] >= threshold)

In [169]:
from sklearn.metrics import precision_score, recall_score, f1_score

precision = precision_score(binary_actual, binary_predicted)
recall = recall_score(binary_actual, binary_predicted)
f1 = f1_score(binary_actual, binary_predicted)

print(f'Precision: {precision}')
print(f'Recall: {recall}')
print(f'F1: {f1}')

Precision: 0.9076346068272837
Recall: 0.8705035971223022
F1: 0.8886814178570318


23/12/03 12:17:13 WARN HeartbeatReceiver: Removing executor driver with no recent heartbeats: 50399328 ms exceeds timeout 120000 ms
23/12/03 12:17:13 WARN SparkContext: Killing executors is not supported by current scheduler.
