In [105]:
from pyspark.sql import SparkSession
from pyspark.ml.recommendation import ALS
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.ml.tuning import ParamGridBuilder, CrossValidator
from pyspark.sql import Row

# Initialize Spark Session
spark = SparkSession.builder.appName("MovieLensALS").getOrCreate()

# Load data
data_path = "../data/interim/ratings_train.parquet"  # Replace with your path to the u.data file
ratings = spark.read.parquet(data_path, sep='\t', inferSchema=True)
ratings = ratings.withColumnRenamed("_c0", "user_id") \
                 .withColumnRenamed("_c1", "movie_id") \
                 .withColumnRenamed("_c2", "rating")

In [128]:
# Split the data into training and test sets
# (training, test) = ratings.randomSplit([0.8, 0.2])
test = spark.read.parquet("../data/interim/ratings_test.parquet", sep='\t', inferSchema=True)
test = test.withColumnRenamed("_c0", "user_id") \
                 .withColumnRenamed("_c1", "movie_id") \
                 .withColumnRenamed("_c2", "rating")

# Build the recommendation model using ALS
als = ALS(userCol="user_id", itemCol="movie_id", ratingCol="rating", coldStartStrategy="drop", nonnegative=True)


paramGrid = ParamGridBuilder() \
    .addGrid(als.rank, [30, 35, 40, 50, 55, 56, 57, 58, 60]) \
    .addGrid(als.regParam, [0.01, 0.05, 0.1, 0.14, 0.15, 0.16, 0.17]) \
    .build()
# Fit the model to the training data
# model = als.fit(ratings)

In [None]:
rmse_evaluator = RegressionEvaluator(
    metricName="rmse", labelCol="rating", predictionCol="prediction"
)

crossval = CrossValidator(
    estimator=als, estimatorParamMaps=paramGrid, evaluator=rmse_evaluator, numFolds=3
)


cvModel = crossval.fit(ratings)

bestModel = cvModel.bestModel

# Print best rank and regParam
print("Best Rank:", bestModel._java_obj.parent().getRank())
print("Best regParam:", bestModel._java_obj.parent().getRegParam())
# Best Rank: 30
# Best regParam: 0.14

In [131]:
mae_evaluator = RegressionEvaluator(
    metricName="mae", labelCol="rating", predictionCol="prediction"
)

predictions = bestModel.transform(test)

rmse = rmse_evaluator.evaluate(predictions)
mae = mae_evaluator.evaluate(predictions)
print("RMSE = " + str(rmse))
print("MAE = " + str(mae))
# RMSE = 0.9301872687539
# MAE = 0.7443731769976111
# Final Combined RMSE: 0.9135125781679562
# Final Combined MAE: 0.7203104881281703

RMSE = 0.9301872687539
MAE = 0.7443731769976111


In [109]:
import pandas as pd

# Load user data
data_path = "../data/interim/"

ratings_df_train = pd.read_parquet(data_path + "ratings_train.parquet")
ratings_df_test = pd.read_parquet(data_path + "ratings_test.parquet")
users_df = pd.read_parquet(data_path + "users.parquet")
movies_df = pd.read_parquet(data_path + "movies.parquet")

combined_df_train = pd.merge(ratings_df_train, users_df, on="user_id")
combined_df_test = pd.merge(ratings_df_test, users_df, on="user_id")

labels_train = combined_df_train["rating"]
labels_test = combined_df_test["rating"]
combined_df_train.drop(["rating"], axis=1, inplace=True)
combined_df_test.drop(["rating"], axis=1, inplace=True)

In [111]:
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.metrics import mean_squared_error, mean_absolute_error

param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}


In [112]:
grid_search = GridSearchCV(estimator=RandomForestClassifier(random_state=42),
                           param_grid=param_grid,
                           cv=3,  # Number of folds for cross-validation
                           n_jobs=-1,  # Use all available cores
                           verbose=2)

# Fit the grid search to the data
grid_search.fit(combined_df_train, labels_train)


Fitting 3 folds for each of 108 candidates, totalling 324 fits
[CV] END max_depth=None, min_samples_leaf=1, min_samples_split=2, n_estimators=100; total time=   6.6s
[CV] END max_depth=None, min_samples_leaf=1, min_samples_split=2, n_estimators=100; total time=   6.8s
[CV] END max_depth=None, min_samples_leaf=1, min_samples_split=2, n_estimators=100; total time=   6.8s
[CV] END max_depth=None, min_samples_leaf=1, min_samples_split=5, n_estimators=100; total time=   5.9s
[CV] END max_depth=None, min_samples_leaf=1, min_samples_split=5, n_estimators=100; total time=   6.2s
[CV] END max_depth=None, min_samples_leaf=1, min_samples_split=2, n_estimators=200; total time=  13.6s
[CV] END max_depth=None, min_samples_leaf=1, min_samples_split=2, n_estimators=200; total time=  14.1s
[CV] END max_depth=None, min_samples_leaf=1, min_samples_split=2, n_estimators=200; total time=  14.3s
[CV] END max_depth=None, min_samples_leaf=1, min_samples_split=5, n_estimators=100; total time=   5.9s
[CV] END m



[CV] END max_depth=None, min_samples_leaf=1, min_samples_split=10, n_estimators=200; total time=  11.1s
[CV] END max_depth=None, min_samples_leaf=1, min_samples_split=10, n_estimators=200; total time=  10.2s
[CV] END max_depth=None, min_samples_leaf=1, min_samples_split=5, n_estimators=300; total time=  18.5s
[CV] END max_depth=None, min_samples_leaf=2, min_samples_split=2, n_estimators=100; total time=   5.3s
[CV] END max_depth=None, min_samples_leaf=2, min_samples_split=2, n_estimators=100; total time=   5.5s
[CV] END max_depth=None, min_samples_leaf=2, min_samples_split=2, n_estimators=100; total time=   5.1s
[CV] END max_depth=None, min_samples_leaf=1, min_samples_split=10, n_estimators=300; total time=  15.3s
[CV] END max_depth=None, min_samples_leaf=1, min_samples_split=10, n_estimators=300; total time=  15.9s
[CV] END max_depth=None, min_samples_leaf=1, min_samples_split=10, n_estimators=300; total time=  15.3s
[CV] END max_depth=None, min_samples_leaf=2, min_samples_split=2, n_

In [113]:
best_params = grid_search.best_params_
print("Best parameters:", best_params)

best_rf = grid_search.best_estimator_

# Re-evaluate with the best parameters
rf_predictions = best_rf.predict(combined_df_test)

# Calculate accuracy, RMSE, and MAE
accuracy = accuracy_score(labels_test, rf_predictions)
rmse = mean_squared_error(labels_test, rf_predictions, squared=False)
mae = mean_absolute_error(labels_test, rf_predictions)

print(f'Best Model RMSE: {rmse}')
print(f'Best Model MAE: {mae}')
print(f'Best Model Accuracy: {accuracy}')

Best parameters: {'max_depth': 10, 'min_samples_leaf': 4, 'min_samples_split': 2, 'n_estimators': 100}
Best Model RMSE: 1.16586877477699
Best Model MAE: 0.83625
Best Model Accuracy: 0.37475


In [114]:
als_predictions_selected = predictions.select('user_id', 'movie_id', 'prediction')
als_predictions_pandas = als_predictions_selected.toPandas()

als_predictions_pandas.head()
rf_predictions_df = pd.DataFrame({
    'user_id': ratings_df_test['user_id'],
    'movie_id': ratings_df_test['movie_id'],
    'prediction_rf': rf_predictions  # Your NumPy array
})

In [115]:
rf_predictions.shape
# combined_predictions = pd.merge(als_predictions_pandas, rf_predictions_df, on=['user_id', 'movie_id'], how='inner')
combined_predictions = pd.merge(als_predictions_pandas, rf_predictions_df, on=['user_id', 'movie_id'])

combined_predictions['final_prediction'] = (combined_predictions['prediction'] + combined_predictions['prediction_rf']) / 2

In [116]:
final_evaluation_df = pd.merge(combined_predictions, ratings_df_test, on=['user_id', 'movie_id'])

# Calculate RMSE and MAE
from sklearn.metrics import mean_squared_error, mean_absolute_error

final_rmse = mean_squared_error(final_evaluation_df['rating'], final_evaluation_df['final_prediction'], squared=False)
final_mae = mean_absolute_error(final_evaluation_df['rating'], final_evaluation_df['final_prediction'])

print(f'Final Combined RMSE: {final_rmse}')
print(f'Final Combined MAE: {final_mae}')

Final Combined RMSE: 0.9727515745117686
Final Combined MAE: 0.7648656593675975


In [117]:
# Your features are the predictions from ALS and Random Forest
X = final_evaluation_df[['prediction', 'prediction_rf']]

# The target variable is the actual rating
y = final_evaluation_df['rating']

final_evaluation_df

Unnamed: 0,user_id,movie_id,prediction,prediction_rf,final_prediction,rating
0,251,148,3.175927,4,3.587964,2
1,255,833,2.100503,3,2.550251,4
2,321,496,3.866192,4,3.933096,4
3,108,471,3.262417,4,3.631209,2
4,101,471,3.239963,4,3.619981,3
...,...,...,...,...,...,...
19963,215,89,4.091563,4,4.045782,4
19964,144,89,3.925414,4,3.962707,3
19965,18,89,4.064257,4,4.032129,3
19966,138,517,3.918762,4,3.959381,4


In [118]:
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, mean_absolute_error

# Split the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Initialize and train the linear regression model
lr_model = LinearRegression()
lr_model.fit(X_train, y_train)

# Predict on the test set
lr_predictions = lr_model.predict(X_test)

# Evaluate the model
lr_rmse = mean_squared_error(y_test, lr_predictions, squared=False)
lr_mae = mean_absolute_error(y_test, lr_predictions)

print(f'Linear Regression RMSE: {lr_rmse}')
print(f'Linear Regression MAE: {lr_mae}')


Linear Regression RMSE: 0.9140223780461284
Linear Regression MAE: 0.7166844255903876


In [119]:
# Assuming lr_model is your trained linear regression model
coefficients = lr_model.coef_
als_weight = coefficients[0]
rf_weight = coefficients[1]

print(f"Weight for ALS predictions: {als_weight}")
print(f"Weight for Random Forest predictions: {rf_weight}")


Weight for ALS predictions: 1.0037993441518225
Weight for Random Forest predictions: 0.03725468142652957


In [120]:
combined_predictions['final_prediction'] = (combined_predictions['prediction'] * als_weight + combined_predictions['prediction_rf'] * rf_weight)

In [121]:
final_evaluation_df = pd.merge(combined_predictions, ratings_df_test, on=['user_id', 'movie_id'])

# Calculate RMSE and MAE
from sklearn.metrics import mean_squared_error, mean_absolute_error

final_rmse = mean_squared_error(final_evaluation_df['rating'], final_evaluation_df['final_prediction'], squared=False)
final_mae = mean_absolute_error(final_evaluation_df['rating'], final_evaluation_df['final_prediction'])

print(f'Final Combined RMSE: {final_rmse}')
print(f'Final Combined MAE: {final_mae}')

Final Combined RMSE: 0.9135125781679562
Final Combined MAE: 0.7203104881281703
