In [5]:
from pyspark.sql import SparkSession
from pyspark.ml.recommendation import ALS
from pyspark.ml.evaluation import RegressionEvaluator

# Initialize Spark Session
spark = SparkSession.builder.appName("MovieLensALS").getOrCreate()

# ALS Model Configuration
als = ALS(rank=57, regParam=0.1, userCol="userId", itemCol="movieId", ratingCol="rating", coldStartStrategy="drop", nonnegative=True)

# Evaluator Configuration
evaluator = RegressionEvaluator(metricName="rmse", labelCol="rating", predictionCol="prediction")

# Path to the MovieLens 100k dataset
data_path = "../../data/raw/ml-100k/"  # Replace with your path to the ml-100k folder


In [6]:
# Iterate through each of the five folds
for i in range(1, 6):
    # Load training and test data for the current fold
    training = spark.read.csv(f"{data_path}u{i}.base", sep='\t', inferSchema=True).toDF("userId", "movieId", "rating", "timestamp")
    test = spark.read.csv(f"{data_path}u{i}.test", sep='\t', inferSchema=True).toDF("userId", "movieId", "rating", "timestamp")

    # Fit the model
    model = als.fit(training)

    # Evaluate the model
    predictions = model.transform(test)
    rmse = evaluator.evaluate(predictions)
    print(f"Fold {i}, RMSE: {rmse}")

23/12/02 16:59:21 WARN InstanceBuilder: Failed to load implementation from:dev.ludovic.netlib.blas.JNIBLAS
23/12/02 16:59:21 WARN InstanceBuilder: Failed to load implementation from:dev.ludovic.netlib.blas.VectorBLAS


Fold 1, RMSE: 0.9275722381661471
Fold 2, RMSE: 0.9179055593880304
Fold 3, RMSE: 0.913817514369452
Fold 4, RMSE: 0.9172329510479836
Fold 5, RMSE: 0.917554774118897


In [None]:
# Stop the Spark session
spark.stop()