In [3]:
import pandas as pd

In [4]:
PATH = "../data/raw/ml-100k/"

ratings_columns = ["user_id", "movie_id", "rating", "timestamp"]
ratings = pd.read_csv(
    PATH + "u.data", sep="\t", names=ratings_columns, encoding="latin-1"
)

In [5]:
movie_columns = [
    "movie_id",
    "title",
    "release_date",
    "video_release_date",
    "IMDb_URL",
    "unknown",
    "Action",
    "Adventure",
    "Animation",
    "Children",
    "Comedy",
    "Crime",
    "Documentary",
    "Drama",
    "Fantasy",
    "Film-Noir",
    "Horror",
    "Musical",
    "Mystery",
    "Romance",
    "Sci-Fi",
    "Thriller",
    "War",
    "Western",
]
movies = pd.read_csv(PATH + "u.item", sep="|", names=movie_columns, encoding="latin-1")

In [7]:
user_columns = ["user_id", "age", "gender", "occupation", "zip_code"]
users = pd.read_csv(PATH + "u.user", sep="|", names=user_columns, encoding="latin-1")

In [12]:
movie_ratings = pd.merge(ratings, movies[['movie_id', 'title']], on='movie_id')

users

Unnamed: 0,user_id,age,gender,occupation,zip_code
0,1,24,M,technician,85711
1,2,53,F,other,94043
2,3,23,M,writer,32067
3,4,24,M,technician,43537
4,5,33,F,other,15213
...,...,...,...,...,...
938,939,26,F,student,33319
939,940,32,M,administrator,02215
940,941,20,M,student,97229
941,942,48,F,librarian,78209


In [17]:
from pyspark.sql import SparkSession
from pyspark.ml.recommendation import ALS
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.ml.tuning import ParamGridBuilder, CrossValidator
from pyspark.sql import Row

# Initialize Spark Session
spark = SparkSession.builder.appName("MovieLensALS").getOrCreate()

# Load data
data_path = "../data/raw/ml-100k/u.data"  # Replace with your path to the u.data file
ratings = spark.read.csv(data_path, sep='\t', inferSchema=True)
ratings = ratings.withColumnRenamed("_c0", "userId") \
                 .withColumnRenamed("_c1", "movieId") \
                 .withColumnRenamed("_c2", "rating")

In [18]:
# Split the data into training and test sets
(training, test) = ratings.randomSplit([0.8, 0.2])

# Build the recommendation model using ALS
# als = ALS(maxIter=15, regParam=0.01, rank=5, userCol="userId", itemCol="movieId", ratingCol="rating", coldStartStrategy="drop")
als = ALS(userCol="userId", itemCol="movieId", ratingCol="rating", coldStartStrategy="drop", nonnegative=True)

paramGrid = ParamGridBuilder() \
    .addGrid(als.rank, [10, 20, 30]) \
    .addGrid(als.regParam, [0.01, 0.05, 0.1]) \
    .build()
# Fit the model to the training data
# model = als.fit(training)

In [19]:
# Evaluate the model by computing the RMSE on the test data
# predictions = model.transform(test)
evaluator = RegressionEvaluator(
    metricName="rmse", labelCol="rating", predictionCol="prediction"
)


crossval = CrossValidator(
    estimator=als, estimatorParamMaps=paramGrid, evaluator=evaluator, numFolds=3
)

cvModel = crossval.fit(ratings)

bestModel = cvModel.bestModel

# Print best rank and regParam
print("Best Rank:", bestModel._java_obj.parent().getRank())
print("Best regParam:", bestModel._java_obj.parent().getRegParam())


Best Rank: 30
Best regParam: 0.1


In [20]:
# Apply the best model to the test data
predictions = bestModel.transform(test)  # Make sure you have a test_data set

# Evaluate best model
rmse = evaluator.evaluate(predictions)
print("Root-mean-square error = " + str(rmse))

Root-mean-square error = 0.7323407740355592


In [None]:
spark.stop()