In [0]:
display(dbutils.fs.ls('/databricks-datasets/cs110x/ml-1m/data-001/'))

In [0]:
%sh
head -n 500 /databricks-datasets/cs110x/ml-1m/data-001/movies.dat

In [0]:
%sh
head -n 500 /databricks-datasets/cs110x/ml-1m/data-001/ratings.dat

In [0]:
display(dbutils.fs.ls('/databricks-datasets/cs110x/ml-20m/data-001/'))

In [0]:
%sh
head -n 500 /databricks-datasets/cs110x/ml-20m/data-001/ratings.csv

In [0]:
from pyspark.sql.types import *

movies_schema = StructType([
  StructField('movieId', IntegerType()),
  StructField('title', StringType()),
  StructField('genres', StringType())
])
ratings_schema = StructType([
  StructField('userId', IntegerType()),
  StructField('movieId', IntegerType()),
  StructField('ratings', FloatType())  
])

In [0]:
file_location = "/databricks-datasets/cs110x/ml-1m/data-001/movies.dat"
file_type = "csv"

# CSV options
infer_schema = "true"
first_row_is_header = "false"
delimiter = "::"

# The applied options are for CSV files. For other file types, these will be ignored.
df_movies = spark.read.format(file_type) \
  .option("inferSchema", infer_schema) \
  .option("header", first_row_is_header) \
  .option("sep", delimiter) \
  .schema(movies_schema) \
  .load(file_location)

display(df_movies)

In [0]:
file_location = "/databricks-datasets/cs110x/ml-1m/data-001/ratings.dat"
file_type = "csv"

# CSV options
infer_schema = "true"
first_row_is_header = "false"
delimiter = "::"

# The applied options are for CSV files. For other file types, these will be ignored.
df_ratings = spark.read.format(file_type) \
  .option("inferSchema", infer_schema) \
  .option("header", first_row_is_header) \
  .option("sep", delimiter) \
  .schema(ratings_schema) \
  .load(file_location)

display(df_ratings)

In [0]:
(df_ratings_train, df_ratings_test) = df_ratings.randomSplit([0.8, 0.2])
#df_ratings_train = df_ratings_80.cache()
#df_ratings_test = df_ratings_20.cache()
print(df_ratings_train.count())
print(df_ratings_test.count())

In [0]:
from pyspark.ml.recommendation import ALS

als = ALS() \
  .setMaxIter(10) \
  .setRegParam(0.1) \
  .setRank(6) \
  .setUserCol('userId') \
  .setRatingCol("ratings") \
  .setItemCol("movieId")

model = als.fit(df_ratings_train)

In [0]:
df_predicted_ratings = model.transform(df_ratings_test)
df_predicted_ratings = df_predicted_ratings.filter(df_predicted_ratings.prediction != float('nan'))
display(df_predicted_ratings)

In [0]:
from pyspark.ml.evaluation import RegressionEvaluator

regr_eval = RegressionEvaluator(predictionCol='prediction', labelCol='ratings', metricName='rmse')
error = regr_eval.evaluate(df_predicted_ratings)
print(f'RMSE: {error}')