In [0]:
display(dbutils.fs.ls('/databricks-datasets/cs110x/ml-1m/data-001/'))

In [0]:
%sh
head -n 500 /databricks-datasets/cs110x/ml-1m/data-001/movies.dat

In [0]:
%sh
head -n 500 /databricks-datasets/cs110x/ml-1m/data-001/ratings.dat

In [0]:
display(dbutils.fs.ls('/databricks-datasets/cs110x/ml-20m/data-001/'))

In [0]:
%sh
head -n 500 /databricks-datasets/cs110x/ml-20m/data-001/ratings.csv

In [0]:
from pyspark.sql.types import *

movies_schema = StructType([
  StructField('movieId', IntegerType()),
  StructField('title', StringType()),
  StructField('genres', StringType())
])
ratings_schema = StructType([
  StructField('userId', IntegerType()),
  StructField('movieId', IntegerType()),
  StructField('ratings', FloatType())  
])

In [0]:
file_location = "/databricks-datasets/cs110x/ml-1m/data-001/movies.dat"
file_type = "csv"

# CSV options
infer_schema = "true"
first_row_is_header = "false"
delimiter = "::"

# The applied options are for CSV files. For other file types, these will be ignored.
df_movies = spark.read.format(file_type) \
  .option("inferSchema", infer_schema) \
  .option("header", first_row_is_header) \
  .option("sep", delimiter) \
  .schema(movies_schema) \
  .load(file_location)

display(df_movies)

In [0]:
file_location = "/databricks-datasets/cs110x/ml-1m/data-001/ratings.dat"
file_type = "csv"

# CSV options
infer_schema = "true"
first_row_is_header = "false"
delimiter = "::"

# The applied options are for CSV files. For other file types, these will be ignored.
df_ratings = spark.read.format(file_type) \
  .option("inferSchema", infer_schema) \
  .option("header", first_row_is_header) \
  .option("sep", delimiter) \
  .schema(ratings_schema) \
  .load(file_location)

display(df_ratings)

In [0]:
(df_ratings_train, df_ratings_test) = df_ratings.randomSplit([0.8, 0.2])
#df_ratings_train = df_ratings_80.cache()
#df_ratings_test = df_ratings_20.cache()
print(df_ratings_train.count())
print(df_ratings_test.count())

In [0]:
from pyspark.ml.recommendation import ALS

als = ALS() \
  .setMaxIter(10) \
  .setRegParam(0.1) \
  .setRank(6) \
  .setUserCol('userId') \
  .setRatingCol("ratings") \
  .setItemCol("movieId")

model = als.fit(df_ratings_train)

In [0]:
df_predicted_ratings = model.transform(df_ratings_test)
df_predicted_ratings = df_predicted_ratings.filter(df_predicted_ratings.prediction != float('nan'))
display(df_predicted_ratings)

In [0]:
from pyspark.ml.evaluation import RegressionEvaluator

regr_eval = RegressionEvaluator(predictionCol='prediction', labelCol='ratings', metricName='rmse')
error = regr_eval.evaluate(df_predicted_ratings)
print(f'RMSE: {error}')

In [0]:
df_predicted_ratings.select('ratings').describe().show()

In [0]:
df_predicted_ratings.select('prediction').describe().show()

In [0]:
df_ratings_test.select('userId').distinct().describe().show()
#check_user_id = 


In [0]:
my_user_id = 0
my_rated_movies = [
    (my_user_id, 318, 3), # Shawshank redemption
    (my_user_id, 908, 4), # North by Northwest (1959)
    (my_user_id, 858, 5), # Godfather, The (1972)
    (my_user_id, 2019, 4), # Seven Samurai (The Magnificent Seven) (Shichinin no samurai) (1954)
    (my_user_id, 912, 4), # Casablanca (1942)
    (my_user_id, 1250, 5), # Bridge on the River Kwai, The (1957)
    (my_user_id, 2324, 5), # Life Is Beautiful (La Vita ? bella) (1997)
    (my_user_id, 1233, 5), # Boat, The (Das Boot) (1981)
    (my_user_id, 593, 4), # Silence of the Lambs, The (1991)
    (my_user_id, 1262, 4), # Great Escape, The (1963)
     # The format of each line is (my_user_id, movie ID, your rating)
     # For example, to give the movie "Star Wars: Episode IV - A New Hope (1977)" a five rating, you would add the following line:
     #   (my_user_id, 260, 5),
]

In [0]:
df_custom_ratings = spark.createDataFrame(my_rated_movies, ['userId', 'movieId', 'ratings'])
df_custom_ratings.show()

In [0]:
print(f'Training set count: {df_ratings_train.count()}')
df_all_ratings = df_ratings_train.union(df_custom_ratings)
print(f'All ratings count: {df_all_ratings.count()}')


In [0]:
custom_model = als.fit(df_all_ratings)


#TODO:
1. Create a dataset of unrated movies using the df_movies
2. Create a dataframe of unrated movies with userId set to "0"
3. Use this dataframe to run a prediction using the "custom_model" (transform)
4. Eliminate the float('nan') values from prediction results
5. Sort the results from highest predicted rating and show them


In [0]:
print(f'size df_movies: {df_movies.count()}')
unrated_movies = df_movies.join(df_custom_ratings, on='movieId', how='leftanti')
print(f'size unrated_movies: {unrated_movies.count()}')
unrated_movies.show()

In [0]:
from pyspark.sql.functions import lit

unrated_movies_with_user = unrated_movies.withColumn('userId', lit(my_user_id))
unrated_movies_with_user.show()

In [0]:
custom_predictions = custom_model.transform(unrated_movies_with_user)
display(custom_predictions)

In [0]:
from pyspark.sql.functions import desc

custom_predictions = custom_predictions.filter(custom_predictions.prediction != float('nan'))
custom_predictions.select('prediction').describe().show()

display(custom_predictions.sort(desc('prediction')))