In [152]:
#importing the required pyspark library 
from pyspark.sql import SparkSession 
from pyspark.ml.evaluation import RegressionEvaluator 
from pyspark.ml.recommendation import ALS
from pyspark.sql.functions import avg, min, max
from pyspark.ml.tuning import ParamGridBuilder, CrossValidator 
  
#Setup Spark Session 
spark = SparkSession.builder.appName('Recommender').getOrCreate() 
spark

In [153]:
ratings = spark.read.csv('ratings.csv', 
                      inferSchema=True,header=True) 
  
ratings.show(5)

+------+-------+------+---------+
|userId|movieId|rating|timestamp|
+------+-------+------+---------+
|     1|      1|   4.0|964982703|
|     1|      3|   4.0|964981247|
|     1|      6|   4.0|964982224|
|     1|     47|   5.0|964983815|
|     1|     50|   5.0|964982931|
+------+-------+------+---------+
only showing top 5 rows



In [154]:
# Count the total number of ratings in the dataset
numerator = ratings.select("rating").count()

# Count the number of distinct userIds and distinct movieIds
num_users = ratings.select("userId").distinct().count()
num_movies = ratings.select("movieId").distinct().count()

# Set the denominator equal to the number of users multiplied by the number of movies
denominator = num_users * num_movies

# Divide the numerator by the denominator
sparsity = (1.0 - (numerator *1.0)/denominator)*100
print("The ratings dataframe is ", "%.2f" % sparsity + "% empty.")

The ratings dataframe is  98.30% empty.


In [155]:
# Min num ratings for movies
print("Movie with the fewest ratings: ")
ratings.groupBy("movieId").count().select(min("count")).show()

# Avg num ratings per movie
print("Avg num ratings per movie: ")
ratings.groupBy("movieId").count().select(avg("count")).show()

Movie with the fewest ratings: 
+----------+
|min(count)|
+----------+
|         1|
+----------+

Avg num ratings per movie: 
+------------------+
|        avg(count)|
+------------------+
|10.369806663924312|
+------------------+



In [156]:
ratings.printSchema()

root
 |-- userId: integer (nullable = true)
 |-- movieId: integer (nullable = true)
 |-- rating: double (nullable = true)
 |-- timestamp: integer (nullable = true)



In [157]:
(training_data, test_data) = ratings.randomSplit([0.8, 0.2], seed=42)

In [158]:
als = ALS(userCol="userId", itemCol="movieId", ratingCol="rating", rank =10, maxIter =15, regParam =.1,
          coldStartStrategy="drop", nonnegative =True, implicitPrefs = False)

In [159]:
model = als.fit(training_data)

In [160]:
test_predictions = model.transform(test_data)

In [161]:
test_predictions.show(10)

+------+-------+------+----------+----------+
|userId|movieId|rating| timestamp|prediction|
+------+-------+------+----------+----------+
|   463|   1088|   3.5|1145460096| 3.5548759|
|   580|   3175|   2.5|1167792674| 3.5527954|
|   580|  44022|   3.5|1167792560| 3.3059227|
|   362|   1645|   5.0|1530641485|  3.737258|
|   597|   1959|   4.0| 941640006|  4.143024|
|   155|   3175|   4.0| 961861723| 3.8306065|
|   368|   2122|   2.0| 971277319| 2.2190294|
|   115|   1645|   4.0| 957648208|  3.344508|
|   115|   3175|   4.0| 965425216| 3.4966578|
|    28|   1645|   2.5|1242033151| 2.5583792|
+------+-------+------+----------+----------+
only showing top 10 rows



In [162]:
evaluator = RegressionEvaluator(metricName="rmse", labelCol="rating", predictionCol="prediction")

print(evaluator.getMetricName())
print(evaluator.getLabelCol())
print(evaluator.getPredictionCol())

rmse
rating
prediction


In [163]:
RMSE = evaluator.evaluate(test_predictions)
print (RMSE)

0.8721631381144733


In [164]:
n = 5
ALS_recommendations = model.recommendForAllUsers(n)

In [165]:
ALS_recommendations.show()

+------+--------------------+
|userId|     recommendations|
+------+--------------------+
|     1|[{177593, 5.75963...|
|     2|[{32892, 4.922302...|
|     3|[{3837, 5.0004992...|
|     4|[{2300, 4.974401}...|
|     5|[{6380, 4.8630085...|
|     6|[{185029, 4.97384...|
|     7|[{1949, 5.0702825...|
|     8|[{306, 5.039128},...|
|     9|[{53123, 5.074069...|
|    10|[{112804, 5.72640...|
|    11|[{3099, 5.842291}...|
|    12|[{6732, 5.85811},...|
|    13|[{5075, 5.7361903...|
|    14|[{49932, 5.495044...|
|    15|[{27611, 5.620723...|
|    16|[{25771, 4.520450...|
|    17|[{96004, 5.040982...|
|    18|[{96004, 4.848904...|
|    19|[{96004, 4.288895...|
|    20|[{177593, 5.60006...|
+------+--------------------+
only showing top 20 rows



In [166]:
ALS_recommendations.createOrReplaceTempView("ALS_recs_temp")

clean_recs = spark.sql ("SELECT userId, movieIds_and_ratings.movieId AS movieId, movieIds_and_ratings.rating AS prediction FROM ALS_recs_temp LATERAL VIEW explode(recommendations) exploded_table AS movieIds_and_ratings")

exploded_recs = spark.sql ("SELECT userId, explode(recommendations) AS MovieRec FROM ALS_recs_temp")

exploded_recs.show()


+------+-------------------+
|userId|           MovieRec|
+------+-------------------+
|     1| {177593, 5.759636}|
|     1|    {3925, 5.68001}|
|     1| {96004, 5.5498986}|
|     1|  {3379, 5.5498986}|
|     1| {171495, 5.510626}|
|     2| {32892, 4.9223027}|
|     2|{131724, 4.8925333}|
|     2|  {92643, 4.839089}|
|     2|{136469, 4.7513666}|
|     2|  {53123, 4.715069}|
|     3|  {3837, 5.0004992}|
|     3|    {6835, 4.89545}|
|     3|    {5746, 4.89545}|
|     3|   {5181, 4.846535}|
|     3|  {4518, 4.7833548}|
|     4|   {2300, 4.974401}|
|     4|{158872, 4.9174886}|
|     4|   {3851, 4.906932}|
|     4| {174053, 4.891874}|
|     4|  {2384, 4.8675675}|
+------+-------------------+
only showing top 20 rows



In [167]:
clean_recs.show()

+------+-------+----------+
|userId|movieId|prediction|
+------+-------+----------+
|     1| 177593|  5.759636|
|     1|   3925|   5.68001|
|     1|  96004| 5.5498986|
|     1|   3379| 5.5498986|
|     1| 171495|  5.510626|
|     2|  32892| 4.9223027|
|     2| 131724| 4.8925333|
|     2|  92643|  4.839089|
|     2| 136469| 4.7513666|
|     2|  53123|  4.715069|
|     3|   3837| 5.0004992|
|     3|   6835|   4.89545|
|     3|   5746|   4.89545|
|     3|   5181|  4.846535|
|     3|   4518| 4.7833548|
|     4|   2300|  4.974401|
|     4| 158872| 4.9174886|
|     4|   3851|  4.906932|
|     4| 174053|  4.891874|
|     4|   2384| 4.8675675|
+------+-------+----------+
only showing top 20 rows



In [168]:
movie_info = spark.read.csv('movies.csv', 
                      inferSchema=True,header=True) 
  
movie_info.show(5)

+-------+--------------------+--------------------+
|movieId|               title|              genres|
+-------+--------------------+--------------------+
|      1|    Toy Story (1995)|Adventure|Animati...|
|      2|      Jumanji (1995)|Adventure|Childre...|
|      3|Grumpier Old Men ...|      Comedy|Romance|
|      4|Waiting to Exhale...|Comedy|Drama|Romance|
|      5|Father of the Bri...|              Comedy|
+-------+--------------------+--------------------+
only showing top 5 rows



In [169]:
clean_recs = clean_recs.join(movie_info, ["movieId"], "left")
clean_recs.show()

+-------+------+----------+--------------------+--------------------+
|movieId|userId|prediction|               title|              genres|
+-------+------+----------+--------------------+--------------------+
| 177593|     1|  5.759636|Three Billboards ...|         Crime|Drama|
|   3925|     1|   5.68001|Stranger Than Par...|        Comedy|Drama|
|  96004|     1| 5.5498986|Dragon Ball Z: Th...|Action|Adventure|...|
|   3379|     1| 5.5498986| On the Beach (1959)|               Drama|
| 171495|     1|  5.510626|              Cosmos|  (no genres listed)|
|  32892|     2| 4.9223027|Ivan's Childhood ...|           Drama|War|
| 131724|     2| 4.8925333|The Jinx: The Lif...|         Documentary|
|  92643|     2|  4.839089|Monsieur Lazhar (...|Children|Comedy|D...|
| 136469|     2| 4.7513666|Larry David: Curb...|              Comedy|
|  53123|     2|  4.715069|         Once (2006)|Drama|Musical|Rom...|
|   3837|     3| 5.0004992|  Phantasm II (1988)|Action|Fantasy|Ho...|
|   6835|     3|   4

In [170]:
clean_recs.join(ratings, ["userId", "movieId"], "left").filter(ratings['rating'].isNull()).show()

+------+-------+----------+--------------------+--------------------+------+---------+
|userId|movieId|prediction|               title|              genres|rating|timestamp|
+------+-------+----------+--------------------+--------------------+------+---------+
|     1| 177593|  5.759636|Three Billboards ...|         Crime|Drama|  NULL|     NULL|
|     1|   3925|   5.68001|Stranger Than Par...|        Comedy|Drama|  NULL|     NULL|
|     1|  96004| 5.5498986|Dragon Ball Z: Th...|Action|Adventure|...|  NULL|     NULL|
|     1|   3379| 5.5498986| On the Beach (1959)|               Drama|  NULL|     NULL|
|     1| 171495|  5.510626|              Cosmos|  (no genres listed)|  NULL|     NULL|
|     2|  32892| 4.9223027|Ivan's Childhood ...|           Drama|War|  NULL|     NULL|
|     2|  92643|  4.839089|Monsieur Lazhar (...|Children|Comedy|D...|  NULL|     NULL|
|     2| 136469| 4.7513666|Larry David: Curb...|              Comedy|  NULL|     NULL|
|     2|  53123|  4.715069|         Once (2

In [171]:
#param_grid = ParamGridBuilder().addGrid(als.rank, [5, 40]).addGrid(als.maxIter, [5, 100]).addGrid(als.regParam, [.05, .1]).build()


In [172]:
#cv = CrossValidator(estimator=als, estimatorParamMaps=param_grid, evaluator=evaluator, numFolds=3)

In [173]:
#model = cv.fit(training_data)