In [1]:
# Computational and Visualisation Packages
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# Pyspark Packages
from pyspark.sql import functions as F
from pyspark.sql.functions import col, desc
from pyspark.sql.types import *
from pyspark.ml import Pipeline
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.ml.recommendation import ALS

In [2]:
user_data = spark.read.option('sep', '\t').csv('/mnt/ml-100k/u.data')
user_data = user_data.select(col('_c0').cast('int').alias('user_id'), col('_c1').cast('int').alias('item_id'), \
                             col('_c2').cast('int').alias('rating'), col('_c3').alias('timestamp').cast('bigint'))
user_data = user_data \
            .withColumn('date_f', F.to_timestamp(F.from_unixtime(col('timestamp'), 'dd-MM-yyyy HH:mm:ss'), 'dd-MM-yyyy HH:mm:ss'))\
            .withColumn('date_s', F.to_date(F.from_unixtime(col('timestamp'), 'yyyyMMdd'), 'yyyyMMdd'))

# Splitting the data into training and testing set
train, test = user_data.randomSplit([.8, .2])

display(user_data.sample(False, 0.1), 100)

user_id,item_id,rating,timestamp,date_f,date_s
22,377,1,878887116,1997-11-07T07:18:36.000+0000,1997-11-07
253,465,5,891628467,1998-04-03T18:34:27.000+0000,1998-04-03
305,451,3,886324817,1998-02-01T09:20:17.000+0000,1998-02-01
119,392,4,886176814,1998-01-30T16:13:34.000+0000,1998-01-30
167,486,4,892738452,1998-04-16T14:54:12.000+0000,1998-04-16
102,768,2,883748450,1998-01-02T13:40:50.000+0000,1998-01-02
276,564,3,874791805,1997-09-20T21:43:25.000+0000,1997-09-20
178,332,3,882823437,1997-12-22T20:43:57.000+0000,1997-12-22
57,304,5,883698581,1998-01-01T23:49:41.000+0000,1998-01-01
225,237,5,879539643,1997-11-14T20:34:03.000+0000,1997-11-14


In [3]:
display (user_data.describe())

summary,user_id,item_id,rating,timestamp
count,100000.0,100000.0,100000.0,100000.0
mean,462.48475,425.53013,3.52986,883528851.48862
stddev,266.61442012750905,330.79835632558473,1.1256735991443214,5343856.189502848
min,1.0,1.0,1.0,874724710.0
max,943.0,1682.0,5.0,893286638.0


In [4]:
display(user_data.groupBy('rating').agg(F.count(F.lit(1)).alias('Total Ratings')))

rating,Total Ratings
1,6110
3,27145
5,21201
4,34174
2,11370


In [5]:
display(user_data.groupBy('date_s').agg(F.count(F.lit(1)).alias('Total Ratings')))

date_s,Total Ratings
1998-01-04,850
1998-02-07,262
1998-01-31,270
1997-09-24,566
1997-11-18,1247
1997-10-09,279
1998-02-10,71
1998-03-18,38
1997-11-30,909
1998-03-08,329


In [6]:
display(user_data.groupBy('item_id').agg(F.count(F.lit(1)).alias('Count of Recommendation')).sort(desc('Count of Recommendation')).limit(40))

item_id,Count of Recommendation
50,583
258,509
100,508
181,507
294,485
286,481
288,478
1,452
300,431
121,429


In [7]:
display(user_data.groupBy('user_id').agg(F.count(F.lit(1)).alias('Count of Recommendation')).sort(desc('Count of Recommendation')).limit(40))

user_id,Count of Recommendation
405,737
655,685
13,636
450,540
276,518
416,493
537,490
303,484
234,480
393,448


In [8]:
# ALS Model Hyperparameter values were separately computed for the best model
movie_recommender_inst = ALS(maxIter=28, regParam=0.1, userCol="user_id", itemCol="item_id", ratingCol="rating", coldStartStrategy="drop", implicitPrefs=True)
movie_recommender_model = movie_recommender_inst.fit(train)

computed_predictions = movie_recommender_model.transform(test)
reg_evaluator = RegressionEvaluator(metricName="rmse", labelCol="rating", predictionCol="prediction")
rmse_model = reg_evaluator.evaluate(computed_predictions)

print ("Computed Root-mean-square error = ", rmse_model)

In [9]:
display (computed_predictions, 250)

user_id,item_id,rating,timestamp,date_f,date_s,prediction
44,148,4,878346946,1997-11-01T01:15:46.000+0000,1997-11-01,0.39935064
606,148,3,878150506,1997-10-29T18:41:46.000+0000,1997-10-29,0.6836442
236,148,4,890117028,1998-03-17T06:43:48.000+0000,1998-03-17,0.43128693
601,148,3,876348140,1997-10-08T22:02:20.000+0000,1997-10-08,0.3346575
618,148,3,891309670,1998-03-31T02:01:10.000+0000,1998-03-31,0.5149034
363,148,3,891497439,1998-04-02T06:10:39.000+0000,1998-04-02,0.43109655
274,148,2,878946133,1997-11-07T23:42:13.000+0000,1997-11-07,0.48952678
54,148,3,880937490,1997-12-01T00:51:30.000+0000,1997-12-01,0.5613334
430,148,2,877226047,1997-10-19T01:54:07.000+0000,1997-10-19,0.15182751
891,148,5,891639793,1998-04-03T21:43:13.000+0000,1998-04-03,0.5233692


In [10]:
display (computed_predictions.describe())

summary,user_id,item_id,rating,timestamp,prediction
count,20073.0,20073.0,20073.0,20073.0,20073.0
mean,458.83460369650777,422.625915408758,3.5211478104917053,883532271.1609625,0.5765657851046389
stddev,265.80266728467063,326.88040949609666,1.1304261717313815,5329016.648097086,0.2505990037601157
min,1.0,1.0,1.0,874724781.0,-0.24810411
max,943.0,1664.0,5.0,893286638.0,1.3496845


In [11]:
computed_user_recommendations = movie_recommender_model.recommendForAllUsers(10)
display (computed_user_recommendations.limit(200))

user_id,recommendations
471,"List(List(418, 0.5861153), List(596, 0.5642026), List(432, 0.5603745), List(588, 0.551357), List(419, 0.53181976), List(501, 0.52384865), List(404, 0.5114288), List(225, 0.5040019), List(420, 0.5032704), List(969, 0.49917987))"
463,"List(List(237, 1.0133508), List(275, 1.0048653), List(111, 0.9843215), List(257, 0.9789947), List(276, 0.966491), List(15, 0.9641205), List(1, 0.96163666), List(25, 0.9555187), List(283, 0.9490139), List(100, 0.943228))"
833,"List(List(23, 1.0784113), List(183, 1.0718707), List(7, 1.0671804), List(201, 1.0508645), List(200, 1.0425217), List(185, 1.0370771), List(182, 1.0268977), List(176, 1.0237503), List(89, 1.0122939), List(436, 1.0103908))"
496,"List(List(168, 0.92025363), List(172, 0.896725), List(174, 0.88862145), List(173, 0.87249553), List(204, 0.8617085), List(202, 0.813236), List(210, 0.8126842), List(588, 0.80947185), List(423, 0.808095), List(28, 0.80788016))"
148,"List(List(168, 0.8571946), List(174, 0.82194316), List(50, 0.80566937), List(173, 0.8006542), List(98, 0.7956149), List(172, 0.7898079), List(181, 0.7751622), List(496, 0.7652716), List(204, 0.7648579), List(1, 0.7554063))"
540,"List(List(117, 0.9282352), List(237, 0.9271201), List(257, 0.9036972), List(121, 0.8961524), List(100, 0.8802126), List(1, 0.8721001), List(50, 0.86366004), List(222, 0.8585881), List(405, 0.8511642), List(15, 0.8500441))"
392,"List(List(286, 1.2234964), List(258, 1.1191676), List(302, 1.1082768), List(269, 1.1042851), List(300, 1.0000604), List(313, 0.9936656), List(288, 0.98166263), List(294, 0.928094), List(289, 0.9111995), List(127, 0.907258))"
243,"List(List(275, 0.918426), List(14, 0.84015906), List(283, 0.8309057), List(70, 0.79909563), List(285, 0.78950995), List(86, 0.7831383), List(237, 0.7600634), List(582, 0.7519068), List(137, 0.7505461), List(9, 0.7353947))"
623,"List(List(50, 0.8047396), List(181, 0.74537534), List(1, 0.72121346), List(100, 0.7168293), List(127, 0.6480753), List(275, 0.6072871), List(222, 0.60542536), List(121, 0.59938765), List(258, 0.5934537), List(98, 0.5903377))"
737,"List(List(56, 0.74159634), List(98, 0.7254819), List(168, 0.67598987), List(174, 0.65685135), List(12, 0.63046706), List(357, 0.62956786), List(64, 0.6228447), List(172, 0.6194699), List(183, 0.6148868), List(173, 0.60842943))"


In [12]:
computed_movie_recommendations = movie_recommender_model.recommendForAllItems(15)
display(computed_movie_recommendations.limit(200))

item_id,recommendations
471,"List(List(256, 1.2032576), List(374, 1.1697769), List(533, 1.1024731), List(178, 1.096342), List(907, 1.0795313), List(825, 1.0569783), List(416, 1.0375804), List(332, 1.02325), List(207, 0.9986015), List(141, 0.98530537), List(593, 0.9770696), List(450, 0.9737439), List(938, 0.9727412), List(393, 0.97186565), List(378, 0.9588188))"
1591,"List(List(782, 0.3046544), List(894, 0.27811083), List(592, 0.24715033), List(655, 0.2418258), List(828, 0.2112891), List(851, 0.21121082), List(489, 0.19965848), List(863, 0.19507228), List(440, 0.189401), List(883, 0.18613176), List(519, 0.18373631), List(181, 0.1779449), List(587, 0.17324598), List(585, 0.16868995), List(279, 0.16549855))"
1342,"List(List(590, 0.035811104), List(79, 0.03374487), List(460, 0.033668652), List(470, 0.032027397), List(430, 0.031639654), List(735, 0.031505484), List(581, 0.03128522), List(733, 0.03116815), List(906, 0.031021563), List(937, 0.030572155), List(473, 0.030091733), List(227, 0.029982142), List(422, 0.029707512), List(558, 0.029690562), List(569, 0.029408852))"
463,"List(List(655, 1.0550791), List(537, 0.89831835), List(883, 0.8966881), List(234, 0.87045515), List(450, 0.8573339), List(90, 0.8571583), List(354, 0.8562028), List(707, 0.84192324), List(18, 0.8304713), List(13, 0.8071458), List(299, 0.80327386), List(334, 0.79155743), List(474, 0.78621614), List(151, 0.7585504), List(308, 0.750708))"
833,"List(List(851, 0.9094337), List(130, 0.8659557), List(181, 0.8558738), List(880, 0.76041293), List(592, 0.75509286), List(145, 0.7302488), List(276, 0.689836), List(393, 0.68775094), List(314, 0.678535), List(332, 0.6748595), List(374, 0.6618821), List(291, 0.6500797), List(435, 0.64390504), List(682, 0.6361094), List(294, 0.6275126))"
1645,"List(List(655, 0.18537773), List(894, 0.1231916), List(537, 0.11846579), List(13, 0.113224), List(450, 0.10382836), List(707, 0.102429055), List(90, 0.10116946), List(181, 0.10052642), List(592, 0.10038659), List(234, 0.09796715), List(201, 0.09754339), List(416, 0.09641771), List(334, 0.09146878), List(883, 0.086844034), List(405, 0.08652875))"
496,"List(List(716, 1.0531023), List(747, 1.0054698), List(406, 0.9976321), List(298, 0.99045587), List(60, 0.9874122), List(151, 0.9838831), List(18, 0.9834018), List(437, 0.98247063), List(815, 0.97710896), List(848, 0.9751622), List(234, 0.9610843), List(389, 0.95795816), List(840, 0.95760554), List(7, 0.95426005), List(312, 0.95365477))"
148,"List(List(374, 1.017583), List(256, 1.0093861), List(181, 0.98729444), List(332, 0.97221726), List(825, 0.95098436), List(145, 0.9260326), List(393, 0.91853774), List(851, 0.9089465), List(416, 0.90474796), List(880, 0.86192966), List(178, 0.8542996), List(130, 0.84967107), List(141, 0.8437496), List(551, 0.83693486), List(682, 0.8320486))"
1088,"List(List(279, 0.39854693), List(782, 0.3721474), List(181, 0.35501847), List(851, 0.3502507), List(130, 0.3430755), List(592, 0.33336335), List(276, 0.3120616), List(286, 0.30172086), List(435, 0.2974439), List(294, 0.28897893), List(880, 0.28012398), List(393, 0.27994806), List(894, 0.27074003), List(472, 0.2667407), List(145, 0.26515114))"
1238,"List(List(782, 0.27252293), List(489, 0.25532266), List(587, 0.22902669), List(863, 0.22879049), List(851, 0.225322), List(519, 0.21314994), List(592, 0.21246624), List(451, 0.20879006), List(894, 0.19995792), List(655, 0.19888556), List(724, 0.19461837), List(721, 0.18750423), List(181, 0.1865735), List(752, 0.17899089), List(116, 0.1679709))"


Dataset has been procured from *https://grouplens.org/datasets/movielens/*

The ALS Classifier indicated least RMSE for 28 iterations and 0.1 regParam value. Overall the classifier offers excellent approach to evaluate, build, and productionalize production-grade collaborative filtering based recommendation systems. The published notebook is available at - https://databricks-prod-cloudfront.cloud.databricks.com/public/4027ec902e239c93eaaa8714f173bcfc/3173713035751393/3658598530030623/2308983777460038/latest.html