In [5]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName('recommender').getOrCreate()
from pyspark.ml.recommendation import ALS
from pyspark.ml.evaluation import RegressionEvaluator
import pandas as pd

In [2]:
data_cols = ['userid','movieid','rating','timestamp']
item_cols = ['movieid','movietitle','release date',
'video release date','IMDb URL','unknown','Action',
'Adventure','Animation','Childrens','Comedy','Crime',
'Documentary','Drama','Fantasy','Film-Noir','Horror',
'Musical','Mystery','Romance ','Sci-Fi','Thriller',
'War' ,'Western']
user_cols = ['userid','age','gender','occupation',
'zip code']

In [6]:
users = pd.read_csv('/home/bella/ml-100k/u.user', sep='|',
names=user_cols, encoding='latin-1')

In [7]:
item = pd.read_csv('/home/bella/ml-100k/u.item', sep='|',
names=item_cols, encoding='latin-1')
data = pd.read_csv('/home/bella/ml-100k/u.data', sep='\t',
names=data_cols, encoding='latin-1')

In [11]:
dataset = pd.merge(pd.merge(item, data),users)
df = dataset[['userid','movieid','rating','timestamp']]
dataframe =spark.createDataFrame(df)
#ratings = dataframe.rdd


In [14]:
dataframe.printSchema()

root
 |-- userid: long (nullable = true)
 |-- movieid: long (nullable = true)
 |-- rating: long (nullable = true)
 |-- timestamp: long (nullable = true)



In [15]:
dataframe.show()

+------+-------+------+---------+
|userid|movieid|rating|timestamp|
+------+-------+------+---------+
|   308|      1|     4|887736532|
|   308|      4|     5|887737890|
|   308|      5|     4|887739608|
|   308|      7|     4|887738847|
|   308|      8|     5|887736696|
|   308|      9|     4|887737194|
|   308|     11|     5|887737837|
|   308|     12|     5|887737243|
|   308|     15|     3|887739426|
|   308|     17|     4|887739056|
|   308|     19|     3|887737383|
|   308|     21|     3|887740729|
|   308|     22|     4|887737647|
|   308|     23|     5|887737293|
|   308|     24|     4|887738057|
|   308|     25|     4|887740649|
|   308|     28|     3|887737036|
|   308|     30|     4|887738933|
|   308|     31|     3|887739472|
|   308|     32|     5|887737432|
+------+-------+------+---------+
only showing top 20 rows



In [17]:
dataframe.describe().show()

+-------+----------------+-----------------+------------------+-----------------+
|summary|          userid|          movieid|            rating|        timestamp|
+-------+----------------+-----------------+------------------+-----------------+
|  count|          100000|           100000|            100000|           100000|
|   mean|       462.48475|        425.53013|           3.52986|8.8352885148862E8|
| stddev|266.614420127509|330.7983563255838|1.1256735991443205|5343856.189502763|
|    min|               1|                1|                 1|        874724710|
|    max|             943|             1682|                 5|        893286638|
+-------+----------------+-----------------+------------------+-----------------+



In [19]:
training, test = dataframe.randomSplit([0.8,0.2])

In [21]:
als = ALS(maxIter=5, regParam=0.01, userCol='userid', itemCol='movieid', ratingCol='rating')

model = als.fit(training)

predictions = model.transform(test)

In [22]:
predictions.describe().show()

+-------+------------------+-----------------+------------------+-------------------+----------+
|summary|            userid|          movieid|            rating|          timestamp|prediction|
+-------+------------------+-----------------+------------------+-------------------+----------+
|  count|             20011|            20011|             20011|              20011|     20011|
|   mean|460.72769976512916|425.5467992604068|3.5167158063065314|8.835421477338464E8|       NaN|
| stddev|266.87561590489815| 330.979649347182|1.1328796140938848|   5342409.57114664|       NaN|
|    min|                 1|                1|                 1|          874728396| -6.011937|
|    max|               943|             1678|                 5|          893286638|       NaN|
+-------+------------------+-----------------+------------------+-------------------+----------+



In [23]:
predictions = predictions.na.drop()
predictions.describe().show()

+-------+------------------+------------------+------------------+-------------------+------------------+
|summary|            userid|           movieid|            rating|          timestamp|        prediction|
+-------+------------------+------------------+------------------+-------------------+------------------+
|  count|             19970|             19970|             19970|              19970|             19970|
|   mean|460.70265398097143|423.44591887831746|3.5185277916875313|8.835357784156234E8|  3.50928863528006|
| stddev| 266.9115251341792| 327.8187458891906|1.1319754223510472|  5341020.139388894|0.9481777149212037|
|    min|                 1|                 1|                 1|          874728396|         -6.011937|
|    max|               943|              1658|                 5|          893286638|          8.301017|
+-------+------------------+------------------+------------------+-------------------+------------------+



In [24]:
evaluator = RegressionEvaluator(metricName="rmse", labelCol="rating",
                                predictionCol="prediction")
rmse = evaluator.evaluate(predictions)
print("Root-mean-square error = " + str(rmse))

Root-mean-square error = 1.07282610421


In [25]:
userRecs = model.recommendForAllUsers(10)
# Generate top 10 user recommendations for each movie
movieRecs = model.recommendForAllItems(10)

In [32]:
# Generate top 10 movie recommendations for a specified set of users
users = dataframe.select(als.getUserCol()).distinct().limit(3)
userSubsetRecs = model.recommendForUserSubset(users, 10)
    # Generate top 10 user recommendations for a specified set of movies
movies = dataframe.select(als.getItemCol()).distinct().limit(3)
movieSubSetRecs = model.recommendForItemSubset(movies, 10)
   
#userRecs.show()
#movieRecs.show()
userSubsetRecs.show()
movieSubSetRecs.show()

+------+--------------------+
|userid|     recommendations|
+------+--------------------+
|    26|[[1512, 5.1629896...|
|   474|[[557, 6.125899],...|
|    29|[[904, 6.043762],...|
+------+--------------------+

+-------+--------------------+
|movieid|     recommendations|
+-------+--------------------+
|     26|[[434, 7.8860817]...|
|    474|[[842, 7.1664853]...|
|     29|[[914, 5.6008306]...|
+-------+--------------------+



In [33]:
userSubsetRecs.printSchema()

root
 |-- userid: integer (nullable = false)
 |-- recommendations: array (nullable = true)
 |    |-- element: struct (containsNull = true)
 |    |    |-- movieid: integer (nullable = true)
 |    |    |-- rating: float (nullable = true)

