In [1]:
import pyspark
from pyspark import SparkContext
from pyspark.sql import SparkSession
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.ml.recommendation import ALS, ALSModel

In [2]:
spark = SparkSession.builder.master("local").getOrCreate()

In [3]:
movie_ratings = spark.read.json('./data/ratings.json')
movie_ratings

DataFrame[movie_id: bigint, rating: bigint, timestamp: double, user_id: bigint]

In [4]:
sorted_ratings = movie_ratings.orderBy('timestamp')

In [5]:
df = sorted_ratings.toPandas()
df

Unnamed: 0,movie_id,rating,timestamp,user_id
0,858,4,956678732.0,6040
1,2384,4,956678754.0,6040
2,593,5,956678754.0,6040
3,1961,4,956678777.0,6040
4,1419,3,956678856.0,6040
5,213,5,956678856.0,6040
6,3111,5,956678856.0,6040
7,573,4,956678856.0,6040
8,3505,4,956678856.0,6040
9,1734,2,956678881.0,6040


In [6]:
from pyspark.sql.functions import percent_rank
from pyspark.sql import Window

df = sorted_ratings.withColumn("rank", percent_rank().over(Window.partitionBy().orderBy("timestamp")))

In [7]:
train_df = df.where("rank <= .9").drop("rank")
train_df.count()

647956

In [8]:
test_df = df.where("rank > .9").drop("rank")
test_df.count()

71993

In [9]:
train_df.persist()
test_df.persist()

DataFrame[movie_id: bigint, rating: bigint, timestamp: double, user_id: bigint]

In [10]:
als = ALS(maxIter=10,rank=10, regParam=0.1, userCol="user_id", 
          itemCol="movie_id", ratingCol="rating", coldStartStrategy='drop')

# fit the ALS model to the training set
model = als.fit(train_df)

In [11]:
predictions = model.transform(test_df)
predictions.persist()

DataFrame[movie_id: bigint, rating: bigint, timestamp: double, user_id: bigint, prediction: float]

In [12]:
predictions.orderBy('prediction', ascending=False).show(10)

+--------+------+------------+-------+----------+
|movie_id|rating|   timestamp|user_id|prediction|
+--------+------+------------+-------+----------+
|     260|     3| 9.7516759E8|   1343|  5.235715|
|    1198|     4|9.75167516E8|   1343| 5.2333574|
|     858|     5|9.75166908E8|   1343|  5.163885|
|     745|     5|9.75106081E8|   1917| 5.1593847|
|    1148|     5|9.75106081E8|   1917|  5.106634|
|    1136|     5|9.75107837E8|   1917| 5.0818186|
|    2858|     5|9.75161494E8|   3203|  5.074533|
|     923|     5|9.75737182E8|   1386| 5.0729256|
|     912|     5|9.75167344E8|   1343| 5.0675645|
|     904|     4|9.75316667E8|   1386|   5.06066|
+--------+------+------------+-------+----------+
only showing top 10 rows



In [13]:
recs = model.recommendForAllUsers(numItems=10)

In [14]:
recs.persist().show()

+-------+--------------------+
|user_id|     recommendations|
+-------+--------------------+
|   1580|[[3906, 4.3775463...|
|   4900|[[572, 5.78599], ...|
|   5300|[[3906, 5.6259575...|
|   1591|[[572, 5.933296],...|
|   4101|[[3867, 5.3719015...|
|   1342|[[572, 4.782131],...|
|   2122|[[572, 4.903619],...|
|   2142|[[572, 5.0096035]...|
|   5803|[[3867, 4.994311]...|
|   3794|[[2775, 4.4228964...|
|   1645|[[2342, 6.069063]...|
|   3175|[[3906, 5.6749587...|
|   4935|[[3906, 5.2834725...|
|   2366|[[572, 4.6744156]...|
|   2866|[[572, 4.471676],...|
|   5156|[[557, 6.2248206]...|
|   3997|[[2197, 4.875693]...|
|   1088|[[572, 5.1001797]...|
|   1238|[[557, 5.2119913]...|
|   3918|[[3906, 4.907593]...|
+-------+--------------------+
only showing top 20 rows



In [15]:
re = RegressionEvaluator(metricName="rmse", labelCol="rating",
                                predictionCol="prediction")
rmse = re.evaluate(model.transform(test_df))
print("Root Mean-squared error = " + str(rmse))

Root Mean-squared error = 0.9008231715659234


In [16]:
# import pickle

# pickle.dump(model2, open('als_model.pkl', 'wb'))

# als_model = pickle.load(open('als_model.pkl', 'rb'))

In [17]:
test_ratings = spark.read.json('./data/requests.json')
test_df = test_ratings.toPandas()

In [18]:
test_df.head()

Unnamed: 0,movie_id,rating,timestamp,user_id
0,2019,,956678777.0,6040
1,759,,956679248.0,6040
2,2858,,956679275.0,6040
3,246,,956679413.0,6040
4,1617,,956679473.0,6040


In [19]:
als_final = ALS(maxIter=10,rank=10, regParam=0.1, userCol="user_id", 
          itemCol="movie_id", ratingCol="rating")

# fit the ALS model to the training set
model_als = als_final.fit(train_df)

In [20]:
predictions_als = model_als.transform(test_ratings)
predictions_als.persist()

DataFrame[movie_id: bigint, rating: double, timestamp: double, user_id: bigint, prediction: float]

In [21]:
predictions_als.show(40)

+--------+------+-------------+-------+----------+
|movie_id|rating|    timestamp|user_id|prediction|
+--------+------+-------------+-------+----------+
|     148|   NaN| 9.77959026E8|     53|       NaN|
|     148|   NaN| 9.76559602E8|   4169|   2.99546|
|     148|   NaN| 9.89024856E8|   5333|  2.333869|
|     148|   NaN| 9.77005381E8|   4387| 2.0601199|
|     148|   NaN| 9.66907208E8|   3539| 2.6047268|
|     148|   NaN| 9.76266538E8|    840|       NaN|
|     148|   NaN| 9.76841639E8|    216|       NaN|
|     148|   NaN| 9.76191154E8|    482|       NaN|
|     148|   NaN|1.029283935E9|    752|       NaN|
|     148|   NaN|1.026978024E9|    424|       NaN|
|     148|   NaN| 9.74150193E8|   2456| 2.4470506|
|     148|   NaN|  9.7014489E8|   3053| 2.5009599|
|     463|   NaN| 9.80596453E8|    970|       NaN|
|     463|   NaN| 9.76560887E8|   4169|  2.381458|
|     463|   NaN| 9.78242788E8|     26|       NaN|
|     463|   NaN| 9.76395651E8|    319|       NaN|
|     463|   NaN| 9.76907712E8|

In [22]:
als_preds_df = predictions_als.toPandas()

In [23]:
als_preds_df.head(10)

Unnamed: 0,movie_id,rating,timestamp,user_id,prediction
0,148,,977959000.0,53,
1,148,,976559600.0,4169,2.99546
2,148,,989024900.0,5333,2.333869
3,148,,977005400.0,4387,2.06012
4,148,,966907200.0,3539,2.604727
5,148,,976266500.0,840,
6,148,,976841600.0,216,
7,148,,976191200.0,482,
8,148,,1029284000.0,752,
9,148,,1026978000.0,424,


In [24]:
als_preds_df['prediction'].isna()[3]

False

In [25]:
recs = model_als.recommendForAllUsers(numItems=10)
recs_als = recs.toPandas()
recs_als.head()

Unnamed: 0,user_id,recommendations
0,1580,"[(3906, 4.377546310424805), (557, 4.2864441871..."
1,4900,"[(572, 5.785990238189697), (811, 5.58649063110..."
2,5300,"[(3906, 5.625957489013672), (557, 5.4169192314..."
3,1591,"[(572, 5.933296203613281), (3906, 5.6931056976..."
4,4101,"[(3867, 5.371901512145996), (3523, 5.113660812..."


In [26]:
movie_id = recs_als.loc[recs_als['user_id'] == 1580]['recommendations'][0][0]['movie_id']
movie_id

3906

In [27]:
def get_top_n_movies(user_id, pred_df, n):
    "Get the top n movies from the highest predicted values"
    
    user = pred_df.loc[pred_df['user_id']==user_id]
    sorted_user = user.sort_values('prediction', ascending=False)
    
    movies = []
    count = pred_df.loc[pred_df['user_id'] == user_id]['movie_id'].count()
    
    if n > count:
        n = count
        
    for i in range(0,n):
        movie = sorted_user.iloc[i]['movie_id']
        movies.append(movie.astype('int'))
        
    return movies


In [28]:
get_top_n_movies(673, als_preds_df, 10)

[804, 3105, 1212, 3936, 300, 3481, 3010, 2236, 3654, 2725]

In [29]:
get_top_n_movies(1980, als_preds_df, 10)

[2019, 3307, 923, 2775, 1247, 910, 2964, 1945, 913, 3897]

In [30]:
def combined_pred(als_pred, cold_start_pred):
    
    pred_final = als_pred.copy()
    
    for i in range(0,len(als_pred)):
        if als_pred['prediction'].isna()[i] == True:
            pred_final.iloc[i, pred_final.columns.get_loc('prediction')] = cold_start_pred['predictions'][i]
        else:
            pred_final.iloc[i, pred_final.columns.get_loc('prediction')] = 0.5 * als_pred['prediction'][i] + 0.5 * cold_start_pred['predictions'][i]
    return pred_final

In [31]:
import pickle

cs_model = pickle.load(open('cs_model.pkl', 'rb'))

In [32]:
cs_model.head()

Unnamed: 0,movie_id,rating,timestamp,user_id,title,year,title_year,budget,revenue,runtime,vote_average,popularity,vote_count,sex,age,occupation,zipcode,predictions
0,2019,0.0,956678777.0,6040,Seven Samurai (The Magnificent Seven) (Shichin...,1954,('Seven Samurai (The Magnificent Seven) (Shich...,0.0,0.0,0.0,0.0,0.0,0.0,0,25,6,11106,4.2
1,759,0.0,956679248.0,6040,Maya Lin: A Strong Clear Vision,1994,"('Maya Lin: A Strong Clear Vision', '1994')",0.0,0.0,0.0,0.0,0.0,0.0,0,25,6,11106,3.6
2,2858,0.0,956679275.0,6040,American Beauty,1999,"('American Beauty', '1999')",15000000.0,356296601.0,122.0,7.9,20.726578,3438.0,0,25,6,11106,4.8
3,246,0.0,956679413.0,6040,Hoop Dreams,1994,"('Hoop Dreams', '1994')",700000.0,7830611.0,171.0,7.7,9.47666,91.0,0,25,6,11106,4.2
4,1617,0.0,956679473.0,6040,L.A. Confidential,1997,"('L.A. Confidential', '1997')",35000000.0,126216940.0,138.0,7.7,10.989442,1340.0,0,25,6,11106,4.0


In [33]:
final_preds = combined_pred(als_preds_df, cs_model)

In [34]:
final_preds.head()

Unnamed: 0,movie_id,rating,timestamp,user_id,prediction
0,148,,977959026.0,53,4.2
1,148,,976559602.0,4169,3.29773
2,148,,989024856.0,5333,3.566935
3,148,,977005381.0,4387,3.13006
4,148,,966907208.0,3539,3.302363


In [35]:
final_preds.to_json('predictions.json')