In [1]:
import re
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
import pickle
# from pyspark.sql.functions import *
import pyspark as ps    # for the pyspark suite
# from pyspark.sql.types import StructType, StructField
# from pyspark.sql.types import IntegerType, StringType, FloatType, DateType, TimestampType
# import pyspark.sql.functions as F
# from pyspark.ml.evaluation import RegressionEvaluator
# from pyspark.ml.tuning import CrossValidator, ParamGridBuilder, TrainValidationSplit
# from pyspark.ml.recommendation import ALS
# from pyspark.sql import Row

spark = ps.sql.SparkSession.builder \
            .master("local[4]") \
            .appName("df lecture") \
            .getOrCreate()

sc = spark.sparkContext  # for the pre-2.0 sparkContext

In [2]:
from lightfm import LightFM
# from lightfm.data import Dataset
# from lightfm.evaluation import precision_at_k
# from lightfm.evaluation import auc_score
# from lightfm.cross_validation import random_train_test_split

## Recommender model comparisons

In [3]:
from pyspark.ml import recommendation
mPath =  "als_recommender"
persisted_ALS_model = recommendation.ALSModel.load(mPath)

pickle_in = open("lightfm-no-features.pkl","rb")
persisted_light_FM = pickle.load(pickle_in)
pickle_in_dataset = open("dataset-lightfm-no-features.pkl","rb")
persisted_dataset_light_FM = pickle.load(pickle_in_dataset)

validata = pd.read_csv('data/test_ratings_df.csv')

In [14]:
def calc_prediction_scores(holdout_set):
    count = 0
    user_ground_truth, movie_mean_rating, lfm_rating, als_rating = [], [], [], []
    ho_users = holdout_set.groupby('userId').count().reset_index()['userId']
    for user in ho_users:
        count += 1
        if count < 400:
            sampled_user_movies = holdout_set[holdout_set['userId'] == user][['movieId','rating','mean']]#'movie Id numbers, not indices'
            sampled_user_movies['userId'] = user

            movie_samps = []
            for movie in sampled_user_movies['movieId']:
                movie_samps.append(persisted_dataset_light_FM.mapping()[2][movie])
                
            # ALS Model predictions
            df2=spark.createDataFrame(sampled_user_movies)
            ALS_pred=persisted_ALS_model.transform(df2).toPandas()
#             print(ALS_pred.head(3))
            
            #LightFM model predictions
            prediction = persisted_light_FM.predict(user_ids = persisted_dataset_light_FM.mapping()[0][user], item_ids = movie_samps, item_features=None, user_features=None)
            sampled_user_movies['lfm_predict'] = prediction
#             print(sampled_user_movies.sort_values(by='lfm_predict',ascending=False).head(3))
            
            sampled_user_movies=sampled_user_movies.drop(['rating', 'mean','userId'], axis=1)
            sampled_user_movies=sampled_user_movies.merge(ALS_pred, how='inner', left_on='movieId', right_on='movieId')
            sampled_user_movies=sampled_user_movies.rename(columns = {'prediction':'als_predict'})
            
            user_ground_truth.append(sampled_user_movies.sort_values('rating', ascending=False).head(5).sum()['rating'] / 5)
            movie_mean_rating.append(sampled_user_movies.sort_values('mean', ascending=False).head(5).sum()['rating'] / 5)
            lfm_rating.append(sampled_user_movies.sort_values('lfm_predict', ascending=False).head(5).sum()['rating'] / 5)
            als_rating.append(sampled_user_movies.sort_values('als_predict', ascending=False).head(5).sum()['rating'] / 5)
    
    return user_ground_truth, movie_mean_rating, lfm_rating, als_rating, np.mean(user_ground_truth), np.std(user_ground_truth), np.mean(movie_mean_rating), np.std(movie_mean_rating), np.mean(lfm_rating), np.std(lfm_rating), np.mean(als_rating), np.std(als_rating)
        

In [15]:
ground, pop_rtg, lfm_rtg, als_rtg, ground_mean, ground_std, pop_best, pop_std, lfm_best, lfm_std, als_best, als_std = calc_prediction_scores(validata)

In [16]:
print(ground_mean, pop_best, lfm_best, als_best)

4.556140350877193 3.9315789473684206 3.7561403508771924 4.06390977443609


In [17]:
bench_test = pd.DataFrame(ground, columns=(['ground']))
bench_test['pop_rating'] = pd.DataFrame(pop_rtg)
bench_test['lfm_rating'] = pd.DataFrame(lfm_rtg)
bench_test['als_rating'] = pd.DataFrame(als_rtg)

In [19]:
bench_test.head(20)

Unnamed: 0,ground,pop_rating,lfm_rating,als_rating
0,4.7,4.7,4.1,4.1
1,4.5,3.7,3.9,3.9
2,4.2,3.8,3.8,4.0
3,5.0,4.6,3.2,4.8
4,4.8,4.6,4.6,4.5
5,3.7,2.7,3.1,2.8
6,4.7,4.4,4.2,4.6
7,4.4,3.8,3.4,3.8
8,4.0,2.5,2.4,2.9
9,4.4,3.8,3.9,3.5


In [22]:
bench_test.to_csv('data/bench_test.csv', index=False)

### Code used to construct the function above

In [180]:
sample = validata.sample(1) # Redraw sample
sampled_user = sample['userId'].iloc[0] # userId, not index number
sampled_user_movies = validata[validata['userId'] == sampled_user][['movieId','rating','mean']]#'movie Id numbers, not indices'
sampled_user_movies['userId'] = sampled_user

movie_samps = []
for movie in sampled_user_movies['movieId']:
    movie_samps.append(persisted_dataset_light_FM.mapping()[2][movie])

In [181]:
# ALS Model predictions
df2=spark.createDataFrame(sampled_user_movies)
ALS_pred=persisted_ALS_model.transform(df2).toPandas()
ALS_pred.head()

Unnamed: 0,movieId,rating,mean,userId,prediction
0,1206,3.5,3.984191,58098,3.741397
1,2607,4.0,3.78223,58098,3.791891
2,2723,2.0,3.063125,58098,2.908971
3,1009,4.0,3.150767,58098,2.973567
4,6322,3.0,3.445218,58098,3.181064


In [182]:
#LightFM model predictions
prediction = persisted_light_FM.predict(user_ids = persisted_dataset_light_FM.mapping()[0][sampled_user], item_ids = movie_samps, item_features=None, user_features=None)
sampled_user_movies['lfm_predict'] = prediction
sampled_user_movies.sort_values(by='lfm_predict',ascending=False).head()

Unnamed: 0,movieId,rating,mean,userId,lfm_predict
68627,1270,3.5,3.953876,58098,0.469942
68620,1680,3.5,3.545137,58098,0.11305
68621,2617,2.5,3.205517,58098,0.066442
68634,2700,5.0,3.626362,58098,-0.001425
68632,2687,3.5,3.484784,58098,-0.175512


In [183]:
sampled_user_movies=sampled_user_movies.drop(['rating', 'mean','userId'], axis=1)
sampled_user_movies=sampled_user_movies.merge(ALS_pred, how='inner', left_on='movieId', right_on='movieId')
sampled_user_movies=sampled_user_movies.rename(columns = {'prediction':'als_predict'})

In [184]:
sampled_user_movies.head()

Unnamed: 0,movieId,lfm_predict,rating,mean,userId,als_predict
0,1206,-1.216506,3.5,3.984191,58098,3.741397
1,2607,-1.909662,4.0,3.78223,58098,3.791891
2,2723,-0.224432,2.0,3.063125,58098,2.908971
3,1009,-1.122579,4.0,3.150767,58098,2.973567
4,6322,-1.990797,3.0,3.445218,58098,3.181064


In [185]:
user_ground_truth = sampled_user_movies.sort_values('rating', ascending=False).head(5).sum()['rating'] / 5

In [186]:
user_ground_truth

4.7

In [187]:
movie_mean_rating = sampled_user_movies.sort_values('mean', ascending=False).head(5).sum()['rating'] / 5

In [188]:
movie_mean_rating

3.9

In [189]:
lfm_rating = sampled_user_movies.sort_values('lfm_predict', ascending=False).head(5).sum()['rating'] / 5


In [190]:
lfm_rating

3.6

In [191]:
als_rating = sampled_user_movies.sort_values('als_predict', ascending=False).head(5).sum()['rating'] / 5

In [192]:
als_rating

4.1