In [284]:
# import pyspark as ps
import pandas as pd
import numpy as np
# from pyspark.sql import SparkSession
# from pyspark.ml.evaluation import RegressionEvaluator
from sklearn.metrics import mean_squared_error
# spark = SparkSession.builder.getOrCreate()
import pickle
import boto3
from io import BytesIO
from src.movie_class import *

In [285]:
from lightfm import LightFM
from lightfm.data import Dataset
from lightfm.evaluation import precision_at_k
from lightfm.evaluation import auc_score
from lightfm.cross_validation import random_train_test_split

In [286]:
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [287]:
s3 = boto3.client('s3')
s3.list_buckets()

{'ResponseMetadata': {'RequestId': 'B129BF9681107D4C',
  'HostId': 'Ec09VwXkTZaJUg1b+1XwY8AcmH4BqsSdY1pbAGb/Bs+zbzpDid21Xpj1pOPCYSOYSjG4v9/lPi8=',
  'HTTPStatusCode': 200,
  'HTTPHeaders': {'x-amz-id-2': 'Ec09VwXkTZaJUg1b+1XwY8AcmH4BqsSdY1pbAGb/Bs+zbzpDid21Xpj1pOPCYSOYSjG4v9/lPi8=',
   'x-amz-request-id': 'B129BF9681107D4C',
   'date': 'Mon, 21 Sep 2020 19:53:25 GMT',
   'content-type': 'application/xml',
   'transfer-encoding': 'chunked',
   'server': 'AmazonS3'},
  'RetryAttempts': 0},
 'Buckets': [{'Name': 'galvrjsbucket',
   'CreationDate': datetime.datetime(2020, 7, 14, 13, 43, 7, tzinfo=tzlocal())}],
 'Owner': {'DisplayName': 'rsalvino1',
  'ID': '19687a4a1667b503ff5ef11e2f8b19dfacf2dcbc7d76a1239efdc7bde26e3be4'}}

In [288]:
def pickle_read(filename, bucket='galvrjsbucket'):
    s3 = boto3.client('s3')
    obj = s3.get_object(Bucket=bucket, Key=filename)
    data = obj['Body'].read()
    f = BytesIO(data)
    file = pickle.load(f)
    return file

def read_dataframe_from_s3(filename, bucket='galvrjsbucket'):
    s3 = boto3.client('s3')
    obj = s3.get_object(Bucket=bucket, Key=filename)
    data = obj['Body'].read()
    f = BytesIO(data)
    df = pd.read_csv(f)
    return df

def pickle_write_to_s3(filename, bucket='galvrjsbucket'):
    s3 = boto3.client('s3')
    obj = s3.put_object(Bucket=bucket, Key=filename)

def pickle_model(model_name, file_name):
        '''Writes model to a pkl file'''
        with open(file_name, 'wb') as f:
            pickle.dump(model_name, f)

### Loading data
- Stored on s3
- Two files, the ratings file and the movie features file

In [289]:
# df = read_dataframe_from_s3('filtered_ratings.csv')

In [290]:
# data = df.sample(frac=0.25, random_state=51)

In [291]:
train_data = pd.read_csv('data/train_ratings_df.csv')
train_data.groupby('movieId').agg(count = ('rating', 'count')).count()
train_data.head()

Unnamed: 0,userId,movieId,rating,count_x,mean,std,stat_score,count_y,_merge
0,40630,933,4.0,4827,3.994821,0.767646,5.14629,43,left_only
1,40630,1035,4.5,15248,3.801515,1.084362,5.428057,43,left_only
2,40630,922,3.5,7368,4.206501,0.872919,5.515879,43,left_only
3,40630,342,2.0,10735,3.507639,1.018708,5.0357,43,left_only
4,40630,2724,2.0,8448,2.862689,1.08866,4.495679,43,left_only


In [292]:
validata = pd.read_csv('data/test_ratings_df.csv')
validata.groupby('movieId').agg(count = ('rating', 'count')).count()
# validata.shape

count    10393
dtype: int64

In [293]:
features = pickle_read('mv_pkl.pkl')
movie_meta = pd.DataFrame.from_records([s.convert_to_dict() for s in features])
# movies_in_validata = validata.groupby('movieId').agg(count = ('rating', 'count'))
train_full = train_data.merge(movie_meta, how='left', left_on='movieId', right_on='movielensId')

In [294]:
train_full.describe()

Unnamed: 0,userId,movieId,rating,count_x,mean,std,stat_score,count_y,movielensId,budget,revenue,tmdb_popularity,tmdb_vote_average,tmdb_vote_count,release_date
count,7232174.0,7232174.0,7232174.0,7232174.0,7232174.0,7232174.0,7232174.0,7232174.0,7076106.0,7076106.0,7076106.0,7076106.0,7076106.0,7076106.0,7076080.0
mean,81241.06,21749.54,3.617184,15515.7,3.622279,0.943781,5.03795,176.9275,18815.01,38980920.0,194847100.0,22.3551,7.185518,4230.196,1993.799
std,46799.09,38751.97,1.008452,16341.79,0.3546737,0.09690264,0.2905155,219.9994,32999.99,48674570.0,270383200.0,14.31891,0.6938721,4772.72,15.20119
min,1.0,1.0,0.5,31.0,2.166667,0.4722703,4.40003,16.0,1.0,0.0,0.0,0.6,0.0,0.0,1896.0
25%,40538.0,1210.0,3.0,3613.0,3.371061,0.8761994,4.83353,51.0,1204.0,4500000.0,20354320.0,12.594,6.7,736.0,1989.0
50%,80982.0,2997.0,4.0,10012.0,3.664388,0.9374866,5.053006,107.0,2921.0,20000000.0,92823550.0,18.411,7.2,2357.0,1997.0
75%,121596.0,8937.0,4.0,22165.0,3.893708,1.000786,5.25331,216.0,8361.0,55000000.0,272742900.0,27.579,7.7,6433.0,2003.0
max,162541.0,207830.0,5.0,81491.0,4.483096,1.742576,6.129151,3695.0,148440.0,380000000.0,2787965000.0,125.689,10.0,27142.0,2019.0


In [295]:
# train_full = train_data.merge(movies_with_features_in_validata, how='inner', left_on='movieId', right_on='movielensId')
# valid_full = test_data.merge(movies_with_features_in_validata, how='inner', left_on='movieId', right_on='movielensId')

In [296]:
train_full.groupby('movieId').agg(count = ('rating', 'count'))

Unnamed: 0_level_0,count
movieId,Unnamed: 1_level_1
1,16381
2,7698
3,2933
4,759
5,2980
...,...
206208,8
206499,27
206845,16
207405,12


In [297]:
train_full['Adventure'] = (train_full['tmdb_genre_1'] == 'Adventure') | (train_full['tmdb_genre_2'] == 'Adventure') | (train_full['tmdb_genre_3'] == 'Adventure')
train_full['Drama'] =     (train_full['tmdb_genre_1'] == 'Drama') |     (train_full['tmdb_genre_2'] == 'Drama') | (train_full['tmdb_genre_3'] == 'Drama')
train_full['Comedy'] =    (train_full['tmdb_genre_1'] == 'Comedy') |    (train_full['tmdb_genre_2'] == 'Comedy') | (train_full['tmdb_genre_3'] == 'Comedy')
train_full['Action'] =    (train_full['tmdb_genre_1'] == 'Action') |    (train_full['tmdb_genre_2'] == 'Action') | (train_full['tmdb_genre_3'] == 'Action')
train_full['Animation'] = (train_full['tmdb_genre_1'] == 'Animation') | (train_full['tmdb_genre_2'] == 'Animation') | (train_full['tmdb_genre_3'] == 'Animation')
train_full['Science Fiction'] = (train_full['tmdb_genre_1'] == 'Science Fiction') | (train_full['tmdb_genre_2'] == 'Science Fiction') | (train_full['tmdb_genre_3'] == 'Science Fiction')
train_full['Fantasy'] =   (train_full['tmdb_genre_1'] == 'Fantasy') |   (train_full['tmdb_genre_2'] == 'Fantasy') | (train_full['tmdb_genre_3'] == 'Fantasy')
train_full['Crime'] =     (train_full['tmdb_genre_1'] == 'Crime') |     (train_full['tmdb_genre_2'] == 'Crime') | (train_full['tmdb_genre_3'] == 'Crime')
train_full['Mystery'] =   (train_full['tmdb_genre_1'] == 'Mystery') |   (train_full['tmdb_genre_2'] == 'Mystery') | (train_full['tmdb_genre_3'] == 'Mystery')
train_full['Romance'] =   (train_full['tmdb_genre_1'] == 'Romance') |   (train_full['tmdb_genre_2'] == 'Romance') | (train_full['tmdb_genre_3'] == 'Romance')
train_full['Horror'] =    (train_full['tmdb_genre_1'] == 'Horror') |    (train_full['tmdb_genre_2'] == 'Horror') | (train_full['tmdb_genre_3'] == 'Horror')
train_full['Thriller'] =  (train_full['tmdb_genre_1'] == 'Thriller') |  (train_full['tmdb_genre_2'] == 'Thriller') | (train_full['tmdb_genre_3'] == 'Thriller')
train_full['History'] =   (train_full['tmdb_genre_1'] == 'History') |   (train_full['tmdb_genre_2'] == 'History') | (train_full['tmdb_genre_3'] == 'History')
train_full['Documentary'] = (train_full['tmdb_genre_1'] == 'Documentary') | (train_full['tmdb_genre_2'] == 'Documentary') | (train_full['tmdb_genre_3'] == 'Documentary')
train_full['Music'] =     (train_full['tmdb_genre_1'] == 'Music') |     (train_full['tmdb_genre_2'] == 'Music') | (train_full['tmdb_genre_3'] == 'Music')
train_full['War'] =       (train_full['tmdb_genre_1'] == 'War') |       (train_full['tmdb_genre_2'] == 'War') | (train_full['tmdb_genre_3'] == 'War')
train_full['Family'] =    (train_full['tmdb_genre_1'] == 'Family') |    (train_full['tmdb_genre_2'] == 'Family') | (train_full['tmdb_genre_3'] == 'Family')
train_full['Western'] =   (train_full['tmdb_genre_1'] == 'Western') |   (train_full['tmdb_genre_2'] == 'Western') | (train_full['tmdb_genre_3'] == 'Western')
train_full['TV Movie'] =  (train_full['tmdb_genre_1'] == 'TV Movie') |  (train_full['tmdb_genre_2'] == 'TV Movie') | (train_full['tmdb_genre_3'] == 'TV Movie')

train_full['mean'] = train_full['mean']*2

### LightFM recommender model application
1. Instantiate Dataset()
2. Fit dataset using userId, itemId, and features
3. Build sparse matrix using build_interactions method. Note the syntax. It takes in an iterable of tuples.
4. Train, test split if desired using random_train_test_split method on interactions matrix
5. Instantiate model, fit, and then run model.

In [233]:
# Step 1. Step 2
dataset_train = Dataset()
dataset_train.fit(train_full['userId'], train_full['movieId'], train_full[['rating', 'mean', 'std', 'movielens_mean_rating',
       'movielens_std_rating', 'budget', 'revenue', 'tmdb_vote_average', 'tmdb_vote_count',
       'Adventure', 'Drama','Comedy', 'Action', 'Animation', 'Science Fiction', 'Fantasy', 'Crime',
       'Mystery', 'Romance', 'Horror', 'Thriller', 'History', 'Documentary',
       'Music', 'War', 'Family', 'Western', 'TV Movie']])
num_users, num_items = dataset_train.interactions_shape()
print('Num users: {}, num_items {}.'.format(num_users, num_items))

Num users: 91783, num_items 12594.


In [234]:
# Step 3
(interactions, weights) = dataset_train.build_interactions([tuple(i) for i in train_full[['userId','movieId']].values])
print(repr(train))

<91783x8996 sparse matrix of type '<class 'numpy.float32'>'
	with 4375445 stored elements in COOrdinate format>


In [235]:
# Step 4
train, test = random_train_test_split(interactions) #, random_state=1)

In [264]:
#5
model = LightFM(learning_rate=0.05, loss='warp')
model.fit(train, user_features=None, epochs=2)

f_train_precision = precision_at_k(model, train, k=10).mean()
f_test_precision = precision_at_k(model, test, k=10).mean()

f_train_auc = auc_score(model, train).mean()
f_test_auc = auc_score(model, test).mean()

print('Precision: train %.3f, test %.3f. With features' % (f_train_precision, f_test_precision))
print('AUC: train %.2f, test %.2f. With features' % (f_train_auc, f_test_auc))

Precision: train 0.153, test 0.037. With features
AUC: train 0.95, test 0.95. With features


In [265]:
model.fit(interactions, epochs=2)

<lightfm.lightfm.LightFM at 0x7f260dc6c910>

In [None]:
### Optimize hyperparameters

from skopt import forest_minimize

# Function to pass to forest_minimize that defines the hyperparameters to test
# Borrowed from https://www.ethanrosenthal.com/2016/11/07/implicit-mf-part-2/
def objective(params):
    # unpack
    epochs, learning_rate,\
    no_components, alpha = params
    
    user_alpha = alpha
    item_alpha = alpha
    model = LightFM(loss='warp',
                    random_state=2016,
                    learning_rate=learning_rate,
                    no_components=no_components,
                    user_alpha=user_alpha,
                    item_alpha=item_alpha)
    model.fit(train, epochs=epochs,
              num_threads=4, verbose=True)
    
    patks = precision_at_k(model, test,
                                              train_interactions=None,
                                              k=10, num_threads=4)
    mapatk = np.mean(patks)
    # Make negative because we want to _minimize_ objective
    out = -mapatk
    # Handle some weird numerical shit going on
    if np.abs(out + 1) < 0.01 or out < -1.0:
        return 0.0
    else:
        return out

# Results from prelim trial 9/19 midday. Epochs = 38 (set range 1,60), learning = 0.04, no_components = 48, alpha = 0 (dont provide range to test)

space = [(1, 60), # epochs
         (10**-3, 1.0, 'log-uniform'), # learning_rate
         (20, 100), # no_components
         (10**-6, 10**-5, 'log-uniform'), # alpha
        ]

res_fm = forest_minimize(objective, space, n_calls=50,
                     random_state=51,
                     verbose=True)

print('Maximimum p@k found: {:6.5f}'.format(-res_fm.fun))
print('Optimal parameters:')
params = ['epochs', 'learning_rate', 'no_components', 'alpha']
for (p, x_) in zip(params, res_fm.x):
    print('{}: {}'.format(p, x_))

In [107]:
pickle_model(model, 'lightfm-with-features.pkl')

### Repeat without features for comparison

In [298]:
# Steps 1-3. build sparse interactions matrix
dataset2 = Dataset()
dataset2.fit(train_full['userId'], train_full['movieId'])
(interactions2, weights2) = dataset2.build_interactions([tuple(i) for i in train_full[['userId','movieId']].values])

In [299]:
# Step 4--
train2, test2 = random_train_test_split(interactions2)
print(f'{repr(train2)} \n {repr(test2)}')

<104964x12594 sparse matrix of type '<class 'numpy.int32'>'
	with 5785739 stored elements in COOrdinate format> 
 <104964x12594 sparse matrix of type '<class 'numpy.int32'>'
	with 1446435 stored elements in COOrdinate format>


In [243]:
### Optimize hyperparameters

from skopt import forest_minimize

# Function to pass to forest_minimize that defines the hyperparameters to test
# Borrowed from https://www.ethanrosenthal.com/2016/11/07/implicit-mf-part-2/
def objective(params):
    # unpack
    epochs, learning_rate,\
    no_components, alpha = params
    
    user_alpha = alpha
    item_alpha = alpha
    model = LightFM(loss='warp',
                    random_state=51,
                    learning_rate=learning_rate,
                    no_components=no_components,
                    user_alpha=user_alpha,
                    item_alpha=item_alpha)
    model.fit(train2, epochs=epochs,
              num_threads=4, verbose=True)
    
    patks = precision_at_k(model, test2,
                                              train_interactions=None,
                                              k=10, num_threads=4)
    mapatk = np.mean(patks)
    # Make negative because we want to _minimize_ objective
    out = -mapatk
    # Handle some weird numerical shit going on
    if np.abs(out + 1) < 0.01 or out < -1.0:
        return 0.0
    else:
        return out

In [244]:
# Results from prelim trial 9/19 midday. Epochs = 38 (set range 1,60), learning = 0.04, no_components = 48, alpha = 0 (dont provide range to test)
space = [(1, 60), # epochs
         (10**-3, 1.0, 'log-uniform'), # learning_rate
         (20, 100), # no_components
         (10**-6, 10**-5, 'log-uniform'), # alpha
        ]

res_fm = forest_minimize(objective, space, n_calls=50,
                     random_state=51,
                     verbose=True)

print('Maximimum p@k found: {:6.5f}'.format(-res_fm.fun))
print('Optimal parameters:')
params = ['epochs', 'learning_rate', 'no_components', 'alpha']
for (p, x_) in zip(params, res_fm.x):
    print('{}: {}'.format(p, x_))

Iteration No: 1 started. Evaluating function at random point.
Epoch 0
Epoch 1
Epoch 2
Epoch 3
Epoch 4
Epoch 5
Epoch 6
Epoch 7
Epoch 8
Epoch 9
Epoch 10
Epoch 11
Epoch 12
Epoch 13
Epoch 14
Epoch 15
Epoch 16
Epoch 17
Epoch 18
Epoch 19
Epoch 20
Epoch 21
Epoch 22
Epoch 23
Epoch 24
Epoch 25
Epoch 26
Epoch 27
Epoch 28
Epoch 29
Epoch 30
Epoch 31
Epoch 32
Epoch 33
Epoch 34
Epoch 35
Epoch 36
Epoch 37
Epoch 38
Epoch 39
Epoch 40
Epoch 41
Epoch 42
Epoch 43
Epoch 44
Epoch 45
Epoch 46
Epoch 47
Epoch 48
Epoch 49
Epoch 50
Epoch 51
Epoch 52
Epoch 53
Epoch 54
Epoch 55
Epoch 56
Epoch 57
Iteration No: 1 ended. Evaluation done at random point.
Time taken: 307.7876
Function value obtained: -0.0327
Current minimum: -0.0327
Iteration No: 2 started. Evaluating function at random point.
Epoch 0
Epoch 1
Epoch 2
Epoch 3
Epoch 4
Epoch 5
Epoch 6
Epoch 7
Epoch 8
Epoch 9
Epoch 10
Epoch 11
Epoch 12
Epoch 13
Epoch 14
Epoch 15
Epoch 16
Epoch 17
Epoch 18
Epoch 19
Epoch 20
Epoch 21
Iteration No: 2 ended. Evaluation done at

In [300]:
# Using the optimal results from the forest minimize search
model = LightFM(learning_rate=0.027, no_components=23, loss='warp')
model.fit(train2, user_features=None, epochs=23)

f_train_precision = precision_at_k(model, train2, k=10).mean()
f_test_precision = precision_at_k(model, test2, k=10).mean()

f_train_auc = auc_score(model, train2).mean()
f_test_auc = auc_score(model, test2).mean()

print('Precision: train %.3f, test %.3f. With features' % (f_train_precision, f_test_precision))
print('AUC: train %.2f, test %.2f. With features' % (f_train_auc, f_test_auc))

Precision: train 0.218, test 0.049. With features
AUC: train 0.97, test 0.96. With features


In [338]:
pickle_model(dataset2, 'dataset-lightfm-no-features.pkl')
pickle_model(model, 'lightfm-no-features.pkl')

### Getting sample info needed for model.predict 
- Needs index numbers in model.predict--use dataset2.mapping() to get internal index

In [334]:
sample = validata.sample(1)

In [335]:
sampled_user = sample['userId'].iloc[0] # userId, not index number
sampled_user_movies = validata[validata['userId'] == sampled_user][['movieId','rating']]#'movie Id numbers, not indices'

In [336]:
movie_samps = []
for movie in sampled_user_movies['movieId']:
    movie_samps.append(dataset2.mapping()[2][movie])

In [337]:
prediction = model.predict(user_ids = dataset2.mapping()[0][sampled_user], item_ids = movie_samps, item_features=None, user_features=None)
sampled_user_movies['predict'] = prediction
sampled_user_movies.sort_values(by='predict',ascending=False)

Unnamed: 0,movieId,rating,predict
61989,2683,4.0,1.553329
61980,780,2.0,1.381363
62009,1527,4.0,1.331024
61984,2916,4.0,1.290805
62000,3994,4.0,0.97474
61985,2115,2.0,0.944186
61998,924,3.0,0.821118
61987,590,5.0,0.644855
61982,1259,5.0,0.609228
61995,2232,4.0,0.350478


In [127]:
def sample_recommendation(model, data, user_ids):
    n_users, n_items = data['train'].shape
    for user_id in user_ids:
        known_positives = data['item_labels'][data['train'].tocsr()[user_id].indices]

        scores = model.predict(user_id, np.arange(n_items))
        top_items = data['item_labels'][np.argsort(-scores)]

        print("User %s" % user_id)
        print("     Known positives:")

        for x in known_positives[:3]:
            print("        %s" % x)

        print("     Recommended:")

        for x in top_items[:3]:
            print("        %s" % x)

In [128]:
sample_recommendation(model, test, [3, 25, 450])

TypeError: 'coo_matrix' object is not subscriptable