In [16]:
# import pyspark as ps
import pandas as pd
import numpy as np
# from pyspark.sql import SparkSession
# from pyspark.ml.evaluation import RegressionEvaluator
from sklearn.metrics import mean_squared_error
# spark = SparkSession.builder.getOrCreate()
import pickle
import boto3
from io import BytesIO
from src.movie_class import *

In [36]:
from lightfm import LightFM
from lightfm.data import Dataset
from lightfm.evaluation import precision_at_k
from lightfm.evaluation import auc_score
from lightfm.cross_validation import random_train_test_split

In [17]:
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [38]:
s3 = boto3.client('s3')
s3.list_buckets()

In [39]:
def pickle_read(filename, bucket='galvrjsbucket'):
    s3 = boto3.client('s3')
    obj = s3.get_object(Bucket=bucket, Key=filename)
    data = obj['Body'].read()
    f = BytesIO(data)
    file = pickle.load(f)
    return file

def read_dataframe_from_s3(filename, bucket='galvrjsbucket'):
    s3 = boto3.client('s3')
    obj = s3.get_object(Bucket=bucket, Key=filename)
    data = obj['Body'].read()
    f = BytesIO(data)
    df = pd.read_csv(f)
    return df

def pickle_write_to_s3(filename, bucket='galvrjsbucket'):
    s3 = boto3.client('s3')
    obj = s3.put_object(Bucket=bucket, Key=filename)

def pickle_model(model_name, file_name):
        '''Writes model to a pkl file'''
        with open(file_name, 'wb') as f:
            pickle.dump(model_name, f)

### Loading data
- Stored on s3
- Two files, the ratings file and the movie features file

In [95]:
df = read_dataframe_from_s3('filtered_ratings.csv')

In [95]:
data = df.sample(frac=0.25, random_state=51)

In [119]:
data.head()

Unnamed: 0,userId,movieId,rating,count,mean,std,stat_score
2050060,57338,1291,4.0,37908,3.985201,0.861053,5.27678
11805774,22522,273,3.0,6796,3.132063,0.950057,4.557148
6853069,134561,48385,4.5,13716,3.362278,1.195487,5.155509
7521388,50839,68157,3.5,23077,4.011397,0.894771,5.353553
1938246,98490,1246,4.5,24954,3.922377,0.863509,5.217641


In [97]:
features = pickle_read('mv_pkl.pkl')
movie_meta = pd.DataFrame.from_records([s.convert_to_dict() for s in features])
movie_full = data.merge(movie_meta, how='left', left_on='movieId', right_on='movielensId')

In [99]:
movie_full['Adventure'] = (movie_full['tmdb_genre_1'] == 'Adventure') | (movie_full['tmdb_genre_2'] == 'Adventure') | (movie_full['tmdb_genre_3'] == 'Adventure')
movie_full['Drama'] =     (movie_full['tmdb_genre_1'] == 'Drama') |     (movie_full['tmdb_genre_2'] == 'Drama') | (movie_full['tmdb_genre_3'] == 'Drama')
movie_full['Comedy'] =    (movie_full['tmdb_genre_1'] == 'Comedy') |    (movie_full['tmdb_genre_2'] == 'Comedy') | (movie_full['tmdb_genre_3'] == 'Comedy')
movie_full['Action'] =    (movie_full['tmdb_genre_1'] == 'Action') |    (movie_full['tmdb_genre_2'] == 'Action') | (movie_full['tmdb_genre_3'] == 'Action')
movie_full['Animation'] = (movie_full['tmdb_genre_1'] == 'Animation') | (movie_full['tmdb_genre_2'] == 'Animation') | (movie_full['tmdb_genre_3'] == 'Animation')
movie_full['Science Fiction'] = (movie_full['tmdb_genre_1'] == 'Science Fiction') | (movie_full['tmdb_genre_2'] == 'Science Fiction') | (movie_full['tmdb_genre_3'] == 'Science Fiction')
movie_full['Fantasy'] =   (movie_full['tmdb_genre_1'] == 'Fantasy') |   (movie_full['tmdb_genre_2'] == 'Fantasy') | (movie_full['tmdb_genre_3'] == 'Fantasy')
movie_full['Crime'] =     (movie_full['tmdb_genre_1'] == 'Crime') |     (movie_full['tmdb_genre_2'] == 'Crime') | (movie_full['tmdb_genre_3'] == 'Crime')
movie_full['Mystery'] =   (movie_full['tmdb_genre_1'] == 'Mystery') |   (movie_full['tmdb_genre_2'] == 'Mystery') | (movie_full['tmdb_genre_3'] == 'Mystery')
movie_full['Romance'] =   (movie_full['tmdb_genre_1'] == 'Romance') |   (movie_full['tmdb_genre_2'] == 'Romance') | (movie_full['tmdb_genre_3'] == 'Romance')
movie_full['Horror'] =    (movie_full['tmdb_genre_1'] == 'Horror') |    (movie_full['tmdb_genre_2'] == 'Horror') | (movie_full['tmdb_genre_3'] == 'Horror')
movie_full['Thriller'] =  (movie_full['tmdb_genre_1'] == 'Thriller') |  (movie_full['tmdb_genre_2'] == 'Thriller') | (movie_full['tmdb_genre_3'] == 'Thriller')
movie_full['History'] =   (movie_full['tmdb_genre_1'] == 'History') |   (movie_full['tmdb_genre_2'] == 'History') | (movie_full['tmdb_genre_3'] == 'History')
movie_full['Documentary'] = (movie_full['tmdb_genre_1'] == 'Documentary') | (movie_full['tmdb_genre_2'] == 'Documentary') | (movie_full['tmdb_genre_3'] == 'Documentary')
movie_full['Music'] =     (movie_full['tmdb_genre_1'] == 'Music') |     (movie_full['tmdb_genre_2'] == 'Music') | (movie_full['tmdb_genre_3'] == 'Music')
movie_full['War'] =       (movie_full['tmdb_genre_1'] == 'War') |       (movie_full['tmdb_genre_2'] == 'War') | (movie_full['tmdb_genre_3'] == 'War')
movie_full['Family'] =    (movie_full['tmdb_genre_1'] == 'Family') |    (movie_full['tmdb_genre_2'] == 'Family') | (movie_full['tmdb_genre_3'] == 'Family')
movie_full['Western'] =   (movie_full['tmdb_genre_1'] == 'Western') |   (movie_full['tmdb_genre_2'] == 'Western') | (movie_full['tmdb_genre_3'] == 'Western')
movie_full['TV Movie'] =  (movie_full['tmdb_genre_1'] == 'TV Movie') |  (movie_full['tmdb_genre_2'] == 'TV Movie') | (movie_full['tmdb_genre_3'] == 'TV Movie')

movie_full['mean'] = movie_full['mean']*2

### LightFM recommender model application
1. Instantiate Dataset()
2. Fit dataset using userId, itemId, and features
3. Build sparse matrix using build_interactions method. Note the syntax. It takes in an iterable of tuples.
4. Train, test split if desired using random_train_test_split method on interactions matrix
5. Instantiate model, fit, and then run model.

In [102]:
# Step 1. Step 2
dataset = Dataset()
dataset.fit(movie_full['userId'], movie_full['movieId'], movie_full[['rating', 'mean', 'std', 'movielens_mean_rating',
       'movielens_std_rating', 'budget', 'revenue', 'tmdb_vote_average', 'tmdb_vote_count',
       'Adventure', 'Drama','Comedy', 'Action', 'Animation', 'Science Fiction', 'Fantasy', 'Crime',
       'Mystery', 'Romance', 'Horror', 'Thriller', 'History', 'Documentary',
       'Music', 'War', 'Family', 'Western', 'TV Movie']])
num_users, num_items = dataset.interactions_shape()
print('Num users: {}, num_items {}.'.format(num_users, num_items))

Num users: 162448, num_items 17619.


In [103]:
# Step 3
(interactions, weights) = dataset.build_interactions([tuple(i) for i in movie_full[['userId','movieId']].values])
print(repr(interactions))

<162448x17619 sparse matrix of type '<class 'numpy.int32'>'
	with 5706522 stored elements in COOrdinate format>


In [104]:
# Step 4
train, test = random_train_test_split(interactions) #, random_state=1)

In [122]:
train

<162448x17619 sparse matrix of type '<class 'numpy.float32'>'
	with 4565217 stored elements in COOrdinate format>

In [106]:
#5
model = LightFM(learning_rate=0.05, loss='warp')
model.fit(train, epochs=10)

f_train_precision = precision_at_k(model, train, k=10).mean()
f_test_precision = precision_at_k(model, test, k=10).mean()

f_train_auc = auc_score(model, train).mean()
f_test_auc = auc_score(model, test).mean()

print('Precision: train %.2f, test %.2f. With features' % (f_train_precision, f_test_precision))
print('AUC: train %.2f, test %.2f. With features' % (f_train_auc, f_test_auc))

Precision: train 0.12, test 0.03. With features
AUC: train 0.98, test 0.97. With features


In [107]:
pickle_model(model, 'lightfm-with-features.pkl')

In [127]:
def sample_recommendation(model, data, user_ids):
    n_users, n_items = data['train'].shape
    for user_id in user_ids:
        known_positives = data['item_labels'][data['train'].tocsr()[user_id].indices]

        scores = model.predict(user_id, np.arange(n_items))
        top_items = data['item_labels'][np.argsort(-scores)]

        print("User %s" % user_id)
        print("     Known positives:")

        for x in known_positives[:3]:
            print("        %s" % x)

        print("     Recommended:")

        for x in top_items[:3]:
            print("        %s" % x)

In [128]:
sample_recommendation(model, test, [3, 25, 450])

TypeError: 'coo_matrix' object is not subscriptable

In [126]:
model.predict('57338', ['1291'])

array([-2.88501358])

### Repeat without features for comparison

In [113]:
# Step 1. Step 2
dataset2 = Dataset()
dataset2.fit(movie_full['userId'], movie_full['movieId'])
num_users, num_items = dataset.interactions_shape()
print('Num users: {}, num_items {}.'.format(num_users, num_items))

Num users: 162448, num_items 17619.


In [114]:
# Step 3
(interactions2, weights2) = dataset2.build_interactions([tuple(i) for i in movie_full[['userId','movieId']].values])
print(repr(interactions))

<162448x17619 sparse matrix of type '<class 'numpy.int32'>'
	with 5706522 stored elements in COOrdinate format>


In [116]:
# Step 4
train2, test2 = random_train_test_split(interactions2) #, random_state=1)

In [117]:
#5--version without features
model_none = LightFM(learning_rate=0.05, loss='warp')
model_none.fit(train2, epochs=10)

train_precision2 = precision_at_k(model_none, train2, k=10).mean()
test_precision2 = precision_at_k(model_none, test2, k=10).mean()

train_auc2 = auc_score(model_none, train2).mean()
test_auc2 = auc_score(model_none, test2).mean()

print('Precision: train %.2f, test %.2f.' % (train_precision2, test_precision2))
print('AUC: train %.2f, test %.2f.' % (train_auc2, test_auc2))

Precision: train 0.11, test 0.03.
AUC: train 0.98, test 0.97.


In [118]:
pickle_model(model, 'lightfm-no-features.pkl')