In [130]:
import re
import numpy as np
import pandas as pd
import pickle
import boto3
from io import BytesIO

In [131]:
from lightfm import LightFM
from lightfm.data import Dataset

from pyspark.sql.functions import *
import pyspark as ps    # for the pyspark suite
from pyspark.sql.types import StructType, StructField
from pyspark.sql.types import IntegerType, StringType, FloatType, DateType, TimestampType
import pyspark.sql.functions as F
from pyspark.sql import SparkSession
from pyspark.ml.recommendation import ALS
from pyspark.sql import Row

# spark = ps.sql.SparkSession.builder \
#             .master("local[4]") \
#             .getOrCreate()

spark = SparkSession.builder.getOrCreate()

sc = spark.sparkContext  # for the pre-2.0 sparkContext

#             .appName("df lecture") \


In [132]:
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


### Data preparation

In [133]:
s3 = boto3.client('s3')
# s3.list_buckets()

In [134]:
def read_dataframe_from_s3(filename, bucket='galvrjsbucket'):
    s3 = boto3.client('s3')
    obj = s3.get_object(Bucket=bucket, Key=filename)
    data = obj['Body'].read()
    f = BytesIO(data)
    df = pd.read_csv(f)
    return df


In [135]:
def pickle_write_to_s3(filename, bucket='galvrjsbucket'):
    s3 = boto3.client('s3')
    obj = s3.put_object(Bucket=bucket, Key=filename)

In [159]:
prod_data = read_dataframe_from_s3('prod_data.csv')

In [160]:
prod_data = prod_data.drop('Unnamed: 0', axis=1)

In [161]:
# prod_data = pd.read_csv('data/train_ratings_df.csv')[['userId','movieId', 'rating']]
# prod_data = prod_data.sample(frac = 0.264) # Same size dataset as that which models were trained on. 55 sec until prediction
# prod_data = prod_data.sample(frac = 0.2) # 18 seconds until prediction

In [162]:
prod_data

Unnamed: 0,userId,movieId,rating
0,106234,1356,4.0
1,161277,920,5.0
2,65887,1370,2.0
3,75235,49530,4.0
4,127206,102445,3.0
...,...,...,...
380488,157621,1608,5.0
380489,128243,3448,4.0
380490,86927,1197,5.0
380491,86587,590,3.0


In [163]:
rated_movies = {933: 4.5,
                1035: 4,
                922: 2,
                342: 5,
                2724: 1
               }
new_user_data = pd.DataFrame.from_dict(rated_movies, orient='index').reset_index()
new_user_data = new_user_data.rename(columns={'index': "movieId", 0:"rating"})
new_user_data['userId'] = 500000

In [164]:
new_user_for_pred = prod_data.groupby('movieId').agg(count=('rating','count')).reset_index()
new_user_for_pred['userId'] = 500000
new_user_for_pred = new_user_for_pred.sample(frac = 0.5)

prod_plus_new = prod_data.append(new_user_data)

### ALS model and predictions

In [165]:
prod_plus_new_spark = spark.createDataFrame(prod_plus_new)

In [166]:
als = ALS(rank=20,
          maxIter=20,
          regParam=0.1,
          alpha=2,
          userCol="userId",
          itemCol="movieId",
          ratingCol="rating")

ALS_fit_model = als.fit(prod_plus_new_spark)

In [167]:
new_user_full_spark = spark.createDataFrame(new_user_for_pred)
new_user_full_spark.show(2)

+-------+-----+------+
|movieId|count|userId|
+-------+-----+------+
| 170787|    2|500000|
|   1036|  655|500000|
+-------+-----+------+
only showing top 2 rows



In [168]:
als_predictions = ALS_fit_model.transform(new_user_full_spark)

In [169]:
als_predictions = als_predictions.orderBy('prediction', ascending=False).take(24)

In [170]:
als_predictions

[Row(movieId=54193, count=2, userId=500000, prediction=4.643538951873779),
 Row(movieId=31705, count=1, userId=500000, prediction=4.630481243133545),
 Row(movieId=60943, count=3, userId=500000, prediction=4.487199783325195),
 Row(movieId=1809, count=21, userId=500000, prediction=4.416341781616211),
 Row(movieId=4927, count=5, userId=500000, prediction=4.316404342651367),
 Row(movieId=2594, count=52, userId=500000, prediction=4.306849956512451),
 Row(movieId=1189, count=48, userId=500000, prediction=4.303802013397217),
 Row(movieId=119966, count=1, userId=500000, prediction=4.300716400146484),
 Row(movieId=1695, count=1, userId=500000, prediction=4.271899700164795),
 Row(movieId=356, count=1270, userId=500000, prediction=4.258736610412598),
 Row(movieId=2131, count=9, userId=500000, prediction=4.241642951965332),
 Row(movieId=102194, count=40, userId=500000, prediction=4.236963748931885),
 Row(movieId=82667, count=17, userId=500000, prediction=4.224095344543457),
 Row(movieId=105477, co

### LightFM model and predictions

In [171]:
dataset2 = Dataset()
dataset2.fit(prod_plus_new['userId'], prod_plus_new['movieId'])
(interactions2, weights2) = dataset2.build_interactions([tuple(i) for i in prod_plus_new[['userId','movieId']].values])

In [172]:
model = LightFM(learning_rate=0.027, no_components=23, loss='warp')
model.fit(interactions2, user_features=None, epochs=56)

<lightfm.lightfm.LightFM at 0x7f58e19c5940>

In [173]:
n_items = len(new_user_for_pred)
n_items

5422

In [174]:
prediction = model.predict(user_ids = dataset2.mapping()[0][new_user_for_pred['userId'].iloc[0]], item_ids =np.arange(len(new_user_for_pred)), item_features=None, user_features=None)

In [175]:
prediction

array([ 0.14952183, -0.544936  , -0.8986882 , ..., -0.49516761,
       -1.58757365, -2.13752365])

In [176]:
lfm_movies = pd.DataFrame.from_dict(dataset2.mapping()[0], orient='index').reset_index()
lfm_movies  = lfm_movies.rename(columns={'index': "movieId", 0:"to_drop"})
lfm_movies = lfm_movies.drop('to_drop', axis=1).iloc[:len(prediction)]

In [177]:
lfm_movies['prediction'] = prediction

In [178]:
lfm_movies.sort_values('prediction', ascending=False).head(10)

Unnamed: 0,movieId,prediction
383,72222,2.276495
480,145603,2.232405
201,33839,2.104612
124,20835,2.089013
138,123747,2.055774
1053,33500,2.03829
239,71186,1.993008
630,125762,1.990573
546,46695,1.910234
537,70616,1.834759
