In [96]:
import re
import numpy as np
import pandas as pd
import pickle
import boto3
from io import BytesIO

In [97]:
from lightfm import LightFM
from lightfm.data import Dataset

from pyspark.sql.functions import *
import pyspark as ps    # for the pyspark suite
from pyspark.sql.types import StructType, StructField
from pyspark.sql.types import IntegerType, StringType, FloatType, DateType, TimestampType
import pyspark.sql.functions as F
from pyspark.sql import SparkSession
from pyspark.ml.recommendation import ALS
from pyspark.sql import Row


spark = SparkSession.builder.getOrCreate()

sc = spark.sparkContext  # for the pre-2.0 sparkContext

In [98]:
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


### Data preparation

In [99]:
s3 = boto3.client('s3')
# s3.list_buckets()

In [100]:
def read_s3_csv_to_df(filename, bucket='galvrjsbucket'):
    s3 = boto3.client('s3')
    obj = s3.get_object(Bucket=bucket, Key=filename)
    data = obj['Body'].read()
    f = BytesIO(data)
    df = pd.read_csv(f)
    return df


In [101]:
def pickle_write_to_s3(filename, bucket='galvrjsbucket'):
    s3 = boto3.client('s3')
    obj = s3.put_object(Bucket=bucket, Key=filename)

In [102]:
prod_data = read_s3_csv_to_df('prod_data.csv')

In [103]:
prod_data = prod_data.drop('Unnamed: 0', axis=1)

In [104]:
# prod_data = pd.read_csv('data/train_ratings_df.csv')[['userId','movieId', 'rating']]
# prod_data = prod_data.sample(frac = 0.264) # Same size dataset as that which models were trained on. 55 sec until prediction
# prod_data = prod_data.sample(frac = 0.2) # 18 seconds until prediction

In [105]:
prod_data

Unnamed: 0,userId,movieId,rating
0,106234,1356,4.0
1,161277,920,5.0
2,65887,1370,2.0
3,75235,49530,4.0
4,127206,102445,3.0
...,...,...,...
380488,157621,1608,5.0
380489,128243,3448,4.0
380490,86927,1197,5.0
380491,86587,590,3.0


In [106]:
rated_movies = {933: 4.5,
                1035: 4,
                922: 2,
                342: 5,
                2724: 1
               }
new_user_data = pd.DataFrame.from_dict(rated_movies, orient='index').reset_index()
new_user_data = new_user_data.rename(columns={'index': "movieId", 0:"rating"})
new_user_data['userId'] = 500000

In [107]:
new_user_for_pred = prod_data.groupby('movieId').agg(count=('rating','count')).reset_index()
new_user_for_pred['userId'] = 500000
new_user_for_pred = new_user_for_pred.sample(frac = 0.5)

prod_plus_new = prod_data.append(new_user_data)

### ALS model and predictions

In [108]:
prod_plus_new_spark = spark.createDataFrame(prod_plus_new)

In [109]:
als = ALS(rank=20,
          maxIter=20,
          regParam=0.1,
          alpha=2,
          userCol="userId",
          itemCol="movieId",
          ratingCol="rating")

ALS_fit_model = als.fit(prod_plus_new_spark)

In [110]:
new_user_full_spark = spark.createDataFrame(new_user_for_pred)
new_user_full_spark.show(2)

+-------+-----+------+
|movieId|count|userId|
+-------+-----+------+
|   4225|   42|500000|
|  89904|   46|500000|
+-------+-----+------+
only showing top 2 rows



In [111]:
als_predictions = ALS_fit_model.transform(new_user_full_spark)

In [112]:
als_predictions2 = als_predictions.orderBy('prediction', ascending=False).select('movieId').take(50)

In [113]:
als_predictions2 = [als_predictions2[i].movieId for i in range(len(als_predictions2))]

In [114]:
als_predictions2

[4768,
 3808,
 26147,
 858,
 3260,
 67957,
 7234,
 342,
 50066,
 1966,
 1949,
 664,
 67999,
 4769,
 1265,
 5319,
 726,
 84242,
 3467,
 2764,
 933,
 1207,
 1270,
 6918,
 60487,
 73,
 598,
 7486,
 1199,
 1397,
 5177,
 4381,
 2730,
 5230,
 33779,
 262,
 3147,
 104908,
 170959,
 1963,
 170777,
 4470,
 108977,
 131724,
 108981,
 1298,
 7158,
 187717,
 30856,
 26425]

### LightFM model and predictions

In [115]:
dataset2 = Dataset()
dataset2.fit(prod_plus_new['userId'], prod_plus_new['movieId'])
(interactions2, weights2) = dataset2.build_interactions([tuple(i) for i in prod_plus_new[['userId','movieId']].values])

In [116]:
model = LightFM(learning_rate=0.027, no_components=23, loss='warp')
model.fit(interactions2, user_features=None, epochs=56)

<lightfm.lightfm.LightFM at 0x7f18d48b9b80>

In [117]:
prediction = model.predict(user_ids = dataset2.mapping()[0][new_user_for_pred['userId'].iloc[0]], item_ids =np.arange(len(new_user_for_pred)), item_features=None, user_features=None)

In [118]:
prediction

array([ 0.35609326,  0.97828329,  1.5400387 , ..., -0.63569278,
       -1.55680573, -1.89555526])

In [119]:
lfm_movies = pd.DataFrame.from_dict(dataset2.mapping()[0], orient='index').reset_index()
lfm_movies  = lfm_movies.rename(columns={'index': "movieId", 0:"to_drop"})
lfm_movies = lfm_movies.drop('to_drop', axis=1).iloc[:len(prediction)]

In [120]:
lfm_movies['prediction'] = prediction

In [121]:
lfm_movies = lfm_movies.sort_values('prediction', ascending=False)

In [122]:
lfm_movies

Unnamed: 0,movieId,prediction
412,40208,2.548931
495,33114,2.398996
702,143725,2.376488
154,31278,2.346927
74,80063,2.110063
...,...,...
3863,96992,-2.760962
2697,160743,-2.769882
2583,146762,-2.783595
3851,130459,-2.915465
