In [1]:
import pyspark
from pyspark import SparkContext
from pyspark.sql import SparkSession
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.ml.recommendation import ALS, ALSModel

In [2]:
spark = SparkSession.builder.master("local").getOrCreate()

In [3]:
movie_ratings = spark.read.json('./data/ratings.json')
movie_ratings = movie_ratings.toPandas()

In [4]:
movie_ratings.shape

(719949, 4)

In [5]:
import pandas as pd

In [6]:
movies_meta = pd.read_csv('features.csv')

In [8]:
movies_meta.drop('category', axis=1, inplace=True)
movies_meta.head()

Unnamed: 0,id,title,year,title_year,budget,revenue,runtime,vote_average,popularity,vote_count
0,1,Toy Story,1995,"('Toy Story', '1995')",30000000.0,373554033.0,81.0,7.7,21.946943,5415.0
1,2,Jumanji,1995,"('Jumanji', '1995')",65000000.0,262797249.0,104.0,6.9,17.015539,2413.0
2,3,Grumpier Old Men,1995,"('Grumpier Old Men', '1995')",0.0,0.0,101.0,6.5,11.7129,92.0
3,4,Waiting to Exhale,1995,"('Waiting to Exhale', '1995')",,,,,,
4,5,Father of the Bride Part II,1995,"('Father of the Bride Part II', '1995')",0.0,76578911.0,106.0,5.7,8.387519,173.0


In [9]:
users_meta = pd.read_csv('users.csv')

In [10]:
users_meta.head()

Unnamed: 0,ID,sex,age,occupation,zipcode
0,1,F,1,10,48067
1,2,M,56,16,70072
2,3,M,25,15,55117
3,4,M,45,7,2460
4,5,M,25,20,55455


In [11]:
movies = pd.merge(movie_ratings, movies_meta, how='left', left_on='movie_id', right_on='id')

In [43]:
movies_and_users = pd.merge(movies, users_meta, how='left', left_on='user_id', right_on='ID')

In [44]:
movies_and_users.drop(['id','ID'], axis=1, inplace=True)

In [45]:
movies_and_users['Percentile_rank']=movies_and_users.timestamp.rank(pct=True)
movies_and_users.head()

Unnamed: 0,movie_id,rating,timestamp,user_id,title,year,title_year,budget,revenue,runtime,vote_average,popularity,vote_count,sex,age,occupation,zipcode,Percentile_rank
0,858,4,956678732.0,6040,"Godfather, The",1972,"('Godfather, The', '1972')",,,,,,,M,25,6,11106,1e-06
1,2384,4,956678754.0,6040,Babe: Pig in the City,1998,"('Babe: Pig in the City', '1998')",90000000.0,69131860.0,92.0,5.3,6.229629,312.0,M,25,6,11106,3e-06
2,593,5,956678754.0,6040,"Silence of the Lambs, The",1991,"('Silence of the Lambs, The', '1991')",,,,,,,M,25,6,11106,3e-06
3,1961,4,956678777.0,6040,Rain Man,1988,"('Rain Man', '1988')",25000000.0,412800000.0,133.0,7.6,11.267467,1762.0,M,25,6,11106,6e-06
4,1419,3,956678856.0,6040,Walkabout,1971,"('Walkabout', '1971')",0.0,0.0,95.0,7.4,10.177086,111.0,M,25,6,11106,1e-05


In [52]:
movies_and_users = movies_and_users.fillna(0)

In [None]:
movies_and_users.head()

In [60]:
movies_and_users.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 720674 entries, 0 to 720673
Data columns (total 18 columns):
movie_id           720674 non-null int64
rating             720674 non-null int64
timestamp          720674 non-null float64
user_id            720674 non-null int64
title              720674 non-null object
year               720674 non-null int64
title_year         720674 non-null object
budget             720674 non-null float64
revenue            720674 non-null float64
runtime            720674 non-null float64
vote_average       720674 non-null float64
popularity         720674 non-null float64
vote_count         720674 non-null float64
sex                720674 non-null int64
age                720674 non-null int64
occupation         720674 non-null int64
zipcode            5 non-null object
Percentile_rank    720674 non-null float64
dtypes: float64(8), int64(7), object(3)
memory usage: 104.5+ MB


In [54]:
movies_and_users['sex'] = movies_and_users['sex'].map({'M': 0, 'F': 1})
movies_and_users['zipcode'] = movies_and_users['zipcode'][0:5]

In [61]:
train_df = movies_and_users.loc[movies_and_users["Percentile_rank"] <= .9]
train_df.shape

X_train = train_df.drop(['rating', 'title', 'title_year', 'zipcode', 'Percentile_rank', 'timestamp'], axis=1)
y_train = train_df['rating']

In [62]:
test_df = movies_and_users.loc[movies_and_users["Percentile_rank"] > .9]
test_df.shape

X_test = test_df.drop(['rating', 'title', 'title_year', 'zipcode', 'Percentile_rank', 'timestamp'], axis=1)
y_test = test_df['rating']

In [57]:
from sklearn.ensemble import RandomForestRegressor

In [63]:
rand_forest = RandomForestRegressor(n_estimators=5)

In [64]:
rand_forest.fit(X_train, y_train)

RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
                      max_features='auto', max_leaf_nodes=None,
                      min_impurity_decrease=0.0, min_impurity_split=None,
                      min_samples_leaf=1, min_samples_split=2,
                      min_weight_fraction_leaf=0.0, n_estimators=5, n_jobs=None,
                      oob_score=False, random_state=None, verbose=0,
                      warm_start=False)

In [65]:
rand_forest.predict(X_test)

array([2.6, 4.8, 4.4, ..., 3.8, 3. , 3.4])

In [None]:
test_ratings = spark.read.json('./data/requests.json')

In [68]:
test_ratings = test_ratings.toPandas()
test_ratings.head()

Unnamed: 0,movie_id,rating,timestamp,user_id
0,2019,,956678777.0,6040
1,759,,956679248.0,6040
2,2858,,956679275.0,6040
3,246,,956679413.0,6040
4,1617,,956679473.0,6040


In [70]:
test_movies = pd.merge(test_ratings, movies_meta, how='left', left_on='movie_id', right_on='id')
test_movies_and_users = pd.merge(test_movies, users_meta, how='left', left_on='user_id', right_on='ID')
test_movies_and_users.drop(['id','ID'], axis=1, inplace=True)
test_movies_and_users = test_movies_and_users.fillna(0)
test_movies_and_users['sex'] = test_movies_and_users['sex'].map({'M': 0, 'F': 1})
test_movies_and_users['zipcode'] = test_movies_and_users['zipcode'][0:5]
features = test_movies_and_users.drop(['rating', 'title', 'title_year', 'zipcode', 'timestamp'], axis=1)
labels = test_movies_and_users['rating']

In [72]:
cs_predictions = rand_forest.predict(features)

In [77]:
test_movies_and_users['predictions'] = cs_predictions

In [78]:
import pickle

pickle.dump(test_movies_and_users, open('cs_model.pkl', 'wb'))