In [1]:
import pyspark
from pyspark import SparkContext
from pyspark.sql import SparkSession
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.ml.recommendation import ALS, ALSModel

In [2]:
spark = SparkSession.builder.master("local").getOrCreate()

In [3]:
movie_ratings = spark.read.json('./data/ratings.json')
movie_ratings = movie_ratings.toPandas()

In [4]:
movie_ratings.shape

(719949, 4)

In [5]:
import pandas as pd

In [6]:
movies_meta = pd.read_csv('features.csv')

In [7]:
# movies_meta.drop('category', axis=1, inplace=True)
# movies_meta.head()

In [8]:
users_meta = pd.read_csv('users.csv')

In [9]:
users_meta.head()

Unnamed: 0.1,Unnamed: 0,ID,sex,age,occupation,zipcode
0,0,2,M,56,16,70072
1,1,3,M,25,15,55117
2,2,4,M,45,7,2460
3,3,5,M,25,20,55455
4,4,6,F,50,9,55117


In [10]:
movies = pd.merge(movie_ratings, movies_meta, how='left', left_on='movie_id', right_on='id')

In [11]:
movies_and_users = pd.merge(movies, users_meta, how='left', left_on='user_id', right_on='ID')

In [12]:
movies_and_users.drop(['id','ID'], axis=1, inplace=True)

In [13]:
movies_and_users['Percentile_rank']=movies_and_users.timestamp.rank(pct=True)
movies_and_users.head()

Unnamed: 0.1,movie_id,rating,timestamp,user_id,title,Adventure,Animation,Children's,Comedy,Crime,...,runtime,vote_average,popularity,vote_count,Unnamed: 0,sex,age,occupation,zipcode,Percentile_rank
0,858,4,956678732.0,6040,"Godfather, The",0,0,0,0,1,...,,,,,6038,M,25,6,11106,1e-06
1,2384,4,956678754.0,6040,Babe: Pig in the City,0,0,1,1,0,...,92.0,5.3,6.229629,312.0,6038,M,25,6,11106,3e-06
2,593,5,956678754.0,6040,"Silence of the Lambs, The",0,0,0,0,0,...,,,,,6038,M,25,6,11106,3e-06
3,1961,4,956678777.0,6040,Rain Man,0,0,0,0,0,...,133.0,7.6,11.267467,1762.0,6038,M,25,6,11106,6e-06
4,1419,3,956678856.0,6040,Walkabout,0,0,0,0,0,...,95.0,7.4,10.177086,111.0,6038,M,25,6,11106,1e-05


In [14]:
movies_and_users = movies_and_users.fillna(0)

In [15]:
movies_and_users.head()

Unnamed: 0.1,movie_id,rating,timestamp,user_id,title,Adventure,Animation,Children's,Comedy,Crime,...,runtime,vote_average,popularity,vote_count,Unnamed: 0,sex,age,occupation,zipcode,Percentile_rank
0,858,4,956678732.0,6040,"Godfather, The",0,0,0,0,1,...,0.0,0.0,0.0,0.0,6038,M,25,6,11106,1e-06
1,2384,4,956678754.0,6040,Babe: Pig in the City,0,0,1,1,0,...,92.0,5.3,6.229629,312.0,6038,M,25,6,11106,3e-06
2,593,5,956678754.0,6040,"Silence of the Lambs, The",0,0,0,0,0,...,0.0,0.0,0.0,0.0,6038,M,25,6,11106,3e-06
3,1961,4,956678777.0,6040,Rain Man,0,0,0,0,0,...,133.0,7.6,11.267467,1762.0,6038,M,25,6,11106,6e-06
4,1419,3,956678856.0,6040,Walkabout,0,0,0,0,0,...,95.0,7.4,10.177086,111.0,6038,M,25,6,11106,1e-05


In [16]:
movies_and_users.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 720674 entries, 0 to 720673
Data columns (total 36 columns):
movie_id           720674 non-null int64
rating             720674 non-null int64
timestamp          720674 non-null float64
user_id            720674 non-null int64
title              720674 non-null object
Adventure          720674 non-null int64
Animation          720674 non-null int64
Children's         720674 non-null int64
Comedy             720674 non-null int64
Crime              720674 non-null int64
Documentary        720674 non-null int64
Drama              720674 non-null int64
Fantasy            720674 non-null int64
Film-Noir          720674 non-null int64
Horror             720674 non-null int64
Musical            720674 non-null int64
Mystery            720674 non-null int64
Romance            720674 non-null int64
Sci-Fi             720674 non-null int64
Thriller           720674 non-null int64
War                720674 non-null int64
Western            720674

In [17]:
movies_and_users['sex'] = movies_and_users['sex'].map({'M': 0, 'F': 1})
movies_and_users['zipcode'] = movies_and_users['zipcode'][0:5]

In [18]:
train_df = movies_and_users.loc[movies_and_users["Percentile_rank"] <= .9]
train_df.shape

X_train = train_df.drop(['rating', 'title', 'title_year', 'zipcode', 'Percentile_rank', 'timestamp'], axis=1)
y_train = train_df['rating']

In [19]:
test_df = movies_and_users.loc[movies_and_users["Percentile_rank"] > .9]
test_df.shape

X_test = test_df.drop(['rating', 'title', 'title_year', 'zipcode', 'Percentile_rank', 'timestamp'], axis=1)
y_test = test_df['rating']

In [20]:
from sklearn.ensemble import RandomForestRegressor

In [21]:
rand_forest = RandomForestRegressor(n_estimators=5)

In [22]:
rand_forest.fit(X_train, y_train)

RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
           max_features='auto', max_leaf_nodes=None,
           min_impurity_decrease=0.0, min_impurity_split=None,
           min_samples_leaf=1, min_samples_split=2,
           min_weight_fraction_leaf=0.0, n_estimators=5, n_jobs=None,
           oob_score=False, random_state=None, verbose=0, warm_start=False)

In [23]:
rand_forest.predict(X_test)

array([3.4, 4.8, 4.8, ..., 4.4, 3.6, 4. ])

In [24]:
test_ratings = spark.read.json('./data/requests.json')

In [25]:
test_ratings = test_ratings.toPandas()
test_ratings.head()

Unnamed: 0,movie_id,rating,timestamp,user_id
0,2019,,956678777.0,6040
1,759,,956679248.0,6040
2,2858,,956679275.0,6040
3,246,,956679413.0,6040
4,1617,,956679473.0,6040


In [29]:
test_movies = pd.merge(test_ratings, movies_meta, how='left', left_on='movie_id', right_on='id')
test_movies_and_users = pd.merge(test_movies, users_meta, how='left', left_on='user_id', right_on='ID')
test_movies_and_users.drop(['id','ID'], axis=1, inplace=True)
test_movies_and_users['sex'] = test_movies_and_users['sex'].map({'M': 0, 'F': 1})
test_movies_and_users = test_movies_and_users.fillna(0)
test_movies_and_users['zipcode'] = test_movies_and_users['zipcode'][0:5]
features = test_movies_and_users.drop(['rating', 'title', 'title_year', 'zipcode', 'timestamp'], axis=1)
labels = test_movies_and_users['rating']

In [31]:
cs_predictions = rand_forest.predict(features)

In [32]:
test_movies_and_users['predictions'] = cs_predictions

In [34]:
test_movies_and_users.head(20)['predictions']

0     4.4
1     4.4
2     4.8
3     3.8
4     4.2
5     4.4
6     4.6
7     3.6
8     4.4
9     3.4
10    3.8
11    4.2
12    4.0
13    3.4
14    3.4
15    2.6
16    3.4
17    3.8
18    3.4
19    4.6
Name: predictions, dtype: float64

In [35]:
# import pickle

# pickle.dump(test_movies_and_users, open('cs_model.pkl', 'wb'))