In [1]:
# Netflix Prediction by Shayan Ray, Tan Le, Ce Wu
# All necessary imports placed here

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import math
import re
from scipy.sparse import csr_matrix
import matplotlib.pyplot as plt
import seaborn as sns
from surprise import Reader, Dataset, SVD, evaluate
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import LinearRegression
from sklearn.metrics import accuracy_score
sns.set_style("darkgrid")
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_error
from surprise import SVDpp


In [2]:
# parameters/variables to set

baseDataPath='/users/tanle/A1/data/'
sampleFraction = 0.02 # 2% of the data
sampleSize=400000  # used only to get started or for a quick code run

In [3]:
# load the training dataset
df1 = pd.read_csv(baseDataPath + 'combined_data_1.txt', header = None, names = ['CustomerId', 'Rating'], usecols = [0,1])
df2 = pd.read_csv(baseDataPath + 'combined_data_2.txt', header = None, names = ['CustomerId', 'Rating'], usecols = [0,1])
df3 = pd.read_csv(baseDataPath + 'combined_data_3.txt', header = None, names = ['CustomerId', 'Rating'], usecols = [0,1])
df4 = pd.read_csv(baseDataPath + 'combined_data_4.txt', header = None, names = ['CustomerId', 'Rating'], usecols = [0,1])
org_df = df1
org_df = org_df.append(df2)
org_df = org_df.append(df3)
org_df = org_df.append(df4)
org_df.shape

(100498277, 2)

In [4]:
#load the probe dataset
probe_df = pd.read_csv(baseDataPath +'/probe.txt', header = None, names = ['CustomerId'], usecols = [0])

In [5]:
#load the qualifying dataset
#qualify_df=pd.read_csv(baseDataPath +'qualifying.txt', header = None, names = ['CustomerId'], usecols = [0])

In [6]:
#work with samples for faster run-time
#org_df = org_df.iloc[0:sampleSize,:]
org_df = org_df.sample(frac=sampleFraction, replace=True)
#probe_df = probe_df.iloc[0:sampleSize, :]
#qualify_df = qualify_df.iloc[0:sampleSize, :]
org_df.shape

(2009966, 2)

In [7]:
# check for successful data load comment out as appropriate
probe_df.shape

(1425333, 1)

In [8]:
# store all movies in original dataframe np array
movie_np = []
movie_id = 0
for index,row in org_df.iterrows():
    if row['CustomerId'][-1] == ':' :
        movie_id = row['CustomerId'][:-1]
    else:
        movie_np.append(movie_id)
print(len(movie_np))


2009597


In [9]:
# remove those Movie ID rows from the original dataset
org_df = org_df[pd.notnull(org_df['Rating'])]
org_df.shape
movie_np = np.array(movie_np)
org_df['Movie_Id'] = movie_np.astype(int)
org_df['CustomerId'] = org_df['CustomerId'].astype(int)

In [10]:
# check the transformed output - first few values
org_df.shape

(2009597, 3)

In [11]:
# store all movies in probe dataframe in np array
pmovie_np = []
pmovie_id = 0
probe_movies = probe_df[probe_df['CustomerId'].str.contains(":")] #IDs that contain ':'
for index,row in probe_df.iterrows():
    if row['CustomerId'][-1] == ':' :
        
        pmovie_id = row['CustomerId'][:-1]
       
    else:
        pmovie_np.append(pmovie_id)
print(len(pmovie_np))

1408395


In [12]:
# remove those Movie ID rows and add it as a column to probe DF
probe_df = probe_df[~probe_df['CustomerId'].str.contains(":")] #IDs that dont contain ':'


pmovie_np = np.array(pmovie_np)
probe_df['Movie_Id'] = pmovie_np.astype(int)
probe_df['CustomerId'] = probe_df['CustomerId'].astype(int)
probe_df.shape


(1408395, 2)

In [13]:
#clean probe df and add available ratings from training data
probe_df = pd.merge(probe_df, org_df, on=['CustomerId','Movie_Id'])
probe_df.head()

Unnamed: 0,CustomerId,Movie_Id,Rating
0,440949,10036,5.0
1,1815129,10036,4.0
2,2090999,10036,5.0
3,1752442,10036,1.0
4,2206144,10042,3.0


In [14]:
# remove probe data from orig data
org_df = pd.merge(org_df, probe_df,how='left',  on=['CustomerId','Movie_Id', 'Rating'])
org_df.head()

Unnamed: 0,CustomerId,Rating,Movie_Id
0,1750396,5.0,0
1,340492,1.0,0
2,257382,4.0,0
3,505544,3.0,0
4,715966,4.0,0


In [15]:
# split the data in training and testing data during training (70% training, 30% testing)
train, test = train_test_split(org_df, random_state=42, test_size=0.3)

In [16]:
test.head()

Unnamed: 0,CustomerId,Rating,Movie_Id
220533,421839,5.0,15313
1261456,411030,5.0,4543
1871360,1430108,3.0,11701
1858368,1351132,5.0,4319
6079,2298601,4.0,15813


In [17]:
# Remove the label -Rating from the train and test features and labels respectively (X_train, y_train, X_test, y_test 
X_test = test.loc[:, train.columns != 'Rating']
y_test = test['Rating']
X_train = train.loc[:, train.columns != 'Rating']
y_train = train['Rating']

In [18]:
# evaluating SVDpp
from surprise import Reader, Dataset, SVD, evaluate
reader = Reader(rating_scale=(1, 5))

#load data for evaluation
data = Dataset.load_from_df(org_df[['CustomerId', 'Movie_Id', 'Rating']], reader) # if sampling, [:sampleSize]
data.split(n_folds=3)

svd = SVDpp()
#evaluate(svd, data, measures=['RMSE', 'MAE'])

In [19]:
# use GridSearch to finetune the hyper parameters for SVDpp namely,
# 1. epochs(number of iterations of the stochastic gradient descent),
# 2. learning rate(lr_all) and 
# 3. regularization(reg_all)

from surprise import GridSearch
param_grid = {'n_epochs': [5, 8], 'lr_all': [0.002, 0.005],
              'reg_all': [0.4, 0.6]}
grid_search = GridSearch(SVDpp, param_grid, measures=['RMSE'])
grid_search.evaluate(data)

[{'reg_all': 0.4, 'n_epochs': 5, 'lr_all': 0.002}, {'reg_all': 0.4, 'n_epochs': 5, 'lr_all': 0.005}, {'reg_all': 0.4, 'n_epochs': 8, 'lr_all': 0.002}, {'reg_all': 0.4, 'n_epochs': 8, 'lr_all': 0.005}, {'reg_all': 0.6, 'n_epochs': 5, 'lr_all': 0.002}, {'reg_all': 0.6, 'n_epochs': 5, 'lr_all': 0.005}, {'reg_all': 0.6, 'n_epochs': 8, 'lr_all': 0.002}, {'reg_all': 0.6, 'n_epochs': 8, 'lr_all': 0.005}]
------------
Parameters combination 1 of 8
params:  {'reg_all': 0.4, 'n_epochs': 5, 'lr_all': 0.002}
------------
Mean RMSE: 1.0712
------------
------------
Parameters combination 2 of 8
params:  {'reg_all': 0.4, 'n_epochs': 5, 'lr_all': 0.005}
------------
Mean RMSE: 1.0602
------------
------------
Parameters combination 3 of 8
params:  {'reg_all': 0.4, 'n_epochs': 8, 'lr_all': 0.002}
------------
Mean RMSE: 1.0656
------------
------------
Parameters combination 4 of 8
params:  {'reg_all': 0.4, 'n_epochs': 8, 'lr_all': 0.005}
------------
Mean RMSE: 1.0534
------------
------------
Parame

In [20]:
# the best RMSE performance for SVDpp
print(grid_search.best_score['RMSE'])

1.05338928783


In [21]:
# the best RMSE performance parameters for SVDpp
print(grid_search.best_params['RMSE'])


{'reg_all': 0.4, 'n_epochs': 8, 'lr_all': 0.005}


In [22]:
# apply the above parameters to the training dataset for SVDpp

data = Dataset.load_from_df(train[['CustomerId', 'Movie_Id', 'Rating']], reader) #[:150000]
data.split(n_folds=3)

# applied the best hyper parameters for SVDapp found above
algo1 = SVDpp(n_epochs= 8, lr_all= 0.005, reg_all=0.4)
trainset = data.build_full_trainset()
algo1.train(trainset)

In [23]:
#predict for a random user id with test set of TRAINING data
uid = str(30878)  #  user id  
mid = str(1)  #  movie id  
# get a prediction for specific users and items.
pred1 = algo1.predict(uid, mid, r_ui=4, verbose=True)
print(pred1.est)

user: 30878      item: 1          r_ui = 4.00   est = 3.60   {'was_impossible': False}
3.6046286298


In [24]:
# predict each of the values for the test split of the training dataset
y_pred = list()
#np.empty([y_test.shape[0],])
np.set_printoptions(precision=3)
for index,row in X_test.iterrows():
    pred_val= algo1.predict(row['CustomerId'], row['Movie_Id'], verbose=False)
    y_pred.append(pred_val.est)# = np.append(y_pred, [float()])


y_pred_np = np.array(y_pred)
print(y_pred_np)


[ 3.921  3.492  3.596 ...,  3.667  3.684  3.685]


In [25]:
# Training error in RMSE format
print (np.sqrt(mean_squared_error(y_test, y_pred))) 


1.05150554602


In [26]:
#convert the probe dataframe to features and labels dataframe
X_probe_df = probe_df.loc[:, probe_df.columns != 'Rating']
y_probe_df = probe_df['Rating']


In [27]:
X_probe_df.shape

(468, 2)

In [28]:
# predict each of the ratings for the probe dataset
y_probe_pred = list() #np.empty([X_probe_df.shape[0],])


for index,row in X_probe_df.iterrows():
    probe_pred_val= algo1.predict(row['CustomerId'], row['Movie_Id'], verbose=False)
    y_probe_pred.append(probe_pred_val.est)
    y_probe_pred_np=np.array(y_probe_pred)

#print(y_probe_pred_np)

In [29]:
#predict for a random user id with probe data
uid = str(30878)  # raw user id 
mid = str(1)  # raw movie id 

# get a prediction for specific user and movie
#for (uid, mid) in X_test['CustomerId', 'MovieId']:
probe_pred = algo1.predict(uid, mid, r_ui=4, verbose=True) #r_ui is the actual rating
print(probe_pred.est)

user: 30878      item: 1          r_ui = 4.00   est = 3.60   {'was_impossible': False}
3.6046286298


In [30]:
# Test(PROBE) error in RMSE format
print (np.sqrt(mean_squared_error(y_probe_df, y_probe_pred))) 

0.983710088638


In [31]:
'1,2,3'.split('#')

['1,2,3']