In [1]:
# Netflix Prediction by Shayan Ray, Tan Le, Ce Wu
# All necessary imports placed here

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import math
import re
from scipy.sparse import csr_matrix
import matplotlib.pyplot as plt
import seaborn as sns
from surprise import Reader, Dataset, CoClustering, evaluate
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import LinearRegression
from sklearn.metrics import accuracy_score
sns.set_style("darkgrid")
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_error



In [2]:
# parameters/variables to set

baseDataPath='/users/tanle/A1/data/'
sampleFraction = 0.02 # 2% of the data
sampleSize=10000  # used only to get started or for a quick code run

In [3]:
# load the training dataset
df1 = pd.read_csv(baseDataPath + 'combined_data_1.txt', header = None, names = ['CustomerId', 'Rating'], usecols = [0,1])
df2 = pd.read_csv(baseDataPath + 'combined_data_2.txt', header = None, names = ['CustomerId', 'Rating'], usecols = [0,1])
df3 = pd.read_csv(baseDataPath + 'combined_data_3.txt', header = None, names = ['CustomerId', 'Rating'], usecols = [0,1])
df4 = pd.read_csv(baseDataPath + 'combined_data_4.txt', header = None, names = ['CustomerId', 'Rating'], usecols = [0,1])
org_df = df1
org_df = org_df.append(df2)
org_df = org_df.append(df3)
org_df = org_df.append(df4)
org_df.shape

(100498277, 2)

In [4]:
#load the probe dataset
probe_df = pd.read_csv(baseDataPath +'/probe.txt', header = None, names = ['CustomerId'], usecols = [0])

In [5]:
#load the qualifying dataset
#qualify_df=pd.read_csv(baseDataPath +'qualifying.txt', header = None, names = ['CustomerId'], usecols = [0])

In [6]:
#work with samples for faster run-time
#org_df = org_df.iloc[0:sampleSize,:]
org_df = org_df.sample(frac=sampleFraction, replace=True)
#probe_df = probe_df.iloc[0:sampleSize, :]
#qualify_df = qualify_df.iloc[0:sampleSize, :]
org_df.shape

(2009966, 2)

In [7]:
# check for successful data load comment out as appropriate
probe_df.shape

(1425333, 1)

In [8]:
# store all movies from original dataset in np array
movie_np = []
movie_id = 0
for index,row in org_df.iterrows():
    if row['CustomerId'][-1] == ':' :
        movie_id = row['CustomerId'][:-1]
    else:
        movie_np.append(movie_id)
print(len(movie_np))

2009625


In [9]:
# remove those Movie ID rows from the original dataset
org_df = org_df[pd.notnull(org_df['Rating'])]
org_df.shape
movie_np = np.array(movie_np)
org_df['Movie_Id'] = movie_np.astype(int)
org_df['CustomerId'] = org_df['CustomerId'].astype(int)

In [10]:
# check the transformed output - first few values
org_df.shape

(2009625, 3)

In [11]:
# store all movies in probe dataframe in np array
pmovie_np = []
pmovie_id = 0
probe_movies = probe_df[probe_df['CustomerId'].str.contains(":")] #IDs that contain ':'
for index,row in probe_df.iterrows():
    if row['CustomerId'][-1] == ':' :
        
        pmovie_id = row['CustomerId'][:-1]
       
    else:
        pmovie_np.append(pmovie_id)
print(len(pmovie_np))

1408395


In [12]:
# remove those Movie ID rows and add it as a column to probe DF
probe_df = probe_df[~probe_df['CustomerId'].str.contains(":")] #IDs that dont contain ':'


pmovie_np = np.array(pmovie_np)
probe_df['Movie_Id'] = pmovie_np.astype(int)
probe_df['CustomerId'] = probe_df['CustomerId'].astype(int)
probe_df.shape


(1408395, 2)

In [13]:
#clean probe df and add available ratings from training data
probe_df = pd.merge(probe_df, org_df, on=['CustomerId','Movie_Id'])
probe_df.head()

Unnamed: 0,CustomerId,Movie_Id,Rating
0,2135891,10109,3.0
1,825779,1026,5.0
2,592734,1026,2.0
3,592734,1026,5.0
4,1808876,1026,3.0


In [14]:
probe_df.shape

(277, 3)

In [15]:
# remove probe data from orig data
org_df = pd.merge(org_df, probe_df,how='left',  on=['CustomerId','Movie_Id', 'Rating'])
org_df.head()

Unnamed: 0,CustomerId,Rating,Movie_Id
0,914708,3.0,0
1,1088633,5.0,0
2,910558,5.0,0
3,370508,4.0,0
4,1893187,3.0,0


In [16]:
org_df.shape

(2009633, 3)

In [17]:
# split the data in training and testing data during training (70% training, 30% testing)
train, test = train_test_split(org_df, random_state=42, test_size=0.3)

In [18]:
test.shape

(602890, 3)

In [19]:
# Remove the label -Rating from the train and test features and labels respectively (X_train, y_train, X_test, y_test 
X_test = test.loc[:, train.columns != 'Rating']
y_test = test['Rating']
X_train = train.loc[:, train.columns != 'Rating']
y_train = train['Rating']

In [20]:
#sample df for grid evaluation
#smpl_df = org_df.iloc[0:sampleSize,:]
#org_df = org_df.iloc[0:sampleSize,:]

#smpl_df = smpl_df1
#smpl_df.shape
#smpl_df.head()



In [21]:
# evaluating CoClustering
from surprise import Reader, Dataset, CoClustering, evaluate
reader = Reader(rating_scale=(1, 5))

#load data for evaluation
data = Dataset.load_from_df(org_df[['CustomerId', 'Movie_Id', 'Rating']], reader) # if sampling, [:sampleSize]
data.split(n_folds=3)

coClustering = CoClustering()

#evaluate(coClustering, data, measures=['RMSE', 'MAE'])

In [22]:
# use GridSearch to finetune the hyper parameters for CoClustering namely,
# 1. epochs(number of iterations of the stochastic gradient descent),
# 2. learning rate(lr_all) and 
# 3. regularization(reg_all)

from surprise import GridSearch
param_grid = {    'n_cltr_u': [2, 3],
                  'n_cltr_i': [2, 3],
                  'n_epochs': [5, 8]
             }
grid_search = GridSearch(CoClustering, param_grid, measures=['RMSE'])
grid_search.evaluate(data)

[{'n_cltr_i': 2, 'n_cltr_u': 2, 'n_epochs': 5}, {'n_cltr_i': 2, 'n_cltr_u': 2, 'n_epochs': 8}, {'n_cltr_i': 2, 'n_cltr_u': 3, 'n_epochs': 5}, {'n_cltr_i': 2, 'n_cltr_u': 3, 'n_epochs': 8}, {'n_cltr_i': 3, 'n_cltr_u': 2, 'n_epochs': 5}, {'n_cltr_i': 3, 'n_cltr_u': 2, 'n_epochs': 8}, {'n_cltr_i': 3, 'n_cltr_u': 3, 'n_epochs': 5}, {'n_cltr_i': 3, 'n_cltr_u': 3, 'n_epochs': 8}]
------------
Parameters combination 1 of 8
params:  {'n_cltr_i': 2, 'n_cltr_u': 2, 'n_epochs': 5}
------------
Mean RMSE: 1.1322
------------
------------
Parameters combination 2 of 8
params:  {'n_cltr_i': 2, 'n_cltr_u': 2, 'n_epochs': 8}
------------
Mean RMSE: 1.1328
------------
------------
Parameters combination 3 of 8
params:  {'n_cltr_i': 2, 'n_cltr_u': 3, 'n_epochs': 5}
------------
Mean RMSE: 1.1470
------------
------------
Parameters combination 4 of 8
params:  {'n_cltr_i': 2, 'n_cltr_u': 3, 'n_epochs': 8}
------------
Mean RMSE: 1.1478
------------
------------
Parameters combination 5 of 8
params:  {'n

In [23]:
# the best RMSE performance for CoClustering
print(grid_search.best_score['RMSE'])

1.13165874435


In [24]:
# the best RMSE performance parameters for CoClustering
print(grid_search.best_params['RMSE'])


{'n_cltr_i': 3, 'n_cltr_u': 2, 'n_epochs': 5}


In [34]:
# apply the above parameters to the training dataset for CoClustering

data = Dataset.load_from_df(train[['CustomerId', 'Movie_Id', 'Rating']], reader) #[:150000]
data.split(n_folds=3)

# applied the best hyper parameters for CoClustering found above
n_cltr_i= 3
n_epochs= 5
n_cltr_u=2


algo1 = CoClustering(n_cltr_i=n_cltr_i, n_epochs=n_epochs, n_cltr_u=n_cltr_u)

trainset = data.build_full_trainset()
algo1.train(trainset)

In [35]:
#predict for a random user id with test set of TRAINING data
uid = str(30878)  #  user id  
mid = str(1)  #  movie id  
# get a prediction for specific users and items.
pred1 = algo1.predict(uid, mid, r_ui=4, verbose=True)
print(pred1.est)

user: 30878      item: 1          r_ui = 4.00   est = 3.60   {'was_impossible': False}
3.60495342788


In [36]:
# predict each of the values for the test split of the training dataset
y_pred = list()
#np.empty([y_test.shape[0],])
np.set_printoptions(precision=3)
for index,row in X_test.iterrows():
    pred_val= algo1.predict(row['CustomerId'], row['Movie_Id'], verbose=False)
    y_pred.append(pred_val.est)# = np.append(y_pred, [float()])


y_pred_np = np.array(y_pred)
print(y_pred_np)


[ 3.93   3.031  3.328 ...,  2.376  4.263  3.412]


In [37]:
# Training error in RMSE format
print (np.sqrt(mean_squared_error(y_test, y_pred))) 


1.12775151513


In [38]:
#convert the probe dataframe to features and labels dataframe
X_probe_df = probe_df.loc[:, probe_df.columns != 'Rating']
y_probe_df = probe_df['Rating']


In [39]:
X_probe_df.shape

(277, 2)

In [40]:
# predict each of the ratings for the probe dataset
y_probe_pred = list() #np.empty([X_probe_df.shape[0],])


for index,row in X_probe_df.iterrows():
    probe_pred_val= algo1.predict(row['CustomerId'], row['Movie_Id'], verbose=False)
    y_probe_pred.append(probe_pred_val.est)
    y_probe_pred_np=np.array(y_probe_pred)

#print(y_probe_pred_np)

In [41]:
#predict for a random user id with probe data
uid = str(30878)  # raw user id 
mid = str(1)  # raw movie id 

# get a prediction for specific user and movie
#for (uid, mid) in X_test['CustomerId', 'MovieId']:
probe_pred = algo1.predict(uid, mid, r_ui=4, verbose=True) #r_ui is the actual rating
print(probe_pred.est)

user: 30878      item: 1          r_ui = 4.00   est = 3.60   {'was_impossible': False}
3.60495342788


In [42]:
# Test(PROBE) error in RMSE format
print (np.sqrt(mean_squared_error(y_probe_df, y_probe_pred))) 

0.977347955923
