In [1]:
# Netflix Prediction by Shayan Ray, Tan Le, Ce Wu
# All necessary imports placed here

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import math
import re
from scipy.sparse import csr_matrix
import matplotlib.pyplot as plt
import seaborn as sns
from surprise import Reader, Dataset, KNNBaseline, evaluate
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import LinearRegression
from sklearn.metrics import accuracy_score
sns.set_style("darkgrid")
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_error
from surprise import KNNBaseline


In [2]:
# parameters/variables to set

baseDataPath='/users/tanle/A1/data/'
sampleFraction = 0.1 # 10% of the data
sampleSize=10000  # used only to get started or for a quick code run

In [3]:
# load the training dataset
df1 = pd.read_csv(baseDataPath + 'combined_data_1.txt', header = None, names = ['CustomerId', 'Rating'], usecols = [0,1])
df2 = pd.read_csv(baseDataPath + 'combined_data_2.txt', header = None, names = ['CustomerId', 'Rating'], usecols = [0,1])
df3 = pd.read_csv(baseDataPath + 'combined_data_3.txt', header = None, names = ['CustomerId', 'Rating'], usecols = [0,1])
df4 = pd.read_csv(baseDataPath + 'combined_data_4.txt', header = None, names = ['CustomerId', 'Rating'], usecols = [0,1])
org_df = df1
org_df = org_df.append(df2)
org_df = org_df.append(df3)
org_df = org_df.append(df4)
org_df.shape

(100498277, 2)

In [4]:
#load the probe dataset
probe_df = pd.read_csv(baseDataPath +'/probe.txt', header = None, names = ['CustomerId'], usecols = [0])

In [5]:
#load the qualifying dataset
#qualify_df=pd.read_csv(baseDataPath +'qualifying.txt', header = None, names = ['CustomerId'], usecols = [0])

In [6]:
#work with samples for faster run-time
#org_df = org_df.iloc[0:sampleSize,:]
#org_df = org_df.sample(frac=sampleFraction, replace=True)
#probe_df = probe_df.iloc[0:sampleSize, :]
#qualify_df = qualify_df.iloc[0:sampleSize, :]
org_df.shape

(100498277, 2)

In [None]:
# check for successful data load comment out as appropriate
probe_df.shape

(1425333, 1)

In [None]:
# store all movies from original dataset in np array
movie_np = []
movie_id = 0
for index,row in org_df.iterrows():
    if row['CustomerId'][-1] == ':' :
        movie_id = row['CustomerId'][:-1]
    else:
        movie_np.append(movie_id)
print(len(movie_np))

In [None]:
# remove those Movie ID rows from the original dataset
org_df = org_df[pd.notnull(org_df['Rating'])]
org_df.shape
movie_np = np.array(movie_np)
org_df['Movie_Id'] = movie_np.astype(int)
org_df['CustomerId'] = org_df['CustomerId'].astype(int)

In [None]:
# check the transformed output - first few values
org_df.shape

In [None]:
# store all movies in probe dataframe in np array
pmovie_np = []
pmovie_id = 0
probe_movies = probe_df[probe_df['CustomerId'].str.contains(":")] #IDs that contain ':'
for index,row in probe_df.iterrows():
    if row['CustomerId'][-1] == ':' :
        
        pmovie_id = row['CustomerId'][:-1]
       
    else:
        pmovie_np.append(pmovie_id)
print(len(pmovie_np))

In [None]:
# remove those Movie ID rows and add it as a column to probe DF
probe_df = probe_df[~probe_df['CustomerId'].str.contains(":")] #IDs that dont contain ':'


pmovie_np = np.array(pmovie_np)
probe_df['Movie_Id'] = pmovie_np.astype(int)
probe_df['CustomerId'] = probe_df['CustomerId'].astype(int)
probe_df.shape


In [None]:
#clean probe df and add available ratings from training data
probe_df = pd.merge(probe_df, org_df, on=['CustomerId','Movie_Id'])
probe_df.head()

In [None]:
# remove probe data from orig data
org_df = pd.merge(org_df, probe_df,how='left',  on=['CustomerId','Movie_Id', 'Rating'])
org_df.head()

In [None]:
# split the data in training and testing data during training (70% training, 30% testing)
train, test = train_test_split(org_df, random_state=42, test_size=0.3)

In [None]:
test.shape

In [None]:
# Remove the label -Rating from the train and test features and labels respectively (X_train, y_train, X_test, y_test 
X_test = test.loc[:, train.columns != 'Rating']
y_test = test['Rating']
X_train = train.loc[:, train.columns != 'Rating']
y_train = train['Rating']

In [None]:
#sample df for grid evaluation
#smpl_df = org_df.iloc[0:sampleSize,:]
#org_df = org_df.iloc[0:sampleSize,:]

#smpl_df = smpl_df1
#smpl_df.shape
#smpl_df.head()



In [None]:
# evaluating KNNBaseline
from surprise import Reader, Dataset, KNNBaseline, evaluate
reader = Reader(rating_scale=(1, 5))

#load data for evaluation
data = Dataset.load_from_df(org_df[['CustomerId', 'Movie_Id', 'Rating']], reader) # if sampling, [:sampleSize]
data.split(n_folds=3)

knn = KNNBaseline()

#evaluate(KNNBaseline, data, measures=['RMSE', 'MAE'])

In [None]:
# use GridSearch to finetune the hyper parameters for KNNBaseline namely,
# 1. epochs(number of iterations of the stochastic gradient descent),
# 2. learning rate(lr_all) and 
# 3. regularization(reg_all)

#from surprise import GridSearch
#param_grid = {'bsl_options': {'method': ['als', 'sgd'],
#                              'reg': [1, 2]},
#              'k': [2, 3],
#              'sim_options': {'name': ['msd', 'cosine'],
#                              'min_support': [1, 5],
#                              'user_based': [False]}
#              }
#grid_search = GridSearch(KNNBaseline, param_grid, measures=['RMSE'])
#grid_search.evaluate(data)


In [None]:
# the best RMSE performance for KNNBaseline
#print(grid_search.best_score['RMSE'])

In [None]:
# the best RMSE performance parameters for KNNBaseline
#print(grid_search.best_params['RMSE'])


In [None]:
# apply the above parameters to the training dataset for KNNBaseline

data = Dataset.load_from_df(train[['CustomerId', 'Movie_Id', 'Rating']], reader) #[:150000]
data.split(n_folds=3)

# applied the best hyper parameters for KNNBaseline found above
bsl_options= {'method': 'als', 'reg': 1}
sim_options= {'name': 'cosine', 'user_based': False, 'min_support': 1}
k= 3

algo1 = KNNBaseline(sim_options=sim_options, bsl_options=bsl_options, k=k)

trainset = data.build_full_trainset()
algo1.train(trainset)

In [None]:
#predict for a random user id with test set of TRAINING data
uid = str(30878)  #  user id  
mid = str(1)  #  movie id  
# get a prediction for specific users and items.
pred1 = algo1.predict(uid, mid, r_ui=4, verbose=True)
print(pred1.est)

In [None]:
# predict each of the values for the test split of the training dataset
y_pred = list()
#np.empty([y_test.shape[0],])
np.set_printoptions(precision=3)
for index,row in X_test.iterrows():
    pred_val= algo1.predict(row['CustomerId'], row['Movie_Id'], verbose=False)
    y_pred.append(pred_val.est)# = np.append(y_pred, [float()])


y_pred_np = np.array(y_pred)
print(y_pred_np)


In [None]:
# Training error in RMSE format
print (np.sqrt(mean_squared_error(y_test, y_pred))) 


In [None]:
#convert the probe dataframe to features and labels dataframe
X_probe_df = probe_df.loc[:, probe_df.columns != 'Rating']
y_probe_df = probe_df['Rating']


In [None]:
X_probe_df.shape

In [None]:
# predict each of the ratings for the probe dataset
y_probe_pred = list() #np.empty([X_probe_df.shape[0],])


for index,row in X_probe_df.iterrows():
    probe_pred_val= algo1.predict(row['CustomerId'], row['Movie_Id'], verbose=False)
    y_probe_pred.append(probe_pred_val.est)
    y_probe_pred_np=np.array(y_probe_pred)

#print(y_probe_pred_np)

In [None]:
#predict for a random user id with probe data
uid = str(30878)  # raw user id 
mid = str(1)  # raw movie id 

# get a prediction for specific user and movie
#for (uid, mid) in X_test['CustomerId', 'MovieId']:
probe_pred = algo1.predict(uid, mid, r_ui=4, verbose=True) #r_ui is the actual rating
print(probe_pred.est)

In [None]:
# Test(PROBE) error in RMSE format
print (np.sqrt(mean_squared_error(y_probe_df, y_probe_pred))) 