## Import Clean Data 

In [1]:
import pandas as pd

df = pd.read_csv("../csvs/vader_sent_filtered_philly_295k.csv")
df.drop(columns=["business_id", "user_id", "text", "date", "res_avg_stars", "review_count", "nltk_sentiment"], inplace=True)
df.rename(columns={'user_id_cat' : 'user_id', 'business_id_cat' : 'business_id'}, inplace=True)
df = df[["user_id", "business_id", "stars"]]
df

Unnamed: 0,user_id,business_id,stars
0,0,0,4.0
1,0,1342,4.0
2,0,2065,4.0
3,0,2500,3.0
4,0,2765,3.0
...,...,...,...
294998,10726,4837,4.0
294999,10726,5192,4.0
295000,10726,5271,2.0
295001,10726,5696,5.0


## Transform Data to Surpise Format

In [2]:
from surprise import Dataset, Reader, accuracy

reader = Reader(rating_scale=(1, 5))
data = Dataset.load_from_df(df, reader)

## Train Test Split

In [3]:
import random
from surprise.model_selection import GridSearchCV
from surprise import BaselineOnly, SVD, SVDpp, NMF

# Load the full dataset.
raw_ratings = data.raw_ratings

# shuffle ratings if you want
random.Random(123).shuffle(raw_ratings)

# A = 80% of the data, B = 20% of the data
threshold = int(0.8 * len(raw_ratings))
A_raw_ratings = raw_ratings[:threshold]
B_raw_ratings = raw_ratings[threshold:]

data.raw_ratings = A_raw_ratings  # data is now the set A

trainset = data.build_full_trainset() # trainset is now the set A as well
testset = data.construct_testset(B_raw_ratings)  # testset is now the set B

## Baseline Method

In [None]:
# Baseline Method
baseline_algo = BaselineOnly()
baseline_algo.fit(trainset)

predictions = baseline_algo.test(testset)
print(accuracy.rmse(predictions))
print(accuracy.mae(predictions))

Estimating biases using als...
RMSE: 0.9929
0.9929128713663561
MAE:  0.7768
0.7767876435061934


## SVD

In [5]:
# SVD
svd_algo = SVD()
svd_algo.fit(trainset)

predictions = svd_algo.test(testset)
print(accuracy.rmse(predictions))
print(accuracy.mae(predictions))

RMSE: 1.0035
1.0034551795033047
MAE:  0.7821
0.7820812535597816


## SVD++

In [6]:
# SVD++
svdpp_algo = SVDpp()
svdpp_algo.fit(trainset)

predictions = svdpp_algo.test(testset)
print(accuracy.rmse(predictions))
print(accuracy.mae(predictions))

RMSE: 1.0025
1.0025187370458302
MAE:  0.7794
0.7793710998772974


## NMF 

In [7]:
# NMF
nmf_algo = NMF()
nmf_algo.fit(trainset)

predictions = nmf_algo.test(testset)
print(accuracy.rmse(predictions))
print(accuracy.mae(predictions))

RMSE: 1.0710
1.0710177133767183
MAE:  0.8270
0.8270302647139903


## Hyperparameter Tuning

## SVD Tuning

In [8]:
# Tuning hyperparameters for SVD

# Select your best algo with grid search.
print("Grid Search...")
param_grid_1 = {"n_factors": [20, 50, 80, 100], "n_epochs": [40, 60, 80, 100], "lr_all": [0.002, 0.005, 0.007], "reg_all": [0.02, 0.05, 0.1]}
grid_search_1 = GridSearchCV(SVD, param_grid_1, measures=["rmse"], cv=5, n_jobs=-1)
grid_search_1.fit(data)

algo1 = grid_search_1.best_estimator["rmse"]

# Best params of SVD
print(grid_search_1.best_params["rmse"])

# Fit the best model with training data
algo1.fit(trainset)
predictions = algo1.test(testset)
print("Prediction accuracy on B,", end=" ")
print(accuracy.rmse(predictions))
print(accuracy.mae(predictions))

Grid Search...
{'n_factors': 20, 'n_epochs': 60, 'lr_all': 0.002, 'reg_all': 0.1}
Prediction accuracy on B, RMSE: 0.9900
0.9899871088094258
MAE:  0.7714
0.7713744753661325


## SVD++ Tuning

In [9]:
# Tuning hyperparameters for SVDpp

# Select your best algo with grid search.
print("Grid Search...")
param_grid_2 = {"n_factors": [20, 50, 80], "n_epochs": [20, 40, 60], "lr_all": [0.002, 0.005, 0.007], "reg_all": [0.02, 0.05, 0.1]}
grid_search_2 = GridSearchCV(SVDpp, param_grid_2, measures=["rmse"], cv=5, n_jobs=-1)
grid_search_2.fit(data)

algo2 = grid_search_2.best_estimator["rmse"]

# Best params of SVD++
print(grid_search_2.best_params["rmse"])

# Fit the best model with training data
algo2.fit(trainset)
predictions = algo2.test(testset)
print("Prediction accuracy on B,", end=" ")
print(accuracy.rmse(predictions))
print(accuracy.mae(predictions))

Grid Search...




{'n_factors': 20, 'n_epochs': 60, 'lr_all': 0.002, 'reg_all': 0.1}
Prediction accuracy on B, RMSE: 0.9901
0.990103180179642
MAE:  0.7717
0.7717101192882551


## NMF Tuning

In [10]:
# Tuning hyperparameters for NMF

# Select your best algo with grid search.
print("Grid Search...")
param_grid_3 = {"n_factors": [15, 30, 50], "n_epochs": [30, 50, 80, 100], "reg_qi": [0.06, 0.1, 0.15], "reg_pu": [0.06, 0.1, 0.15]}
grid_search_3 = GridSearchCV(NMF, param_grid_3, measures=["rmse"], cv=5, n_jobs=-1)
grid_search_3.fit(data)

algo3 = grid_search_3.best_estimator["rmse"]

# Best params of NMF
print(grid_search_3.best_params["rmse"])

# Fit the best model with training data
algo3.fit(trainset)
predictions = algo3.test(testset)
print("Prediction accuracy on B,", end=" ")
print(accuracy.rmse(predictions))
print(accuracy.mae(predictions))

Grid Search...
{'n_factors': 50, 'n_epochs': 30, 'reg_qi': 0.15, 'reg_pu': 0.15}
Prediction accuracy on B, RMSE: 1.0082
1.0082434354159713
MAE:  0.7717
0.7717046027186136
