In [1]:
import pandas as pd
import matplotlib.pyplot as plt
from surprise import Reader, Dataset
from sklearn.model_selection import train_test_split

In [2]:
dataset = pd.read_csv("../dataset/ratings_clean_dataset.csv", dtype = {
    'users': 'uint16',
    'books': 'int32',
    'ratings': 'int8',  # Use 'category' for categorical data
})
len(dataset)

877855

## GridSearch for multiple algorithms

In [3]:
# fraction of the dataset for grid search
_, subset = train_test_split(dataset, test_size=0.4, random_state=15) 
reader = Reader(rating_scale=(1, 5))
subset = Dataset.load_from_df(subset, reader)

In [None]:
# grid search
from surprise import SVD, KNNBasic, KNNWithMeans, BaselineOnly
from surprise.accuracy import rmse
from surprise.model_selection import GridSearchCV


# Define algorithms and their parameter grids
param_grids = {
    
    "KNNBasic": {"SVD": {"n_factors": [50, 100], "lr_all": [0.005, 0.01], "reg_all": [0.02, 0.1]},
        "k": [20, 40],
        "min_k": [1, 3],
        "sim_options": {"name": ["msd", "cosine"], "user_based": [True, False]},
    },
    "KNNWithMeans": {
        "k": [20, 40],
        "min_k": [1, 3],
        "sim_options": {"name": ["msd", "cosine"], "user_based": [True, False]},
    },
    "BaselineOnly": {
        "bsl_options": {
            "method": ["sgd", "als"],
            "learning_rate": [0.005, 0.01],
            "reg": [0.02, 0.1],
        }
    },
}

# Run GridSearchCV for each algorithm
best_algorithms = {}
for algo_name, param_grid in param_grids.items():
    print(f"Tuning {algo_name}...")
    algo_class = eval(algo_name)  # Dynamically get the algorithm class
    gs = GridSearchCV(algo_class, param_grid, measures=["rmse", "mae"], cv=2)
    gs.fit(subset) 

    # Save the best algorithm and its parameters
    best_algorithms[algo_name] = {
        "best_algo": gs.best_estimator["rmse"],
        "best_params": gs.best_params["rmse"],
        "best_rmse": gs.best_score["rmse"],
    }

In [14]:
best_algorithms

NameError: name 'best_algorithms' is not defined

## Prepare data 

In [3]:
from sklearn.model_selection import train_test_split
from surprise import Dataset, Reader

reader = Reader(rating_scale=(1, 5))
trainval_df, test_df = train_test_split(dataset, test_size=0.10, random_state=15)
train_df, val_df = train_test_split(trainval_df, test_size=0.10, random_state=15)

# make train_set a Trainset object
train_set = Dataset.load_from_df(
    train_df, reader
).build_full_trainset()

# make validation and test set in the right format: list of tuples
val_set = list(val_df.itertuples(index=False, name=None))  
test_set = list(
    test_df.itertuples(index=False, name=None)
)  

# sizes of each set
print(f"Train set size: {train_set.n_ratings}")
print(f"Validation set size: {len(val_set)}")
print(f"Test set size: {len(test_set)}")

Train set size: 711062
Validation set size: 79007
Test set size: 87786


In [7]:
# full data 
full_data = Dataset.load_from_df(dataset, reader)

In [4]:
# normalized data

from sklearn.model_selection import train_test_split

# split
train_dataset, test_dataset = train_test_split(
    dataset, test_size=0.20, random_state=15
)

# calculate mean and std on the train set
mean = train_dataset["ratings"].mean()
std = train_dataset["ratings"].std()

# z score on the train set
train_dataset["ratings"] = (train_dataset["ratings"] - mean) / std

train_highest = train_dataset["ratings"].max()
train_lowest = train_dataset["ratings"].min()

# z score on the test set
test_dataset["ratings"] = (test_dataset["ratings"] - mean) / std

test_highest = test_dataset["ratings"].max()
test_lowest = test_dataset["ratings"].min()

train_reader = Reader(rating_scale=(train_lowest, train_highest))
test_reader = Reader(rating_scale=(test_lowest, test_highest))

norm_train_set = Dataset.load_from_df(train_dataset, train_reader).build_full_trainset()
norm_test_set = list(test_dataset.itertuples(index=False, name=None))

In [5]:
from surprise import BaselineOnly
from surprise.accuracy import rmse

bsl_options = {"method": "sgd", "learning_rate": 0.01, "reg": 0.02}
bsl = BaselineOnly(bsl_options=bsl_options)

bsl.fit(train_set)
predictions = bsl.test(val_set)
rmse(predictions)
# RMSE: 0.7950

Estimating biases using sgd...
RMSE: 0.7950


0.7950371179528264

In [None]:
from surprise import KNNBasic
from surprise.accuracy import rmse

sim_options = {'k': 20,
   'min_k': 1,
   'sim_options': {'name': 'msd', 'user_based': True}}
knn = KNNBasic(bsl_options=bsl_options)

knn.fit(train_set)
predictions = knn.test(val_set)
rmse(predictions)
# RMSE: 0.7587

Computing the msd similarity matrix...
Done computing similarity matrix.
RMSE: 0.7587


0.7586833755463397

In [8]:
from surprise.model_selection import GridSearchCV

param_grid = {"k": [20, 21, 25, 30], "min_k": [1], "sim_options": {"name": ["msd"], "user_based": [True]}}
gs = GridSearchCV(KNNBasic, param_grid, measures=["rmse"], cv=3)
gs.fit(full_data)


# best RMSE score
print(gs.best_score["rmse"])

# combination of parameters that gave the best RMSE score
print(gs.best_params["rmse"])

# RMSE 0.7745137508746671
# {"k": 20, "min_k": 1, "sim_options": {"name": "msd", "user_based": True}}

Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
0.7745137508746671
{'k': 20, 'min_k': 1, 'sim_options': {'name': 'msd', 'user_based': True}}


## SVD - grid search

In [None]:
from surprise.model_selection import GridSearchCV

from surprise.prediction_algorithms.matrix_factorization import SVD

param_grid = {
        "n_factors": [40, 50, 60, 75],
        "lr_all": [0.007, 0.01, 0.015],
        "reg_all": [0.015, 0.02, 0.025],
        "n_epochs": [40, 50, 70],
        "random_state": [15]
    }
gs = GridSearchCV(SVD, param_grid, measures=["rmse"], cv=3)
gs.fit(full_data)


# best RMSE score
print(gs.best_score["rmse"])

# combination of parameters that gave the best RMSE score
print(gs.best_params["rmse"])

# 0.6789877072818665
# {"n_factors": 75, "lr_all": 0.015, "reg_all": 0.025, "n_epochs": 70, "random_state": 15}

0.6789877072818665
{'n_factors': 75, 'lr_all': 0.015, 'reg_all': 0.025, 'n_epochs': 70, 'random_state': 15}


In [17]:
from surprise.model_selection import GridSearchCV

from surprise.prediction_algorithms.matrix_factorization import SVD

param_grid = {
    "n_factors": [85, 90, 95],
    "lr_all": [0.017],
    "reg_all": [0.025],
    "n_epochs": [90, 100, 110],
    "random_state": [15],
}
gs = GridSearchCV(SVD, param_grid, measures=["rmse"], cv=3)
gs.fit(full_data)


# best RMSE score
print(gs.best_score["rmse"])

# combination of parameters that gave the best RMSE score
print(gs.best_params["rmse"])

# 0.6728158195384913
# { "n_factors": 90,  "lr_all": 0.017, "reg_all": 0.025,  "n_epochs": 110, "random_state": 15}

0.6728158195384913
{'n_factors': 90, 'lr_all': 0.017, 'reg_all': 0.025, 'n_epochs': 110, 'random_state': 15}


In [1]:
from surprise.model_selection import GridSearchCV

from surprise.prediction_algorithms.matrix_factorization import SVD

param_grid = {
    "n_factors": [90],
    "lr_all": [0.017],
    "reg_all": [0.025],
    "n_epochs": [110],
    "random_state": [15],
}
gs = GridSearchCV(SVD, param_grid, measures=["rmse"], cv=3)
gs.fit(full_data)


# best RMSE score
print(gs.best_score["rmse"])

# combination of parameters that gave the best RMSE score
print(gs.best_params["rmse"])

# RMSE: 0.6395057888211728
# {'n_factors': 90, 'lr_all': 0.017, 'reg_all': 0.025, 'n_epochs': 110, 'random_state': 15}

NameError: name 'full_data' is not defined

In [None]:
from surprise.model_selection import GridSearchCV

from surprise.prediction_algorithms.matrix_factorization import SVD

param_grid = {
    "n_factors": [83, 87, 91],
    "lr_all": [0.015],
    "reg_all": [0.025],
    "n_epochs": [150, 170],
    "random_state": [15],
}
gs = GridSearchCV(SVD, param_grid, measures=["rmse", "mae"], cv=3)
gs.fit(full_data)


# best RMSE score
print(f'best RMSE: {gs.best_score["rmse"]}')
print(f'best param RMSE: {gs.best_params["rmse"]}')


# combination of parameters that gave the best RMSE score
print(f'best MAE: {gs.best_score["mae"]}')
print(f'best param MAE: {gs.best_params["mae"]}')

# best RMSE: 0.6377000105746375
# best param RMSE: {'n_factors': 91, 'lr_all': 0.015, 'reg_all': 0.025, 'n_epochs': 170, 'random_state': 15}

best RMSE: 0.6377000105746375
best param RMSE: {'n_factors': 91, 'lr_all': 0.015, 'reg_all': 0.025, 'n_epochs': 170, 'random_state': 15}


KeyError: 'mae'

In [29]:
from surprise.model_selection import GridSearchCV

from surprise.prediction_algorithms.matrix_factorization import SVD

param_grid = {
    "n_factors": [91, 95],
    "lr_all": [0.017, 0.015],
    "reg_all": [0.025],
    "n_epochs": [170, 190],
    "random_state": [15],
}
gs = GridSearchCV(SVD, param_grid, measures=["rmse", "mae"], cv=3)
gs.fit(full_data)


# best RMSE score
print(f'best RMSE: {gs.best_score["rmse"]}')
print(f'best param RMSE: {gs.best_params["rmse"]}')


# combination of parameters that gave the best RMSE score
print(f'best MAE: {gs.best_score["mae"]}')
print(f'best param MAE: {gs.best_params["mae"]}')


best RMSE: 0.6371719890906221
best param RMSE: {'n_factors': 95, 'lr_all': 0.017, 'reg_all': 0.025, 'n_epochs': 190, 'random_state': 15}
best MAE: 0.37288016676181374
best param MAE: {'n_factors': 91, 'lr_all': 0.015, 'reg_all': 0.025, 'n_epochs': 190, 'random_state': 15}


## svd with un-normalized data

In [34]:
from surprise.prediction_algorithms.matrix_factorization import SVD
from surprise import accuracy

algo = SVD(
        n_factors=95,
        lr_all=0.017,
        reg_all=0.025,
        n_epochs=200,
        random_state=15,
)
algo.fit(train_set)
predictions = algo.test(test_set)

accuracy.rmse(predictions, verbose=True)

# RMSE: 0.6509

RMSE: 0.6509


0.6509079237321987

## svd with normalized data

In [38]:
algo = SVD(
    n_factors=95,
    lr_all=0.017,
    reg_all=0.025,
    n_epochs=200,
    random_state=15,
)
algo.fit(norm_train_set)

predictions = algo.test(norm_test_set)
 
score = accuracy.rmse(predictions, verbose=True)

# 0.6228416701242567

RMSE: 0.6228


multiply the rmse for the standard deviation for it not to be influeced by the scale of the data
for reference: https://www.marinedatascience.co/blog/2019/01/07/normalizing-the-rmse/

In [39]:
score * std

0.6567803270145557

u-normalized data do marginally better after this comparison

#### more epochs

In [None]:
from surprise.prediction_algorithms.matrix_factorization import SVD
from surprise import accuracy

algo = SVD(
    n_factors=95,
    lr_all=0.017,
    reg_all=0.025,
    n_epochs=250,
    random_state=15,
)
algo.fit(train_set)
predictions = algo.test(test_set)

accuracy.rmse(predictions, verbose=True)

# 210 RMSE: 0.6508
# 215 RMSE: 0.6507
# 220 RMSE: 0.6506
# 230 RMSE: 0.6505
# 230 RMSE: 0.6503

RMSE: 0.6503


0.6503028640398695

In [18]:
from surprise.prediction_algorithms.matrix_factorization import SVD
from surprise import accuracy

algo = SVD(
    n_factors=95,
    lr_all=0.017,
    init_std_dev=0.05,
    reg_all=0.025,
    n_epochs=250,
    random_state=15,
)
algo.fit(train_set)
predictions = algo.test(val_set)

accuracy.rmse(predictions, verbose=True)

# init_std_dev=0.05 RMSE: RMSE: 0.6456

RMSE: 0.6456


0.645560037758737

reg_all

In [None]:
from surprise.prediction_algorithms.matrix_factorization import SVD
from surprise import accuracy

algo = SVD(
    n_factors=95,
    lr_all=0.013,
    init_std_dev=0.05,
    reg_all=0.01,
    n_epochs=250,
    random_state=15,
)
algo.fit(train_set)
predictions = algo.test(val_set)

accuracy.rmse(predictions, verbose=True)

# reg_all 0.02: RMSE: 0.6442
# reg_all 0.018: RMSE: 0.6438
# reg_all=0.01 + lr_all=0.015
# reg_all=0.01 + lr_all=0.013 RMSE: 0.6428

RMSE: 0.6428


0.6427741724028159

again more epochs

In [4]:
from surprise.prediction_algorithms.matrix_factorization import SVD
from surprise import accuracy

algo = SVD(
    n_factors=95,
    lr_all=0.013,
    init_std_dev=0.05,
    reg_all=0.01,
    n_epochs=300,
    random_state=15,
)
algo.fit(train_set)
predictions = algo.test(val_set)

accuracy.rmse(predictions, verbose=True)

# n_epo0chs: 300


RMSE: 0.6426


0.6425996572136261

## Test

In [4]:
from surprise.prediction_algorithms.matrix_factorization import SVD
from surprise import accuracy

algo = SVD(
    n_factors=95,
    lr_all=0.015,
    init_std_dev=0.05,
    reg_all=0.015,
    n_epochs=300,
    random_state=15,
)
algo.fit(train_set)
predictions = algo.test(test_set)

accuracy.rmse(predictions, verbose=True)


RMSE: 0.6518


0.6518133940299382

In [5]:
predictions

[Prediction(uid=35373, iid=13651, r_ui=5, est=4.97052926337442, details={'was_impossible': False}),
 Prediction(uid=26811, iid=111342, r_ui=4, est=4.017913234646036, details={'was_impossible': False}),
 Prediction(uid=3236, iid=62155, r_ui=4, est=4.458590638449327, details={'was_impossible': False}),
 Prediction(uid=22505, iid=92158, r_ui=5, est=4.9727858633774185, details={'was_impossible': False}),
 Prediction(uid=3893, iid=54226, r_ui=5, est=4.600761363445462, details={'was_impossible': False}),
 Prediction(uid=30691, iid=55110, r_ui=1, est=3.6974857595728623, details={'was_impossible': False}),
 Prediction(uid=30391, iid=39589, r_ui=2, est=3.856261257128117, details={'was_impossible': False}),
 Prediction(uid=43, iid=95034, r_ui=5, est=3.705507046202907, details={'was_impossible': False}),
 Prediction(uid=16233, iid=52389, r_ui=5, est=4.988171932507364, details={'was_impossible': False}),
 Prediction(uid=25162, iid=36593, r_ui=3, est=4.047075217735121, details={'was_impossible': Fa

In [12]:
# Function to get top-N recommendations
def get_top_n_from_test(user_id, model, testset, n=10):
    # Filter predictions for the given user in the test set
    user_predictions = [pred for pred in testset if pred[0] == user_id]

    # Use the model to predict ratings for these items
    predictions = [model.predict(uid=pred[0], iid=pred[1]) for pred in user_predictions]

    # Sort predictions by estimated rating
    predictions.sort(key=lambda x: x.est, reverse=True)

    # Get the top-N recommendations
    top_n = [(pred.iid, pred.est) for pred in predictions[:n]]

    return top_n


# Example: Get top-10 recommendations for user with ID '196'
user_id = 35373
top_n_recommendations = get_top_n_from_test(user_id, algo, test_set, n=10)

# Display recommendations
for item_id, rating in top_n_recommendations:
    print(f"Item ID: {item_id}, Predicted Rating: {rating}")

Item ID: 13651, Predicted Rating: 4.97052926337442


In [11]:
test_set

[(35373, 13651, 5),
 (26811, 111342, 4),
 (3236, 62155, 4),
 (22505, 92158, 5),
 (3893, 54226, 5),
 (30691, 55110, 1),
 (30391, 39589, 2),
 (43, 95034, 5),
 (16233, 52389, 5),
 (25162, 36593, 3),
 (29669, 36001, 5),
 (31376, 95745, 5),
 (25595, 110034, 4),
 (10825, 38004, 4),
 (23019, 111342, 5),
 (17722, 48042, 5),
 (5380, 3410, 4),
 (28780, 67239, 5),
 (31311, 73281, 4),
 (16526, 103939, 4),
 (34191, 78247, 3),
 (7297, 55738, 3),
 (30795, 81068, 5),
 (23745, 61449, 4),
 (18655, 63539, 5),
 (4154, 29233, 5),
 (8246, 2377, 5),
 (36150, 84414, 5),
 (15216, 104679, 5),
 (32448, 97451, 3),
 (32305, 98192, 5),
 (34197, 111342, 5),
 (28294, 94244, 1),
 (13378, 72629, 5),
 (19310, 81068, 4),
 (12164, 32322, 3),
 (17178, 72976, 5),
 (17533, 70717, 5),
 (3004, 88224, 5),
 (25070, 86951, 5),
 (32148, 111344, 5),
 (8251, 86659, 5),
 (2824, 83409, 3),
 (28159, 108898, 5),
 (28470, 78565, 4),
 (3449, 22108, 5),
 (33582, 77617, 5),
 (33578, 112001, 4),
 (8443, 47438, 1),
 (23416, 50738, 3),
 (5872,