In [22]:
import pandas as pd
from surprise import Reader, Dataset
from sklearn.model_selection import train_test_split

In [23]:
seed = 42

In [24]:
dataset = pd.read_csv("../datasets/clean_dataset.csv", dtype = {
    'users': 'uint16',
    'books': 'int32',
    'ratings': 'int8',  # Use 'category' for categorical data
})
len(dataset)

604906

In [25]:
dataset = dataset.drop(["categories", "description","title"], axis=1)
dataset = dataset[["user", "book_id", "rating"]]

## GridSearch for multiple algorithms

In [27]:
from sklearn.model_selection import train_test_split
from surprise import Dataset, Reader

reader = Reader(rating_scale=(1, 5))
trainval_df, test_df = train_test_split(dataset, test_size=0.10, random_state=seed)
train_df, val_df = train_test_split(trainval_df, test_size=0.10, random_state=seed)

# make train_set a Trainset object
train_set = Dataset.load_from_df(train_df, reader).build_full_trainset()

# make validation and test set in the right format: list of tuples
val_set = list(val_df.itertuples(index=False, name=None))

# sizes of each set
print(f"Train set size: {train_set.n_ratings}")
print(f"Validation set size: {len(val_set)}")

Train set size: 489973
Validation set size: 54442


In [6]:
# fraction of the dataset for grid search
_, subset = train_test_split(train_df, test_size=0.4, random_state=42) 
reader = Reader(rating_scale=(1, 5))
subset = Dataset.load_from_df(subset, reader)

In [None]:
# grid search
from surprise import SVD, KNNBasic, KNNWithMeans, BaselineOnly
from surprise.accuracy import rmse
from surprise.model_selection import GridSearchCV


# Define algorithms and their parameter grids
param_grids = {
    "SVD": {"n_factors": [50, 100], "lr_all": [0.005, 0.01], "reg_all": [0.02, 0.1]},
    "KNNBasic": {
        "k": [20, 40],
        "min_k": [1, 3],
        "sim_options": {"name": ["cosine"], "user_based": [True, False]},
    },
    "KNNWithMeans": {
        "k": [20, 40],
        "min_k": [1, 3],
        "sim_options": {"name": ["cosine"], "user_based": [True, False]},
    },
    "BaselineOnly": {
        "bsl_options": {
            "method": ["sgd", "als"],
            "learning_rate": [0.005, 0.01],
            "reg": [0.02, 0.1],
        }
    },
}

# Run GridSearchCV for each algorithm
best_algorithms = {}
for algo_name, param_grid in param_grids.items():
    print(f"Tuning {algo_name}...")
    algo_class = eval(algo_name)  # Dynamically get the algorithm class
    gs = GridSearchCV(algo_class, param_grid, measures=["rmse"], cv=2)
    gs.fit(subset) 

    # Save the best algorithm and its parameters
    best_algorithms[algo_name] = {
        "best_algo": gs.best_estimator["rmse"],
        "best_params": gs.best_params["rmse"],
        "best_rmse": gs.best_score["rmse"],
    }

In [None]:
best_algorithms
"""
{'SVD': {'best_algo': <surprise.prediction_algorithms.matrix_factorization.SVD at 0x3797f7800>,
  'best_params': {'n_factors': 50, 'lr_all': 0.01, 'reg_all': 0.02},
  'best_rmse': 0.8356303051692147},
 'KNNBasic': {'best_algo': <surprise.prediction_algorithms.knns.KNNBasic at 0x1544c63f0>,
  'best_params': {'k': 20,
   'min_k': 1,
   'sim_options': {'name': 'msd', 'user_based': True}},
  'best_rmse': 0.8792703092704963},
 'KNNWithMeans': {'best_algo': <surprise.prediction_algorithms.knns.KNNWithMeans at 0x31e2521e0>,
  'best_params': {'k': 20,
   'min_k': 3,
   'sim_options': {'name': 'msd', 'user_based': False}},
  'best_rmse': 0.8918201340711682},
 'BaselineOnly': {'best_algo': <surprise.prediction_algorithms.baseline_only.BaselineOnly at 0x127e4e780>,
  'best_params': {'bsl_options': {'method': 'sgd',
    'learning_rate': 0.01,
    'reg': 0.02}},
  'best_rmse': 0.8890501088103555}}
"""

### fine-tuning SVD

In [28]:
from surprise.prediction_algorithms.matrix_factorization import SVD
from surprise import accuracy

algo = SVD(
        n_factors=95,
        lr_all=0.017,
        reg_all=0.025,
        n_epochs=200,
        random_state=seed,
)
algo.fit(train_set)
predictions = algo.test(val_set)

accuracy.rmse(predictions, verbose=True)

# RMSE: 0.6509

RMSE: 0.6710


0.6709833112719381

#### more epochs

In [29]:
from surprise.prediction_algorithms.matrix_factorization import SVD
from surprise import accuracy

algo = SVD(
    n_factors=95,
    lr_all=0.017,
    reg_all=0.025,
    n_epochs=250,
    random_state=seed,
)
algo.fit(train_set)
predictions = algo.test(val_set)

accuracy.rmse(predictions, verbose=True)

# 210 RMSE: 0.6508
# 215 RMSE: 0.6507
# 220 RMSE: 0.6506
# 230 RMSE: 0.6505
# 230 RMSE: 0.6503

RMSE: 0.6703


0.6703318604796966

In [30]:
from surprise.prediction_algorithms.matrix_factorization import SVD
from surprise import accuracy

algo = SVD(
    n_factors=95,
    lr_all=0.017,
    init_std_dev=0.05,
    reg_all=0.025,
    n_epochs=250,
    random_state=seed,
)
algo.fit(train_set)
predictions = algo.test(val_set)

accuracy.rmse(predictions, verbose=True)

# init_std_dev=0.05 RMSE: RMSE: 0.6456

RMSE: 0.6685


0.6685258005938344

#### modifying reg_all

In [31]:
from surprise.prediction_algorithms.matrix_factorization import SVD
from surprise import accuracy

algo = SVD(
    n_factors=95,
    lr_all=0.013,
    init_std_dev=0.05,
    reg_all=0.01,
    n_epochs=250,
    random_state=seed,
)
algo.fit(train_set)
predictions = algo.test(val_set)

accuracy.rmse(predictions, verbose=True)

# reg_all 0.02: RMSE: 0.6442
# reg_all 0.018: RMSE: 0.6438
# reg_all=0.01 + lr_all=0.015
# reg_all=0.01 + lr_all=0.013 RMSE: 0.6428

RMSE: 0.6659


0.665886525561939

#### again more epochs

In [32]:
from surprise.prediction_algorithms.matrix_factorization import SVD
from surprise import accuracy

algo = SVD(
    n_factors=95,
    lr_all=0.013,
    init_std_dev=0.05,
    reg_all=0.01,
    n_epochs=300,
    random_state=seed,
)
algo.fit(train_set)
predictions = algo.test(val_set)

accuracy.rmse(predictions, verbose=True)

# n_epo0chs: 300


RMSE: 0.6657


0.6656678535104626