In [None]:
import sys
import numpy as np
import pandas as pd
sys.path.append('../')

from config.paths import PROCESSED_DATA_PATH, CONFIG_PATH

from utils.data_split import temporal_train_test_split
from utils.metrics import get_top_n, precision_recall_at_k
from utils.config_loader import load_config

from surprise import Dataset, Reader, SVD
from surprise.model_selection import train_test_split
from surprise import accuracy

import optuna


In [None]:
# Load config
config = load_config(CONFIG_PATH / "settings.yaml")
model_cfg = config["model"]
model_features_cfg = config["model_features"]
preproc_cfg = config["preprocessing"]
svd_hyperparams = config["svd_hyperparams"]

In [None]:
processed_data_path = PROCESSED_DATA_PATH / "processed_data.parquet"
data = pd.read_parquet(processed_data_path)

In [None]:
df = data.iloc[:int(len(data)*model_cfg['data_sample_fraction'])]
print(f"Using {model_cfg['data_sample_fraction']*100}% of samples ({len(df)} rows)")

# Train Test Split

In [None]:
train_df, test_df = temporal_train_test_split(df, test_size=model_cfg['test_size'])

# Model

In [None]:
# Define data format
model_features_list = [model_features_cfg['user'], model_features_cfg['item'], model_features_cfg['rating']]

reader = Reader(rating_scale=(preproc_cfg['min_rating'], preproc_cfg['max_rating']))
data = Dataset.load_from_df(train_df[model_features_list], reader)

# Split train and testing
trainset = data.build_full_trainset()
testset = list(zip(test_df[model_features_cfg['user']], test_df[model_features_cfg['item']], test_df[model_features_cfg['rating']]))

# Train the model
model = SVD()
model.fit(trainset)

# Evaluation

In [None]:
predictions = model.test(testset)
rmse = accuracy.rmse(predictions)

In [None]:
# Get top-N predictions
top_n = get_top_n(predictions, n=model_cfg['top_n'])

# Calculate precision and recall
precision, recall = precision_recall_at_k(predictions, k=model_cfg['top_n'], threshold=model_cfg['threshold'])

print(f"Precision@10: {precision:.4f}")
print(f"Recall@10: {recall:.4f}")


# Optimized model

In [None]:
import optuna
from surprise import SVD, accuracy

def objective(trial, config):
    n_factors = trial.suggest_int('n_factors', config['n_factors_min'], config['n_factors_max'])
    n_epochs = trial.suggest_int('n_epochs', config['n_epochs_min'], config['n_epochs_max'])
    lr_all = trial.suggest_float('lr_all', config['lr_all_min'], config['lr_all_max'], log=True)
    reg_all = trial.suggest_float('reg_all', config['reg_all_min'], config['reg_all_max'], log=True)

    params = {
        'n_factors': n_factors,
        'n_epochs': n_epochs,
        'lr_all': lr_all,
        'reg_all': reg_all
    }

    model = SVD(**params)
    model.fit(trainset)
    preds = model.test(testset)
    rmse = accuracy.rmse(preds, verbose=False)
    
    return rmse

study = optuna.create_study(direction='minimize')
study.optimize(lambda trial: objective(trial, svd_hyperparams), n_trials=model_cfg['n_trials'])

print("Best RMSE:", study.best_value)
print("Best params:", study.best_params)


In [None]:
best_params = study.best_params
final_model = SVD(**best_params)
final_model.fit(trainset)

In [None]:
predictions = final_model.test(testset)
# Get top-N predictions
top_n = get_top_n(predictions, n=model_cfg['top_n'])

# Calculate precision and recall
precision, recall = precision_recall_at_k(predictions, k=model_cfg['top_n'], threshold=model_cfg['threshold'])

print(f"Precision@10: {precision:.4f}")
print(f"Recall@10: {recall:.4f}")