# FM Optuna Pipeline

Run Factorization Machines hyperparameter optimization with Optuna and MLflow tracking.

In [None]:
import sagemaker
from sagemaker import get_execution_role
import boto3

from pipelines.sagemaker_nb_pipeline import (
    define_steps_for_pipeline,
    execute_local_sagemaker_pipeline,
    execute_sagemaker_pipeline,
)
from utils.mlflow_helpers import setup_mlflow_server_access
from scripts.aggregate_optuna_studies import execute_study_agg_pipeline

sagemaker_session = sagemaker.Session()

## Configuration

In [None]:
# Pipeline configuration
instance_type = "ml.m5.xlarge"
pipeline_name = "fm-optuna-pipeline"
train_notebook = "fm_train.ipynb"
bucket = "your-bucket-name"
subfolder_name = "fm-training"
image_uri = "your-account.dkr.ecr.region.amazonaws.com/sagemaker-distribution:latest"
kernel_name = "python3"
notebook_artifacts = f"s3://{bucket}/{subfolder_name}"

# Training parameters
max_trials = "20"
early_stopping = "5"
experiment_name = "fm_gambling_optuna_2025"

## Define Training Configurations

Define different configurations to run in parallel.

In [None]:
# Different dataset configurations to test
config_dict = {
    "small": {
        "n_users": "1000",
        "n_games": "50",
        "n_days": "90",
    },
    "medium": {
        "n_users": "5000",
        "n_games": "100",
        "n_days": "180",
    },
    "large": {
        "n_users": "10000",
        "n_games": "200",
        "n_days": "365",
    },
}

## Define Pipeline Steps

In [None]:
params = {
    "max_trials": max_trials,
    "experiment_name": experiment_name,
    "early_stopping": early_stopping,
}

pipeline_steps = define_steps_for_pipeline(
    config_dict,
    image_uri,
    notebook_artifacts,
    train_notebook,
    kernel_name,
    instance_type,
    **params
)

print(f"Created {len(pipeline_steps)} pipeline steps")

## Local Mode Execution (Testing)

Use local mode to test the pipeline before running on SageMaker.

In [None]:
# Uncomment to run locally
# execute_local_sagemaker_pipeline(pipeline_name, pipeline_steps)

## SageMaker Pipeline Execution

In [None]:
# Execute on SageMaker
execution = execute_sagemaker_pipeline(pipeline_name, pipeline_steps)

## Aggregate Results

After pipeline completes, aggregate Optuna studies from all runs.

In [None]:
# Setup MLflow access
mlflow_server_name = "mlflow-server"
setup_mlflow_server_access(mlflow_server_name)

In [None]:
# Aggregate studies
execute_study_agg_pipeline(
    experiment_name=experiment_name,
    optuna_target_db="all_fm_studies.db",
    local_folder="results",
)

## View Best Results

In [None]:
import optuna

# Load aggregated study
study = optuna.load_study(
    study_name="fm_gambling",
    storage="sqlite:///results/all_fm_studies.db"
)

print(f"Best RMSE: {-study.best_value:.4f}")
print(f"Best params: {study.best_params}")