# FM Training Notebook

This notebook is executed as a SageMaker notebook job for FM Optuna training.
Parameters are injected by the pipeline.

In [36]:
import logging
import sys
from datetime import datetime
from functools import partial
from pathlib import Path
import os
import mlflow
import numpy as np
import optuna
import pandas as pd
import boto3
from data.simulate_gambling_data import generate_gambling_dataset
from scripts.fm_encoding import FMEncoder, create_user_item_matrix
from scripts.factorization_machines import LocalFMSimulator

In [37]:
# Parameters - these will be overwritten by the pipeline
config_name = "default"
n_users = "5000"
n_games = "100"
n_days = "180"
max_trials = "20"
early_stopping = "5"
experiment_name = "fm_gambling_optuna"
use_feature_store = "false"
project_name = "fm-gambling-recommender"

In [38]:

# Get AWS account info
sts = boto3.client("sts")
account_id = sts.get_caller_identity()["Account"]
region = boto3.session.Session().region_name

print(f"AWS Account: {account_id}")
print(f"Region: {region}")

AWS Account: 376337229415
Region: us-east-1


In [39]:
# Convert string params to appropriate types
n_users = int(n_users)
n_games = int(n_games)
n_days = int(n_days)
max_trials = int(max_trials)
early_stopping = int(early_stopping)

os.environ["MLFLOW_ENABLE_SYSTEM_METRICS_LOGGING"] = "true"
os.environ["MLFLOW_TRACKING_URI"] = "arn:aws:sagemaker:us-east-1:376337229415:mlflow-app/app-JZITH5VWKAWZ"


print(f"Config: {config_name}")
print(f"Users: {n_users}, Games: {n_games}, Days: {n_days}")
print(f"Max trials: {max_trials}, Early stopping: {early_stopping}")
print(f"Use Feature Store: {use_feature_store}")

Config: default
Users: 5000, Games: 100, Days: 180
Max trials: 20, Early stopping: 5
Use Feature Store: false


In [40]:

logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

## Generate Data

In [41]:
logger.info("Generating gambling dataset...")
users, games, interactions, _ = generate_gambling_dataset(
    n_users=n_users,
    n_games=n_games,
    n_days=n_days,
    avg_sessions_per_user=30,
    seed=42,
)

print(f"Generated {len(users)} users, {len(games)} games, {len(interactions)} interactions")

INFO:__main__:Generating gambling dataset...


Generated 5000 users, 100 games, 150000 interactions


## Prepare Data

In [42]:
# Split by date
train_days = int(n_days * 0.8)
interactions["summary_date"] = pd.to_datetime(interactions["summary_date"])
min_date = interactions["summary_date"].min()
cutoff_date = min_date + pd.Timedelta(days=train_days)

train_df = interactions[interactions["summary_date"] <= cutoff_date].copy()
valid_df = interactions[interactions["summary_date"] > cutoff_date].copy()

print(f"Train: {len(train_df)}, Validation: {len(valid_df)}")

Train: 120486, Validation: 29514


In [43]:
# Encode features
encoder = FMEncoder()
categorical_cols = ["game_type", "vip_tier", "region"]

train_features = train_df.merge(users, on="user_id").merge(games, on="game_id")
valid_features = valid_df.merge(users, on="user_id").merge(games, on="game_id")

X_train, y_train = encoder.fit_transform(
    train_features, target_col="bet_qty", categorical_cols=categorical_cols
)
X_valid, y_valid = encoder.transform(
    valid_features, target_col="bet_qty", categorical_cols=categorical_cols
)

print(f"X_train shape: {X_train.shape}")
print(f"X_valid shape: {X_valid.shape}")

## Run Optuna Optimization

In [None]:
data = {
    "X_train": X_train,
    "y_train": y_train,
    "X_valid": X_valid,
    "y_valid": y_valid,
}

def objective(trial, data):
    """Optuna objective function."""
    with mlflow.start_run(run_name=f"Trial-{trial.number}", nested=True):
        num_factors = trial.suggest_int("num_factors", 8, 64)
        epochs = trial.suggest_int("epochs", 10, 30)
        learning_rate = trial.suggest_float("learning_rate", 0.001, 0.1, log=True)

        mlflow.log_params({
            "num_factors": num_factors,
            "epochs": epochs,
            "learning_rate": learning_rate,
        })

        model = LocalFMSimulator(
            num_factors=num_factors,
            epochs=epochs,
            learning_rate=learning_rate,
        )
        model.fit(data["X_train"], data["y_train"])

        train_rmse = model.score(data["X_train"], data["y_train"])
        valid_rmse = model.score(data["X_valid"], data["y_valid"])

        mlflow.log_metric("train_rmse", train_rmse)
        mlflow.log_metric("valid_rmse", valid_rmse)

        logger.info(f"Trial {trial.number}: RMSE={valid_rmse:.4f}")
        return -valid_rmse


def early_stopping_callback(study, trial, rounds=5):
    """Early stopping callback."""
    if len(study.trials) < rounds:
        return
    recent = [t.value for t in study.trials[-rounds:] if t.value is not None]
    if len(recent) < rounds:
        return
    if study.best_value is not None and max(recent) <= study.best_value - 0.001:
        study.stop()

In [None]:
# Setup MLflow
mlflow.set_experiment(experiment_name)
current_time = datetime.now().strftime("%Y%m%d_%H%M%S")
study_name = f"fm_{config_name}_{current_time}"

# Create results directory
Path("results").mkdir(exist_ok=True)
storage_path = f"results/optuna_{study_name}.db"

In [None]:
with mlflow.start_run(run_name=f"FM-{config_name}-{current_time}"):
    mlflow.log_params({
        "config_name": config_name,
        "n_users": n_users,
        "n_games": n_games,
        "n_days": n_days,
        "max_trials": max_trials,
        "aws_account_id": account_id,
        "aws_region": region,
        "use_feature_store": use_feature_store,
    })

    # Create study
    study = optuna.create_study(
        direction="maximize",
        study_name=study_name,
        storage=f"sqlite:///{storage_path}",
        load_if_exists=True,
    )

    # Run optimization
    fn = partial(objective, data=data)
    es_callback = partial(early_stopping_callback, rounds=early_stopping)

    study.optimize(
        fn,
        n_trials=max_trials,
        callbacks=[es_callback],
        gc_after_trial=True,
    )

    # Log results
    mlflow.log_params({f"best_{k}": v for k, v in study.best_params.items()})
    mlflow.log_metric("best_valid_rmse", -study.best_value)

    # Save artifacts
    trials_df = study.trials_dataframe()
    trials_path = f"results/{study_name}_trials.parquet"
    trials_df.to_parquet(trials_path)
    mlflow.log_artifact(trials_path, artifact_path="trials")
    mlflow.log_artifact(storage_path, artifact_path="optuna_db")

mlflow.end_run()

## Results

In [None]:
print(f"\nBest RMSE: {-study.best_value:.4f}")
print(f"Best parameters:")
for k, v in study.best_params.items():
    print(f"  {k}: {v}")

In [None]:
study.trials_dataframe()