# Boosted Tree Hyperparameter Search

In [44]:
import lightgbm as lgb
import xgboost as xgb
import catboost as cb
import pandas as pd
import numpy as np

from startup import setup_environment
from src.modeling.training import train_and_log
from src.features.preprocessors import load_dataset, list_datasets
from src.modeling.evaluate import load_model


### Datasets

In [None]:
datasets = list_datasets()
names = ['dataset_v5_full_hourly_selected_rfecv', 'dataset_v5_full_hourly_selected_shap', 
         'dataset_v5_slim_hourly', 'dataset_v5_full_hourly']
data_ids = [
    datasets[datasets['dataset_name'] == name].sort_values('version', ascending=False).iloc[0]['run_id']
    for name in names 
    if (datasets['dataset_name'] == name).any()
]
print(data_ids)
datasets

### Regularisation configs

In [None]:

reg_configs = {
    'Model A (moderate)': {
        'n_estimators': 1000,
        'learning_rate': 0.02,
        'num_leaves': 31,
        'max_depth': 8,
        'min_child_samples': 50,
        'subsample': 0.7,
        'subsample_freq': 1,
        'colsample_bytree': 0.7,
        'reg_alpha': 5.0,
        'reg_lambda': 5.0,
        'min_gain_to_split': 0.1,
        'min_child_weight': 10,
        'boosting_type': 'gbdt',
        'verbosity': -1,
    },
    'Model B (strong)': {
        'n_estimators': 1000,
        'learning_rate': 0.01,
        'num_leaves': 21,
        'max_depth': 6,
        'min_child_samples': 100,
        'subsample': 0.5,
        'subsample_freq': 1,
        'colsample_bytree': 0.5,
        'reg_alpha': 10.0,
        'reg_lambda': 10.0,
        'min_gain_to_split': 0.5,
        'min_child_weight': 20,
        'boosting_type': 'gbdt',
        'verbosity': -1,
    },
    'Model C (very strong)': {
        'n_estimators': 1000,
        'learning_rate': 0.005,
        'num_leaves': 15,
        'max_depth': 4,
        'min_child_samples': 200,
        'subsample': 0.4,
        'subsample_freq': 1,
        'colsample_bytree': 0.4,
        'reg_alpha': 20.0,
        'reg_lambda': 20.0,
        'min_gain_to_split': 1.0,
        'min_child_weight': 50,
        'boosting_type': 'gbdt',
        'verbosity': -1,
    }
}




### Model configs

In [None]:
models = {
    'LGBM': {
        'class': lgb.LGBMRegressor,
        'param_map': lambda p: {k: v for k, v in p.items() if k not in ['subsample_freq']},
        'objective_map': {
            'default': {'objective': 'regression', 'metric': 'rmse'},
            'mae': {'objective': 'mae', 'metric': 'mae'},
            'huber': {'objective': 'huber', 'metric': 'huber', 'alpha': 10.0}
        }
    },
    'XGBoost': {
        'class': xgb.XGBRegressor,
        'param_map': lambda p: {
            'n_estimators': p['n_estimators'],
            'learning_rate': p['learning_rate'],
            'max_depth': p['max_depth'],
            'min_child_weight': p['min_child_samples'],
            'subsample': p['subsample'],
            'colsample_bytree': p['colsample_bytree'],
            'reg_alpha': p['reg_alpha'],
            'reg_lambda': p['reg_lambda'],
            'gamma': p['min_gain_to_split'],
            'verbosity': 0
        },
        'objective_map': {
            'default': {'objective': 'reg:squarederror', 'eval_metric': 'rmse'},
            'mae': {'objective': 'reg:absoluteerror', 'eval_metric': 'mae'},
            'huber': {'objective': 'reg:pseudohubererror', 'eval_metric': 'mae', 'huber_slope': 10.0}
        }
    },
    'CatBoost': {
        'class': cb.CatBoostRegressor,
        'param_map': lambda p: {
            'iterations': p['n_estimators'],
            'learning_rate': p['learning_rate'],
            'depth': p['max_depth'],
            'min_child_samples': p['min_child_samples'],
            'subsample': p['subsample'],
            'rsm': p['colsample_bytree'],
            'l2_leaf_reg': p['reg_lambda'],
            'verbose': False
        },
        'objective_map': {
            'default': {'loss_function': 'RMSE'},
            'mae': {'loss_function': 'MAE'},
            'huber': {'loss_function': 'Huber:delta=10'}
        }
    }
}

#### Restart logic

In [7]:
start_from = (data_ids[0], 'CatBoost', 'Model A (moderate)', 'default')  # adjust as needed
resume = True 

## Main model loop

In [None]:
all_run_ids = []

for data_id in data_ids:
    for model_name, model_info in models.items():
        for config_name, base_params in reg_configs.items():
            model_params = model_info['param_map'](base_params)
            config_letter = config_name[6]

            for obj_name, obj_params in model_info['objective_map'].items():
                current = (data_id, model_name, config_name, obj_name)
                if resume:
                    if current == start_from:
                        resume = False
                    else:
                        continue

                full_params = {**model_params, **obj_params}
                model = model_info['class'](**full_params)

                run_name = f"{model_name}_hourly_{config_letter}_{obj_name}"
                description = (f"{model_name} hourly global model, "
                               f"regularisation {config_name}, loss={obj_name}")
                tags = {
                    "model_family": "boosted_tree",
                    "model": model_name,
                    "regularisation_level": config_letter,
                    "loss": obj_name,
                    "dataset_id": data_id
                }

                run_id = train_and_log(
                    dataset_run_id=data_id,
                    model=model,
                    model_name=f"{model_name}_reg_{config_letter}",
                    target_transform="none",
                    experiment=f"{model_name}_hourly",
                    run_name=run_name,
                    description=description,
                    tags=tags,
                    group_size=24,
                    y_baseline=None,
                    test_size=0.1,
                    weight_half_life=730
                )
                all_run_ids.append(run_id)

## Evaluation

In [None]:
import mlflow
import pandas as pd

def get_run_metadata(run_id: str) -> dict:
    """Return metadata for a given MLflow run as a flat dictionary."""
    run = mlflow.get_run(run_id)
    info = run.info
    return {
        "run_name": info.run_name,
        "run_id": info.run_id,
        "experiment_id": info.experiment_id,
        "status": info.status,
        "start_time": pd.Timestamp(info.start_time, unit="ms", tz="UTC"),
        "end_time": pd.Timestamp(info.end_time, unit="ms", tz="UTC") if info.end_time else None,
        "artifact_uri": info.artifact_uri,
        "params": run.data.params,
        "metrics": run.data.metrics,
        "tags": run.data.tags,
    }

In [None]:
rows = []
for run_id in all_run_ids:
    run = mlflow.get_run(run_id)
    rows.append({
        "run_name": run.info.run_name,
        "dataset_name": run.data.params.get("dataset_name"),
        "model": run.data.tags.get("model"),
        "regularisation_level": run.data.tags.get("regularisation_level"),
        "loss": run.data.tags.get("loss"),
        "rmse": run.data.metrics.get("rmse"),
        "mae": run.data.metrics.get("mae"),
        "me": run.data.metrics.get("me"),
        "r2": run.data.metrics.get("r2"),
    })

results_df = pd.DataFrame(rows)
results_df

In [43]:
groupings = []
groupings.append(results_df.groupby('model').agg({'rmse': 'mean', 'mae': 'mean', 'me': 'mean', 'r2': 'mean'}).round(4).assign(grouping='model'))
groupings.append(results_df.groupby('regularisation_level').agg({'rmse': 'mean', 'mae': 'mean', 'me': 'mean', 'r2': 'mean'}).round(4).assign(grouping='regularisation_level'))
groupings.append(results_df.groupby('loss').agg({'rmse': 'mean', 'mae': 'mean', 'me': 'mean', 'r2': 'mean'}).round(4).assign(grouping='loss'))
groupings.append(results_df.groupby('dataset_name').agg({'rmse': 'mean', 'mae': 'mean', 'me': 'mean', 'r2': 'mean'}).round(4).assign(grouping='dataset_name'))
avg_df = pd.concat(groupings).reset_index()
avg_df

Unnamed: 0,index,rmse,mae,me,r2,grouping
0,CatBoost,22.9639,14.0237,0.9427,0.7965,model
1,LGBM,23.1844,13.6261,1.6908,0.7881,model
2,XGBoost,22.4497,13.4348,1.8729,0.8054,model
3,A,21.4566,13.019,1.8479,0.8226,regularisation_level
4,B,22.3889,13.3846,0.9693,0.8067,regularisation_level
5,C,24.7819,14.7797,1.5214,0.7605,regularisation_level
6,default,22.4109,13.9161,4.0216,0.8066,loss
7,huber,24.2556,14.5683,0.3438,0.7695,loss
8,mae,21.9609,12.699,-0.0268,0.8137,loss
9,dataset_v5_full_hourly,22.8712,13.6448,1.9377,0.7966,dataset_name


## Random search for refinement

In [None]:
best_data_id = '247ccd98f2614cc5aa0bf834c1f1835e'

param_ranges = {
    'n_estimators': (800, 1200),
    'learning_rate': (0.005, 0.03),
    'num_leaves': (15, 40),
    'max_depth': (4, 10),
    'min_child_samples': (30, 120),
    'subsample': (0.4, 0.8),
    'colsample_bytree': (0.4, 0.8),
    'reg_alpha': (2.0, 12.0),
    'reg_lambda': (2.0, 12.0),
    'min_gain_to_split': (0.05, 0.7),
    'min_child_weight': (5, 25),
}

weight_half_life_range = (365, 365 * 3)

In [54]:
models = {
    'LGBM': {
        'class': lgb.LGBMRegressor,
        'param_map': lambda p: {**p, 'objective': 'mae', 'metric': 'mae', 
                                 'verbosity': -1, 'boosting_type': 'gbdt'}
    },
    'XGBoost': {
        'class': xgb.XGBRegressor,
        'param_map': lambda p: {
            'n_estimators': p['n_estimators'],
            'learning_rate': p['learning_rate'],
            'max_depth': p['max_depth'],
            'min_child_weight': p['min_child_samples'],
            'subsample': p['subsample'],
            'colsample_bytree': p['colsample_bytree'],
            'reg_alpha': p['reg_alpha'],
            'reg_lambda': p['reg_lambda'],
            'gamma': p['min_gain_to_split'],
            'objective': 'reg:absoluteerror',
            'eval_metric': 'mae',
            'verbosity': 0
        }
    },
    'CatBoost': {
        'class': cb.CatBoostRegressor,
        'param_map': lambda p: {
            'iterations': p['n_estimators'],
            'learning_rate': p['learning_rate'],
            'depth': p['max_depth'],
            'min_child_samples': p['min_child_samples'],
            'subsample': p['subsample'],
            'rsm': p['colsample_bytree'],
            'l2_leaf_reg': p['reg_lambda'],
            'loss_function': 'MAE',
            'verbose': False
        }
    }
}


### Random search loop

In [None]:
np.random.seed(123)
n_trials = 30
refinement_run_ids = []

int_params = {'n_estimators', 'num_leaves', 'max_depth', 'min_child_samples', 'min_child_weight'}

for trial in range(n_trials):
    sampled_lgb_params = {
        key: np.random.randint(int(round(low)), int(round(high)) + 1)
        if key in int_params else np.random.uniform(low, high)
        for key, (low, high) in param_ranges.items()
    }

    whl_low, whl_high = weight_half_life_range
    weight_half_life = np.random.randint(whl_low, whl_high + 1)

    for model_name, model_info in models.items():
        model_params = model_info['param_map'](sampled_lgb_params)
        model = model_info['class'](**model_params)

        run_name = f"{model_name}_rand_trial_{trial:02d}"
        tags = {
            "model_family": "boosted_tree",
            "model": model_name,
            "strategy": "hourly_global",
            "loss": "mae",
            "refinement": "random_search",
            "trial": str(trial)
        }

        run_id = train_and_log(
            dataset_run_id=best_data_id,
            model=model,
            model_name=f"{model_name}_rand_trial_{trial:02d}",
            target_transform="none",
            experiment=f"{model_name}_hourly",
            run_name=run_name,
            description=f"{model_name} random search trial {trial}",
            tags=tags,
            group_size=24,
            y_baseline=None,
            test_size=0.1,
            weight_half_life=730
        )
        refinement_run_ids.append(run_id)

print(f"Completed {n_trials * len(models)} random search runs.")