# Model Building
Train and tune a classification model using Optuna for hyperparameter optimisation.


This notebook consumes the engineered training features and leverages Optuna to optimise a RandomForest classifier.


In [None]:
params = {
    'train_path': 'data/train_features.csv',
    'feature_metadata_path': 'data/feature_metadata.json',
    'model_output_path': 'models/random_forest.pkl',
    'study_output_path': 'models/optuna_trials.csv',
    'best_params_path': 'models/best_params.json',
    'n_trials': 20,
    'timeout': None,
    'random_state': 42
}


In [None]:
from pathlib import Path
import json

import joblib
import optuna
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import StratifiedKFold, cross_val_score

params = dict(params)
train_path = Path(params['train_path'])
metadata_path = Path(params['feature_metadata_path'])
model_output_path = Path(params['model_output_path'])
study_output_path = Path(params['study_output_path'])
best_params_path = Path(params['best_params_path'])
n_trials = int(params.get('n_trials', 20))
timeout = params.get('timeout')
timeout = None if timeout in (None, 'None') else float(timeout)
random_state = int(params.get('random_state', 42))

model_output_path.parent.mkdir(parents=True, exist_ok=True)
study_output_path.parent.mkdir(parents=True, exist_ok=True)
best_params_path.parent.mkdir(parents=True, exist_ok=True)

train_df = pd.read_csv(train_path)
metadata = json.loads(metadata_path.read_text())
target_column = metadata['target_column']
feature_columns = metadata['feature_columns']

X = train_df[feature_columns]
y = train_df[target_column]

def objective(trial: optuna.Trial) -> float:
    model_params = {
        'n_estimators': trial.suggest_int('n_estimators', 100, 500, step=50),
        'max_depth': trial.suggest_int('max_depth', 3, 12),
        'min_samples_split': trial.suggest_int('min_samples_split', 2, 10),
        'min_samples_leaf': trial.suggest_int('min_samples_leaf', 1, 6),
        'max_features': trial.suggest_categorical('max_features', ['sqrt', 'log2', None]),
        'bootstrap': trial.suggest_categorical('bootstrap', [True, False]),
        'random_state': random_state,
        'n_jobs': -1,
    }
    model = RandomForestClassifier(**model_params)
    cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=random_state)
    scores = cross_val_score(model, X, y, cv=cv, scoring='accuracy', n_jobs=-1)
    return scores.mean()

sampler = optuna.samplers.TPESampler(seed=random_state)
study = optuna.create_study(direction='maximize', sampler=sampler)
study.optimize(objective, n_trials=n_trials, timeout=timeout)

best_params = study.best_trial.params
best_params.update({'random_state': random_state, 'n_jobs': -1})

best_model = RandomForestClassifier(**best_params)
best_model.fit(X, y)

joblib.dump(best_model, model_output_path)
study_df = study.trials_dataframe()
study_df.to_csv(study_output_path, index=False)

with best_params_path.open('w') as fp:
    json.dump({'best_params': best_params, 'best_value': study.best_value}, fp, indent=2)

{
    'best_score': study.best_value,
    'best_params': best_params,
    'n_trials': len(study.trials),
}
