# Model Building

### Import Libraries

In [1]:
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
import xgboost as xgb
import pickle
import warnings
from skopt import BayesSearchCV
from skopt.space import Integer, Real, Categorical
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import (
    roc_auc_score, accuracy_score, f1_score,
    precision_score, recall_score
)

# Ignore PerformanceWarning and UserWarning
warnings.filterwarnings("ignore", category=UserWarning)
warnings.filterwarnings("ignore", category=pd.errors.PerformanceWarning)

### Load Datasets

In [2]:
# Load datasets
X_train = pd.read_csv('train_test_datasets/X_train2.csv')
X_test = pd.read_csv('train_test_datasets/X_test2.csv')
y_train = pd.read_csv('train_test_datasets/y_train2.csv')
y_test = pd.read_csv('train_test_datasets/y_test2.csv')

### Function to build models on different hyperparameters

We can now train a series of models on various hyperparameters, collecting various performance metrics for each iteration of model, and return a dataframe showing the performance metrics for the best performing version of each model based on roc-auc score:

In [3]:

def train_and_evaluate_models(X_train, X_test, y_train, y_test, n_iter=25):
    """
    Trains and evaluates Random Forest and XGBoost models using Bayesian optimisation.

    Parameters:
    ----------
    X_train : pd.DataFrame
        Training feature set.
    X_test : pd.DataFrame
        Test feature set.
    y_train : pd.Series or np.ndarray
        Training target labels.
    y_test : pd.Series or np.ndarray
        Test target labels.
    n_iter : int
        Number of optimisation iterations for BayesSearchCV.

    Returns:
    -------
    pd.DataFrame
        A DataFrame containing the evaluation metrics and best hyperparameters for each model.
    """
    
    models = { 
        "RandomForest": {
            "model": RandomForestClassifier(random_state=42),
            "search_space": {
                "n_estimators": Integer(100, 500),
                "max_depth": Integer(5, 50),
                "min_samples_split": Integer(2, 20),
                "min_samples_leaf": Integer(1, 10),
                "bootstrap": Categorical([True, False])
            }
        },
        "XGBoost": {
            "model": xgb.XGBClassifier(use_label_encoder=False, eval_metric="logloss"),
            "search_space": {
                "n_estimators": Integer(100, 500),
                "max_depth": Integer(3, 12),
                "learning_rate": Real(0.01, 0.3, prior="log-uniform"),
                "subsample": Real(0.5, 1.0),
                "colsample_bytree": Real(0.5, 1.0)
            }
        }
    }

    results = []

    for model_name, config in models.items():
        print(f"Optimising {model_name}...")

        opt = BayesSearchCV(
            estimator=config["model"],
            search_spaces=config["search_space"],
            n_iter=n_iter,
            scoring='roc_auc',
            cv=3,
            n_jobs=-1,
            random_state=42,
            verbose=0
        )

        opt.fit(X_train, y_train)

        best_model = opt.best_estimator_
        y_pred = best_model.predict(X_test)
        y_proba = best_model.predict_proba(X_test)[:, 1]

        auc = roc_auc_score(y_test, y_proba)
        accuracy = accuracy_score(y_test, y_pred)
        f1 = f1_score(y_test, y_pred)
        precision = precision_score(y_test, y_pred)
        recall = recall_score(y_test, y_pred)

        results.append({
            "Model": model_name,
            "Best Params": opt.best_params_,
            "ROC-AUC": auc,
            "Accuracy": accuracy,
            "F1-Score": f1,
            "Precision": precision,
            "Recall": recall
        })

    results_df = pd.DataFrame(results).sort_values(by="ROC-AUC", ascending=False)
    
    return results_df


In [4]:
# Run the function
results_df = train_and_evaluate_models(X_train, X_test, y_train, y_test)

Optimising RandomForest...
Optimising XGBoost...


In [5]:
results_df

Unnamed: 0,Model,Best Params,ROC-AUC,Accuracy,F1-Score,Precision,Recall
1,XGBoost,"{'colsample_bytree': 0.5485095094956723, 'lear...",0.735725,0.676006,0.688781,0.675464,0.702634
0,RandomForest,"{'bootstrap': True, 'max_depth': 10, 'min_samp...",0.734648,0.674947,0.688617,0.673535,0.704389


### Function to save best performing model

We can now build a function to save our best performing model:

In [6]:
def save_best_model(results_df, X_train, y_train, model_name, output_path=None):
    """
    Selects the best-performing model of the specified type from the results DataFrame,
    refits it on the full training data, and saves it to a file.

    This function:
    - Filters the results to include only the specified model type.
    - Selects the configuration with the highest ROC-AUC score.
    - Instantiates the model with its best parameters and fits it to the full training data.
    - Saves the fitted model as a pickle file.

    Parameters:
        results_df (pandas.DataFrame): A DataFrame containing model names, best parameters,
                                       and evaluation metrics (e.g. 'ROC-AUC').
        X_train (pandas.DataFrame): The training features.
        y_train (pandas.Series): The training labels.
        model_name (str): The name of the model type to extract (e.g. "XGBoost", "RandomForest").
        output_path (str, optional): Path to save the trained model. If not provided,
                                     a default name is used based on the model type.

    Returns:
        str: The file path to which the model was saved.
    """

    # Validate model mapping
    model_mapping = {
        "RandomForest": RandomForestClassifier,
        "XGBoost": xgb.XGBClassifier,
    }

    # Filter for the specified model and select the best one by AUC
    filtered = results_df[results_df["Model"] == model_name]
    if filtered.empty:
        raise ValueError(f"No entries found in results_df for model type: '{model_name}'")

    best_row = filtered.sort_values(by="ROC-AUC", ascending=False).iloc[0]
    best_params = best_row["Best Params"]

    # Instantiate and fit the model
    model_class = model_mapping[model_name]
    model = model_class(**best_params)
    model.fit(X_train, y_train)

    # Determine default output path if none provided
    if output_path is None:
        output_path = f"best_{model_name.lower()}_model.pkl"

    # Save model to file
    with open(output_path, "wb") as file:
        pickle.dump(model, file)

    print(f"Saved best '{model_name}' model with ROC-AUC {best_row['ROC-AUC']:.4f} to '{output_path}'")

    return model


In [7]:
# Run function
xg = save_best_model(results_df, X_train, y_train, 'XGBoost', output_path='xg_model.pkl')

Saved best 'XGBoost' model with ROC-AUC 0.7357 to 'xg_model.pkl'
