# Model Building

### Import Libraries

In [None]:
import pandas as pd
import numpy as np
from sklearn.metrics import roc_auc_score, accuracy_score, f1_score, precision_score, recall_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
import xgboost as xgb
import pickle
import warnings

# Ignore PerformanceWarning and UserWarning
warnings.filterwarnings("ignore", category=UserWarning)
warnings.filterwarnings("ignore", category=pd.errors.PerformanceWarning)

### Load Datasets

In [None]:
# Load datasets
X_train = pd.readcsv('train_test_datasets\X_train.csv', index=False)
X_test = pd.read_csv('train_test_datasets\X_test.csv', index=False)
y_train = pd.read_csv('train_test_datasets\y_train.csv', index=False)
y_test = pd.read_csv('train_test_datasets\y_test.csv', index=False)

### Function to build models on different hyperparameters

We can now train a series of models on various hyperparameters, collecting various performance metrics for each iteration of model, and return a dataframe showing the performance metrics for the best performing version of each model based on roc-auc score:

In [299]:
def train_and_evaluate_models(X_train, X_test, y_train, y_test):
    """
    Trains and evaluates Random Forest and XGBoost models using GridSearchCV.

    This function applies hyperparameter tuning with cross-validation to Random Forest
    and XGBoost. It computes key evaluation metrics on the test set, including ROC-AUC,
    accuracy, F1-score, precision, and recall.

    Parameters:
    ----------
    X_train : pd.DataFrame
        Training feature set.
    X_test : pd.DataFrame
        Test feature set.
    y_train : pd.Series or np.ndarray
        Training target labels.
    y_test : pd.Series or np.ndarray
        Test target labels.

    Returns:
    -------
    pd.DataFrame
        A DataFrame containing the evaluation metrics and best hyperparameters for each model.
    """
    
    models = { 
        "RandomForest": {
            "model": RandomForestClassifier(random_state=42),
            "params": {
                "n_estimators": [100, 200, 300],
                "max_depth": [None, 10, 20, 30],
                "min_samples_split": [2, 5, 10],
                "min_samples_leaf": [1, 2, 4],
                "bootstrap": [True, False]
            }
        },
        "XGBoost": {
            "model": xgb.XGBClassifier(use_label_encoder=False, eval_metric="logloss"),
            "params": {
                "n_estimators": [100, 200, 300],
                "max_depth": [3, 6, 9],
                "learning_rate": [0.01, 0.1, 0.2],
                "subsample": [0.8, 1],
                "colsample_bytree": [0.8, 1]
            }
        }
    }

    results = []

    for model_name, config in models.items():
        print(f"Training {model_name}...")
        grid_search = GridSearchCV(config["model"], config["params"], cv=3, scoring='roc_auc', n_jobs=-1)
        grid_search.fit(X_train, y_train)
        
        best_model = grid_search.best_estimator_
        y_pred = best_model.predict(X_test)
        y_proba = best_model.predict_proba(X_test)[:, 1]
        
        auc = roc_auc_score(y_test, y_proba)
        accuracy = accuracy_score(y_test, y_pred)
        f1 = f1_score(y_test, y_pred)
        precision = precision_score(y_test, y_pred)
        recall = recall_score(y_test, y_pred)
        
        results.append({
            "Model": model_name,
            "Best Params": grid_search.best_params_,
            "ROC-AUC": auc,
            "Accuracy": accuracy,
            "F1-Score": f1,
            "Precision": precision,
            "Recall": recall
        })

    results_df = pd.DataFrame(results).sort_values(by="ROC-AUC", ascending=False)
    
    return results_df


In [300]:
# Run the function
results_df = train_and_evaluate_models(X_train, X_test, y_train, y_test)

Training RandomForest...
Training XGBoost...


In [302]:
# View the results df
results_df

Unnamed: 0,Model,Best Params,ROC-AUC,Accuracy,F1-Score,Precision,Recall
0,RandomForest,"{'bootstrap': True, 'max_depth': 10, 'min_samp...",0.731616,0.665177,0.684581,0.659731,0.711376
1,XGBoost,"{'colsample_bytree': 1, 'learning_rate': 0.01,...",0.723939,0.661438,0.685347,0.652336,0.721877


### Function to save best performing model

We can now build a function to save our best performing model:

In [303]:
def save_best_model(results_df, X_train, y_train, model_name, output_path=None):
    """
    Selects the best-performing model of the specified type from the results DataFrame,
    refits it on the full training data, and saves it to a file.

    This function:
    - Filters the results to include only the specified model type.
    - Selects the configuration with the highest ROC-AUC score.
    - Instantiates the model with its best parameters and fits it to the full training data.
    - Saves the fitted model as a pickle file.

    Parameters:
        results_df (pandas.DataFrame): A DataFrame containing model names, best parameters,
                                       and evaluation metrics (e.g. 'ROC-AUC').
        X_train (pandas.DataFrame): The training features.
        y_train (pandas.Series): The training labels.
        model_name (str): The name of the model type to extract (e.g. "XGBoost", "RandomForest").
        output_path (str, optional): Path to save the trained model. If not provided,
                                     a default name is used based on the model type.

    Returns:
        str: The file path to which the model was saved.
    """

    # Validate model mapping
    model_mapping = {
        "RandomForest": RandomForestClassifier,
        "XGBoost": xgb.XGBClassifier,
        "LightGBM": lgb.LGBMClassifier,
        "LogisticRegression": LogisticRegression,
        "GradientBoosting": GradientBoostingClassifier
    }

    # Filter for the specified model and select the best one by AUC
    filtered = results_df[results_df["Model"] == model_name]
    if filtered.empty:
        raise ValueError(f"No entries found in results_df for model type: '{model_name}'")

    best_row = filtered.sort_values(by="ROC-AUC", ascending=False).iloc[0]
    best_params = best_row["Best Params"]

    # Instantiate and fit the model
    model_class = model_mapping[model_name]
    model = model_class(**best_params)
    model.fit(X_train, y_train)

    # Determine default output path if none provided
    if output_path is None:
        output_path = f"best_{model_name.lower()}_model.pkl"

    # Save model to file
    with open(output_path, "wb") as file:
        pickle.dump(model, file)

    print(f"Saved best '{model_name}' model with ROC-AUC {best_row['ROC-AUC']:.4f} to '{output_path}'")

    return model


In [None]:
# Run function
rf = save_best_model(results_df, X_train, y_train, 'RandomForest', output_path='rf_model.pkl')

Saved best 'RandomForest' model with ROC-AUC 0.7316 to 'rf_model.pkl'
