In [1]:
from xgboost import XGBRegressor
from sklearn.multioutput import MultiOutputRegressor
from sklearn.decomposition import PCA

import sys
import os
import logging

sys.path.append(os.path.dirname("/Users/mac/Desktop/train_delay_prediction/utils.py"))

from utils import *

logging.basicConfig(
    filename='xgboost_pca_evaluation.log',
    level=logging.INFO,
    format='%(asctime)s - %(levelname)s - %(message)s',
)
logging.info("Starting PCA-based eXtreme Gradient Boosting evaluation script.")

In [2]:
n_estimators = 50
max_depth = 10

print(f"Running XGBoost with PCA (75 components), n_estimators={n_estimators}, and max_depth={max_depth}")

Running XGBoost with PCA (75 components), n_estimators=50, and max_depth=10


In [None]:
data = load_data_more_features(percentage_of_data_usage=1.0)

X_train = data["X_train"]
y_train = data["y_train"]
X_test = data["X_test"]
y_test = data["y_test"]

past_delay_index = 4
futur_planned_start = 200 
futur_planned_end = 205

past_delay_train = X_train[:, past_delay_index]
past_delay_test = X_test[:, past_delay_index]
futur_planned_train = X_train[:, futur_planned_start:futur_planned_end]
futur_planned_test = X_test[:, futur_planned_start:futur_planned_end]

pca = PCA(n_components=75)
X_train_pca = pca.fit_transform(X_train)
X_test_pca = pca.transform(X_test)

logging.info(f"PCA applied: reduced to {pca.n_components_} components.")

In [4]:
trained_models = {}

xgb_regressor = MultiOutputRegressor(
    XGBRegressor(
        n_estimators=n_estimators,
        max_depth=max_depth,
        random_state=42,
        n_jobs=-1
    )
)

model_name = "XGBoost_PCA"
trained_model_data = train(xgb_regressor, X_train_pca, y_train, model_name, savemodel=False)
trained_models[model_name] = trained_model_data

In [None]:
def evaluate_pca(trained_model, X_test_pca, y_test, past_delay_test, futur_planned_test, model_name, 
                 delay_delta_bins=np.array([-np.inf, -5, 0, 5, 10, 15, 20, 25, 30, np.inf]) * 60, 
                 horizon_obs_bins=np.array([0, 5, 10, 15, np.inf]) * 60, 
                 results_folder="./results", plots_folder="./plots", suffix="_pca"):
    """
    Evaluates the PCA-based XGBoost model while ensuring MAE is computed in the same way as the original models.

    Args:
        trained_model (dict): Contains the trained model and metadata.
        X_test_pca (np.ndarray): PCA-transformed test feature matrix.
        y_test (np.ndarray): Test target matrix.
        past_delay_test (np.ndarray): Stored past delay values before PCA.
        futur_planned_test (np.ndarray): Stored future planned values before PCA.
        model_name (str): Name of the model for saving results.
        delay_delta_bins (np.ndarray): Bins for delay-based MAE computation.
        horizon_obs_bins (np.ndarray): Bins for horizon-based MAE computation.
        results_folder (str): Directory to save evaluation results.
        plots_folder (str): Directory to save evaluation plots.
        suffix (str): Suffix for filenames.

    Returns:
        dict: Evaluation metrics.
    """
    logging.basicConfig(level=logging.INFO, format='%(message)s')
    log_time("Starting PCA-based evaluation.")

    os.makedirs(results_folder, exist_ok=True)
    os.makedirs(plots_folder, exist_ok=True)
    log_time("Created results and plots directories if not already existing.")

    model = trained_model["model"]
    log_time("Loaded trained PCA-based model.")

    y_pred = model.predict(X_test_pca)
    log_time("Model predictions completed.")

    y_pred = transform_to_seconds(y_pred)
    y_test = transform_to_seconds(y_test)

    log_time("Transformed predictions and ground truth to seconds.")

    past_delay_sec = transform_to_seconds(past_delay_test) 
    futur_planned_sec = transform_to_seconds(futur_planned_test)

    corrected_translate_test = past_delay_sec - (
        (futur_planned_sec[:, 0] + past_delay_sec) < 0
    ) * (futur_planned_sec[:, 0] + past_delay_sec)

    corrected_translate_test = np.repeat(corrected_translate_test[:, np.newaxis], y_test.shape[1], axis=1)

    log_time("Computed corrected translation-based estimates.")

    # Compute MAE for PCA model
    mae_model = np.abs(y_pred - y_test)
    mae_translation = np.abs(corrected_translate_test - y_test)

    log_time("Computed MAE for model and translation approach.")

    # Compute evaluation metrics (same as evaluate_2)
    metrics, mae_delay, mae_horizon = compute_and_save_metrics(
        y_test, corrected_translate_test, mae_model, mae_translation,
        delay_delta_bins, horizon_obs_bins, futur_planned_sec, trained_model,
        results_folder, model_name, suffix
    )

    # Plot MAE results
    plot_mae(mae_delay, mae_horizon, delay_delta_bins, horizon_obs_bins, plots_folder, model_name, suffix)
    log_time("PCA evaluation complete.")

    return metrics

metrics_2 = evaluate_pca(
    trained_model=trained_model_data,
    X_test_pca=X_test_pca,
    y_test=y_test,
    past_delay_test=past_delay_test,  # Stored before PCA
    futur_planned_test=futur_planned_test,  # Stored before PCA
    model_name=model_name
)

In [10]:
calculate_feature_importance(
    trained_models=trained_models,
    X_test=X_test_pca,
    y_test=y_test,
    feature_mapping={i: f'PC{i+1}' for i in range(75)},  # Map PCA components as features
    top_features_threshold=0.01,
    n_repeats=5
)

Calculating feature importance: 100%|██████████| 1/1 [05:31<00:00, 331.12s/it]
