In [None]:
import pandas as pd
import numpy as np
import joblib
import warnings
from collections import deque
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import accuracy_score, f1_score, roc_auc_score
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler

from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier, GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from xgboost import XGBClassifier

# --- Constants (from your script) ---
WINDOW_SIZE = 50
WINDOW_STRIDE = 5
DATA_PATH = "my_master_dataset_full.csv"
FEATURE_COLS = [
    'att_roll', 'att_pitch', 'att_yaw',
    'pos_lat', 'pos_lon', 'pos_alt_rel',
    'pos_vx', 'pos_vy', 'pos_vz',
    'nav_roll', 'nav_pitch', 'nav_alt_error',
    'sys_voltage_battery', 'sys_load',
    'vib_x', 'vib_y', 'vib_z'
]

def extract_window_features_safe(df: pd.DataFrame) -> dict:
    """
    Extract robust window-level features, safely handling NaNs.
    """
    features = {}
    with warnings.catch_warnings():
        warnings.simplefilter("ignore", category=RuntimeWarning)
        for col in df.columns:
            col_data = df[col].astype(float).values

            if np.all(np.isnan(col_data)):
                features[f"{col}_mean"] = np.nan
                features[f"{col}_std"] = np.nan
                features[f"{col}_min"] = np.nan
                features[f"{col}_max"] = np.nan
                features[f"{col}_slope"] = np.nan
                features[f"{col}_range"] = np.nan
                features[f"{col}_diff_mean"] = np.nan
                features[f"{col}_diff_std"] = np.nan
                continue

            features[f"{col}_mean"] = np.nanmean(col_data)
            features[f"{col}_std"] = np.nanstd(col_data)
            features[f"{col}_min"] = np.nanmin(col_data)
            features[f"{col}_max"] = np.nanmax(col_data)

            try:
                valid_indices = np.where(~np.isnan(col_data))[0]
                first_valid_idx = valid_indices[0]
                last_valid_idx = valid_indices[-1]
                features[f"{col}_slope"] = col_data[last_valid_idx] - col_data[first_valid_idx]
            except IndexError:
                features[f"{col}_slope"] = np.nan

            features[f"{col}_range"] = np.nanmax(col_data) - np.nanmin(col_data)
            diff = np.diff(col_data)

            if len(diff[~np.isnan(diff)]) > 0:
                features[f"{col}_diff_mean"] = np.nanmean(diff)
                features[f"{col}_diff_std"] = np.nanstd(diff)
            else:
                features[f"{col}_diff_mean"] = np.nan
                features[f"{col}_diff_std"] = np.nan
    return features

# --- create_windows function (unchanged) ---
def create_windows(df):
    windows = []
    labels = []
    groups = []
    print("Starting window creation...")
    for fid, g in df.groupby("flight_id"):
        arr = g[FEATURE_COLS].values
        lbl = g["label"].values
        n = len(arr)

        if n < WINDOW_SIZE:
            continue

        for start in range(0, n - WINDOW_SIZE + 1, WINDOW_STRIDE):
            w = arr[start:start+WINDOW_SIZE]
            wdf = pd.DataFrame(w, columns=FEATURE_COLS).replace([np.inf, -np.inf], np.nan)

            feats = extract_window_features_safe(wdf)
            windows.append(feats)
            labels.append(int(np.any(lbl[start:start+WINDOW_SIZE] == 1)))
            groups.append(fid)

    print("Window creation finished.")
    return pd.DataFrame(windows), np.array(labels), np.array(groups)


def main():
    print("ðŸ“˜ Loading dataset...")
    df = pd.read_csv(DATA_PATH)
    df = df.dropna(subset=FEATURE_COLS)

    print("ðŸ“¦ Creating sliding windows...")
    X, y, groups = create_windows(df)

    print(f"Total windows: {len(X)} | Features per window: {X.shape[1]}")
    print(f"Attack windows: {sum(y)} | Normal windows: {len(y) - sum(y)}")
    print(f"Attack ratio: {sum(y) / len(y):.3f}")

    # --- Define Models ---
    # Calculate scale_pos_weight for XGBoost
    try:
        count_neg = (y == 0).sum()
        count_pos = (y == 1).sum()
        scale_pos_weight = count_neg / count_pos
        print(f"Calculated XGB scale_pos_weight: {scale_pos_weight:.2f}")
    except ZeroDivisionError:
        scale_pos_weight = 1
        print("Warning: No positive samples (label=1) found.")

    models = {
        "RandomForest": RandomForestClassifier(n_estimators=300, max_depth=14, class_weight='balanced', n_jobs=-1, random_state=42),
        "ExtraTrees": ExtraTreesClassifier(n_estimators=200, max_depth=12, n_jobs=-1, random_state=42),
        "LogisticRegression": LogisticRegression(max_iter=500, class_weight='balanced', n_jobs=-1),
        "SVC-RBF": SVC(kernel="rbf", probability=True, class_weight='balanced'),
        "GradientBoosting": GradientBoostingClassifier(n_estimators=200, learning_rate=0.05, max_depth=5, random_state=42),
        "XGBoost": XGBClassifier(
            n_estimators=200, learning_rate=0.05, max_depth=8, n_jobs=-1,
            use_label_encoder=False, eval_metric='logloss',
            scale_pos_weight=scale_pos_weight
        )
    }

    # --- Stratified GroupKFold Setup ---
    print("\nðŸ§ª Setting up Stratified GroupKFold...")
    group_df = pd.DataFrame({'group': groups, 'label': y})
    group_labels_df = group_df.groupby('group')['label'].max()
    unique_groups = group_labels_df.index.values
    group_labels = group_labels_df.values

    print(f"Total unique groups (flights): {len(unique_groups)}")
    print(f"Attack flights: {sum(group_labels)} | Normal flights: {len(group_labels) - sum(group_labels)}")

    skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
    results = []

    # --- Run Model Comparison ---
    for name, model in models.items():
        print(f"\n--- Training and Evaluating: {name} ---")
        fold_metrics = []

        pipe = Pipeline([
            ("imputer", SimpleImputer(strategy="mean")),
            ("scaler", StandardScaler()),
            ("model", model)
        ])

        for fold, (train_group_indices, test_group_indices) in enumerate(skf.split(unique_groups, group_labels)):

            train_groups = unique_groups[train_group_indices]
            test_groups = unique_groups[test_group_indices]

            train_idx = np.where(np.isin(groups, train_groups))[0]
            test_idx = np.where(np.isin(groups, test_groups))[0]

            X_train, X_test = X.iloc[train_idx], X.iloc[test_idx]
            y_train, y_test = y[train_idx], y[test_idx]

            if len(np.unique(y_test)) < 2:
                print(f"Fold {fold+1}: WARNING: Test set has only one class. Skipping fold.")
                continue

            pipe.fit(X_train, y_train)
            y_pred = pipe.predict(X_test)

            y_prob = (
                pipe.predict_proba(X_test)[:, 1]
                if hasattr(pipe.named_steps["model"], "predict_proba")
                else y_pred
            )

            acc = accuracy_score(y_test, y_pred)
            f1 = f1_score(y_test, y_pred)
            auc = roc_auc_score(y_test, y_prob)
            fold_metrics.append((acc, f1, auc))
            print(f"Fold {fold+1}: Acc: {acc:.4f} | F1: {f1:.4f} | AUC: {auc:.4f}")

        if not fold_metrics:
            print(f"Model {name} failed to train (all folds skipped).")
            continue

        acc_mean = np.mean([m[0] for m in fold_metrics])
        f1_mean = np.mean([m[1] for m in fold_metrics])
        auc_mean = np.nanmean([m[2] for m in fold_metrics])

        results.append({
            "Model": name,
            "Accuracy": acc_mean,
            "F1": f1_mean,
            "AUC": auc_mean
        })
        print(f"--- {name} Mean: Acc: {acc_mean:.4f} | F1: {f1_mean:.4f} | AUC: {auc_mean:.4f} ---")

    # --- Print Final Results ---
    results_df = pd.DataFrame(results).sort_values("AUC", ascending=False)
    print("\n" + "="*30)
    print("  Final Window-Based Model Comparison")
    print("="*30)
    print(results_df)


if __name__ == "__main__":
    main()

ðŸ“˜ Loading dataset...
ðŸ“¦ Creating sliding windows...
Starting window creation...
Window creation finished.
Total windows: 7282 | Features per window: 136
Attack windows: 2643 | Normal windows: 4639
Attack ratio: 0.363
Calculated XGB scale_pos_weight: 1.76

ðŸ§ª Setting up Stratified GroupKFold...
Total unique groups (flights): 24
Attack flights: 10 | Normal flights: 14

--- Training and Evaluating: RandomForest ---
Fold 1: Acc: 0.7706 | F1: 0.6362 | AUC: 0.7860
Fold 2: Acc: 0.7927 | F1: 0.6209 | AUC: 0.8230
Fold 3: Acc: 0.8095 | F1: 0.6471 | AUC: 0.8480
Fold 4: Acc: 0.8457 | F1: 0.7537 | AUC: 0.8873
Fold 5: Acc: 0.7926 | F1: 0.8032 | AUC: 0.9075
--- RandomForest Mean: Acc: 0.8022 | F1: 0.6922 | AUC: 0.8503 ---

--- Training and Evaluating: ExtraTrees ---
Fold 1: Acc: 0.7493 | F1: 0.4938 | AUC: 0.7854
Fold 2: Acc: 0.8499 | F1: 0.6471 | AUC: 0.8531
Fold 3: Acc: 0.7653 | F1: 0.4067 | AUC: 0.8520
Fold 4: Acc: 0.7937 | F1: 0.5687 | AUC: 0.8709
Fold 5: Acc: 0.7411 | F1: 0.7188 | AUC: 0.9

Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


Fold 1: Acc: 0.7335 | F1: 0.5890 | AUC: 0.7616


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


Fold 2: Acc: 0.6921 | F1: 0.4118 | AUC: 0.7563


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


Fold 3: Acc: 0.7838 | F1: 0.6356 | AUC: 0.8353


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


Fold 4: Acc: 0.8182 | F1: 0.7220 | AUC: 0.8812


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


Fold 5: Acc: 0.7578 | F1: 0.7674 | AUC: 0.8824
--- XGBoost Mean: Acc: 0.7571 | F1: 0.6252 | AUC: 0.8234 ---

  Final Window-Based Model Comparison
                Model  Accuracy        F1       AUC
1          ExtraTrees  0.779878  0.566985  0.856961
0        RandomForest  0.802219  0.692206  0.850348
4    GradientBoosting  0.768500  0.634715  0.839018
5             XGBoost  0.757064  0.625155  0.823374
3             SVC-RBF  0.773092  0.599623  0.774273
2  LogisticRegression  0.728838  0.570487  0.725715


In [1]:
import pandas as pd
import numpy as np
import joblib
import warnings
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier

# --- Constants ---
WINDOW_SIZE = 50
WINDOW_STRIDE = 5  # This is only for training; in real-time, the stride is 1
DATA_PATH = "my_master_dataset_RELABELED.csv"
FEATURE_COLS = [
    'att_roll', 'att_pitch', 'att_yaw',
    'pos_lat', 'pos_lon', 'pos_alt_rel',
    'pos_vx', 'pos_vy', 'pos_vz',
    'nav_roll', 'nav_pitch', 'nav_alt_error',
    'sys_voltage_battery', 'sys_load',
    'vib_x', 'vib_y', 'vib_z'
]

# --- Final Model Filenames ---
MODEL_SAVE_PATH = "window_ids_model.pkl"
FEATURE_LIST_PATH = "window_feature_list.pkl"
WINDOW_SIZE_PATH = "WINDOW_SIZE.pkl"

# --- NAN-SAFE FEATURE EXTRACTOR ---
# This must be the same as 'window_feature.py'
def extract_window_features_safe(df: pd.DataFrame) -> dict:
    """
    Extract robust window-level features, safely handling NaNs.
    """
    features = {}
    with warnings.catch_warnings():
        warnings.simplefilter("ignore", category=RuntimeWarning)
        for col in df.columns:
            col_data = df[col].astype(float).values

            if np.all(np.isnan(col_data)):
                features[f"{col}_mean"] = np.nan
                features[f"{col}_std"] = np.nan
                features[f"{col}_min"] = np.nan
                features[f"{col}_max"] = np.nan
                features[f"{col}_slope"] = np.nan
                features[f"{col}_range"] = np.nan
                features[f"{col}_diff_mean"] = np.nan
                features[f"{col}_diff_std"] = np.nan
                continue

            features[f"{col}_mean"] = np.nanmean(col_data)
            features[f"{col}_std"] = np.nanstd(col_data)
            features[f"{col}_min"] = np.nanmin(col_data)
            features[f"{col}_max"] = np.nanmax(col_data)

            try:
                valid_indices = np.where(~np.isnan(col_data))[0]
                first_valid_idx = valid_indices[0]
                last_valid_idx = valid_indices[-1]
                features[f"{col}_slope"] = col_data[last_valid_idx] - col_data[first_valid_idx]
            except IndexError:
                features[f"{col}_slope"] = np.nan

            features[f"{col}_range"] = np.nanmax(col_data) - np.nanmin(col_data)
            diff = np.diff(col_data)

            if len(diff[~np.isnan(diff)]) > 0:
                features[f"{col}_diff_mean"] = np.nanmean(diff)
                features[f"{col}_diff_std"] = np.nanstd(diff)
            else:
                features[f"{col}_diff_mean"] = np.nan
                features[f"{col}_diff_std"] = np.nan
    return features

# --- create_windows function ---
def create_windows(df):
    windows = []
    labels = []
    groups = []
    print("Starting window creation...")
    for fid, g in df.groupby("flight_id"):
        arr = g[FEATURE_COLS].values
        lbl = g["label"].values
        n = len(arr)

        if n < WINDOW_SIZE:
            continue

        for start in range(0, n - WINDOW_SIZE + 1, WINDOW_STRIDE):
            w = arr[start:start+WINDOW_SIZE]
            wdf = pd.DataFrame(w, columns=FEATURE_COLS).replace([np.inf, -np.inf], np.nan)

            feats = extract_window_features_safe(wdf)
            windows.append(feats)
            labels.append(int(np.any(lbl[start:start+WINDOW_SIZE] == 1)))
            groups.append(fid) # We don't use groups, but good to keep

    print("Window creation finished.")
    return pd.DataFrame(windows), np.array(labels), np.array(groups)


def main():
    print("ðŸ“˜ Loading dataset...")
    df = pd.read_csv(DATA_PATH)
    df = df.dropna(subset=FEATURE_COLS)

    print("ðŸ“¦ Creating sliding windows for training...")
    X, y, groups = create_windows(df)

    print(f"Total windows: {len(X)} | Features per window: {X.shape[1]}")
    print(f"Attack windows: {sum(y)} | Normal windows: {len(y) - sum(y)}")

    # Save the exact feature list the model will be trained on
    feature_list = list(X.columns)

    # --- Define Our Winning Model ---
    # RandomForestClassifier with parameters from the comparison script
    final_model = RandomForestClassifier(
        n_estimators=300,
        max_depth=14,
        class_weight='balanced',
        n_jobs=-1,
        random_state=42
    )

    # --- Build Final Pipeline ---
    pipeline = Pipeline([
        ("imputer", SimpleImputer(strategy="mean")), # Handles any NaNs from feature extraction
        ("scaler", StandardScaler()),
        ("model", final_model)
    ])

    print("\nðŸš€ Training final RandomForest model on ALL data...")
    # Train the pipeline on the entire dataset
    pipeline.fit(X, y)

    print("âœ… Model training complete.")

    # --- Save Artifacts for Dashboard ---
    joblib.dump(pipeline, MODEL_SAVE_PATH)
    print(f"ðŸ’¾ Final model saved to: {MODEL_SAVE_PATH}")

    joblib.dump(feature_list, FEATURE_LIST_PATH)
    print(f"ðŸ’¾ Feature list saved to: {FEATURE_LIST_PATH}")

    joblib.dump(WINDOW_SIZE, WINDOW_SIZE_PATH)
    print(f"ðŸ’¾ Window size saved to: {WINDOW_SIZE_PATH}")

    print("\nðŸŽ‰ All artifacts are ready for the dashboard!")

if __name__ == "__main__":
    main()

ðŸ“˜ Loading dataset...
ðŸ“¦ Creating sliding windows for training...
Starting window creation...
Window creation finished.
Total windows: 7282 | Features per window: 136
Attack windows: 2158 | Normal windows: 5124

ðŸš€ Training final RandomForest model on ALL data...
âœ… Model training complete.
ðŸ’¾ Final model saved to: window_ids_model.pkl
ðŸ’¾ Feature list saved to: window_feature_list.pkl
ðŸ’¾ Window size saved to: WINDOW_SIZE.pkl

ðŸŽ‰ All artifacts are ready for the dashboard!


In [3]:
!pip install xgboost


[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m25.2[0m[39;49m -> [0m[32;49m25.3[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49m/home/sarthak/anaconda3/bin/python -m pip install --upgrade pip[0m


In [5]:
import pandas as pd
import numpy as np
import joblib
import warnings
from collections import deque
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import accuracy_score, f1_score, roc_auc_score
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler

from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier, GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from xgboost import XGBClassifier

# --- Constants (from your script) ---
WINDOW_SIZE = 50
WINDOW_STRIDE = 5
DATA_PATH = "my_master_dataset_RELABELED.csv"
FEATURE_COLS = [
    'att_roll', 'att_pitch', 'att_yaw',
    'pos_lat', 'pos_lon', 'pos_alt_rel',
    'pos_vx', 'pos_vy', 'pos_vz',
    'nav_roll', 'nav_pitch', 'nav_alt_error',
    'sys_voltage_battery', 'sys_load',
    'vib_x', 'vib_y', 'vib_z'
]

def extract_window_features_safe(df: pd.DataFrame) -> dict:
    """
    Extract robust window-level features, safely handling NaNs.
    """
    features = {}
    with warnings.catch_warnings():
        warnings.simplefilter("ignore", category=RuntimeWarning)
        for col in df.columns:
            col_data = df[col].astype(float).values

            if np.all(np.isnan(col_data)):
                features[f"{col}_mean"] = np.nan
                features[f"{col}_std"] = np.nan
                features[f"{col}_min"] = np.nan
                features[f"{col}_max"] = np.nan
                features[f"{col}_slope"] = np.nan
                features[f"{col}_range"] = np.nan
                features[f"{col}_diff_mean"] = np.nan
                features[f"{col}_diff_std"] = np.nan
                continue

            features[f"{col}_mean"] = np.nanmean(col_data)
            features[f"{col}_std"] = np.nanstd(col_data)
            features[f"{col}_min"] = np.nanmin(col_data)
            features[f"{col}_max"] = np.nanmax(col_data)

            try:
                valid_indices = np.where(~np.isnan(col_data))[0]
                first_valid_idx = valid_indices[0]
                last_valid_idx = valid_indices[-1]
                features[f"{col}_slope"] = col_data[last_valid_idx] - col_data[first_valid_idx]
            except IndexError:
                features[f"{col}_slope"] = np.nan

            features[f"{col}_range"] = np.nanmax(col_data) - np.nanmin(col_data)
            diff = np.diff(col_data)

            if len(diff[~np.isnan(diff)]) > 0:
                features[f"{col}_diff_mean"] = np.nanmean(diff)
                features[f"{col}_diff_std"] = np.nanstd(diff)
            else:
                features[f"{col}_diff_mean"] = np.nan
                features[f"{col}_diff_std"] = np.nan
    return features

# --- create_windows function (unchanged) ---
def create_windows(df):
    windows = []
    labels = []
    groups = []
    print("Starting window creation...")
    for fid, g in df.groupby("flight_id"):
        arr = g[FEATURE_COLS].values
        lbl = g["label"].values
        n = len(arr)

        if n < WINDOW_SIZE:
            continue

        for start in range(0, n - WINDOW_SIZE + 1, WINDOW_STRIDE):
            w = arr[start:start+WINDOW_SIZE]
            wdf = pd.DataFrame(w, columns=FEATURE_COLS).replace([np.inf, -np.inf], np.nan)

            feats = extract_window_features_safe(wdf)
            windows.append(feats)
            labels.append(int(np.any(lbl[start:start+WINDOW_SIZE] == 1)))
            groups.append(fid)

    print("Window creation finished.")
    return pd.DataFrame(windows), np.array(labels), np.array(groups)


def main():
    print("ðŸ“˜ Loading dataset...")
    df = pd.read_csv(DATA_PATH)
    df = df.dropna(subset=FEATURE_COLS)

    print("ðŸ“¦ Creating sliding windows...")
    X, y, groups = create_windows(df)

    print(f"Total windows: {len(X)} | Features per window: {X.shape[1]}")
    print(f"Attack windows: {sum(y)} | Normal windows: {len(y) - sum(y)}")
    print(f"Attack ratio: {sum(y) / len(y):.3f}")

    # --- Define Models ---
    # Calculate scale_pos_weight for XGBoost
    try:
        count_neg = (y == 0).sum()
        count_pos = (y == 1).sum()
        scale_pos_weight = count_neg / count_pos
        print(f"Calculated XGB scale_pos_weight: {scale_pos_weight:.2f}")
    except ZeroDivisionError:
        scale_pos_weight = 1
        print("Warning: No positive samples (label=1) found.")

    models = {
        "RandomForest": RandomForestClassifier(n_estimators=300, max_depth=14, class_weight='balanced', n_jobs=-1, random_state=42),
        "ExtraTrees": ExtraTreesClassifier(n_estimators=200, max_depth=12, n_jobs=-1, random_state=42),
        "LogisticRegression": LogisticRegression(max_iter=500, class_weight='balanced', n_jobs=-1),
        "SVC-RBF": SVC(kernel="rbf", probability=True, class_weight='balanced'),
        "GradientBoosting": GradientBoostingClassifier(n_estimators=200, learning_rate=0.05, max_depth=5, random_state=42),
        "XGBoost": XGBClassifier(
            n_estimators=200, learning_rate=0.05, max_depth=8, n_jobs=-1,
            use_label_encoder=False, eval_metric='logloss',
            scale_pos_weight=scale_pos_weight
        )
    }

    # --- Stratified GroupKFold Setup ---
    print("\nðŸ§ª Setting up Stratified GroupKFold...")
    group_df = pd.DataFrame({'group': groups, 'label': y})
    group_labels_df = group_df.groupby('group')['label'].max()
    unique_groups = group_labels_df.index.values
    group_labels = group_labels_df.values

    print(f"Total unique groups (flights): {len(unique_groups)}")
    print(f"Attack flights: {sum(group_labels)} | Normal flights: {len(group_labels) - sum(group_labels)}")

    skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
    results = []

    # --- Run Model Comparison ---
    for name, model in models.items():
        print(f"\n--- Training and Evaluating: {name} ---")
        fold_metrics = []

        pipe = Pipeline([
            ("imputer", SimpleImputer(strategy="mean")),
            ("scaler", StandardScaler()),
            ("model", model)
        ])

        for fold, (train_group_indices, test_group_indices) in enumerate(skf.split(unique_groups, group_labels)):

            train_groups = unique_groups[train_group_indices]
            test_groups = unique_groups[test_group_indices]

            train_idx = np.where(np.isin(groups, train_groups))[0]
            test_idx = np.where(np.isin(groups, test_groups))[0]

            X_train, X_test = X.iloc[train_idx], X.iloc[test_idx]
            y_train, y_test = y[train_idx], y[test_idx]

            if len(np.unique(y_test)) < 2:
                print(f"Fold {fold+1}: WARNING: Test set has only one class. Skipping fold.")
                continue

            pipe.fit(X_train, y_train)
            y_pred = pipe.predict(X_test)

            y_prob = (
                pipe.predict_proba(X_test)[:, 1]
                if hasattr(pipe.named_steps["model"], "predict_proba")
                else y_pred
            )

            acc = accuracy_score(y_test, y_pred)
            f1 = f1_score(y_test, y_pred)
            auc = roc_auc_score(y_test, y_prob)
            fold_metrics.append((acc, f1, auc))
            print(f"Fold {fold+1}: Acc: {acc:.4f} | F1: {f1:.4f} | AUC: {auc:.4f}")

        if not fold_metrics:
            print(f"Model {name} failed to train (all folds skipped).")
            continue

        acc_mean = np.mean([m[0] for m in fold_metrics])
        f1_mean = np.mean([m[1] for m in fold_metrics])
        auc_mean = np.nanmean([m[2] for m in fold_metrics])

        results.append({
            "Model": name,
            "Accuracy": acc_mean,
            "F1": f1_mean,
            "AUC": auc_mean
        })
        print(f"--- {name} Mean: Acc: {acc_mean:.4f} | F1: {f1_mean:.4f} | AUC: {auc_mean:.4f} ---")

    # --- Print Final Results ---
    results_df = pd.DataFrame(results).sort_values("AUC", ascending=False)
    print("\n" + "="*30)
    print("  Final Window-Based Model Comparison")
    print("="*30)
    print(results_df)


if __name__ == "__main__":
    main()

ModuleNotFoundError: No module named 'xgboost'