# Classic ML Baseline: TF-IDF + XGBoost on 48 Subcategory Labels

Same pipeline as notebook 02 cell 4, but on the finer-grained 48-label subcategory dataset.
Enables comparison with the 13-label parent-level Macro-F1 0.691 / Micro-F1 0.746.

In [1]:
import pandas as pd
import numpy as np
import os, time, warnings

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import precision_score, recall_score, f1_score, roc_auc_score
from xgboost import XGBClassifier
from tqdm import tqdm
import matplotlib
matplotlib.use("Agg")
import matplotlib.pyplot as plt

warnings.filterwarnings("ignore")

DATA_DIR = "../data"
RESULTS_DIR = "../results"
os.makedirs(RESULTS_DIR, exist_ok=True)

print("Setup complete.")

Setup complete.


In [2]:
# Cell 2: Load Subcategory Train/Test Data

train_df = pd.read_csv(os.path.join(DATA_DIR, "subcategory_train_set.csv"))
test_df = pd.read_csv(os.path.join(DATA_DIR, "subcategory_test_set.csv"))

LABEL_COLS = [c for c in train_df.columns if c not in ("ACN", "Narrative")]

print(f"Train: {len(train_df):,} rows | Test: {len(test_df):,} rows")
print(f"Label columns: {len(LABEL_COLS)}")
print(f"\nLabel prevalence (train):")
for col in LABEL_COLS:
    pct = train_df[col].mean() * 100
    cnt = train_df[col].sum()
    print(f"  {col:<55} {cnt:>6,} ({pct:>5.2f}%)")

train_lc = train_df[LABEL_COLS].sum(axis=1)
print(f"\nLabels per report: mean={train_lc.mean():.2f}, median={train_lc.median():.0f}, max={train_lc.max():.0f}")

Train: 32,089 rows | Test: 8,017 rows
Label columns: 48

Label prevalence (train):
  ATC Issue                                                5,472 (17.05%)
  Aircraft Equipment Problem: Critical                     5,307 (16.54%)
  Aircraft Equipment Problem: Less Severe                  3,883 (12.10%)
  Airspace Violation                                       1,271 ( 3.96%)
  Conflict: Airborne Conflict                              3,955 (12.33%)
  Conflict: Ground Conflict                                2,462 ( 7.67%)
  Conflict: NMAC                                           2,359 ( 7.35%)
  Deviation - Altitude: Crossing Restriction Not Met         965 ( 3.01%)
  Deviation - Altitude: Excursion From Assigned Altitude   2,489 ( 7.76%)
  Deviation - Altitude: Overshoot                          2,055 ( 6.40%)
  Deviation - Altitude: Undershoot                           537 ( 1.67%)
  Deviation - Procedural: Clearance                        9,454 (29.46%)
  Deviation - Procedural: FAR

In [3]:
# Cell 3: TF-IDF + XGBoost (48 binary classifiers)

pred_path = os.path.join(RESULTS_DIR, "classic_ml_subcategory_predictions.csv")
metrics_path = os.path.join(RESULTS_DIR, "classic_ml_subcategory_metrics.csv")

if os.path.exists(pred_path) and os.path.exists(metrics_path):
    print("Checkpoint: predictions and metrics exist, loading...")
    predictions_df = pd.read_csv(pred_path)
    metrics_df = pd.read_csv(metrics_path)
    print(metrics_df.to_string(index=False))
else:
    # Fit TF-IDF
    t0 = time.time()
    tfidf = TfidfVectorizer(
        max_features=50_000,
        ngram_range=(1, 2),
        sublinear_tf=True,
        dtype=np.float32,
    )
    X_train = tfidf.fit_transform(train_df["Narrative"])
    X_test = tfidf.transform(test_df["Narrative"])
    print(f"TF-IDF shape: train {X_train.shape}, test {X_test.shape} ({time.time() - t0:.1f}s)")

    # Train one classifier per label
    results = []
    all_preds = {"ACN": test_df["ACN"].values}
    total_t0 = time.time()

    for i, col in enumerate(tqdm(LABEL_COLS, desc="Training classifiers")):
        t0 = time.time()
        y_tr = train_df[col].values
        y_te = test_df[col].values

        n_pos = y_tr.sum()
        n_neg = len(y_tr) - n_pos
        spw = n_neg / n_pos if n_pos > 0 else 1.0

        clf = XGBClassifier(
            n_estimators=300,
            max_depth=6,
            learning_rate=0.1,
            scale_pos_weight=spw,
            n_jobs=-1,
            eval_metric="logloss",
            random_state=42,
            verbosity=0,
            tree_method="hist",
        )
        clf.fit(X_train, y_tr)

        y_pred = clf.predict(X_test)
        y_proba = clf.predict_proba(X_test)[:, 1]

        p = precision_score(y_te, y_pred, zero_division=0)
        r = recall_score(y_te, y_pred, zero_division=0)
        f1 = f1_score(y_te, y_pred, zero_division=0)
        auc = roc_auc_score(y_te, y_proba)
        elapsed = time.time() - t0

        results.append({"Category": col, "Precision": p, "Recall": r, "F1": f1, "ROC-AUC": auc})
        all_preds[f"{col}_true"] = y_te
        all_preds[f"{col}_pred"] = y_pred
        all_preds[f"{col}_proba"] = y_proba
        print(f"  [{i+1}/{len(LABEL_COLS)}] {col:<55} P={p:.3f} R={r:.3f} F1={f1:.3f} AUC={auc:.3f}  ({elapsed:.1f}s)")

    total_elapsed = time.time() - total_t0
    print(f"\nTotal training time: {total_elapsed:.1f}s")

    # Compute macro and micro aggregates on 2D arrays
    y_true_all = test_df[LABEL_COLS].values  # (n_samples, 48)
    y_pred_all = np.column_stack([all_preds[f"{c}_pred"] for c in LABEL_COLS])
    y_proba_all = np.column_stack([all_preds[f"{c}_proba"] for c in LABEL_COLS])

    macro_p = precision_score(y_true_all, y_pred_all, average="macro", zero_division=0)
    macro_r = recall_score(y_true_all, y_pred_all, average="macro", zero_division=0)
    macro_f1 = f1_score(y_true_all, y_pred_all, average="macro", zero_division=0)
    macro_auc = roc_auc_score(y_true_all, y_proba_all, average="macro")

    micro_p = precision_score(y_true_all, y_pred_all, average="micro", zero_division=0)
    micro_r = recall_score(y_true_all, y_pred_all, average="micro", zero_division=0)
    micro_f1 = f1_score(y_true_all, y_pred_all, average="micro", zero_division=0)
    micro_auc = roc_auc_score(y_true_all, y_proba_all, average="micro")

    results.append({"Category": "MACRO", "Precision": macro_p, "Recall": macro_r, "F1": macro_f1, "ROC-AUC": macro_auc})
    results.append({"Category": "MICRO", "Precision": micro_p, "Recall": micro_r, "F1": micro_f1, "ROC-AUC": micro_auc})

    metrics_df = pd.DataFrame(results)
    metrics_df.to_csv(metrics_path, index=False)

    predictions_df = pd.DataFrame(all_preds)
    predictions_df.to_csv(pred_path, index=False)

    print(f"\nSaved {metrics_path} and {pred_path}")
    print(metrics_df.to_string(index=False))

# Print headline numbers
macro_row = metrics_df[metrics_df["Category"] == "MACRO"].iloc[0]
micro_row = metrics_df[metrics_df["Category"] == "MICRO"].iloc[0]
print(f"\n{'='*60}")
print(f"Macro-F1: {macro_row['F1']:.4f}  |  Micro-F1: {micro_row['F1']:.4f}")
print(f"Macro-AUC: {macro_row['ROC-AUC']:.4f}  |  Micro-AUC: {micro_row['ROC-AUC']:.4f}")
print(f"{'='*60}")

Checkpoint: predictions and metrics exist, loading...
                                                    Category  Precision   Recall       F1  ROC-AUC
                                                   ATC Issue   0.615929 0.765957 0.682799 0.928528
                        Aircraft Equipment Problem: Critical   0.612233 0.802572 0.694599 0.931667
                     Aircraft Equipment Problem: Less Severe   0.368594 0.580145 0.450783 0.822381
                                          Airspace Violation   0.500000 0.719243 0.589909 0.937360
                                 Conflict: Airborne Conflict   0.543255 0.751521 0.630638 0.925124
                                   Conflict: Ground Conflict   0.454357 0.714519 0.555485 0.927023
                                              Conflict: NMAC   0.580605 0.784014 0.667149 0.960395
          Deviation - Altitude: Crossing Restriction Not Met   0.641732 0.676349 0.658586 0.978809
      Deviation - Altitude: Excursion From Assigned Alt

In [4]:
# Cell 4: Parent-Level Comparison Summary

# Map each subcategory to its parent
def get_parent(col):
    if ":" in col:
        return col.split(":")[0].strip()
    return col

parent_map = {col: get_parent(col) for col in LABEL_COLS}

# Get subcategory F1 scores (exclude MACRO/MICRO)
sub_metrics = metrics_df[~metrics_df["Category"].isin(["MACRO", "MICRO"])].copy()
sub_metrics["Parent"] = sub_metrics["Category"].map(parent_map)

# Mean F1 per parent group
parent_avg = sub_metrics.groupby("Parent")["F1"].agg(["mean", "count"]).reset_index()
parent_avg.columns = ["Parent", "Avg_Subcategory_F1", "Num_Subcategories"]
parent_avg = parent_avg.sort_values("Parent")

# Load parent-level baseline
parent_metrics = pd.read_csv(os.path.join(RESULTS_DIR, "classic_ml_text_metrics.csv"))
parent_f1 = parent_metrics[~parent_metrics["Category"].isin(["MACRO", "MICRO"])].set_index("Category")["F1"]

# Side-by-side comparison
print(f"{'Parent Category':<30} {'Parent F1':>10} {'Avg Sub F1':>11} {'Delta':>8} {'#Subs':>6}")
print("-" * 67)
for _, row in parent_avg.iterrows():
    parent = row["Parent"]
    pf1 = parent_f1.get(parent, float("nan"))
    sf1 = row["Avg_Subcategory_F1"]
    delta = sf1 - pf1 if not np.isnan(pf1) else float("nan")
    print(f"{parent:<30} {pf1:>10.4f} {sf1:>11.4f} {delta:>+8.4f} {int(row['Num_Subcategories']):>6}")

# Overall comparison
parent_macro_f1 = parent_metrics.loc[parent_metrics["Category"] == "MACRO", "F1"].values[0]
parent_micro_f1 = parent_metrics.loc[parent_metrics["Category"] == "MICRO", "F1"].values[0]
sub_macro_f1 = macro_row["F1"]
sub_micro_f1 = micro_row["F1"]

print(f"\n{'Aggregate':<30} {'Parent':>10} {'Subcategory':>11} {'Delta':>8}")
print("-" * 61)
print(f"{'Macro-F1':<30} {parent_macro_f1:>10.4f} {sub_macro_f1:>11.4f} {sub_macro_f1 - parent_macro_f1:>+8.4f}")
print(f"{'Micro-F1':<30} {parent_micro_f1:>10.4f} {sub_micro_f1:>11.4f} {sub_micro_f1 - parent_micro_f1:>+8.4f}")

Parent Category                 Parent F1  Avg Sub F1    Delta  #Subs
-------------------------------------------------------------------
ATC Issue                          0.6719      0.6828  +0.0109      1
Aircraft Equipment Problem         0.8157      0.5727  -0.2430      2
Airspace Violation                 0.5681      0.5899  +0.0218      1
Conflict                           0.8011      0.6178  -0.1833      3
Deviation - Altitude               0.7289      0.5429  -0.1860      4
Deviation - Procedural             0.7951      0.4967  -0.2983     10
Deviation - Speed                  0.5767      0.5196  -0.0571      1
Deviation - Track/Heading          0.6550      0.6631  +0.0081      1
Flight Deck/Cabin Event            0.7378      0.6369  -0.1009      4
Ground Event/Encounter             0.5922      0.2677  -0.3245      8
Ground Excursion                   0.5723      0.4613  -0.1110      2
Ground Incursion                   0.7294      0.5422  -0.1872      2
Inflight Event/Encount

In [5]:
# Cell 5: Summary File

summary_path = os.path.join(RESULTS_DIR, "classic_ml_subcategory_summary.txt")

lines = [
    "Classic ML Baseline: TF-IDF + XGBoost (48 Subcategory Labels)",
    "=" * 65,
    f"Train set: {len(train_df):,} reports | Test set: {len(test_df):,} reports",
    f"TF-IDF: max_features=50000, ngram_range=(1,2), sublinear_tf=True",
    f"XGBoost: n_estimators=300, max_depth=6, lr=0.1, scale_pos_weight=auto",
    "",
    f"{'Category':<55} {'Precision':>10} {'Recall':>10} {'F1':>10} {'ROC-AUC':>10}",
    "-" * 97,
]
for _, row in metrics_df.iterrows():
    cat = row["Category"]
    if cat == "MACRO":
        lines.append("-" * 97)
    lines.append(f"{cat:<55} {row['Precision']:>10.4f} {row['Recall']:>10.4f} {row['F1']:>10.4f} {row['ROC-AUC']:>10.4f}")

# Parent-group comparison
lines.append("")
lines.append("")
lines.append("Parent-Level Comparison (13-label vs 48-label)")
lines.append("=" * 65)
lines.append(f"{'Parent Category':<30} {'Parent F1':>10} {'Avg Sub F1':>11} {'Delta':>8} {'#Subs':>6}")
lines.append("-" * 67)
for _, row in parent_avg.iterrows():
    parent = row["Parent"]
    pf1 = parent_f1.get(parent, float("nan"))
    sf1 = row["Avg_Subcategory_F1"]
    delta = sf1 - pf1 if not np.isnan(pf1) else float("nan")
    lines.append(f"{parent:<30} {pf1:>10.4f} {sf1:>11.4f} {delta:>+8.4f} {int(row['Num_Subcategories']):>6}")
lines.append("-" * 67)
lines.append(f"{'Macro-F1':<30} {parent_macro_f1:>10.4f} {sub_macro_f1:>11.4f} {sub_macro_f1 - parent_macro_f1:>+8.4f}")
lines.append(f"{'Micro-F1':<30} {parent_micro_f1:>10.4f} {sub_micro_f1:>11.4f} {sub_micro_f1 - parent_micro_f1:>+8.4f}")

summary = "\n".join(lines)
with open(summary_path, "w") as f:
    f.write(summary)
print(summary)
print(f"\nSaved {summary_path}")

Classic ML Baseline: TF-IDF + XGBoost (48 Subcategory Labels)
Train set: 32,089 reports | Test set: 8,017 reports
TF-IDF: max_features=50000, ngram_range=(1,2), sublinear_tf=True
XGBoost: n_estimators=300, max_depth=6, lr=0.1, scale_pos_weight=auto

Category                                                 Precision     Recall         F1    ROC-AUC
-------------------------------------------------------------------------------------------------
ATC Issue                                                   0.6159     0.7660     0.6828     0.9285
Aircraft Equipment Problem: Critical                        0.6122     0.8026     0.6946     0.9317
Aircraft Equipment Problem: Less Severe                     0.3686     0.5801     0.4508     0.8224
Airspace Violation                                          0.5000     0.7192     0.5899     0.9374
Conflict: Airborne Conflict                                 0.5433     0.7515     0.6306     0.9251
Conflict: Ground Conflict                           

In [6]:
# Cell 6: F1 Bar Chart

cat_metrics = metrics_df[~metrics_df["Category"].isin(["MACRO", "MICRO"])].copy()
cat_metrics = cat_metrics.sort_values("F1", ascending=True)

macro_f1 = metrics_df.loc[metrics_df["Category"] == "MACRO", "F1"].values[0]

fig, ax = plt.subplots(figsize=(12, 14))
bars = ax.barh(cat_metrics["Category"], cat_metrics["F1"], color="steelblue", edgecolor="white")
ax.axvline(macro_f1, color="red", linestyle="--", linewidth=1.5, label=f"Macro-F1 = {macro_f1:.3f}")

for bar in bars:
    width = bar.get_width()
    ax.text(width + 0.005, bar.get_y() + bar.get_height() / 2,
            f"{width:.3f}", va="center", fontsize=8)

ax.set_xlabel("F1 Score")
ax.set_title("TF-IDF + XGBoost: Per-Subcategory F1 Scores (48 Labels)")
ax.legend(loc="lower right")
ax.set_xlim(0, 1.0)
plt.tight_layout()
plt.savefig(os.path.join(RESULTS_DIR, "classic_ml_subcategory_f1_barchart.png"), dpi=150)
plt.show()
print("Saved classic_ml_subcategory_f1_barchart.png")

Saved classic_ml_subcategory_f1_barchart.png
