# Part 2: Stratified Sampling & Classic ML Baseline

Uses `data/asrs_multilabel.csv` (172K rows) from notebook 01.
1. Stratified 40K sample -> 32K train / 8K test
2. TF-IDF + XGBoost multi-label baseline
3. Structured feature extraction
4. Modal LLM script scaffolds

In [1]:
import subprocess, sys
for pkg in ["iterative-stratification", "tqdm"]:
    subprocess.check_call([sys.executable, "-m", "pip", "install", "-q", pkg])

import pandas as pd
import numpy as np
import os, time, glob, warnings
from contextlib import contextmanager

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import precision_score, recall_score, f1_score, roc_auc_score
from xgboost import XGBClassifier
from iterstrat.ml_stratifiers import MultilabelStratifiedShuffleSplit
from tqdm import tqdm
import joblib
import matplotlib
matplotlib.use("Agg")
import matplotlib.pyplot as plt

warnings.filterwarnings("ignore")

RAW_DIR = "../raw data"
DATA_DIR = "../data"
RESULTS_DIR = "../results"
SCRIPTS_DIR = "../scripts"
os.makedirs(DATA_DIR, exist_ok=True)
os.makedirs(RESULTS_DIR, exist_ok=True)
os.makedirs(SCRIPTS_DIR, exist_ok=True)

@contextmanager
def timer(label):
    t0 = time.time()
    yield
    print(f"  [{label}] {time.time() - t0:.1f}s")

print("Setup complete.")

Setup complete.


In [2]:
# Cell 2: Load Data & Create 40K Stratified Sample

train_path = os.path.join(DATA_DIR, "train_set.csv")
test_path = os.path.join(DATA_DIR, "test_set.csv")

if os.path.exists(train_path) and os.path.exists(test_path):
    print("Checkpoint: train_set.csv and test_set.csv already exist, loading...")
    train_df = pd.read_csv(train_path)
    test_df = pd.read_csv(test_path)
    full_df = pd.read_csv(os.path.join(DATA_DIR, "asrs_multilabel.csv"))
    LABEL_COLS = [c for c in full_df.columns if c not in ("ACN", "Narrative")]
    print(f"Full: {len(full_df):,}  Train: {len(train_df):,}  Test: {len(test_df):,}")
else:
    with timer("Load full dataset"):
        full_df = pd.read_csv(os.path.join(DATA_DIR, "asrs_multilabel.csv"))
        print(f"Loaded {len(full_df):,} rows")

    LABEL_COLS = [c for c in full_df.columns if c not in ("ACN", "Narrative")]
    print(f"Label columns ({len(LABEL_COLS)}): {LABEL_COLS}")

    y_full = full_df[LABEL_COLS].values
    X_idx = np.arange(len(full_df))

    # Step 1: draw 40K stratified sample from 172K
    with timer("40K stratified sample"):
        splitter_40k = MultilabelStratifiedShuffleSplit(n_splits=1, test_size=40_000, random_state=42)
        _, sample_idx = next(splitter_40k.split(X_idx, y_full))
        sample_df = full_df.iloc[sample_idx].reset_index(drop=True)
        print(f"Sample size: {len(sample_df):,}")

    # Step 2: split 40K -> 32K train / 8K test
    with timer("32K/8K split"):
        y_sample = sample_df[LABEL_COLS].values
        X_sample_idx = np.arange(len(sample_df))
        splitter_split = MultilabelStratifiedShuffleSplit(n_splits=1, test_size=8_000, random_state=42)
        train_idx, test_idx = next(splitter_split.split(X_sample_idx, y_sample))
        train_df = sample_df.iloc[train_idx].reset_index(drop=True)
        test_df = sample_df.iloc[test_idx].reset_index(drop=True)
        print(f"Train: {len(train_df):,}  Test: {len(test_df):,}")

    train_df.to_csv(train_path, index=False)
    test_df.to_csv(test_path, index=False)
    print(f"Saved {train_path} and {test_path}")

Checkpoint: train_set.csv and test_set.csv already exist, loading...


Full: 172,183  Train: 31,850  Test: 8,044


In [3]:
# Cell 3: Stratification Quality Check

print(f"{'Category':<30} {'Full %':>8} {'Train %':>8} {'Test %':>8}")
print("-" * 56)
for col in LABEL_COLS:
    pf = full_df[col].mean() * 100
    pt = train_df[col].mean() * 100
    pe = test_df[col].mean() * 100
    print(f"{col:<30} {pf:>7.2f}% {pt:>7.2f}% {pe:>7.2f}%")

print(f"\nLabel Count Distribution")
print(f"{'# Labels':<10} {'Train':>8} {'Test':>8}")
print("-" * 28)
train_lc = train_df[LABEL_COLS].sum(axis=1)
test_lc = test_df[LABEL_COLS].sum(axis=1)
for n in sorted(set(train_lc.unique()) | set(test_lc.unique())):
    label = f"{int(n)}" if n < 5 else "5+"
    if n >= 5 and n > min(x for x in (set(train_lc.unique()) | set(test_lc.unique())) if x >= 5):
        continue
    if n < 5:
        tr_cnt = (train_lc == n).sum()
        te_cnt = (test_lc == n).sum()
    else:
        tr_cnt = (train_lc >= 5).sum()
        te_cnt = (test_lc >= 5).sum()
    print(f"{label:<10} {tr_cnt:>8,} {te_cnt:>8,}")

Category                         Full %  Train %   Test %
--------------------------------------------------------
Aircraft Equipment Problem       28.64%   28.75%   28.56%
Airspace Violation                3.97%    3.99%    3.95%
ATC Issue                        17.09%   17.16%   17.04%
Conflict                         26.88%   26.99%   26.80%
Deviation - Altitude             16.48%   16.54%   16.43%
Deviation - Procedural           65.40%   65.66%   65.22%
Deviation - Speed                 2.90%    2.92%    2.90%
Deviation - Track/Heading        11.77%   11.82%   11.74%
Flight Deck/Cabin Event           7.14%    7.16%    7.12%
Ground Event/Encounter            8.27%    8.30%    8.24%
Ground Excursion                  2.16%    2.17%    2.15%
Ground Incursion                  7.32%    7.35%    7.30%
Inflight Event/Encounter         22.45%   22.54%   22.39%

Label Count Distribution
# Labels      Train     Test
----------------------------
1             6,936    1,770
2            14,44

In [4]:
# Cell 4: TF-IDF + XGBoost (13 binary classifiers)

pred_path = os.path.join(RESULTS_DIR, "classic_ml_text_predictions.csv")
metrics_path = os.path.join(RESULTS_DIR, "classic_ml_text_metrics.csv")

if os.path.exists(pred_path) and os.path.exists(metrics_path):
    print("Checkpoint: predictions and metrics exist, loading...")
    predictions_df = pd.read_csv(pred_path)
    metrics_df = pd.read_csv(metrics_path)
    print(metrics_df.to_string(index=False))
else:
    # Fit TF-IDF
    with timer("TF-IDF vectorization"):
        tfidf = TfidfVectorizer(
            max_features=50_000,
            ngram_range=(1, 2),
            sublinear_tf=True,
            dtype=np.float32,
        )
        X_train = tfidf.fit_transform(train_df["Narrative"])
        X_test = tfidf.transform(test_df["Narrative"])
        print(f"TF-IDF shape: train {X_train.shape}, test {X_test.shape}")

    # Train one classifier per category
    results = []
    all_preds = {"ACN": test_df["ACN"].values}
    total_t0 = time.time()

    for i, col in enumerate(LABEL_COLS):
        t0 = time.time()
        y_tr = train_df[col].values
        y_te = test_df[col].values

        n_pos = y_tr.sum()
        n_neg = len(y_tr) - n_pos
        spw = n_neg / n_pos if n_pos > 0 else 1.0

        clf = XGBClassifier(
            n_estimators=300,
            max_depth=6,
            learning_rate=0.1,
            scale_pos_weight=spw,
            n_jobs=-1,
            eval_metric="logloss",
            random_state=42,
            verbosity=0,
            tree_method="hist",
        )
        clf.fit(X_train, y_tr)

        y_pred = clf.predict(X_test)
        y_proba = clf.predict_proba(X_test)[:, 1]

        p = precision_score(y_te, y_pred, zero_division=0)
        r = recall_score(y_te, y_pred, zero_division=0)
        f1 = f1_score(y_te, y_pred, zero_division=0)
        auc = roc_auc_score(y_te, y_proba)
        elapsed = time.time() - t0

        results.append({"Category": col, "Precision": p, "Recall": r, "F1": f1, "ROC-AUC": auc})
        all_preds[f"{col}_true"] = y_te
        all_preds[f"{col}_pred"] = y_pred
        all_preds[f"{col}_proba"] = y_proba
        print(f"  [{i+1}/{len(LABEL_COLS)}] {col:<30} P={p:.3f} R={r:.3f} F1={f1:.3f} AUC={auc:.3f}  ({elapsed:.1f}s)")

    total_elapsed = time.time() - total_t0
    print(f"\nTotal training time: {total_elapsed:.1f}s")

    # Compute macro and micro aggregates
    y_true_all = test_df[LABEL_COLS].values
    y_pred_all = np.column_stack([all_preds[f"{c}_pred"] for c in LABEL_COLS])
    y_proba_all = np.column_stack([all_preds[f"{c}_proba"] for c in LABEL_COLS])

    macro_p = precision_score(y_true_all, y_pred_all, average="macro", zero_division=0)
    macro_r = recall_score(y_true_all, y_pred_all, average="macro", zero_division=0)
    macro_f1 = f1_score(y_true_all, y_pred_all, average="macro", zero_division=0)
    macro_auc = roc_auc_score(y_true_all, y_proba_all, average="macro")

    micro_p = precision_score(y_true_all, y_pred_all, average="micro", zero_division=0)
    micro_r = recall_score(y_true_all, y_pred_all, average="micro", zero_division=0)
    micro_f1 = f1_score(y_true_all, y_pred_all, average="micro", zero_division=0)
    micro_auc = roc_auc_score(y_true_all, y_proba_all, average="micro")

    results.append({"Category": "MACRO", "Precision": macro_p, "Recall": macro_r, "F1": macro_f1, "ROC-AUC": macro_auc})
    results.append({"Category": "MICRO", "Precision": micro_p, "Recall": micro_r, "F1": micro_f1, "ROC-AUC": micro_auc})

    metrics_df = pd.DataFrame(results)
    metrics_df.to_csv(metrics_path, index=False)

    predictions_df = pd.DataFrame(all_preds)
    predictions_df.to_csv(pred_path, index=False)

    print(f"\nSaved {metrics_path} and {pred_path}")
    print(metrics_df.to_string(index=False))

Checkpoint: predictions and metrics exist, loading...
                  Category  Precision   Recall       F1  ROC-AUC
Aircraft Equipment Problem   0.813420 0.818024 0.815715 0.943793
        Airspace Violation   0.480435 0.694969 0.568123 0.937690
                 ATC Issue   0.605263 0.754923 0.671860 0.915598
                  Conflict   0.769099 0.835807 0.801067 0.943318
      Deviation - Altitude   0.663563 0.808623 0.728946 0.949377
    Deviation - Procedural   0.811718 0.779070 0.795059 0.794006
         Deviation - Speed   0.550781 0.605150 0.576687 0.948998
 Deviation - Track/Heading   0.591959 0.733051 0.654993 0.927470
   Flight Deck/Cabin Event   0.701258 0.778360 0.737800 0.963287
    Ground Event/Encounter   0.503165 0.719457 0.592179 0.923195
          Ground Excursion   0.572254 0.572254 0.572254 0.973433
          Ground Incursion   0.645669 0.838160 0.729429 0.975834
  Inflight Event/Encounter   0.704499 0.765130 0.733564 0.920235
                     MACRO   0.64716

In [5]:
# Cell 5: Structured Features Extraction from Raw CSVs

struct_path = os.path.join(DATA_DIR, "structured_features.csv")

if os.path.exists(struct_path):
    print("Checkpoint: structured_features.csv exists, loading...")
    struct_df = pd.read_csv(struct_path)
    print(f"Shape: {struct_df.shape}")
else:
    target_cols = ["ACN", "Local Time Of Day", "Light", "Flight Phase",
                   "Make Model Name", "Weather Elements / Visibility", "Locale Reference"]

    sample_acns = set(train_df["ACN"].tolist() + test_df["ACN"].tolist())
    print(f"Looking for {len(sample_acns):,} ACNs in raw CSVs")

    raw_files = sorted(glob.glob(os.path.join(RAW_DIR, "*.csv")))
    chunks = []

    with timer("Read raw CSVs"):
        for i, f in enumerate(raw_files):
            df_raw = pd.read_csv(
                f, header=1, low_memory=False,
                usecols=lambda c: c.strip() in target_cols,
            )
            df_raw.columns = df_raw.columns.str.strip()
            df_raw = df_raw[df_raw["ACN"].isin(sample_acns)]
            if len(df_raw) > 0:
                chunks.append(df_raw)
            if (i + 1) % 10 == 0:
                print(f"  Read {i+1}/{len(raw_files)} files...")

    struct_all = pd.concat(chunks, ignore_index=True)
    print(f"Total rows before dedup: {len(struct_all):,}")

    def first_non_null(s):
        vals = s.dropna()
        return vals.iloc[0] if len(vals) > 0 else np.nan

    struct_df = struct_all.groupby("ACN", sort=False).agg(
        {c: first_non_null for c in target_cols if c != "ACN"}
    ).reset_index()
    print(f"After dedup: {len(struct_df):,} rows")

    struct_df.to_csv(struct_path, index=False)
    print(f"Saved {struct_path}")

# Print value distributions
print(f"\n{'='*60}")
for col in struct_df.columns:
    if col == "ACN":
        continue
    n_unique = struct_df[col].nunique()
    n_null = struct_df[col].isna().sum()
    print(f"\n{col} ({n_unique} unique, {n_null} missing):")
    print(struct_df[col].value_counts().head(10).to_string())

# TODO: encoding + structured-only XGBoost model deferred to future cell

Checkpoint: structured_features.csv exists, loading...
Shape: (39894, 7)


Local Time Of Day (5 unique, 2380 missing):
Local Time Of Day
1201-1800    14980
0601-1200    11655
1801-2400     8281
0001-0600     2591
ZZZ              7

Light (4 unique, 8612 missing):
Light
Daylight    23139
Night        6010
Dusk         1524
Dawn          609

Flight Phase (971 unique, 1032 missing):
Flight Phase
Cruise              4851
Initial Approach    4037
Climb               3751
Taxi                3219
Parked              2586
Descent             2257
Landing             1989
Cruise; Cruise      1947
Takeoff / Launch    1822
Initial Climb       1464

Make Model Name (655 unique, 250 missing):
Make Model Name
Commercial Fixed Wing                                     3062
Medium Large Transport; Low Wing; 2 Turbojet Eng          1860
Any Unknown or Unlisted Aircraft Manufacturer             1223
Skyhawk 172/Cutlass 172                                   1186
B737 Undifferentiated or Other Model    

In [6]:
# Cell 6: Results Summary

summary_path = os.path.join(RESULTS_DIR, "classic_ml_summary.txt")

if os.path.exists(summary_path):
    print("Checkpoint: classic_ml_summary.txt exists.")
    with open(summary_path) as f:
        print(f.read())
else:
    if "metrics_df" not in dir():
        metrics_df = pd.read_csv(os.path.join(RESULTS_DIR, "classic_ml_text_metrics.csv"))

    lines = [
        "Classic ML Baseline: TF-IDF + XGBoost (Text Only)",
        "=" * 55,
        f"Train set: {len(train_df):,} reports | Test set: {len(test_df):,} reports",
        f"TF-IDF: max_features=50000, ngram_range=(1,2), sublinear_tf=True",
        f"XGBoost: n_estimators=300, max_depth=6, lr=0.1, scale_pos_weight=auto",
        "",
        f"{'Category':<30} {'Precision':>10} {'Recall':>10} {'F1':>10} {'ROC-AUC':>10}",
        "-" * 72,
    ]
    for _, row in metrics_df.iterrows():
        cat = row["Category"]
        sep = "-" * 72 if cat == "MACRO" else ""
        if sep:
            lines.append(sep)
        lines.append(f"{cat:<30} {row['Precision']:>10.4f} {row['Recall']:>10.4f} {row['F1']:>10.4f} {row['ROC-AUC']:>10.4f}")

    summary = "\n".join(lines)
    with open(summary_path, "w") as f:
        f.write(summary)
    print(summary)
    print(f"\nSaved {summary_path}")

Checkpoint: classic_ml_summary.txt exists.
Classic ML Baseline: TF-IDF + XGBoost (Text Only)
Train set: 31,850 reports | Test set: 8,044 reports
TF-IDF: max_features=50000, ngram_range=(1,2), sublinear_tf=True
XGBoost: n_estimators=300, max_depth=6, lr=0.1, scale_pos_weight=auto

Category                        Precision     Recall         F1    ROC-AUC
------------------------------------------------------------------------
Aircraft Equipment Problem         0.8134     0.8180     0.8157     0.9438
Airspace Violation                 0.4804     0.6950     0.5681     0.9377
ATC Issue                          0.6053     0.7549     0.6719     0.9156
Conflict                           0.7691     0.8358     0.8011     0.9433
Deviation - Altitude               0.6636     0.8086     0.7289     0.9494
Deviation - Procedural             0.8117     0.7791     0.7951     0.7940
Deviation - Speed                  0.5508     0.6052     0.5767     0.9490
Deviation - Track/Heading          0.5920     

In [7]:
# Cell 7: F1 Bar Chart Visualization

if "metrics_df" not in dir():
    metrics_df = pd.read_csv(os.path.join(RESULTS_DIR, "classic_ml_text_metrics.csv"))

cat_metrics = metrics_df[~metrics_df["Category"].isin(["MACRO", "MICRO"])].copy()
cat_metrics = cat_metrics.sort_values("F1", ascending=True)

macro_f1 = metrics_df.loc[metrics_df["Category"] == "MACRO", "F1"].values[0]

fig, ax = plt.subplots(figsize=(10, 7))
bars = ax.barh(cat_metrics["Category"], cat_metrics["F1"], color="steelblue", edgecolor="white")
ax.axvline(macro_f1, color="red", linestyle="--", linewidth=1.5, label=f"Macro-F1 = {macro_f1:.3f}")

for bar in bars:
    width = bar.get_width()
    ax.text(width + 0.005, bar.get_y() + bar.get_height() / 2,
            f"{width:.3f}", va="center", fontsize=9)

ax.set_xlabel("F1 Score")
ax.set_title("TF-IDF + XGBoost: Per-Category F1 Scores")
ax.legend(loc="lower right")
ax.set_xlim(0, 1.0)
plt.tight_layout()
plt.savefig(os.path.join(RESULTS_DIR, "classic_ml_f1_barchart.png"), dpi=150)
plt.show()
print("Saved classic_ml_f1_barchart.png")

Saved classic_ml_f1_barchart.png


---
## Part 3: Modal LLM Scripts (Zero-Shot, Few-Shot, Fine-Tune)

In [8]:
# Cell 9: Verify scripts/modal_zero_shot.py

zero_shot_path = os.path.join(SCRIPTS_DIR, "modal_zero_shot.py")

if os.path.exists(zero_shot_path):
    print(f"Checkpoint: {zero_shot_path} exists.")
    with open(zero_shot_path) as f:
        print(f.read()[:500])
else:
    print("ERROR: modal_zero_shot.py not found.")

Checkpoint: ../scripts\modal_zero_shot.py exists.
"""Zero-shot classification of ASRS reports using Llama 3.1 8B-Instruct on Modal."""
import modal
import json
import csv
import io

MODEL_ID = "meta-llama/Llama-3.1-8B-Instruct"
GPU = "L4"  # 24GB VRAM, ~6GB for 4-bit 8B model

app = modal.App("asrs-zero-shot")

vllm_image = (
    modal.Image.debian_slim(python_version="3.11")
    .pip_install("vllm", "torch", "transformers", "huggingface_hub")
)


@app.cls(
    image=vllm_image,
    gpu=GPU,
    timeout=3600,
    secrets=[modal.Secret.from_name


In [9]:
# Cell 10: Verify scripts/modal_few_shot.py and scripts/modal_finetune.py

few_shot_path = os.path.join(SCRIPTS_DIR, "modal_few_shot.py")
finetune_path = os.path.join(SCRIPTS_DIR, "modal_finetune.py")

if os.path.exists(few_shot_path) and os.path.exists(finetune_path):
    print("Checkpoint: few-shot and fine-tune scripts exist.")
    for p in [few_shot_path, finetune_path]:
        print(f"\n--- {p} ---")
        with open(p) as f:
            print(f.read()[:300])
else:
    print("ERROR: Modal scripts not found.")

Checkpoint: few-shot and fine-tune scripts exist.

--- ../scripts\modal_few_shot.py ---
"""Few-shot classification of ASRS reports using Llama 3.1 8B-Instruct on Modal."""
import modal
import json

MODEL_ID = "meta-llama/Llama-3.1-8B-Instruct"
GPU = "L4"  # 24GB VRAM

app = modal.App("asrs-few-shot")

vllm_image = (
    modal.Image.debian_slim(python_version="3.11")
    .pip_install("v

--- ../scripts\modal_finetune.py ---
"""QLoRA fine-tuning of Llama 3.1 8B for ASRS report classification on Modal."""
import modal
import json

MODEL_ID = "meta-llama/Llama-3.1-8B-Instruct"
GPU = "L4"  # 24GB VRAM; QLoRA: 4-bit base ~6GB + adapters fits in 24GB

app = modal.App("asrs-finetune")

train_image = (
    modal.Image.debian_s
