# 04 - Model Training (Baseline & Tree-Based Models)

In this notebook we train baseline models and tree-based models for flight delay prediction.

In [None]:
# 04_model_training.ipynb

import pandas as pd
import numpy as np
import os
os.makedirs("models", exist_ok=True)

from sklearn.model_selection import train_test_split
from sklearn.metrics import (
    accuracy_score,
    precision_score,
    recall_score,
    f1_score,
    roc_auc_score,
    classification_report,
)
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import HistGradientBoostingClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from xgboost import XGBClassifier
import joblib

# ======================
# 1. LOAD ENGINEERED DATA
# ======================

data_path = "data/flight_data_2018_2024_engineered.csv"
df = pd.read_csv(data_path)
df.columns = df.columns.str.strip()

print("Data shape:", df.shape)

# ======================
# 2. DEFINE TARGET & DROP NON-PREDICTIVE COLUMNS
# ======================

target = "DELAYED"

cols_to_remove = [
    "DELAYED",
    "FlightDate",
    "Duplicate",
    "DivAirportLandings",
    "CRSArrTime",
    "ArrTimeBlk",
]

cols_to_drop_final = [c for c in cols_to_remove if c in df.columns]

X = df.drop(columns=cols_to_drop_final)
y = df[target]

print("Feature shape:", X.shape)
print("Target distribution:\n", y.value_counts(normalize=True))

# ======================
# 3. TRAIN–TEST SPLIT
# ======================

X_train, X_test, y_train, y_test = train_test_split(
    X,
    y,
    test_size=0.2,
    random_state=42,
    stratify=y
)

print("Train shape:", X_train.shape, "Test shape:", X_test.shape)

# ======================
# 4. IDENTIFY COLUMN TYPES
# ======================

cat_cols = X.select_dtypes(include="object").columns.tolist()
num_cols = X.select_dtypes(exclude="object").columns.tolist()

print("Categorical columns:", cat_cols)
print("Numerical columns:", num_cols)

# Downcast numeric to float32 to save memory
X_train[num_cols] = X_train[num_cols].astype(np.float32)
X_test[num_cols] = X_test[num_cols].astype(np.float32)

# ======================
# 5. PREPROCESSOR FOR LOGISTIC (SPARSE, FULL FEATURES)
# ======================

preprocessor_logit = ColumnTransformer(
    transformers=[
        ("cat", OneHotEncoder(handle_unknown="ignore", sparse_output=True), cat_cols),
        ("num", StandardScaler(), num_cols),
    ]
)

# ======================
# 6. EVALUATION FUNCTION
# ======================

def evaluate_model(name, y_true, y_pred, y_prob):
    print(f"\n===== {name} Performance =====")
    print("Accuracy :", accuracy_score(y_true, y_pred))
    print("Precision:", precision_score(y_true, y_pred))
    print("Recall   :", recall_score(y_true, y_pred))
    print("F1-score :", f1_score(y_true, y_pred))
    print("ROC-AUC  :", roc_auc_score(y_true, y_prob))
    print("\nClassification Report:\n", classification_report(y_true, y_pred))

# ======================
# 7. LOGISTIC REGRESSION (BASELINE, SAME AS BEFORE)
# ======================

log_reg_pipeline = Pipeline(steps=[
    ("preprocessor", preprocessor_logit),
    ("model", LogisticRegression(max_iter=3000, n_jobs=-1)),
])

print("\nFitting Logistic Regression (baseline)...")
log_reg_pipeline.fit(X_train, y_train)

y_pred_lr = log_reg_pipeline.predict(X_test)
y_prob_lr = log_reg_pipeline.predict_proba(X_test)[:, 1]

evaluate_model("Logistic Regression", y_test, y_pred_lr, y_prob_lr)

# ======================
# 8. PREPROCESSOR FOR HIST GRADIENT BOOSTING (MEMORY-SAFE)
# ======================

# Use only low-cardinality categorical features for HGB
# (to avoid huge dense one-hot matrices)
low_card_cats = [c for c in cat_cols if df[c].nunique() <= 20]

print("\nLow-cardinality categorical columns for HGB:", low_card_cats)

# High-cardinality features like Route, StateRoute, city names etc.
# are DROPPED for HGB to avoid memory explosion.
hgb_feature_cols = num_cols + low_card_cats

X_train_hgb = X_train[hgb_feature_cols].copy()
X_test_hgb = X_test[hgb_feature_cols].copy()

preprocessor_hgb = ColumnTransformer(
    transformers=[
        # Few low-cardinality cats → dense one-hot is fine
        ("cat", OneHotEncoder(handle_unknown="ignore", sparse_output=False), low_card_cats),
        # Tree-based models don't need scaling; pass numeric as-is
        ("num", "passthrough", num_cols),
    ]
)

# ======================
# 9. HIST GRADIENT BOOSTING (FAST, STRONG MODEL)
# ======================

hgb_pipeline = Pipeline(steps=[
    ("preprocessor", preprocessor_hgb),
    ("model", HistGradientBoostingClassifier(
        max_depth=10,
        learning_rate=0.1,
        max_iter=200,
        random_state=42
    )),
])

print("\nFitting HistGradientBoostingClassifier on reduced feature set...")
hgb_pipeline.fit(X_train_hgb, y_train)

y_pred_hgb = hgb_pipeline.predict(X_test_hgb)
y_prob_hgb = hgb_pipeline.predict_proba(X_test_hgb)[:, 1]

evaluate_model("HistGradientBoosting (reduced features)", y_test, y_pred_hgb, y_prob_hgb)

# ======================
# 10. ADDITIONAL MODELS - XGBOOST, SVC, KNN, DECISION TREE
# ======================

# Train and evaluate additional models
additional_models = {
    "XGBoost": XGBClassifier(random_state=42, n_jobs=-1, eval_metric='logloss'),
    "SVC": SVC(random_state=42, probability=True),
    "KNeighbors": KNeighborsClassifier(n_jobs=-1),
    "DecisionTree": DecisionTreeClassifier(random_state=42)
}

model_results = {
    "Logistic Regression": {
        "y_pred": y_pred_lr,
        "y_prob": y_prob_lr,
        "accuracy": accuracy_score(y_test, y_pred_lr),
        "f1": f1_score(y_test, y_pred_lr),
        "roc_auc": roc_auc_score(y_test, y_prob_lr)
    },
    "HistGradientBoosting": {
        "y_pred": y_pred_hgb,
        "y_prob": y_prob_hgb,
        "accuracy": accuracy_score(y_test, y_pred_hgb),
        "f1": f1_score(y_test, y_pred_hgb),
        "roc_auc": roc_auc_score(y_test, y_prob_hgb)
    }
}

for name, model in additional_models.items():
    print(f"\n{'='*60}")
    print(f"Training {name}...")
    print('='*60)
    
    # Create pipeline
    pipeline = Pipeline([
        ("preprocessor", preprocessor_hgb),
        ("model", model)
    ])
    
    # Train
    pipeline.fit(X_train_hgb, y_train)
    
    # Predict
    y_pred = pipeline.predict(X_test_hgb)
    y_prob = pipeline.predict_proba(X_test_hgb)[:, 1]
    
    # Evaluate
    evaluate_model(name, y_test, y_pred, y_prob)
    
    # Store results
    model_results[name] = {
        "y_pred": y_pred,
        "y_prob": y_prob,
        "accuracy": accuracy_score(y_test, y_pred),
        "f1": f1_score(y_test, y_pred),
        "roc_auc": roc_auc_score(y_test, y_prob)
    }

# ======================
# 11. MODEL COMPARISON SUMMARY
# ======================

print("\n" + "="*80)
print("MODEL COMPARISON SUMMARY")
print("="*80)
print(f"{'Model':<30} | {'Accuracy':<10} | {'F1-Score':<10} | {'ROC-AUC':<10}")
print("-"*80)

for name, res in model_results.items():
    print(f"{name:<30} | {res['accuracy']:<10.4f} | {res['f1']:<10.4f} | {res['roc_auc']:<10.4f}")

print("="*80)

# Find best model by F1-score
best_model_name = max(model_results.items(), key=lambda x: x[1]['f1'])[0]
print(f"\nBest model (by F1-score): {best_model_name}")
print(f"  Accuracy: {model_results[best_model_name]['accuracy']:.4f}")
print(f"  F1-Score: {model_results[best_model_name]['f1']:.4f}")
print(f"  ROC-AUC:  {model_results[best_model_name]['roc_auc']:.4f}")

# ======================
# 12. SAVE BEST MODEL
# ======================

# Save HistGradientBoosting as the best model (or save the best one if different)
joblib.dump(hgb_pipeline, "models/best_flight_delay_model.pkl")
print("\nSaved HistGradientBoosting model to models/best_flight_delay_model.pkl")
print("(Note: You can modify this to save the best performing model instead)")


Data shape: (582425, 51)
Feature shape: (582425, 45)
Target distribution:
 DELAYED
0    0.616778
1    0.383222
Name: proportion, dtype: float64
Train shape: (465940, 45) Test shape: (116485, 45)
Categorical columns: ['Marketing_Airline_Network', 'Operated_or_Branded_Code_Share_Partners', 'IATA_Code_Marketing_Airline', 'Operating_Airline', 'IATA_Code_Operating_Airline', 'Origin', 'OriginCityName', 'OriginState', 'OriginStateName', 'Dest', 'DestCityName', 'DestState', 'DestStateName', 'DepTimeBlk', 'DistanceGroup', 'DepPartOfDay', 'Route', 'StateRoute', 'Airline']
Numerical columns: ['Year', 'Quarter', 'Month', 'DayofMonth', 'DayOfWeek', 'DOT_ID_Marketing_Airline', 'Flight_Number_Marketing_Airline', 'DOT_ID_Operating_Airline', 'Flight_Number_Operating_Airline', 'OriginAirportID', 'OriginAirportSeqID', 'OriginCityMarketID', 'OriginStateFips', 'OriginWac', 'DestAirportID', 'DestAirportSeqID', 'DestCityMarketID', 'DestStateFips', 'DestWac', 'CRSDepTime', 'Cancelled', 'Diverted', 'CRSElapsed