# 06 - Final Evaluation & Model Comparison

**Objective**: Comprehensive evaluation of all models with detailed metrics and visualizations.

**Evaluation includes**:
- Multiple model comparison (HistGradientBoosting, XGBoost, SVC, KNeighbors, DecisionTree)
- Performance metrics: Accuracy, Precision, Recall, F1-score, ROC-AUC
- Confusion matrices
- ROC curves
- Feature importance analysis

In [1]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns

pd.set_option('display.max_columns', None)
pd.set_option('display.width', 120)
sns.set()


In [2]:
import joblib
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score, precision_score, recall_score, f1_score, roc_auc_score
from sklearn.model_selection import train_test_split
import seaborn as sns
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np

# ======================
# 1. LOAD ENGINEERED DATA
# ======================

data_path = "data/flight_data_2018_2024_engineered.csv"
df = pd.read_csv(data_path)
df.columns = df.columns.str.strip()

print("Data shape:", df.shape)

# ======================
# 2. DEFINE TARGET & DROP NON-PREDICTIVE COLUMNS
# ======================

target = "DELAYED"

cols_to_remove = [
    "DELAYED",
    "FlightDate",
    "Duplicate",
    "DivAirportLandings",
    "CRSArrTime",
    "ArrTimeBlk",
]

cols_to_drop_final = [c for c in cols_to_remove if c in df.columns]

X = df.drop(columns=cols_to_drop_final)
y = df[target]

print("Feature shape:", X.shape)
print("Target distribution:\n", y.value_counts(normalize=True))

# ======================
# 3. TRAINâ€“TEST SPLIT (SAME AS TRAINING)
# ======================

X_train, X_test, y_train, y_test = train_test_split(
    X,
    y,
    test_size=0.2,
    random_state=42,
    stratify=y
)

print("Train shape:", X_train.shape, "Test shape:", X_test.shape)

# ======================
# 4. IDENTIFY COLUMN TYPES
# ======================

cat_cols = X.select_dtypes(include="object").columns.tolist()
num_cols = X.select_dtypes(exclude="object").columns.tolist()

# Downcast numeric to float32 to save memory
X_train[num_cols] = X_train[num_cols].astype(np.float32)
X_test[num_cols] = X_test[num_cols].astype(np.float32)

# ======================
# 5. PREPARE FEATURES FOR MODEL (SAME AS TRAINING)
# ======================

# Use only low-cardinality categorical features (same as training)
low_card_cats = [c for c in cat_cols if df[c].nunique() <= 20]

print("\nLow-cardinality categorical columns:", low_card_cats)

# Use the same feature columns as during training
hgb_feature_cols = num_cols + low_card_cats

X_train_hgb = X_train[hgb_feature_cols].copy()
X_test_hgb = X_test[hgb_feature_cols].copy()

print(f"\nTrain features shape: {X_train_hgb.shape}")
print(f"Test features shape for prediction: {X_test_hgb.shape}")

# ======================
# LOAD BEST MODEL & COMPARE
# ======================

best_model = joblib.load('models/best_flight_delay_model.pkl')
print("\nLoaded saved model (HistGradientBoosting)")

y_pred_best = best_model.predict(X_test_hgb)
y_prob_best = best_model.predict_proba(X_test_hgb)[:, 1]

best_results = {
    'accuracy': accuracy_score(y_test, y_pred_best),
    'f1': f1_score(y_test, y_pred_best),
    'roc_auc': roc_auc_score(y_test, y_prob_best)
}

print(f"Best Model - Accuracy: {best_results['accuracy']:.4f}, F1: {best_results['f1']:.4f}, ROC-AUC: {best_results['roc_auc']:.4f}")

Data shape: (582425, 51)
Feature shape: (582425, 45)
Target distribution:
 DELAYED
0    0.616778
1    0.383222
Name: proportion, dtype: float64
Train shape: (465940, 45) Test shape: (116485, 45)

Low-cardinality categorical columns: ['Marketing_Airline_Network', 'Operated_or_Branded_Code_Share_Partners', 'IATA_Code_Marketing_Airline', 'DepTimeBlk', 'DistanceGroup', 'DepPartOfDay']

Train features shape: (465940, 32)
Test features shape for prediction: (116485, 32)

Loaded saved model (HistGradientBoosting)
Best Model - Accuracy: 0.7100, F1: 0.5555, ROC-AUC: 0.7580


In [None]:
# ======================
# ADDITIONAL MODELS - TRAIN & PREDICT
# ======================

from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from xgboost import XGBClassifier

# Prepare preprocessor for models that need it
preprocessor = ColumnTransformer(
    transformers=[
        ("cat", OneHotEncoder(handle_unknown="ignore", sparse_output=False), low_card_cats),
        ("num", "passthrough", num_cols),
    ]
)

# Train and predict with additional models
models = {
    "XGBoost": XGBClassifier(random_state=42, n_jobs=-1, eval_metric='logloss'),
    "SVC": SVC(random_state=42, probability=True),
    "KNeighbors": KNeighborsClassifier(n_jobs=-1),
    "DecisionTree": DecisionTreeClassifier(random_state=42)
}

results = {}

for name, model in models.items():
    print(f"\nTraining {name}...")
    
    # Create pipeline
    pipeline = Pipeline([
        ("preprocessor", preprocessor),
        ("model", model)
    ])
    
    # Train
    pipeline.fit(X_train_hgb, y_train)
    
    # Predict
    y_pred = pipeline.predict(X_test_hgb)
    y_prob = pipeline.predict_proba(X_test_hgb)[:, 1]
    
    # Store results
    results[name] = {
        'y_pred': y_pred,
        'y_prob': y_prob,
        'pipeline': pipeline,
        'accuracy': accuracy_score(y_test, y_pred),
        'f1': f1_score(y_test, y_pred),
        'roc_auc': roc_auc_score(y_test, y_prob)
    }
    
    print(f"{name} - Accuracy: {results[name]['accuracy']:.4f}, F1: {results[name]['f1']:.4f}, ROC-AUC: {results[name]['roc_auc']:.4f}")

# ======================
# COMPREHENSIVE MODEL COMPARISON
# ======================

from sklearn.metrics import roc_curve, precision_recall_curve

# Add best model to results for comparison
all_results = {'HistGradientBoosting': best_results}
all_results.update(results)

# Print comparison table
print("\n" + "="*80)
print("ALL MODELS COMPARISON")
print("="*80)
print(f"{'Model':<25} | {'Accuracy':<10} | {'Precision':<10} | {'Recall':<10} | {'F1-Score':<10} | {'ROC-AUC':<10}")
print("-"*80)

for name, res in all_results.items():
    if 'precision' not in res:
        # Calculate missing metrics
        from sklearn.metrics import precision_score, recall_score
        res['precision'] = precision_score(y_test, res['y_pred'])
        res['recall'] = recall_score(y_test, res['y_pred'])
    
    print(f"{name:<25} | {res['accuracy']:<10.4f} | {res['precision']:<10.4f} | "
          f"{res['recall']:<10.4f} | {res['f1']:<10.4f} | {res['roc_auc']:<10.4f}")

print("="*80)

# ======================
# ROC CURVES
# ======================

plt.figure(figsize=(10, 8))
for name, res in all_results.items():
    fpr, tpr, _ = roc_curve(y_test, res['y_prob'])
    plt.plot(fpr, tpr, label=f"{name} (AUC={res['roc_auc']:.3f})", linewidth=2)

plt.plot([0, 1], [0, 1], 'k--', label='Random Classifier', linewidth=1)
plt.xlabel('False Positive Rate', fontsize=12)
plt.ylabel('True Positive Rate', fontsize=12)
plt.title('ROC Curves - Model Comparison', fontsize=14, fontweight='bold')
plt.legend(loc='lower right', fontsize=10)
plt.grid(alpha=0.3)
plt.tight_layout()
plt.show()

# ======================
# CONFUSION MATRICES
# ======================

fig, axes = plt.subplots(2, 3, figsize=(18, 12))
axes = axes.flatten()

for idx, (name, res) in enumerate(all_results.items()):
    cm = confusion_matrix(y_test, res['y_pred'])
    sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', ax=axes[idx],
                xticklabels=['On-time', 'Delayed'],
                yticklabels=['On-time', 'Delayed'])
    axes[idx].set_title(f'{name}\nAcc: {res["accuracy"]:.3f}, F1: {res["f1"]:.3f}', 
                       fontsize=11, fontweight='bold')
    axes[idx].set_xlabel('Predicted', fontsize=10)
    axes[idx].set_ylabel('Actual', fontsize=10)

plt.tight_layout()
plt.show()

# ======================
# FEATURE IMPORTANCE (for tree-based models)
# ======================

try:
    # Get feature names from preprocessor
    feature_names = (preprocessor.named_transformers_['cat'].get_feature_names_out(low_card_cats).tolist() + 
                    num_cols)
    
    # XGBoost feature importance
    if 'XGBoost' in results and 'pipeline' in results['XGBoost']:
        xgb_pipeline = results['XGBoost']['pipeline']
        xgb_model = xgb_pipeline.named_steps['model']
        if hasattr(xgb_model, 'feature_importances_'):
            importances = xgb_model.feature_importances_
            top_features = pd.DataFrame({
                'Feature': feature_names,
                'Importance': importances
            }).sort_values('Importance', ascending=False).head(15)
            
            plt.figure(figsize=(10, 8))
            plt.barh(range(len(top_features)), top_features['Importance'], color='steelblue')
            plt.yticks(range(len(top_features)), top_features['Feature'])
            plt.xlabel('Feature Importance', fontsize=12)
            plt.title('Top 15 Most Important Features (XGBoost)', fontsize=14, fontweight='bold')
            plt.gca().invert_yaxis()
            plt.grid(axis='x', alpha=0.3)
            plt.tight_layout()
            plt.show()
except Exception as e:
    print(f"Could not plot feature importance: {e}")

# ======================
# FINAL SUMMARY
# ======================

best_model_name = max(all_results.items(), key=lambda x: x[1]['f1'])[0]
print("\n" + "="*80)
print("BEST MODEL (by F1-Score)")
print("="*80)
print(f"Model: {best_model_name}")
best_res = all_results[best_model_name]
print(f"  Accuracy:  {best_res['accuracy']:.4f}")
print(f"  Precision: {best_res['precision']:.4f}")
print(f"  Recall:    {best_res['recall']:.4f}")
print(f"  F1-Score:  {best_res['f1']:.4f}")
print(f"  ROC-AUC:   {best_res['roc_auc']:.4f}")
print("="*80)



Training XGBoost...
XGBoost - Accuracy: 0.7128, F1: 0.5686, ROC-AUC: 0.7629

Training SVC...
