## Creating compact versions of the models

In [1]:
import pandas as pd
import numpy as np
import sys
import os
import json
from pathlib import Path
from lightgbm import LGBMClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, confusion_matrix
import matplotlib.pyplot as plt
import seaborn as sns

# Detect project root (parent of notebooks/)
ROOT = Path.cwd().parents[0]

# Add root to path if not present
if str(ROOT) not in sys.path:
    sys.path.append(str(ROOT))

# Define paths
DATA_TRAIN   = ROOT / "data" / "interim" / "training_fe.csv"
DATA_LABELS  = ROOT / "data" / "raw" / "training_set_labels.csv"
ARTIFACTS    = ROOT / "artifacts"

# Import utility functions from src.modeling
from src.modeling import load_training_data, clean_feature_names, compute_metrics, save_confusion_matrices, TARGET_COLS

In [2]:
# Load Data
print("Loading training data...")
X, y = load_training_data(str(DATA_TRAIN), str(DATA_LABELS))
print(f"Loaded {X.shape[0]} samples with {X.shape[1]} features")

Loading training data...
Loaded 26707 samples with 72 features


In [61]:
# Define Reduced Feature Sets

H1N1_FEATURES = [
    "doctor_recc_h1n1",
    "health_insurance_1_0",
    "opinion_h1n1_vacc_effective",
    "opinion_h1n1_risk",
    "opinion_h1n1_sick_from_vacc",
    "age_group",
    "sex_Male",  # Mapped from 'sex'
    "health_worker",
#    "employment_industry_te_h1n1_vaccine",
#    "employment_occupation_te_h1n1_vaccine",
    "education",
#    "income_poverty",
#    "PVAS_h1n1",
#    "HRI"
]

SEASONAL_FEATURES = [
#    "PVAS_seas",
    "doctor_recc_seasonal",
    "age_group",
    "health_insurance_1_0",
    "opinion_seas_vacc_effective",
    "opinion_seas_risk",
    "opinion_seas_sick_from_vacc",
    "health_worker",
    "education",
]

# Verify features exist
print("Verifying H1N1 features...")
missing_h1n1 = [f for f in H1N1_FEATURES if f not in X.columns]
if missing_h1n1:
    print(f"Warning: Missing H1N1 features: {missing_h1n1}")
else:
    print("All H1N1 features present.")

print("\nVerifying Seasonal features...")
missing_seasonal = [f for f in SEASONAL_FEATURES if f not in X.columns]
if missing_seasonal:
    print(f"Warning: Missing Seasonal features: {missing_seasonal}")
else:
    print("All Seasonal features present.")

Verifying H1N1 features...
All H1N1 features present.

Verifying Seasonal features...
All Seasonal features present.


In [62]:
# Load Hyperparameters

# H1N1 Best Params (from artifacts/h1n1_vaccine_best_params.json)
h1n1_best_params = {
  "bagging_fraction": 0.7380284992106732,
  "bagging_freq": 1,
  "feature_fraction": 0.695824756266789,
  "lambda_l1": 0.1448948720912231,
  "lambda_l2": 0.489452760277563,
  "max_depth": 8,
  "min_child_samples": 41,
  "num_leaves": 74
}

# Seasonal Best Params (from artifacts/seasonal_vaccine_best_params.json)
seasonal_best_params = {
  "bagging_fraction": 0.9627313766183017,
  "bagging_freq": 1,
  "feature_fraction": 0.6911740650167767,
  "lambda_l1": 0.4271077886262563,
  "lambda_l2": 0.8180147659224931,
  "max_depth": 11,
  "min_child_samples": 12,
  "num_leaves": 58
}

In [63]:
# Function to train and evaluate reduced model
def train_evaluate_reduced(X, y, target, features, best_params, outdir="artifacts", random_state=42):
    print(f"Training reduced model for {target}...")
    
    # Select features
    X_reduced = X[features]
    
    # Split data (same random_state as tuning to be comparable)
    X_train, X_val, y_train, y_val = train_test_split(
        X_reduced, y[target], test_size=0.2, random_state=random_state, stratify=y[target]
    )
    
    spw = 2.7 if target == "h1n1_vaccine" else 1.0
    model = LGBMClassifier(
        objective="binary",
        boosting_type="gbdt",
        learning_rate=0.03,
        n_estimators=500,
        scale_pos_weight=spw,
        verbose=-1,
        random_state=random_state,
        **best_params
    )
    
    model.fit(X_train, y_train)
    
    # Evaluate
    y_prob_val = model.predict_proba(X_val)[:, 1]
    y_pred_val = (y_prob_val >= 0.5).astype(int)
    
    metrics = compute_metrics(y_val, y_pred_val, y_prob_val)
    
    # Save confusion matrices
    save_confusion_matrices(y_val, y_pred_val, f"reduced_{target}", outdir)
    
    print(f"Metrics for {target} (Reduced):")
    for k, v in metrics.items():
        print(f"  {k}: {v:.4f}")
        
    return metrics, model

# Train H1N1 Reduced
h1n1_reduced_metrics, h1n1_reduced_model = train_evaluate_reduced(
    X, y, "h1n1_vaccine", H1N1_FEATURES, h1n1_best_params
)

# Train Seasonal Reduced
seasonal_reduced_metrics, seasonal_reduced_model = train_evaluate_reduced(
    X, y, "seasonal_vaccine", SEASONAL_FEATURES, seasonal_best_params
)

Training reduced model for h1n1_vaccine...
Metrics for h1n1_vaccine (Reduced):
  accuracy: 0.8139
  precision: 0.5500
  recall: 0.6828
  f1: 0.6093
  roc_auc: 0.8445
Training reduced model for seasonal_vaccine...
Metrics for seasonal_vaccine (Reduced):
  accuracy: 0.7746
  precision: 0.7670
  recall: 0.7411
  f1: 0.7538
  roc_auc: 0.8519


In [64]:
# Compare Metrics
print("\n=== PERFORMANCE COMPARISON (Tuned vs Reduced) ===")

reduced_metrics_dict = {
    "h1n1_vaccine": h1n1_reduced_metrics,
    "seasonal_vaccine": seasonal_reduced_metrics
}

for target in TARGET_COLS:
    print(f"\n{target.upper().replace('_', ' ')}:")
    print("-" * 65)
    
    # Load tuned metrics
    with open(ARTIFACTS / f"tuned_{target}_metrics.json") as f:
        tuned = json.load(f)
        
    reduced = reduced_metrics_dict[target]
    
    print(f"{'Metric':<15} {'Tuned':<12} {'Reduced':<12} {'Change':<12}")
    print("-" * 65)
    
    for metric in ["accuracy", "precision", "recall", "f1", "roc_auc"]:
        tune_val = tuned[metric]
        red_val = reduced[metric]
        change = red_val - tune_val
        change_pct = (change / tune_val * 100) if tune_val > 0 else 0
        
        print(f"{metric:<15} {tune_val:<12.4f} {red_val:<12.4f} {change:+.4f} ({change_pct:+.2f}%)")


=== PERFORMANCE COMPARISON (Tuned vs Reduced) ===

H1N1 VACCINE:
-----------------------------------------------------------------
Metric          Tuned        Reduced      Change      
-----------------------------------------------------------------
accuracy        0.8261       0.8139       -0.0122 (-1.47%)
precision       0.5728       0.5500       -0.0227 (-3.97%)
recall          0.7000       0.6828       -0.0172 (-2.45%)
f1              0.6300       0.6093       -0.0208 (-3.29%)
roc_auc         0.8650       0.8445       -0.0205 (-2.37%)

SEASONAL VACCINE:
-----------------------------------------------------------------
Metric          Tuned        Reduced      Change      
-----------------------------------------------------------------
accuracy        0.7937       0.7746       -0.0191 (-2.41%)
precision       0.7809       0.7670       -0.0140 (-1.79%)
recall          0.7650       0.7411       -0.0239 (-3.13%)
f1              0.7729       0.7538       -0.0191 (-2.47%)
roc_auc   

In [66]:
# Retrain H1N1 model on full dataset
print("Retraining H1N1 model on full dataset (Reduced Features)...")
h1n1_final_reduced_model = LGBMClassifier(
    objective="binary",
    boosting_type="gbdt",
    learning_rate=0.03,
    n_estimators=500,
    scale_pos_weight=2.7,
    verbose=-1,
    random_state=42,
    **h1n1_best_params
)
h1n1_final_reduced_model.fit(X[H1N1_FEATURES], y['h1n1_vaccine'])

Retraining H1N1 model on full dataset (Reduced Features)...


0,1,2
,boosting_type,'gbdt'
,num_leaves,74
,max_depth,8
,learning_rate,0.03
,n_estimators,500
,subsample_for_bin,200000
,objective,'binary'
,class_weight,
,min_split_gain,0.0
,min_child_weight,0.001


In [None]:
# Retrain Seasonal model on full dataset
print("Retraining Seasonal model on full dataset (Reduced Features)...")
seasonal_final_reduced_model = LGBMClassifier(
    objective="binary",
    boosting_type="gbdt",
    learning_rate=0.03,
    n_estimators=500,
    scale_pos_weight=1.0,
    verbose=-1,
    random_state=42,
    **seasonal_best_params
)
seasonal_final_reduced_model.fit(X[SEASONAL_FEATURES], y['seasonal_vaccine'])
print("Retraining complete.")

Retraining Seasonal model on full dataset (Reduced Features)...
Retraining complete.


In [69]:
# Create output directory
models_dir = ARTIFACTS / "models"
models_dir.mkdir(parents=True, exist_ok=True)

# Save models
pd.to_pickle(h1n1_final_reduced_model, models_dir / "h1n1_reduced_model.pkl")
pd.to_pickle(seasonal_final_reduced_model, models_dir / "seasonal_reduced_model.pkl")

print(f"Models saved as pickles to {models_dir}")

Models saved as pickles to c:\Projects\Flushot\artifacts\models
