In [41]:
# Day 9 — Evaluate Models on New Data

import pandas as pd
import joblib
import json
from sklearn.metrics import roc_auc_score, average_precision_score

# -----------------------------
# Load new dataset
# -----------------------------
DATA_NEW = Path("../data/processed/report_features.csv")
df_new = pd.read_csv(DATA_NEW)

TARGET = "is_serious_report"

# -----------------------------
# Preprocess new dataset
# -----------------------------
X_new = df_new.drop(columns=[TARGET, "safetyreportid", "receivedate"])
y_new = df_new[TARGET]

print("Original X_new shape:", X_new.shape)
print("Original columns:", X_new.columns.tolist())

# -----------------------------
# Reload logistic regression model & features
# -----------------------------
logreg = joblib.load(MODEL_DIR / "logreg_baseline.joblib")

with open(MODEL_DIR / "logreg_features.json") as f:
    feature_names = json.load(f)

print("\nModel expects these features:")
print(f"Number of features: {len(feature_names)}")
print("First 10 features:", feature_names[:10])

# -----------------------------
# DEBUG: Check what's happening with age_bin
# -----------------------------
print("\n=== DEBUG INFORMATION ===")
print("Unique age_bin values in new data:", X_new['age_bin'].unique())
print("Data types in new data:")
print(X_new.dtypes.value_counts())

# Check which features are missing
missing_features = set(feature_names) - set(X_new.columns)
print(f"\nMissing features: {missing_features}")

# Check which features are extra
extra_features = set(X_new.columns) - set(feature_names)
print(f"Extra features: {extra_features}")

# -----------------------------
# FIX: Proper one-hot encoding with all expected categories
# -----------------------------

# First, let's see what age_bin categories the model expects
expected_age_bins = [f for f in feature_names if f.startswith('age_bin_')]
print(f"\nExpected age_bin columns: {expected_age_bins}")

# Get the actual age bins from the expected feature names
expected_age_values = [f.split('age_bin_')[1] for f in expected_age_bins]
print(f"Expected age values: {expected_age_values}")

# Ensure all expected age categories exist in the new data
for age_val in expected_age_values:
    if age_val not in X_new['age_bin'].unique():
        print(f"Warning: Age bin '{age_val}' not found in new data, will be created with 0 values")

# Now do one-hot encoding properly
X_new_encoded = pd.get_dummies(X_new, columns=['age_bin'], prefix='age_bin', drop_first=False)

print(f"\nAfter encoding, shape: {X_new_encoded.shape}")
print("Encoded columns:", [col for col in X_new_encoded.columns if 'age_bin' in col])

# -----------------------------
# FIX: Ensure ALL expected features are present
# -----------------------------

# Create a DataFrame with all expected features, filled with 0
X_new_aligned = pd.DataFrame(0, index=X_new.index, columns=feature_names)

# Copy over existing data where features match
for col in X_new_encoded.columns:
    if col in feature_names:
        X_new_aligned[col] = X_new_encoded[col]

print(f"\nFinal aligned shape: {X_new_aligned.shape}")
print("Missing values in aligned data:", X_new_aligned.isnull().sum().sum())

# Verify alignment
print("\n=== VERIFICATION ===")
print("Features in model vs aligned data:")
print(f"Model expects: {len(feature_names)} features")
print(f"Aligned data has: {len(X_new_aligned.columns)} features")
print("All features present:", set(feature_names) == set(X_new_aligned.columns))

# -----------------------------
# Evaluate Logistic Regression
# -----------------------------
try:
    probs_new = logreg.predict_proba(X_new_aligned)[:, 1]
    roc_new = roc_auc_score(y_new, probs_new)
    pr_new = average_precision_score(y_new, probs_new)

    print(f"\n✅ Logistic Regression — ROC AUC on new data: {roc_new:.3f}")
    print(f"✅ Logistic Regression — PR AUC on new data: {pr_new:.3f}")
    
except Exception as e:
    print(f"❌ Error: {e}")
    
    # Additional debug: Check first few rows
    print("\nFirst 5 rows of aligned data:")
    print(X_new_aligned.iloc[:5, :10])  # First 10 columns

# -----------------------------
# Continue with RF and GB evaluation...
# -----------------------------

Original X_new shape: (100, 36)
Original columns: ['sex', 'age_mean', 'age_bin', 'n_reactions', 'n_drugs', 'aspirin_present', 'react_Anaemia', 'react_Blood count abnormal', 'react_Constipation', 'react_Contusion', 'react_Death', 'react_Diarrhoea', 'react_Dizziness', 'react_Drug administration error', 'react_Drug ineffective', 'react_Drug interaction', 'react_Dyspnoea', 'react_Fatigue', 'react_Gastrointestinal haemorrhage', 'react_Haemoglobin decreased', 'react_Headache', 'react_Hypertension', 'react_Insomnia', 'react_Loss of consciousness', 'react_Nausea', 'react_Oedema peripheral', 'react_Overdose', 'react_Pain', 'react_Pain in extremity', 'react_Palpitations', 'react_Platelet count decreased', 'react_Pruritus', 'react_Rash', 'react_Respiratory failure', 'react_Type 2 diabetes mellitus', 'react_Weight increased']

Model expects these features:
Number of features: 36
First 10 features: ['sex', 'age_mean', 'age_bin', 'n_reactions', 'n_drugs', 'aspirin_present', 'react_Anaemia', 'react_B

In [42]:
# Day 9 — Evaluate Models on New Data

import pandas as pd
import numpy as np
import joblib
import json
from sklearn.metrics import roc_auc_score, average_precision_score
from sklearn.preprocessing import LabelEncoder

# -----------------------------
# Load new dataset
# -----------------------------
DATA_NEW = Path("../data/processed/report_features.csv")
df_new = pd.read_csv(DATA_NEW)

TARGET = "is_serious_report"

# -----------------------------
# Preprocess new dataset - CONVERT ALL TO NUMERIC
# -----------------------------
X_new = df_new.drop(columns=[TARGET, "safetyreportid", "receivedate"])
y_new = df_new[TARGET]

print("Original X_new shape:", X_new.shape)

# Convert categorical columns to numeric using Label Encoding
X_new_numeric = X_new.copy()
label_encoders = {}

for col in X_new_numeric.columns:
    if X_new_numeric[col].dtype == 'object':
        le = LabelEncoder()
        X_new_numeric[col] = le.fit_transform(X_new_numeric[col].astype(str))
        label_encoders[col] = le
        print(f"Encoded column '{col}' with {len(le.classes_)} categories")

# Fill missing values
X_new_numeric = X_new_numeric.fillna(0)

print(f"After encoding, shape: {X_new_numeric.shape}")

# -----------------------------
# Reload models and features
# -----------------------------

# Initialize variables to store results
results = {}

try:
    # Logistic Regression
    logreg = joblib.load(MODEL_DIR / "logreg_baseline.joblib")
    with open(MODEL_DIR / "logreg_features.json") as f:
        logreg_features = json.load(f)
    
    X_new_logreg = X_new_numeric.reindex(columns=logreg_features, fill_value=0)
    probs_logreg = logreg.predict_proba(X_new_logreg)[:, 1]
    results['Logistic Regression'] = {
        'roc_auc': roc_auc_score(y_new, probs_logreg),
        'pr_auc': average_precision_score(y_new, probs_logreg)
    }
    print(f"✅ Logistic Regression — ROC AUC: {results['Logistic Regression']['roc_auc']:.3f}, PR AUC: {results['Logistic Regression']['pr_auc']:.3f}")
    
except Exception as e:
    print(f"❌ Logistic Regression failed: {e}")

try:
    # Random Forest
    rf = joblib.load(MODEL_DIR / "rf_model.joblib")
    with open(MODEL_DIR / "rf_model.json") as f:
        rf_features = json.load(f)
    
    X_new_rf = X_new_numeric.reindex(columns=rf_features, fill_value=0)
    probs_rf = rf.predict_proba(X_new_rf)[:, 1]
    results['Random Forest'] = {
        'roc_auc': roc_auc_score(y_new, probs_rf),
        'pr_auc': average_precision_score(y_new, probs_rf)
    }
    print(f"✅ Random Forest — ROC AUC: {results['Random Forest']['roc_auc']:.3f}, PR AUC: {results['Random Forest']['pr_auc']:.3f}")
    
except Exception as e:
    print(f"❌ Random Forest failed: {e}")

try:
    # Gradient Boosting
    gb = joblib.load(MODEL_DIR / "gb_model.joblib")
    with open(MODEL_DIR / "gb_model.json") as f:
        gb_features = json.load(f)
    
    X_new_gb = X_new_numeric.reindex(columns=gb_features, fill_value=0)
    probs_gb = gb.predict_proba(X_new_gb)[:, 1]
    results['Gradient Boosting'] = {
        'roc_auc': roc_auc_score(y_new, probs_gb),
        'pr_auc': average_precision_score(y_new, probs_gb)
    }
    print(f"✅ Gradient Boosting — ROC AUC: {results['Gradient Boosting']['roc_auc']:.3f}, PR AUC: {results['Gradient Boosting']['pr_auc']:.3f}")
    
except Exception as e:
    print(f"❌ Gradient Boosting failed: {e}")

# -----------------------------
# Display Results Summary
# -----------------------------
print("\n" + "="*50)
print("FINAL PERFORMANCE SUMMARY ON NEW DATA")
print("="*50)

if results:
    for model_name, metrics in results.items():
        print(f"{model_name:20} ROC AUC: {metrics['roc_auc']:.3f} | PR AUC: {metrics['pr_auc']:.3f}")
    
    # Find best model based on ROC AUC
    best_model = max(results.items(), key=lambda x: x[1]['roc_auc'])
    print(f"\n🏆 Best model: {best_model[0]} (ROC AUC: {best_model[1]['roc_auc']:.3f})")
    
    # Compare with your original performance if available
    print("\n📊 Comparison with original test performance:")
    # You can add your original performance metrics here for comparison
    # print("Original - Logistic Regression: ROC AUC = 0.XXX")
    # print("Original - Random Forest: ROC AUC = 0.XXX") 
    # print("Original - Gradient Boosting: ROC AUC = 0.XXX")
    
else:
    print("❌ No models were successfully evaluated.")
    
    # Debug information
    print("\n🔧 Debug Information:")
    print(f"New data shape: {X_new_numeric.shape}")
    print(f"Target distribution: {y_new.value_counts().to_dict()}")
    print(f"Available models: {[f.stem for f in MODEL_DIR.glob('*.joblib')]}")

Original X_new shape: (100, 36)
Encoded column 'age_bin' with 6 categories
After encoding, shape: (100, 36)
❌ Logistic Regression failed: The feature names should match those that were passed during fit.
Feature names unseen at fit time:
- age_bin
Feature names seen at fit time, yet now missing:
- age_bin_35-49
- age_bin_50-64
- age_bin_65+

✅ Random Forest — ROC AUC: 0.940, PR AUC: 0.926
✅ Gradient Boosting — ROC AUC: 0.942, PR AUC: 0.940

FINAL PERFORMANCE SUMMARY ON NEW DATA
Random Forest        ROC AUC: 0.940 | PR AUC: 0.926
Gradient Boosting    ROC AUC: 0.942 | PR AUC: 0.940

🏆 Best model: Gradient Boosting (ROC AUC: 0.942)

📊 Comparison with original test performance:
