In [1]:
import pandas as pd
import numpy as np
import os
import shap
import xgboost as xgb
import lightgbm as lgb
import matplotlib.pyplot as plt
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier
from sklearn.impute import SimpleImputer
from sklearn.metrics import mean_squared_error, r2_score, classification_report, roc_auc_score
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
import dask.dataframe as dd

def ensure_directories():
    os.makedirs("reports/figures", exist_ok=True)

def load_data():
    df = dd.read_parquet("../data/processed/insurance_data_with_features.parquet")
    df = df[df["totalpremium"] > 0].compute()  # Load necessary rows and convert to DataFrame
    
    df = drop_empty_columns(df)  # Drop columns with all missing values
    df = optimize_dataframe(df)
    
    df["has_claim"] = (df["totalclaims"] > 0).astype(int)
    df["loss_ratio"] = df["totalclaims"] / df["totalpremium"]
    df["margin"] = df["totalpremium"] - df["totalclaims"]
    
    print(f"✅ Loaded data: {df.shape[0]:,} rows")
    print(f"🔍 Valid claim records: {(df['has_claim'] == 1).sum():,}")
    return df

def optimize_dataframe(df):
    for col in df.select_dtypes(include=['float64']).columns:
        df[col] = df[col].astype('float32')
    for col in df.select_dtypes(include=['int64']).columns:
        df[col] = df[col].astype('int32')
    for col in df.select_dtypes(include=['object']).columns:
        df[col] = df[col].astype('category')
    return df

def drop_empty_columns(df):
    return df.dropna(axis=1, how='all')  # Drop columns with all missing values

def encode_features(df, exclude=[]):
    return df.drop(columns=exclude, errors="ignore")

def run_severity_model(df):
    df_sev = df[df["has_claim"] == 1].copy()
    if df_sev.shape[0] < 20:
        print("⚠️ Not enough data for severity modeling.")
        return None, None

    X = encode_features(df_sev, exclude=["totalclaims"])
    y = df_sev["totalclaims"]
    X = X.select_dtypes(include=[np.number])  # Keep only numeric columns

    imputer = SimpleImputer(strategy='mean')
    model = xgb.XGBRegressor(tree_method="hist", random_state=42)
    pipeline = Pipeline(steps=[('imputer', imputer), ('regressor', model)])

    X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)

    pipeline.fit(X_train, y_train)
    preds = pipeline.predict(X_test)
    rmse = np.sqrt(mean_squared_error(y_test, preds))
    r2 = r2_score(y_test, preds)
    print(f"\n📊 Severity Model — RMSE: {rmse:.2f}, R²: {r2:.4f}")
    return pipeline, X_train

def run_premium_model(df):
    df = drop_empty_columns(df)  # Drop columns with all missing values
    X = encode_features(df, exclude=["totalpremium"])
    y = df["totalpremium"]
    X = X.select_dtypes(include=[np.number])  # Keep only numeric columns

    imputer = SimpleImputer(strategy='mean')
    model = RandomForestRegressor(max_depth=12, n_estimators=100, random_state=42)
    pipeline = Pipeline(steps=[('imputer', imputer), ('regressor', model)])

    X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)

    pipeline.fit(X_train, y_train)
    preds = pipeline.predict(X_test)
    rmse = np.sqrt(mean_squared_error(y_test, preds))
    r2 = r2_score(y_test, preds)
    print(f"\n📊 Premium Model — RMSE: {rmse:.2f}, R²: {r2:.4f}")

def run_claim_probability_model(df):
    df = drop_empty_columns(df)  # Drop columns with all missing values
    X = encode_features(df, exclude=["has_claim"])
    y = df["has_claim"]
    X = X.select_dtypes(include=[np.number])  # Keep only numeric columns

    imputer = SimpleImputer(strategy='mean')
    model = RandomForestClassifier(random_state=42)
    pipeline = Pipeline(steps=[('imputer', imputer), ('classifier', model)])

    X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, random_state=42)

    pipeline.fit(X_train, y_train)
    preds = pipeline.predict(X_test)
    proba = pipeline.predict_proba(X_test)[:, 1]
    print(f"\n📊 Claim Probability Model — AUC: {roc_auc_score(y_test, proba):.4f}")
    print(classification_report(y_test, preds))

def explain_shap(pipeline, X_train):
    try:
        # Extract the model from the pipeline
        model = pipeline.named_steps['regressor']
        
        print("\n🔎 SHAP Summary — Severity Model")
        explainer = shap.Explainer(model)
        shap_values = explainer(X_train)
        
        # Convert SHAP values to DataFrame for analysis
        shap_df = pd.DataFrame(shap_values.values, columns=X_train.columns)

        # Calculate mean absolute SHAP values for feature importance
        feature_importance = np.abs(shap_df).mean(axis=0).sort_values(ascending=False)

        # Plot SHAP summary
        shap.summary_plot(shap_values, X_train, show=False)
        plt.title("SHAP Summary — Claim Severity")
        plt.savefig("reports/figures/severity_shap_summary.png")
        plt.close()

        print("\n🧠 Top SHAP Features:")
        for feat, val in feature_importance.head(10).items():
            print(f"  • {feat}: {val:.4f}")
    except Exception as e:
        print(f"❌ SHAP explanation failed: {e}")

def main():
    ensure_directories()
    df = load_data()

    print("\n📍 Running Claim Severity Model...")
    sev_model, X_train = run_severity_model(df)

    print("\n📍 Running Claim Probability Model...")
    run_claim_probability_model(df)

    print("\n📍 Running Premium Prediction Model...")
    run_premium_model(df)

    if sev_model and X_train is not None:
        explain_shap(sev_model, X_train)  # Passing the pipeline directly

    print("\n✅ Task 4 Complete — Results printed and SHAP summary saved.")

if __name__ == "__main__":
    main()

✅ Loaded data: 618,176 rows
🔍 Valid claim records: 2,641

📍 Running Claim Severity Model...

📊 Severity Model — RMSE: 1358.45, R²: 0.9981

📍 Running Claim Probability Model...

📊 Claim Probability Model — AUC: 1.0000
              precision    recall  f1-score   support

           0       1.00      1.00      1.00    153884
           1       1.00      1.00      1.00       660

    accuracy                           1.00    154544
   macro avg       1.00      1.00      1.00    154544
weighted avg       1.00      1.00      1.00    154544


📍 Running Premium Prediction Model...

📊 Premium Model — RMSE: 35.05, R²: 0.9721

🔎 SHAP Summary — Severity Model

🧠 Top SHAP Features:
  • margin: 24851.8633
  • suminsured: 285.7473
  • loss_ratio: 193.8185
  • totalpremium: 143.3830
  • customvalueestimate: 100.3822
  • numberofdoors: 83.8465
  • calculatedpremiumperterm: 77.2729
  • registrationyear: 59.7718
  • underwrittencoverid: 56.3280
  • postalcode: 47.7493

✅ Task 4 Complete — Results prin