In [6]:
import pandas as pd
import numpy as np
from scipy import stats
import matplotlib.pyplot as plt
import seaborn as sns
from pathlib import Path
import os

sns.set_style('whitegrid')
plt.rcParams['figure.dpi'] = 300

def ensure_directory_exists(filepath):
    os.makedirs(os.path.dirname(filepath), exist_ok=True)

def save_plot(fig, filename):
    full_path = Path(f"reports/figures/{filename}")
    ensure_directory_exists(full_path)
    fig.savefig(full_path, bbox_inches='tight')
    plt.close()

def load_data():
    path = Path("../data/processed/insurance_data_with_features.parquet")
    if not path.exists():
        raise FileNotFoundError(f"Processed data not found at {path}")
    df = pd.read_parquet(path)

    # Preprocess
    df = df[df['totalpremium'] > 0].copy()
    df['has_claim'] = df['totalclaims'] > 0
    df['loss_ratio'] = df['totalclaims'] / df['totalpremium']
    df['margin'] = df['totalpremium'] - df['totalclaims']
    df.replace([np.inf, -np.inf], np.nan, inplace=True)

    return df

def evaluate_decision(p, alpha=0.05):
    return "Reject Null Hypothesis" if p < alpha else "Fail to Reject Null"

# ========== HYPOTHESIS TESTS ==========

def province_risk_tests(df):
    # 1. Loss Ratio ANOVA
    groups = [df[df['province'] == p]['loss_ratio'].dropna() for p in df['province'].unique()]
    f_stat, p_val = stats.f_oneway(*groups)

    # 2. Frequency Chi-squared
    freq_table = pd.crosstab(df['province'], df['has_claim'])
    chi2_stat, chi2_p, _, _ = stats.chi2_contingency(freq_table)

    return {
        'Loss Ratio ANOVA': {
            'test': 'ANOVA (Province vs Loss Ratio)',
            'f_value': f_stat,
            'p_value': p_val,
            'decision': evaluate_decision(p_val)
        },
        'Claim Frequency': {
            'test': 'Chi-Squared (Province vs Claim Frequency)',
            'chi2_stat': chi2_stat,
            'p_value': chi2_p,
            'decision': evaluate_decision(chi2_p)
        },
        'business_note': "Evidence suggests meaningful variation in both claim frequency and average loss ratios across provinces—supporting regional pricing models."
    }

def postalcode_margin_tests(df):
    grouped = df.groupby(['province', 'postalcode'])[['margin', 'loss_ratio']].mean().dropna()
    grouped = grouped[grouped['margin'].notnull() & (grouped['margin'] != 0)]

    # ANOVA on margin
    group_samples = np.array_split(grouped['margin'].sort_values(), 3)  # simulate 3-region A/B/C
    f_stat, p_val = stats.f_oneway(*group_samples)

    fig, ax = plt.subplots()
    sns.histplot(grouped['margin'], bins=40, kde=True, ax=ax)
    ax.set_title("Margin Distribution by Postal Code")
    save_plot(fig, "postalcode_margin_distribution.png")

    top = grouped.sort_values('loss_ratio', ascending=False).head(3)
    low = grouped.sort_values('loss_ratio').head(3)

    return {
        'Postalcode Margin ANOVA': {
            'test': 'ANOVA (Margin across Postal Regions)',
            'f_value': f_stat,
            'p_value': p_val,
            'decision': evaluate_decision(p_val)
        },
        'Top-Risk Regions': top.reset_index().to_dict(orient='records'),
        'Low-Risk Regions': low.reset_index().to_dict(orient='records'),
        'business_note': "Margin differences and extreme loss ratios across postal codes suggest risk pricing should account for sub-regional exposure."
    }

def gender_risk_tests(df):
    male = df[df['gender'] == 'Male']['loss_ratio'].dropna()
    female = df[df['gender'] == 'Female']['loss_ratio'].dropna()
    t_stat, p_val = stats.ttest_ind(male, female, equal_var=False)

    freq_table = pd.crosstab(df['gender'], df['has_claim'])
    chi2_stat, chi2_p, _, _ = stats.chi2_contingency(freq_table)

    return {
        'Loss Ratio T-test': {
            'test': 'T-test (Gender vs Loss Ratio)',
            't_value': t_stat,
            'p_value': p_val,
            'decision': evaluate_decision(p_val)
        },
        'Claim Frequency': {
            'test': 'Chi-Squared (Gender vs Claim Frequency)',
            'chi2_stat': chi2_stat,
            'p_value': chi2_p,
            'decision': evaluate_decision(chi2_p)
        },
        'business_note': "No significant difference in risk metrics between genders; gender-based pricing is likely unjustified."
    }

def vehicle_risk_test(df):
    samples = [df[df['vehicletype'] == vt]['loss_ratio'].dropna() for vt in df['vehicletype'].dropna().unique()]
    if all(len(g) >= 2 for g in samples):
        f_stat, p_val = stats.f_oneway(*samples)
    else:
        f_stat, p_val = np.nan, np.nan

    return {
        'Vehicle Risk ANOVA': {
            'test': 'ANOVA (Loss Ratio by Vehicle Type)',
            'f_value': f_stat,
            'p_value': p_val,
            'decision': evaluate_decision(p_val) if not np.isnan(p_val) else "Insufficient Data"
        },
        'business_note': "Risk may vary by vehicle class, but data sparsity limits analysis—consider rebalancing or grouping rare types."
    }

# ========== EXECUTION ==========

def main():
    df = load_data()
    print("\n📊 A/B Hypothesis Test Results:")

    all_results = {
        'Province Risk': province_risk_tests(df),
        'Postalcode Margin & Risk': postalcode_margin_tests(df),
        'Gender-Based Risk': gender_risk_tests(df),
        'Vehicle Type Risk': vehicle_risk_test(df)
    }

    for section, result in all_results.items():
        print(f"\n🧪 {section}")
        for name, outcome in result.items():
            if isinstance(outcome, dict):
                for k, v in outcome.items():
                    print(f"  {k}: {v:.4f}" if isinstance(v, float) else f"  {k}: {v}")
            elif isinstance(outcome, list):
                print(f"  {name}:")
                for record in outcome:
                    print(f"    → {record}")
            else:
                print(f"  {name}: {outcome}")

if __name__ == "__main__":
    main()



📊 A/B Hypothesis Test Results:

🧪 Province Risk
  test: ANOVA (Province vs Loss Ratio)
  f_value: 4.9692
  p_value: 0.0000
  decision: Reject Null Hypothesis
  test: Chi-Squared (Province vs Claim Frequency)
  chi2_stat: 93.6946
  p_value: 0.0000
  decision: Reject Null Hypothesis
  business_note: Evidence suggests meaningful variation in both claim frequency and average loss ratios across provinces—supporting regional pricing models.

🧪 Postalcode Margin & Risk
  test: ANOVA (Margin across Postal Regions)
  f_value: 123.4868
  p_value: 0.0000
  decision: Reject Null Hypothesis
  Top-Risk Regions:
    → {'province': 'Western Cape', 'postalcode': 6571, 'margin': -18.504444444444786, 'loss_ratio': 6.963629629629858}
    → {'province': 'Mpumalanga', 'postalcode': 1342, 'margin': -4157.687765846076, 'loss_ratio': 5.817310659475216}
    → {'province': 'North West', 'postalcode': 466, 'margin': -2104.003715493932, 'loss_ratio': 4.485526854357317}
  Low-Risk Regions:
    → {'province': 'East