In [None]:
import pandas as pd
import numpy as np
from scipy.stats import chi2_contingency, f_oneway, ttest_ind

# Load Data (assuming you saved the file in the parent directory)
file_path = '../insurance_claims_data.csv' 
data = pd.read_csv(file_path)

# Data Preparation and Metric Creation (as discussed)
data['HadClaim'] = np.where(data['TotalClaims'] > 0, 1, 0)
claims_only_data = data[data['HadClaim'] == 1].copy()
claims_only_data['ClaimSeverity'] = claims_only_data['TotalClaims']

# Identify Provinces with sufficient volume for testing (e.g., top 5)
province_counts = data['Province'].value_counts()
top_provinces = province_counts.head(5).index.tolist()
data_test = data[data['Province'].isin(top_provinces)]
claims_only_test = claims_only_data[claims_only_data['Province'].isin(top_provinces)]

In [None]:
print("\n--- H1: Claim Severity Across Provinces (ANOVA Test) ---")

# Prepare groups for ANOVA
province_groups = [
    claims_only_test[claims_only_test['Province'] == prov]['ClaimSeverity']
    for prov in top_provinces
]

# Perform ANOVA test
f_stat, p_value_sev = f_oneway(*province_groups)

print(f"ANOVA F-statistic: {f_stat:.2f}")
print(f"P-value: {p_value_sev:.4f}")

if p_value_sev < alpha:
    print(f"\nConclusion: Reject the Null Hypothesis (p < {alpha}).")
    print("There is a **statistically significant difference** in Claim Severity across the top provinces.")
else:
    print(f"\nConclusion: Fail to Reject the Null Hypothesis (p >= {alpha}).")
    print("There is no statistically significant difference in Claim Severity across the top provinces.")

# Print Claim Severity for Business Interpretation
claim_severity = claims_only_test.groupby('Province')['ClaimSeverity'].mean().sort_values(ascending=False)
print("\nAverage Claim Severity (Rand) by Province:")
print(claim_severity.map('{:,.2f}'.format))

In [None]:
print("\n--- H4: Risk Differences Between Women and Men ---")

# Filter data to include only the two groups (Men and Women)
gender_data = data[(data['Gender'] == 'Male') | (data['Gender'] == 'Female')].copy()
gender_claims_only = gender_data[gender_data['HadClaim'] == 1].copy()

#### Metric 1: Claim Frequency (Chi-Squared Test)
gender_contingency = pd.crosstab(gender_data['Gender'], gender_data['HadClaim'])
chi2_gender, p_value_freq_gender, dof_gender, expected_gender = chi2_contingency(gender_contingency)

print(f"\nClaim Frequency (Chi-Squared): P-value: {p_value_freq_gender:.4f}")

# Print Claim Frequencies for Interpretation
gender_freq = gender_data.groupby('Gender')['HadClaim'].mean().sort_values(ascending=False)
print("Claim Frequency (%) by Gender:")
print((gender_freq * 100).round(2))

#### Metric 2: Claim Severity (Two-Sample T-test)
male_severity = gender_claims_only[gender_claims_only['Gender'] == 'Male']['TotalClaims']
female_severity = gender_claims_only[gender_claims_only['Gender'] == 'Female']['TotalClaims']

# Perform two-sample T-test
t_stat, p_value_sev_gender = ttest_ind(male_severity, female_severity, equal_var=False, nan_policy='omit')

print(f"\nClaim Severity (T-Test): P-value: {p_value_sev_gender:.4f}")

# Print Claim Severity for Interpretation
gender_severity = gender_claims_only.groupby('Gender')['TotalClaims'].mean().sort_values(ascending=False)
print("Average Claim Severity (Rand) by Gender:")
print(gender_severity.map('{:,.2f}'.format))

# Overall Conclusion for H4
if p_value_freq_gender < alpha or p_value_sev_gender < alpha:
    print("\nConclusion: Reject the Null Hypothesis (at least one risk metric is significant).")
    print("There is a **statistically significant risk difference** between Men and Women.")
else:
    print("\nConclusion: Fail to Reject the Null Hypothesis.")
    print("There is no statistically significant risk difference between Men and Women.")