In [1]:
import sys
sys.path.append('../scripts')
from utils import DataLoader
from hypothesis_testing import HypothesisTester

In [2]:
data_loader = DataLoader()

In [3]:
data = data_loader.load_data("../data/processed/insurance_data.txt")

### Hypothesis Testing

In [4]:
tester = HypothesisTester(data)

#### Calculate KPIs

In [5]:
tester.calculate_metrics()

Metrics calculated: Claim Frequency=0.0028, Claim Severity=23273.39, Average Margin=-2.96


{'Claim Frequency': np.float64(0.002787726802773328),
 'Claim Severity': np.float64(23273.387063228372),
 'Average Margin': np.float64(-2.955693816275505)}

#### Test for risk differences across provinces
Chi-square on claim frequency by province.

In [6]:
province_result = tester.test_province_risk()
print(f"Province Risk Test")
for k, v in province_result.items():
    print(f"- {k}: {v}")

Province Risk Test
- Test: Chi-Square (Claim Frequency)
- Feature: Province
- P-Value: 5.925510718204677e-19
- Reject_H0: True
- Interpretation: Reject H₀ for Province (p=0.0000) → Statistically significant difference.


#### Test for risk differences between two zip codes
Chi-square on claim frequency for the two zip codes.

In [7]:
zipcode_result = tester.test_zipcode_risk(2000, 7405)
print(f"### ZIP Code Risk Test (2000 vs 7405)")
for k, v in zipcode_result.items():
    print(f"- {k}: {v}")

### ZIP Code Risk Test (2000 vs 7405)
- Test: Chi-Square (Claim Frequency)
- Feature: PostalCode
- P-Value: 7.282436419882232e-06
- Reject_H0: True
- Interpretation: Reject H₀ for PostalCode (p=0.0000) → Statistically significant difference.


#### Test for margin differences between two zip codes
T-test on Margin for the two zip codes.

In [8]:
zipcode_margin_result = tester.test_zipcode_margin(2000, 7405)
print(f"ZIP Code Margin Test (2000 vs 7405)")
for k, v in zipcode_margin_result.items():
    print(f"- {k}: {v}")

ZIP Code Margin Test (2000 vs 7405)
- Test: T-Test (Margin)
- Feature: PostalCode
- Groups: [2000, 7405]
- Group_1_Mean: -8.11194427227065
- Group_2_Mean: 17.51682453042594
- P-Value: 0.00656886191231903
- Reject_H0: True
- Interpretation: Reject H₀ for PostalCode (p=0.0066) → Statistically significant difference.


#### Test for risk differences between genders

In [9]:
gender_freq_result, gender_sev_result = tester.test_gender_risk()
print(f"Gender Risk Test")
print(f"**Frequency (ClaimOccurred)**")
for k, v in gender_freq_result.items():
    print(f"- {k}: {v}")
print(f"\n**Severity (TotalClaims)**")
for k, v in gender_sev_result.items():
    print(f"- {k}: {v}")

Gender Risk Test
**Frequency (ClaimOccurred)**
- Test: Chi-Square (Claim Frequency)
- Feature: Gender
- P-Value: 0.9514644755420456
- Reject_H0: False
- Interpretation: Fail to reject H₀ for Gender (p=0.9515) → No evidence of difference.

**Severity (TotalClaims)**
- Test: T-Test (TotalClaims)
- Feature: Gender
- Groups: ['Male', 'Female']
- Group_1_Mean: 32.620312390266385
- Group_2_Mean: 37.04605451452467
- P-Value: 0.7669656471629475
- Reject_H0: False
- Interpretation: Fail to reject H₀ for Gender (p=0.7670) → No evidence of difference.
