In [5]:
import pandas as pd
import numpy as np

df = pd.read_csv("../MachineLearningRating_v3.txt", sep='|')

# Preprocessing: Create useful fields
df['HasClaim'] = df['TotalClaims'] > 0
df['Margin'] = df['TotalPremium'] - df['TotalClaims']


  df = pd.read_csv("../MachineLearningRating_v3.txt", sep='|')


No risk differences across provinces

In [7]:
import scipy.stats as stats

# Claim Frequency by Province
province_freq = df.groupby('Province')['HasClaim'].mean()

# Claim Severity by Province (for those who claimed)
severity_data = [group['TotalClaims'] for name, group in df[df['HasClaim']].groupby('Province')]

# Test 1: Chi-Squared for frequency difference
contingency = pd.crosstab(df['Province'], df['HasClaim'])
chi2, p_freq, _, _ = stats.chi2_contingency(contingency)

# Test 2: ANOVA for claim severity
anova_p = stats.f_oneway(*severity_data).pvalue

print(f"Province Frequency p-value: {p_freq}")
print(f"Province Severity p-value: {anova_p}")


Province Frequency p-value: 5.925510718204678e-19
Province Severity p-value: 6.304916760425176e-06


No risk differences between Zip Codes

In [9]:
# Top 5 zip codes by policy count
top_zip = df['PostalCode'].value_counts().nlargest(5).index
zip_df = df[df['PostalCode'].isin(top_zip)]

# Claim Frequency and Chi-Squared
contingency = pd.crosstab(zip_df['PostalCode'], zip_df['HasClaim'])
chi2, p_freq_zip, _, _ = stats.chi2_contingency(contingency)

# Claim Severity ANOVA
zip_severity = [group['TotalClaims'] for name, group in zip_df[zip_df['HasClaim']].groupby('PostalCode')]
p_anova_zip = stats.f_oneway(*zip_severity).pvalue

print(f"Zip Frequency p: {p_freq_zip}, Zip Severity p: {p_anova_zip}")


Zip Frequency p: 2.5997263446847924e-12, Zip Severity p: 0.0038826391783015673


No significant margin difference between zip codes

In [10]:
# Compare average Margin across top zip codes
margin_by_zip = [group['Margin'] for _, group in zip_df.groupby('PostalCode')]
p_margin = stats.f_oneway(*margin_by_zip).pvalue

print(f"Zip Margin Difference p-value: {p_margin}")


Zip Margin Difference p-value: 0.04693183137128581


No significant risk difference between Women and Men

In [11]:
# Frequency Chi-Squared
contingency_gender = pd.crosstab(df['Gender'], df['HasClaim'])
chi2_gender, p_freq_gender, _, _ = stats.chi2_contingency(contingency_gender)

# Severity t-test
men_claims = df[(df['Gender'] == 'Male') & (df['HasClaim'])]['TotalClaims']
women_claims = df[(df['Gender'] == 'Female') & (df['HasClaim'])]['TotalClaims']
p_gender_severity = stats.ttest_ind(men_claims, women_claims, equal_var=False).pvalue

print(f"Gender Frequency p-value: {p_freq_gender}")
print(f"Gender Severity p-value: {p_gender_severity}")


Gender Frequency p-value: 0.026570248768437145
Gender Severity p-value: 0.5680286951630678


| Hypothesis                                                    | p-value(s)                                                | Interpretation                                                                                               | Business Insight                                                                                                                                |
| ------------------------------------------------------------- | --------------------------------------------------------- | ------------------------------------------------------------------------------------------------------------ | ----------------------------------------------------------------------------------------------------------------------------------------------- |
| **Risk differences across Provinces (Frequency & Severity)**  | Frequency: 5.93e-19 < 0.05 <br> Severity: 6.30e-06 < 0.05 | **Reject H₀** — Significant difference in risk between provinces both in claim frequency and claim severity. | Some provinces are riskier; premium pricing should vary regionally to reflect this risk.                                                        |
| **Risk differences between Zip Codes (Frequency & Severity)** | Frequency: 2.60e-12 < 0.05 <br> Severity: 0.00388 < 0.05  | **Reject H₀** — Statistically significant difference in claim frequency and severity across zip codes.       | Zip code is a meaningful factor to segment risk; pricing and marketing strategies should consider location at this granularity.                 |
| **Margin difference between Zip Codes**                       | 0.0469 < 0.05                                             | **Reject H₀** — Significant difference in profitability across zip codes.                                    | Some zip codes yield better profit margins; targeting low-margin areas with pricing or marketing adjustments could improve profitability.       |
| **Risk difference between Gender (Frequency & Severity)**     | Frequency: 0.027 < 0.05 <br> Severity: 0.568 > 0.05       | **Partial Rejection:** Claim frequency differs by gender, but claim severity does not.                       | Gender influences how often claims are made, but not the size of claims. Consider gender in risk segmentation mainly for frequency predictions. |
