In [3]:
# notebooks/02_hypothesis_testing.ipynb

import pandas as pd
import numpy as np
from scipy.stats import chi2_contingency, f_oneway, ttest_ind
import statsmodels.api as sm
from statsmodels.formula.api import ols

import sys
import os

# Get the absolute path of the current working directory (where the notebook is launched from)
current_working_directory = os.getcwd()

# Assuming 'src' is one level up from the 'notebooks' directory
# So, if CWD is 'project_root/notebooks', then project_root is os.path.dirname(CWD)
project_root = os.path.dirname(current_working_directory)

# Construct the path to the 'src' directory
src_path = os.path.join(project_root, 'src')

# Add the 'src' directory to the system path if it's not already there
if src_path not in sys.path:
    sys.path.append(src_path)
    print(f"Added '{src_path}' to sys.path.")

# --- Optional: Add checks for debugging (keep these, they are helpful) ---
if not os.path.exists(src_path):
    print(f"ERROR: The 'src' directory does not exist at '{src_path}'. Please check your project structure.")
elif not os.path.isdir(src_path):
    print(f"ERROR: '{src_path}' is not a directory.")
elif not os.path.exists(os.path.join(src_path, 'data_tools.py')):
    print(f"ERROR: 'data_tools.py' not found in '{src_path}'. Please ensure the file exists and is named 'data_tools.py'.")
# --- End Optional Checks ---

# CORRECTED IMPORTS: Import the correct functions from data_tools and constants from utils
from data_tools import preprocess_data
from utils import RAW_DATA_PATH # Import RAW_DATA_PATH from utils

# Load the raw data
# Use RAW_DATA_PATH from utils, assuming it's correctly configured
# The MachineLearningRating_v3.txt file is pipe-separated
raw_data_file_path = RAW_DATA_PATH

# Read the raw data with the correct separator
raw_df = pd.read_csv(raw_data_file_path, sep='|')

# Preprocess the raw data using the imported preprocess_data function
df_processed = preprocess_data(raw_df)

print("Data loaded and preprocessed successfully for Task 3.")
print(f"Shape of df_processed: {df_processed.shape}")

# Create 'has_claim' column (for Claim Frequency)
# If total_claims is > 0, then a claim occurred (1), else no claim (0)
df_processed['has_claim'] = (df_processed['total_claims'] > 0).astype(int)

# Create 'margin' column
df_processed['margin'] = df_processed['total_premium'] - df_processed['total_claims']

# Prepare data for Claim Severity: only policies where a claim occurred
df_claims_only = df_processed[df_processed['has_claim'] == 1].copy()

print("\nDerived metrics added:")
print(f"Claim Frequency (policies with claims): {df_processed['has_claim'].mean():.4f}")
print(f"Overall Claim Severity (average claim amount for claims): {df_claims_only['total_claims'].mean():.2f}")
print(f"Overall Average Margin: {df_processed['margin'].mean():.2f}")


# --- Hypothesis 1: Risk differences across provinces ---
print("\n--- Hypothesis 1: Risk differences across provinces ---")

# --- Claim Frequency by Province (Chi-squared Test) ---
print("\nClaim Frequency by Province:")
contingency_table_province_freq = pd.crosstab(df_processed['province'], df_processed['has_claim'])
print(contingency_table_province_freq)

# Ensure enough data for Chi-squared test (at least 2x2 table and not all zero counts)
if contingency_table_province_freq.shape[0] > 1 and contingency_table_province_freq.shape[1] > 1 and contingency_table_province_freq.min().min() > 0:
    chi2_freq, p_value_freq, _, _ = chi2_contingency(contingency_table_province_freq)
    print(f"\nChi-squared test for Claim Frequency (Province):")
    print(f"Chi2 Stat: {chi2_freq:.2f}, P-value: {p_value_freq:.4f}")

    if p_value_freq < 0.05:
        print("Decision: Reject the Null Hypothesis (H₀).")
        print("Interpretation: There is a statistically significant difference in claim frequency across provinces.")
        # Business Recommendation for frequency
        province_freq = df_processed.groupby('province')['has_claim'].mean().sort_values(ascending=False)
        print("\nClaim Frequency by Province (proportion of policies with claims):")
        print(province_freq)
        print(f"Business Recommendation: Provinces like {province_freq.index[0]} exhibit higher claim frequencies compared to {province_freq.index[-1]} and others. This suggests that regional adjustments to premiums based on the likelihood of a claim might be warranted.")
    else:
        print("Decision: Fail to Reject the Null Hypothesis (H₀).")
        print("Interpretation: There is no statistically significant difference in claim frequency across provinces.")
else:
    print("Not enough unique provinces or sufficient data for Chi-squared test on Claim Frequency.")


# --- Claim Severity by Province (ANOVA Test) ---
print("\nClaim Severity by Province:")
# Filter for claims only for severity analysis
province_groups_severity = [df_claims_only['total_claims'][df_claims_only['province'] == p] for p in df_claims_only['province'].unique()]

# Filter out empty groups or groups with only one observation for ANOVA
province_groups_severity = [group for group in province_groups_severity if len(group) > 1]

if len(province_groups_severity) > 1: # Ensure there's more than one group to compare
    f_stat_severity, p_value_severity = f_oneway(*province_groups_severity)
    print(f"ANOVA test for Claim Severity (Province):")
    print(f"F-statistic: {f_stat_severity:.2f}, P-value: {p_value_severity:.4f}")

    if p_value_severity < 0.05:
        print("Decision: Reject the Null Hypothesis (H₀).")
        print("Interpretation: There is a statistically significant difference in claim severity across provinces.")
        # Business Recommendation for severity
        province_severity = df_claims_only.groupby('province')['total_claims'].mean().sort_values(ascending=False)
        print("\nClaim Severity by Province (average claim amount for policies with claims):")
        print(province_severity)
        print(f"Business Recommendation: Provinces like {province_severity.index[0]} show higher average claim amounts when a claim occurs, compared to {province_severity.index[-1]}. This indicates a need to consider regional factors when setting cover limits or excess options to manage potential payout costs.")
    else:
        print("Decision: Fail to Reject the Null Hypothesis (H₀).")
        print("Interpretation: There is no statistically significant difference in claim severity across provinces.")
else:
    print("Not enough unique provinces with sufficient claims to perform ANOVA for Claim Severity.")




  raw_df = pd.read_csv(raw_data_file_path, sep='|')


Columns cleaned.
Transaction_month converted to datetime.
Specific 'Not specified' values handled.
Loss Ratio calculated.
Key numerical columns converted to numeric.
Filled NaN in custom_value_estimate with median: 220000.0
Filled NaN in capital_outstanding with median: 0.0
Filled NaN in kilowatts with median: 111.0
Filled NaN in cubic_capacity with median: 2694.0
Filled NaN in gender with 'Unknown'.
Filled NaN in marital_status with 'Unknown'.
Filled NaN in vehicle_type with 'Unknown'.
Filled NaN in body_type with 'Unknown'.
Filled NaN in make with 'Unknown'.
Filled NaN in model with 'Unknown'.
Data loaded and preprocessed successfully for Task 3.
Shape of df_processed: (1000098, 53)

Derived metrics added:
Claim Frequency (policies with claims): 0.0028
Overall Claim Severity (average claim amount for claims): 23273.39
Overall Average Margin: -2.96

--- Hypothesis 1: Risk differences across provinces ---

Claim Frequency by Province:
has_claim           0     1
province               

In [4]:
# --- Hypothesis 2: Risk differences between zip codes (postal codes) ---
print("\n--- Hypothesis 2: Risk differences between zip codes (postal codes) ---")

# --- Claim Frequency by Postal Code (Chi-squared Test) ---
print("\nClaim Frequency by Postal Code:")
contingency_table_zip_freq = pd.crosstab(df_processed['postal_code'], df_processed['has_claim'])

# Filter out postal codes with very few entries if chi-squared gives errors
min_observations_for_chi2 = 10 # Adjust as needed
valid_zip_codes_freq = contingency_table_zip_freq[contingency_table_zip_freq.sum(axis=1) >= min_observations_for_chi2].index
contingency_table_zip_freq_filtered = contingency_table_zip_freq.loc[valid_zip_codes_freq]

if not contingency_table_zip_freq_filtered.empty and \
   contingency_table_zip_freq_filtered.shape[0] > 1 and \
   contingency_table_zip_freq_filtered.shape[1] > 1 and \
   contingency_table_zip_freq_filtered.min().min() > 0: # Ensure enough data for test
    chi2_zip_freq, p_value_zip_freq, _, _ = chi2_contingency(contingency_table_zip_freq_filtered)
    print(f"\nChi-squared test for Claim Frequency (Postal Code - filtered for >= {min_observations_for_chi2} policies):")
    print(f"Chi2 Stat: {chi2_zip_freq:.2f}, P-value: {p_value_zip_freq:.4f}")

    if p_value_zip_freq < 0.05:
        print("Decision: Reject the Null Hypothesis (H₀).")
        print("Interpretation: There is a statistically significant difference in claim frequency across postal codes.")
        # Business Recommendation for frequency
        zip_freq = df_processed.groupby('postal_code')['has_claim'].mean().sort_values(ascending=False)
        print("\nTop 5 Postal Codes by Claim Frequency:")
        print(zip_freq.head(5))
        print("\nBottom 5 Postal Codes by Claim Frequency:")
        print(zip_freq.tail(5))
        print(f"Business Recommendation: The analysis indicates significant variation in claim frequency by postal code. Specific postal codes show notably higher or lower claim rates, which could be leveraged to refine geographical rating factors in premium calculations. Further granular analysis of high-frequency postal codes is recommended.")
    else:
        print("Decision: Fail to Reject the Null Hypothesis (H₀).")
        print("Interpretation: There is no statistically significant difference in claim frequency across postal codes.")
else:
    print("Not enough valid postal codes or sufficient data to perform Chi-squared test for Claim Frequency.")


# --- Claim Severity by Postal Code (ANOVA Test) ---
print("\nClaim Severity by Postal Code:")
# Filter for claims only for severity analysis
postal_code_groups_severity = [df_claims_only['total_claims'][df_claims_only['postal_code'] == z] for z in df_claims_only['postal_code'].unique()]

# Filter out groups with insufficient data (e.g., less than 2 observations for ANOVA)
postal_code_groups_severity = [group for group in postal_code_groups_severity if len(group) > 1]

if len(postal_code_groups_severity) > 1: # Ensure there's more than one group to compare
    f_stat_zip_severity, p_value_zip_severity = f_oneway(*postal_code_groups_severity)
    print(f"\nANOVA test for Claim Severity (Postal Code):")
    print(f"F-statistic: {f_stat_zip_severity:.2f}, P-value: {p_value_zip_severity:.4f}")

    if p_value_zip_severity < 0.05:
        print("Decision: Reject the Null Hypothesis (H₀).")
        print("Interpretation: There is a statistically significant difference in claim severity across postal codes.")
        # Business Recommendation for severity
        zip_severity = df_claims_only.groupby('postal_code')['total_claims'].mean().sort_values(ascending=False)
        print("\nTop 5 Postal Codes by Claim Severity:")
        print(zip_severity.head(5))
        print("\nBottom 5 Postal Codes by Claim Severity:")
        print(zip_severity.tail(5))
        print(f"Business Recommendation: Claim severity varies significantly by postal code. Certain areas are associated with higher average claim costs, which could necessitate higher excesses or tailored product offerings in those regions to maintain profitability.")
    else:
        print("Decision: Fail to Reject the Null Hypothesis (H₀).")
        print("Interpretation: There is no statistically significant difference in claim severity across postal codes.")
else:
    print("Not enough unique postal codes with sufficient claims to perform ANOVA for Claim Severity.")




--- Hypothesis 2: Risk differences between zip codes (postal codes) ---

Claim Frequency by Postal Code:
Not enough valid postal codes or sufficient data to perform Chi-squared test for Claim Frequency.

Claim Severity by Postal Code:

ANOVA test for Claim Severity (Postal Code):
F-statistic: 1.18, P-value: 0.0271
Decision: Reject the Null Hypothesis (H₀).
Interpretation: There is a statistically significant difference in claim severity across postal codes.

Top 5 Postal Codes by Claim Severity:
postal_code
4680    260087.719298
4820    181480.798246
1791    173481.535088
1610    163104.473684
9756    157520.359649
Name: total_claims, dtype: float64

Bottom 5 Postal Codes by Claim Severity:
postal_code
7503    614.035088
721     610.198830
9306    531.350877
6506    490.000000
1559    389.956140
Name: total_claims, dtype: float64
Business Recommendation: Claim severity varies significantly by postal code. Certain areas are associated with higher average claim costs, which could necess

In [5]:

# --- Hypothesis 3: Margin differences between zip codes (postal codes) ---
print("\n--- Hypothesis 3: Margin differences between zip codes (postal codes) ---")
print("\nMargin by Postal Code:")
# Group postal codes for ANOVA on margin
# Filter out groups with insufficient data (e.g., less than 2 observations for ANOVA)
postal_code_groups_margin = [df_processed['margin'][df_processed['postal_code'] == z] for z in df_processed['postal_code'].unique()]
postal_code_groups_margin = [group for group in postal_code_groups_margin if len(group) > 1]

if len(postal_code_groups_margin) > 1: # Ensure there's more than one group to compare
    f_stat_zip_margin, p_value_zip_margin = f_oneway(*postal_code_groups_margin)
    print(f"\nANOVA test for Margin (Postal Code):")
    print(f"F-statistic: {f_stat_zip_margin:.2f}, P-value: {p_value_zip_margin:.4f}")

    if p_value_zip_margin < 0.05:
        print("Decision: Reject the Null Hypothesis (H₀).")
        print("Interpretation: There is a statistically significant difference in average margin across postal codes.")
        zip_margin = df_processed.groupby('postal_code')['margin'].mean().sort_values(ascending=False)
        print("\nTop 5 Postal Codes by Average Margin:")
        print(zip_margin.head(5))
        print("\nBottom 5 Postal Codes by Average Margin:")
        print(zip_margin.tail(5))
        print(f"Business Recommendation: Significant variations in average margin by postal code indicate that pricing strategies could be further optimized at a granular geographic level. Areas with lower margins might require premium adjustments or closer review of associated risks/costs.")
    else:
        print("Decision: Fail to Reject the Null Hypothesis (H₀).")
        print("Interpretation: There is no statistically significant difference in average margin across postal codes.")
else:
    print("Not enough unique postal codes with sufficient data to perform ANOVA for Margin.")





--- Hypothesis 3: Margin differences between zip codes (postal codes) ---

Margin by Postal Code:

ANOVA test for Margin (Postal Code):
F-statistic: 0.89, P-value: 0.9911
Decision: Fail to Reject the Null Hypothesis (H₀).
Interpretation: There is no statistically significant difference in average margin across postal codes.


In [6]:
# --- Hypothesis 4: Risk differences between Women and Men ---
print("\n--- Hypothesis 4: Risk differences between Women and Men ---")

# Ensure 'gender' column is cleaned (e.g., 'Not specified' handled and filled with 'Unknown').
# Filter to only 'Male' and 'Female' for comparison.
df_gender_filtered = df_processed[df_processed['gender'].isin(['Male', 'Female'])].copy()

if df_gender_filtered.empty:
    print("Not enough 'Male' or 'Female' gender data to perform analysis for Hypothesis 4.")
else:
    # --- Claim Frequency by Gender (Chi-squared Test) ---
    print("\nClaim Frequency by Gender:")
    contingency_table_gender_freq = pd.crosstab(df_gender_filtered['gender'], df_gender_filtered['has_claim'])
    print(contingency_table_gender_freq)

    # Ensure enough data for Chi-squared test (at least 2x2 table and not all zero counts)
    if contingency_table_gender_freq.shape[0] > 1 and contingency_table_gender_freq.shape[1] > 1 and contingency_table_gender_freq.min().min() > 0:
        chi2_gender_freq, p_value_gender_freq, _, _ = chi2_contingency(contingency_table_gender_freq)
        print(f"\nChi-squared test for Claim Frequency (Gender):")
        print(f"Chi2 Stat: {chi2_gender_freq:.2f}, P-value: {p_value_gender_freq:.4f}")

        if p_value_gender_freq < 0.05:
            print("Decision: Reject the Null Hypothesis (H₀).")
            print("Interpretation: There is a statistically significant difference in claim frequency between genders.")
            gender_freq = df_gender_filtered.groupby('gender')['has_claim'].mean().sort_values(ascending=False)
            print("\nClaim Frequency by Gender (proportion of policies with claims):")
            print(gender_freq)
            print(f"Business Recommendation: Gender appears to be a significant factor in claim frequency. Policies for {gender_freq.index[0]} show a higher likelihood of claims, warranting gender-based premium adjustments.")
        else:
            print("Decision: Fail to Reject the Null Hypothesis (H₀).")
            print("Interpretation: There is no statistically significant difference in claim frequency between genders.")
    else:
        print("Not enough gender categories or observations for Chi-squared test on Claim Frequency.")


    # --- Claim Severity by Gender (T-test) ---
    print("\nClaim Severity by Gender:")
    # Filter for claims only and valid genders
    df_claims_only_gender = df_claims_only[df_claims_only['gender'].isin(['Male', 'Female'])].copy()

    male_claims = df_claims_only_gender[df_claims_only_gender['gender'] == 'Male']['total_claims']
    female_claims = df_claims_only_gender[df_claims_only_gender['gender'] == 'Female']['total_claims']

    if len(male_claims) > 1 and len(female_claims) > 1: # Ensure enough data for T-test
        # Using Welch's t-test (equal_var=False) as it's more robust to unequal variances and sample sizes
        t_stat_gender_severity, p_value_gender_severity = ttest_ind(male_claims, female_claims, equal_var=False)
        print(f"\nT-test for Claim Severity (Gender - Welch's):")
        print(f"T-statistic: {t_stat_gender_severity:.2f}, P-value: {p_value_gender_severity:.4f}")

        if p_value_gender_severity < 0.05:
            print("Decision: Reject the Null Hypothesis (H₀).")
            print("Interpretation: There is a statistically significant difference in claim severity between genders.")
            gender_severity = df_claims_only_gender.groupby('gender')['total_claims'].mean().sort_values(ascending=False)
            print("\nClaim Severity by Gender (average claim amount for policies with claims):")
            print(gender_severity)
            print(f"Business Recommendation: When claims occur, the average claim amount differs significantly by gender. Policies for {gender_severity.index[0]} show higher severity, suggesting a need for gender-specific considerations in payout estimations or deductible offerings.")
        else:
            print("Decision: Fail to Reject the Null Hypothesis (H₀).")
            print("Interpretation: There is no statistically significant difference in claim severity between genders.")
    else:
        print("Not enough 'Male' or 'Female' policies with claims to perform T-test for Claim Severity.")


    # --- Margin by Gender (T-test) ---
    print("\nMargin by Gender:")
    # Use the df_gender_filtered which contains all policies with 'Male'/'Female' gender
    male_margin = df_gender_filtered[df_gender_filtered['gender'] == 'Male']['margin']
    female_margin = df_gender_filtered[df_gender_filtered['gender'] == 'Female']['margin']

    if len(male_margin) > 1 and len(female_margin) > 1: # Ensure enough data for T-test
        # Using Welch's t-test (equal_var=False)
        t_stat_gender_margin, p_value_gender_margin = ttest_ind(male_margin, female_margin, equal_var=False)
        print(f"\nT-test for Margin (Gender - Welch's):")
        print(f"T-statistic: {t_stat_gender_margin:.2f}, P-value: {p_value_gender_margin:.4f}")

        if p_value_gender_margin < 0.05:
            print("Decision: Reject the Null Hypothesis (H₀).")
            print("Interpretation: There is a statistically significant difference in average margin between genders.")
            gender_margin = df_gender_filtered.groupby('gender')['margin'].mean().sort_values(ascending=False)
            print("\nAverage Margin by Gender:")
            print(gender_margin)
            print(f"Business Recommendation: The average margin significantly differs between genders. Policies for {gender_margin.index[0]} generate higher average margins, which could influence targeted marketing or product development strategies based on profitability by gender segment.")
        else:
            print("Decision: Fail to Reject the Null Hypothesis (H₀).")
            print("Interpretation: There is no statistically significant difference in average margin between genders.")
    else:
        print("Not enough 'Male' or 'Female' policies with margin data to perform T-test for Margin.")


--- Hypothesis 4: Risk differences between Women and Men ---

Claim Frequency by Gender:
has_claim      0   1
gender              
Female      6741  14
Male       42723  94

Chi-squared test for Claim Frequency (Gender):
Chi2 Stat: 0.00, P-value: 0.9515
Decision: Fail to Reject the Null Hypothesis (H₀).
Interpretation: There is no statistically significant difference in claim frequency between genders.

Claim Severity by Gender:

T-test for Claim Severity (Gender - Welch's):
T-statistic: -0.58, P-value: 0.5680
Decision: Fail to Reject the Null Hypothesis (H₀).
Interpretation: There is no statistically significant difference in claim severity between genders.

Margin by Gender:

T-test for Margin (Gender - Welch's):
T-statistic: -0.25, P-value: 0.8015
Decision: Fail to Reject the Null Hypothesis (H₀).
Interpretation: There is no statistically significant difference in average margin between genders.
