### Importing the Necessary Packages

In [47]:
import pandas as pd
import numpy as np
from scipy import stats
import seaborn as sns
import matplotlib.pyplot as plt
from scipy.stats import chi2_contingency, ttest_ind
from sklearn.model_selection import StratifiedShuffleSplit

### Loading the Dataset

In [11]:
data = pd.read_csv('..\data\insurance_data.csv')

  data = pd.read_csv('..\data\insurance_data.csv')


### Ensure the data types are correct (e.g., categorical columns are converted to category type)

In [21]:

categorical_columns = ['Province', 'PostalCode', 'Gender']
for col in categorical_columns:
    data[col] = data[col].astype('category')

# Create a new column for profit margin
data['ProfitMargin'] = data['TotalPremium'] - data['TotalClaims']
data['RiskCategory'] = pd.cut(data['TotalClaims'], bins=[-1, 1000, 5000, np.inf], labels=['Low', 'Medium', 'High'])

In [22]:
data.head()

Unnamed: 0,UnderwrittenCoverID,PolicyID,TransactionMonth,IsVATRegistered,Citizenship,LegalType,Title,Language,Bank,AccountType,...,CoverType,CoverGroup,Section,Product,StatutoryClass,StatutoryRiskType,TotalPremium,TotalClaims,ProfitMargin,RiskCategory
0,145249,12827,2015-03-01 00:00:00,True,,Close Corporation,Mr,English,First National Bank,Current account,...,Windscreen,Comprehensive - Taxi,Motor Comprehensive,Mobility Metered Taxis: Monthly,Commercial,IFRS Constant,21.929825,0.0,21.929825,Low
1,145249,12827,2015-05-01 00:00:00,True,,Close Corporation,Mr,English,First National Bank,Current account,...,Windscreen,Comprehensive - Taxi,Motor Comprehensive,Mobility Metered Taxis: Monthly,Commercial,IFRS Constant,21.929825,0.0,21.929825,Low
2,145249,12827,2015-07-01 00:00:00,True,,Close Corporation,Mr,English,First National Bank,Current account,...,Windscreen,Comprehensive - Taxi,Motor Comprehensive,Mobility Metered Taxis: Monthly,Commercial,IFRS Constant,0.0,0.0,0.0,Low
3,145255,12827,2015-05-01 00:00:00,True,,Close Corporation,Mr,English,First National Bank,Current account,...,Own Damage,Comprehensive - Taxi,Motor Comprehensive,Mobility Metered Taxis: Monthly,Commercial,IFRS Constant,2.191491,0.0,2.191491,Low
4,145255,12827,2015-07-01 00:00:00,True,,Close Corporation,Mr,English,First National Bank,Current account,...,Own Damage,Comprehensive - Taxi,Motor Comprehensive,Mobility Metered Taxis: Monthly,Commercial,IFRS Constant,0.0,0.0,0.0,Low


### Hypothesis Testing

#### Hypothesis 1: "There are no risk differences across provinces"

##### Step 1: Select Metrics and Segment Data


In [87]:
# Group data by Province and calculate total claims
province_claims = data.groupby('Province')['TotalClaims'].sum()

# Check the total claims by province
print(province_claims)


Province
Eastern Cape     1.356427e+06
Free State       3.549223e+05
Gauteng          2.888766e+07
KwaZulu-Natal    1.430138e+07
Limpopo          1.016477e+06
Mpumalanga       2.044675e+06
North West       5.920250e+06
Northern Cape    8.949051e+04
Western Cape     9.831361e+06
Name: TotalClaims, dtype: float64


  province_claims = data.groupby('Province')['TotalClaims'].sum()


##### Step 2:  Feature Selection and Segmentation into Groups

In [99]:
# Splitting data into two provinces
group_A = data[data['Province'] == 'Gauteng']  # Example province
group_B = data[data['Province'] == 'Western Cape']  # Another province


##### Step 3:  Statistical Equivalence Check

In [100]:

# Create a contingency table for Marital Status in Group A and Group B
contingency_table_marital_status = pd.crosstab(data['MaritalStatus'], data['Province'])

# Perform Chi-Square Test for Marital Status
chi2_marital, p_marital, dof_marital, expected_marital = chi2_contingency(contingency_table_marital_status)

print(f"Marital Status: Chi-Square: {chi2_marital}, p-value: {p_marital}")

if p_marital < 0.05:
    print("Group A and Group B differ significantly in marital status. Adjust the groups.")
else:
    print("No significant difference in marital status between the groups.")


Marital Status: Chi-Square: 11959.167120437687, p-value: 0.0
Group A and Group B differ significantly in marital status. Adjust the groups.


In [90]:
# Create a contingency table for VehicleType in Group A and Group B
contingency_table_vehicle_type = pd.crosstab(data['VehicleType'], data['Province'])

# Perform Chi-Square Test for Vehicle Type
chi2_vehicle, p_vehicle, dof_vehicle, expected_vehicle = chi2_contingency(contingency_table_vehicle_type)

print(f"Vehicle Type: Chi-Square: {chi2_vehicle}, p-value: {p_vehicle}")

if p_vehicle < 0.05:
    print("Group A and Group B differ significantly in Vehicle Type. Adjust the groups.")
else:
    print("No significant difference in Vehicle Type between the groups.")


Vehicle Type: Chi-Square: 21634.925901482635, p-value: 0.0
Group A and Group B differ significantly in Vehicle Type. Adjust the groups.


In [101]:

# Perform T-test for SumInsured between Group A and Group B
t_stat_suminsured, p_suminsured = ttest_ind(group_A['SumInsured'], group_B['SumInsured'])

print(f"Sum Insured: T-Statistic: {t_stat_suminsured}, p-value: {p_suminsured}")

if p_suminsured < 0.05:
    print("Group A and Group B differ significantly in Sum Insured. Adjust the groups.")
else:
    print("No significant difference in Sum Insured between the groups.")


Sum Insured: T-Statistic: -3.5346734614051325, p-value: 0.0004083124994400181
Group A and Group B differ significantly in Sum Insured. Adjust the groups.


##### Step 4: Adjust Groups 
For example, using sklearn’s StratifiedShuffleSplit to balance the groups while keeping the feature distributions equivalent

In [102]:
from sklearn.model_selection import StratifiedShuffleSplit

# Let's say we want to stratify based on 'MaritalStatus' while splitting into two groups
split = StratifiedShuffleSplit(n_splits=1, test_size=0.5, random_state=42)

for train_index, test_index in split.split(data, data['MaritalStatus']):
    group_A = data.loc[train_index]
    group_B = data.loc[test_index]

# Now we have two groups A and B that are balanced in terms of 'MaritalStatus'


In [103]:
from sklearn.model_selection import StratifiedShuffleSplit

# Let's say we want to stratify based on 'Gender' while splitting into two groups
split = StratifiedShuffleSplit(n_splits=1, test_size=0.5, random_state=42)

for train_index, test_index in split.split(data, data['Gender']):
    group_A = data.loc[train_index]
    group_B = data.loc[test_index]

# Now we have two groups A and B that are balanced in terms of 'Gender'


In [104]:

split = StratifiedShuffleSplit(n_splits=1, test_size=0.5, random_state=42)

for train_index, test_index in split.split(data, data['VehicleType']):
    group_A = data.loc[train_index]
    group_B = data.loc[test_index]

# Now we have two groups A and B that are balanced in terms of 'VehicleType'


In [105]:

split = StratifiedShuffleSplit(n_splits=1, test_size=0.5, random_state=42)

for train_index, test_index in split.split(data, data['CoverType']):
    group_A = data.loc[train_index]
    group_B = data.loc[test_index]

# Now we have two groups A and B that are balanced in terms of 'CoverType'

##### Step 5: Checking if our adjustment worked

In [106]:
# Perform T-test for SumInsured between Group A and Group B
t_stat_suminsured, p_suminsured = ttest_ind(group_A['SumInsured'], group_B['SumInsured'])

print(f"Sum Insured: T-Statistic: {t_stat_suminsured}, p-value: {p_suminsured}")

if p_suminsured < 0.05:
    print("Group A and Group B differ significantly in Sum Insured. Adjust the groups.")
else:
    print("No significant difference in Sum Insured between the groups.")

Sum Insured: T-Statistic: -0.03583851955332036, p-value: 0.9714111258502445
No significant difference in Sum Insured between the groups.


##### Step 5: Hypothesis 1 (Risk Differences Across Provinces):

In [107]:
# Perform T-test on TotalClaims
t_stat_claims, p_claims = ttest_ind(group_A['TotalClaims'], group_B['TotalClaims'], nan_policy='omit')

# Output result
print(f"T-Test for Total Claims between Gauteng and Western Cape: T-Statistic = {t_stat_claims}, p-value = {p_claims}")

# Interpret the result
if p_claims < 0.05:
    print("Reject the null hypothesis: There are significant risk differences across provinces.")
else:
    print("Fail to reject the null hypothesis: There are no significant risk differences across provinces.")



T-Test for Total Claims between Gauteng and Western Cape: T-Statistic = 0.02485786356179346, p-value = 0.9801683418578361
Fail to reject the null hypothesis: There are no significant risk differences across provinces.


#### Hypothesis 2: Risk Differences Between Zip Codes

In [108]:
# Split data based on Postal Code
group_A = data[data['PostalCode'] == 122]  # Group A: Postal Code 122
group_B = data[data['PostalCode'] == 8000]  # Group B: Postal Code 8000

# Perform statistical tests for equivalence (repeat the chi-square and t-test as above for other features)


In [109]:
# Perform T-test on TotalClaims
t_stat_claims, p_claims = ttest_ind(group_A['TotalClaims'], group_B['TotalClaims'], nan_policy='omit')

# Output result
print(f"T-Test for Total Claims between Postal Code 1001 and 2001: T-Statistic = {t_stat_claims}, p-value = {p_claims}")

# Interpret the result
if p_claims < 0.05:
    print("Reject the null hypothesis: There are significant risk differences between postal codes.")
else:
    print("Fail to reject the null hypothesis: There are no significant risk differences between postal codes.")


T-Test for Total Claims between Postal Code 1001 and 2001: T-Statistic = -0.7916125704983136, p-value = 0.42858970670051955
Fail to reject the null hypothesis: There are no significant risk differences between postal codes.


#### Hypothesis 3: Profit Margin Differences Between Zip Codes

In [112]:
# Calculate Profit for both groups
group_A['Profit'] = group_A['TotalPremium'] - group_A['TotalClaims']
group_B['Profit'] = group_B['TotalPremium'] - group_B['TotalClaims']

# Perform T-test on Profit
t_stat_profit, p_profit = ttest_ind(group_A['Profit'], group_B['Profit'], nan_policy='omit')

# Output result
print(f"T-Test for Profit between Postal Code 1001 and 2001: T-Statistic = {t_stat_profit}, p-value = {p_profit}")

# Interpret the result
if p_profit < 0.05:
    print("Reject the null hypothesis: There are significant profit margin differences between postal codes.")
else:
    print("Fail to reject the null hypothesis: There are no significant profit margin differences between postal codes.")


T-Test for Profit between Postal Code 1001 and 2001: T-Statistic = -0.7915742130225282, p-value = 0.4286091775658968
Fail to reject the null hypothesis: There are no significant profit margin differences between postal codes.


#### Hypothesis 4: Risk Differences Between Men and Women

##### Step 1:  Data Segmentation

In [113]:
# Split data by Gender
group_A = data[data['Gender'] == 'Male']  # Group A: Male
group_B = data[data['Gender'] == 'Female']  # Group B: Female



##### Step 2: Perform Hypothesis Testing

In [114]:
# Perform T-test on TotalClaims
t_stat_claims, p_claims = ttest_ind(group_A['TotalClaims'], group_B['TotalClaims'], nan_policy='omit')

# Output result
print(f"T-Test for Total Claims between Men and Women: T-Statistic = {t_stat_claims}, p-value = {p_claims}")

# Interpret the result
if p_claims < 0.05:
    print("Reject the null hypothesis: There are significant risk differences between men and women.")
else:
    print("Fail to reject the null hypothesis: There are no significant risk differences between men and women")

T-Test for Total Claims between Men and Women: T-Statistic = 2.106521469743276, p-value = 0.03515933145724895
Reject the null hypothesis: There are significant risk differences between men and women.
