##### Import the required library and load the data

In [1]:
# Importing necessary libraries
import pandas as pd
from scipy import stats
import numpy as np

# Load the data
df = pd.read_csv('../Data/MachineLearningRating_v3.txt', sep='|', low_memory=False)

##### Convert TotalPremium and TotalClaims to numeric

In [2]:
df['TotalPremium'] = pd.to_numeric(df['TotalPremium'], errors='coerce')
df['TotalClaims'] = pd.to_numeric(df['TotalClaims'], errors='coerce')

##### Hypothesis 1: Test risk differences across provinces

In [3]:
# For the purpose of hypothesis testing, let's choose 'TotalPremium' as our key performance indicator (KPI).

# Let's segment the data based on the feature we want to test. 
# For example, if we want to test the impact of provinces on TotalPremium:
# Group A (Control Group): Plans with Province = 'Gauteng'
control_group_province = df[df['Province'] == 'Gauteng']['TotalPremium']
# Group B (Test Group): Plans with Province = 'Western Cape'
test_group_province = df[df['Province'] == 'Western Cape']['TotalPremium']

# Perform t-test
t_statistic, p_value = stats.ttest_ind(control_group_province, test_group_province)

# Print results
print("Hypothesis 1: Test risk differences across provinces")
print("-----------------------------------------------------")
print(f"T-statistic: {t_statistic}")
print(f"P-value: {p_value}")

# Analyze p-value
alpha = 0.05
if p_value < alpha:
    print("Reject Null Hypothesis: There are significant risk differences across provinces.")
else:
    print("Fail to reject Null Hypothesis: There are no significant risk differences across provinces.")

Hypothesis 1: Test risk differences across provinces
-----------------------------------------------------
T-statistic: 4.6537430154774375
P-value: 3.260368633786302e-06
Reject Null Hypothesis: There are significant risk differences across provinces.


##### Hypothesis 2: Test risk differences across Zip Codes

In [5]:
control_group_zipcode = df[df['PostalCode'] == '1459']
# Group B (Test Group): Plans with ZipCode = 'ZipCode_B'
test_group_zipcode = df[df['PostalCode'] == '7784']

# Perform t-test
t_statistic, p_value = stats.ttest_ind(control_group_zipcode['TotalPremium'], test_group_zipcode['TotalPremium'])

# Print results
print("Hypothesis 2: There are no risk differences between zip codes")
print("--------------------------------------------------------------")
print(f"T-statistic: {t_statistic}")
print(f"P-value: {p_value}")

# Analyze p-value
if p_value < alpha:
    print("Reject Null Hypothesis: There are significant risk differences between zip codes.")
else:
    print("Fail to reject Null Hypothesis: There are no significant risk differences between zip codes.")

Hypothesis 2: There are no risk differences between zip codes
--------------------------------------------------------------
T-statistic: nan
P-value: nan
Fail to reject Null Hypothesis: There are no significant risk differences between zip codes.


  t_statistic, p_value = stats.ttest_ind(control_group_zipcode['TotalPremium'], test_group_zipcode['TotalPremium'])


##### Hypothesis 3: Test significant margin (profit) difference between zip codes

In [7]:
# For the purpose of hypothesis testing, let's choose 'ProfitMargin' as our key performance indicator (KPI).
df['ProfitMargin'] = df['TotalPremium'] - df['TotalClaims']

# Data Segmentation
# Let's segment the data based on the feature we want to test. 
# For example, if we want to test the impact of zip codes on ProfitMargin:
# Group A (Control Group): Plans with ZipCode = 'ZipCode_A'
control_group_zipcode = df[df['PostalCode'] == '1459']
# Group B (Test Group): Plans with ZipCode = 'ZipCode_B'
test_group_zipcode = df[df['PostalCode'] == '7784']

# Perform t-test
t_statistic, p_value = stats.ttest_ind(control_group_zipcode['ProfitMargin'], test_group_zipcode['ProfitMargin'])

# Print results
print("Hypothesis 3: Test significant margin (profit) difference between zip codes")
print("------------------------------------------------------------------------------")
print(f"T-statistic: {t_statistic}")
print(f"P-value: {p_value}")

# Analyze p-value
if p_value < alpha:
    print("Reject Null Hypothesis: There are significant margin (profit) differences between zip codes.")
else:
    print("Fail to reject Null Hypothesis: There are no significant margin (profit) differences between zip codes.")

Hypothesis 3: Test significant margin (profit) difference between zip codes
------------------------------------------------------------------------------
T-statistic: nan
P-value: nan
Fail to reject Null Hypothesis: There are no significant margin (profit) differences between zip codes.


  t_statistic, p_value = stats.ttest_ind(control_group_zipcode['ProfitMargin'], test_group_zipcode['ProfitMargin'])


##### Hypothesis 4: Test significant margin (profit) differences between zip codes

In [9]:
# Let's segment the data based on the feature we want to test. 
# For example, if we want to test the impact of Gender on TotalPremium:
# Group A (Control Group): Plans with Gender = 'Male'
control_group_gender = df[df['Gender'] == 'Male']
# Group B (Test Group): Plans with Gender = 'Female'
test_group_gender = df[df['Gender'] == 'Female']

# Perform t-test
t_statistic, p_value = stats.ttest_ind(control_group_gender['TotalPremium'], test_group_gender['TotalPremium'])

# Print results
print("Hypothesis 4: Test significant risk differences between Women and Men")
print("-----------------------------------------------------------------------")
print(f"T-statistic: {t_statistic}")
print(f"P-value: {p_value}")

# Analyze p-value
if p_value < alpha:
    print("Reject Null Hypothesis: There are significant risk differences between Women and Men.")
else:
    print("Fail to reject Null Hypothesis: There are no significant risk differences between Women and Men.")

Hypothesis 4: Test significant risk differences between Women and Men
-----------------------------------------------------------------------
T-statistic: -5.118420932688848
P-value: 3.0925282750010697e-07
Reject Null Hypothesis: There are significant risk differences between Women and Men.
