In [1]:
import sys
import numpy as np
import os
import pandas as pd
from scipy.stats import ttest_ind, chi2_contingency
from scipy import stats
from sklearn.metrics import mean_squared_error, r2_score

data_dir=data_dir = r'E:\code\WEEK3\IINSURANCE_ANALAYSIS\data'
src_dir = r'E:\code\WEEK3\IINSURANCE_ANALAYSIS\src'


sys.path.append(src_dir)
sys.path.append(data_dir)
from Modeling_utils import InsuranceModeling 
from futureutils  import InsuranceDataUtils

In [2]:
csv_file_path = r'E:\code\WEEK3\IINSURANCE_ANALAYSIS\data\cleaned_data.csv'
df = pd.read_csv(csv_file_path, low_memory=False)
df.head()

Unnamed: 0,UnderwrittenCoverID,PolicyID,TransactionMonth,IsVATRegistered,Citizenship,LegalType,Title,Language,Bank,AccountType,...,ExcessSelected,CoverCategory,CoverType,CoverGroup,Section,Product,StatutoryClass,StatutoryRiskType,TotalPremium,TotalClaims
0,145249,12827,2015-03-01 00:00:00,True,,Close Corporation,Mr,English,First National Bank,Current account,...,Mobility - Windscreen,Windscreen,Windscreen,Comprehensive - Taxi,Motor Comprehensive,Mobility Metered Taxis: Monthly,Commercial,IFRS Constant,21.929825,0.0
1,145249,12827,2015-05-01 00:00:00,True,,Close Corporation,Mr,English,First National Bank,Current account,...,Mobility - Windscreen,Windscreen,Windscreen,Comprehensive - Taxi,Motor Comprehensive,Mobility Metered Taxis: Monthly,Commercial,IFRS Constant,21.929825,0.0
2,145249,12827,2015-07-01 00:00:00,True,,Close Corporation,Mr,English,First National Bank,Current account,...,Mobility - Windscreen,Windscreen,Windscreen,Comprehensive - Taxi,Motor Comprehensive,Mobility Metered Taxis: Monthly,Commercial,IFRS Constant,0.0,0.0
3,145255,12827,2015-05-01 00:00:00,True,,Close Corporation,Mr,English,First National Bank,Current account,...,Mobility - Metered Taxis - R2000,Own damage,Own Damage,Comprehensive - Taxi,Motor Comprehensive,Mobility Metered Taxis: Monthly,Commercial,IFRS Constant,512.84807,0.0
4,145255,12827,2015-07-01 00:00:00,True,,Close Corporation,Mr,English,First National Bank,Current account,...,Mobility - Metered Taxis - R2000,Own damage,Own Damage,Comprehensive - Taxi,Motor Comprehensive,Mobility Metered Taxis: Monthly,Commercial,IFRS Constant,0.0,0.0


In [3]:
missing_values = df.isnull().sum()
# Display columns with missing values and the count
missing_values = missing_values[missing_values > 0]
print(missing_values)

Series([], dtype: int64)


In [4]:
# Initialize and preprocess
utils = InsuranceDataUtils(df, target_column='TotalPremium')
utils.add_date_features()
utils.add_vehicle_age()
utils.combine_province_zone()
utils.apply_log_transformation()

# Get the new dataset with only selected features
selected_features_df = utils.get_selected_features_df()

# Get the new dataset with only selected features
selected_features_df = utils.get_selected_features_df()
# List unique values in the 'Province' column
selected_features_df.head()

       LogSumInsured  LogCapitalOutstanding
count   1.000098e+06           1.000098e+06
mean    9.412743e+00           2.550096e+00
std     4.009265e+00           5.008409e+00
min     9.950331e-03           9.999950e-06
25%     8.517393e+00           9.999950e-06
50%     8.922792e+00           9.999950e-06
75%     1.242922e+01           9.999950e-06
max     1.635208e+01           1.345884e+01


Unnamed: 0,TransactionYear,TransactionMonthOnly,VehicleAge,ProvinceZone,LogSumInsured,LogCapitalOutstanding,TotalPremium,TotalClaims
0,2015,3,20,Gauteng_Rand East,0.00995,11.689405,21.929825,0.0
1,2015,5,20,Gauteng_Rand East,0.00995,11.689405,21.929825,0.0
2,2015,7,20,Gauteng_Rand East,0.00995,11.689405,0.0,0.0
3,2015,5,20,Gauteng_Rand East,11.689405,11.689405,512.84807,0.0
4,2015,7,20,Gauteng_Rand East,11.689405,11.689405,0.0,0.0


In [5]:
# Initialize with target column for TotalPremium
encode_premium = InsuranceDataUtils(selected_features_df, target_column='TotalPremium')
encode_premium.preprocess_features()
encode_premium.split_data()
preprocessed_df = encode_premium.get_preprocessed_df()

preprocessed_df.head()
# Initialize with target column for TotalClaims
X_train_premium, X_test_premium, y_train_premium, y_test_premium = encode_premium.get_train_test_data()

Starting feature preprocessing...
Preprocessing successful.
DataFrame created successfully with shape: (1000098, 27)
Training data shape for TotalPremium: (800078, 26)
Testing data shape for TotalPremium: (200020, 26)


In [6]:
encode_claims = InsuranceDataUtils(selected_features_df, target_column='TotalClaims')
encode_claims.preprocess_features()
encode_claims.split_data()
preprocessed_df = encode_claims.get_preprocessed_df()

preprocessed_df.head()
# Print the shapes to verify
X_train_claims, X_test_claims, y_train_claims, y_test_claims = encode_claims.get_train_test_data()

Starting feature preprocessing...
Preprocessing successful.
DataFrame created successfully with shape: (1000098, 27)
Training data shape for TotalClaims: (800078, 26)
Testing data shape for TotalClaims: (200020, 26)


In [7]:
print("Data split for TotalPremium:")
print("Training set shape:", encode_premium.X_train.shape)
print("Testing set shape:", encode_premium.X_test.shape)

print("Data split for TotalClaims:")
print("Training set shape:", encode_claims.X_train.shape)
print("Testing set shape:", encode_claims.X_test.shape)


Data split for TotalPremium:
Training set shape: (800078, 26)
Testing set shape: (200020, 26)
Data split for TotalClaims:
Training set shape: (800078, 26)
Testing set shape: (200020, 26)


In [8]:
modeling = InsuranceModeling(
    X_train_premium, X_test_premium, y_train_premium, y_test_premium,
    X_train_claims, X_test_claims, y_train_claims, y_test_claims
)


In [9]:
modeling.train_models()
modeling.evaluate_models()



Training Linear Regression for TotalPremium...
Training Linear Regression for TotalClaims...
Training Random Forest for TotalPremium...
Training Random Forest for TotalClaims...
Training XGBoost for TotalPremium...
Training XGBoost for TotalClaims...
Training Decision Tree for TotalPremium...
Training Decision Tree for TotalClaims...

Evaluation for TotalPremium:
Linear Regression: RMSE = 0.70, MAE = 0.40
Random Forest: RMSE = 0.41, MAE = 0.17
XGBoost: RMSE = 0.47, MAE = 0.22
Decision Tree: RMSE = 0.41, MAE = 0.17

Evaluation for TotalClaims:
Linear Regression: RMSE = 0.92, MAE = 0.06
Random Forest: RMSE = 1.02, MAE = 0.05
XGBoost: RMSE = 0.97, MAE = 0.06
Decision Tree: RMSE = 1.06, MAE = 0.06


In [10]:
modeling.feature_importance_analysis()
#modeling.lime_analysis('Linear Regression')
#modeling.lime_analysis('XGBoost')  # Replace with the desired model name
#modeling.lime_analysis('Decision Tree')
#modeling.lime_analysis('Random Forest')



Feature Importance for Linear Regression (TotalPremium):
Feature importance not available for this model.

Feature Importance for Linear Regression (TotalClaims):
Feature importance not available for this model.

Feature Importance for Random Forest (TotalPremium):
Feature 0: 0.0712
Feature 1: 0.2629
Feature 2: 0.0754
Feature 3: 0.5051
Feature 4: 0.0000
Feature 5: 0.0000
Feature 6: 0.0007
Feature 7: 0.0000
Feature 8: 0.0004
Feature 9: 0.0045
Feature 10: 0.0009
Feature 11: 0.0115
Feature 12: 0.0047
Feature 13: 0.0100
Feature 14: 0.0054
Feature 15: 0.0069
Feature 16: 0.0084
Feature 17: 0.0009
Feature 18: 0.0047
Feature 19: 0.0071
Feature 20: 0.0001
Feature 21: 0.0000
Feature 22: 0.0149
Feature 23: 0.0014
Feature 24: 0.0027
Feature 25: 0.0000

Feature Importance for Random Forest (TotalClaims):
Feature 0: 0.0712
Feature 1: 0.2629
Feature 2: 0.0754
Feature 3: 0.5051
Feature 4: 0.0000
Feature 5: 0.0000
Feature 6: 0.0007
Feature 7: 0.0000
Feature 8: 0.0004
Feature 9: 0.0045
Feature 10: 0.00

NameError: name 'modeling' is not defined