In [1]:
import numpy
import pandas as pd
from sklearn import preprocessing 
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import train_test_split
label_encoder = preprocessing.LabelEncoder() 
df = pd.read_csv('Data.csv')
pd.set_option('display.max_columns', None)
# Split max_torque and max_power into separate columns
df[['torque_value', 'torque_rpm']] = df['max_torque'].str.split('@', expand=True)
df[['power_value', 'power_rpm']] = df['max_power'].str.split('@', expand=True)

# Remove units and convert to numeric
df['torque_value'] = df['torque_value'].str.replace('Nm', '').astype(float)
df['torque_rpm'] = df['torque_rpm'].str.replace('rpm', '').astype(int)
df['power_value'] = df['power_value'].str.replace('bhp', '').astype(float)
df['power_rpm'] = df['power_rpm'].str.replace('rpm', '').astype(int)
df_proc = df
display(df.head(2))


Unnamed: 0,policy_id,policy_tenure,age_of_car,age_of_policyholder,area_cluster,population_density,make,segment,model,fuel_type,max_torque,max_power,engine_type,airbags,is_esc,is_adjustable_steering,is_tpms,is_parking_sensors,is_parking_camera,rear_brakes_type,displacement,cylinder,transmission_type,gear_box,steering_type,turning_radius,length,width,height,gross_weight,is_front_fog_lights,is_rear_window_wiper,is_rear_window_washer,is_rear_window_defogger,is_brake_assist,is_power_door_locks,is_central_locking,is_power_steering,is_driver_seat_height_adjustable,is_day_night_rear_view_mirror,is_ecw,is_speed_alert,ncap_rating,is_claim,torque_value,torque_rpm,power_value,power_rpm
0,ID00001,0.515874,0.05,0.644231,C1,4990,1,A,M1,CNG,60Nm@3500rpm,40.36bhp@6000rpm,F8D Petrol Engine,2,No,No,No,Yes,No,Drum,796,3,Manual,5,Power,4.6,3445,1515,1475,1185,No,No,No,No,No,No,No,Yes,No,No,No,Yes,0,0,60.0,3500,40.36,6000
1,ID00002,0.672619,0.02,0.375,C2,27003,1,A,M1,CNG,60Nm@3500rpm,40.36bhp@6000rpm,F8D Petrol Engine,2,No,No,No,Yes,No,Drum,796,3,Manual,5,Power,4.6,3445,1515,1475,1185,No,No,No,No,No,No,No,Yes,No,No,No,Yes,0,0,60.0,3500,40.36,6000


In [2]:
df.dtypes

policy_id                            object
policy_tenure                       float64
age_of_car                          float64
age_of_policyholder                 float64
area_cluster                         object
population_density                    int64
make                                  int64
segment                              object
model                                object
fuel_type                            object
max_torque                           object
max_power                            object
engine_type                          object
airbags                               int64
is_esc                               object
is_adjustable_steering               object
is_tpms                              object
is_parking_sensors                   object
is_parking_camera                    object
rear_brakes_type                     object
displacement                          int64
cylinder                              int64
transmission_type               

In [3]:
#Creating new feature columns by combining one or more columns
df_proc["policy_age_at_claim"] = df_proc["policy_tenure"] * 12 #Policy Age at Claim Time
df_proc["car_age_at_claim"] = df_proc["age_of_car"] + df_proc["policy_tenure"] # Car Age at Claim Time

#Modifying the Boolean fields to 0 or 1
df_proc["is_speed_alert"].replace({'No': 0, 'Yes': 1}, inplace=True)
df_proc["is_tpms"].replace({'No': 0, 'Yes': 1},inplace=True)
df_proc["is_esc"].replace({'No': 0, 'Yes': 1},inplace=True)
df_proc["is_parking_sensors"].replace({'No': 0, 'Yes': 1},inplace=True)
df_proc["is_parking_camera"].replace({'No': 0, 'Yes': 1},inplace=True)
df_proc["is_brake_assist"].replace({'No': 0, 'Yes': 1},inplace=True)
df_proc["is_adjustable_steering"].replace({'No': 0, 'Yes': 1},inplace=True)
df_proc["is_ecw"].replace({'No': 0, 'Yes': 1},inplace=True)
df_proc["is_day_night_rear_view_mirror"].replace({'No': 0, 'Yes': 1},inplace=True)
df_proc["is_driver_seat_height_adjustable"].replace({'No': 0, 'Yes': 1},inplace=True)
df_proc["is_power_steering"].replace({'No': 0, 'Yes': 1},inplace=True)
df_proc["is_central_locking"].replace({'No': 0, 'Yes': 1},inplace=True)
df_proc["is_power_door_locks"].replace({'No': 0, 'Yes': 1},inplace=True)
df_proc["is_rear_window_defogger"].replace({'No': 0, 'Yes': 1},inplace=True)
df_proc["is_rear_window_washer"].replace({'No': 0, 'Yes': 1},inplace=True)
df_proc["is_rear_window_wiper"].replace({'No': 0, 'Yes': 1},inplace=True)
df_proc["is_front_fog_lights"].replace({'No': 0, 'Yes': 1},inplace=True)

df_proc["risk_factor"] = (df_proc["age_of_policyholder"] / df_proc["policy_tenure"]) * df_proc["is_speed_alert"].astype(int)
df_proc["vehicle_safety_index"] = df_proc["ncap_rating"] + df_proc["is_esc"] + df_proc["is_tpms"]
df_proc["advanced_sefety_features"] = df_proc["is_esc"] + df_proc["is_parking_sensors"] + df_proc["is_brake_assist"]
df_proc["safety_score"] = df_proc["is_esc"] + df_proc["is_parking_sensors"] + df_proc["is_brake_assist"] + df_proc["is_tpms"] + df_proc["is_parking_camera"]
df_proc["car_type"] = df_proc["segment"].astype(str) +"_"+ df_proc["make"].astype(str) +"_"+ df_proc["model"].astype(str)
df_proc["car_usage"] = df_proc["segment"].astype(str) + "_" + df_proc["fuel_type"].astype(str)
df_proc["performance_index"] = df_proc["torque_value"] + df_proc["torque_rpm"] + df_proc["power_value"] + df_proc["power_rpm"]
df_proc['policyholder_risk'] = df_proc['age_of_policyholder'] / df_proc['policy_tenure']

# Using Label encoding to convert categorical values to numerical values based on the field type
df_proc["model_en"] = label_encoder.fit_transform(df_proc["model"])
df_proc["segment_en"] = label_encoder.fit_transform(df_proc["segment"])
df_proc["make_en"] = label_encoder.fit_transform(df_proc["make"])
df_proc["fuel_type_en"] = label_encoder.fit_transform(df_proc["fuel_type"])
df_proc["car_type_encode"] = label_encoder.fit_transform(df_proc["car_type"])
df_proc["car_usage_encode"] = label_encoder.fit_transform(df_proc["car_usage"])
df_final = df_proc
display(df_final.head(2))


Unnamed: 0,policy_id,policy_tenure,age_of_car,age_of_policyholder,area_cluster,population_density,make,segment,model,fuel_type,max_torque,max_power,engine_type,airbags,is_esc,is_adjustable_steering,is_tpms,is_parking_sensors,is_parking_camera,rear_brakes_type,displacement,cylinder,transmission_type,gear_box,steering_type,turning_radius,length,width,height,gross_weight,is_front_fog_lights,is_rear_window_wiper,is_rear_window_washer,is_rear_window_defogger,is_brake_assist,is_power_door_locks,is_central_locking,is_power_steering,is_driver_seat_height_adjustable,is_day_night_rear_view_mirror,is_ecw,is_speed_alert,ncap_rating,is_claim,torque_value,torque_rpm,power_value,power_rpm,policy_age_at_claim,car_age_at_claim,risk_factor,vehicle_safety_index,advanced_sefety_features,safety_score,car_type,car_usage,performance_index,policyholder_risk,model_en,segment_en,make_en,fuel_type_en,car_type_encode,car_usage_encode
0,ID00001,0.515874,0.05,0.644231,C1,4990,1,A,M1,CNG,60Nm@3500rpm,40.36bhp@6000rpm,F8D Petrol Engine,2,0,0,0,1,0,Drum,796,3,Manual,5,Power,4.6,3445,1515,1475,1185,0,0,0,0,0,0,0,1,0,0,0,1,0,0,60.0,3500,40.36,6000,6.190483,0.565874,1.248815,0,1,1,A_1_M1,A_CNG,9600.36,1.248815,0,0,0,0,0,0
1,ID00002,0.672619,0.02,0.375,C2,27003,1,A,M1,CNG,60Nm@3500rpm,40.36bhp@6000rpm,F8D Petrol Engine,2,0,0,0,1,0,Drum,796,3,Manual,5,Power,4.6,3445,1515,1475,1185,0,0,0,0,0,0,0,1,0,0,0,1,0,0,60.0,3500,40.36,6000,8.071422,0.692619,0.557523,0,1,1,A_1_M1,A_CNG,9600.36,0.557523,0,0,0,0,0,0


In [4]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, confusion_matrix


df_final = df_final.fillna(0)
#display(df_final.head(2))
X = df_final.drop(['is_claim','policy_id','area_cluster','segment','model','fuel_type','max_torque','max_power','engine_type'
                  ,'rear_brakes_type','transmission_type','steering_type','car_type','car_usage'],axis=1)

#X = df_final.drop(string_cols,axis=1,inplace=True)
y = df_final["is_claim"]

#print(X)

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

model = LogisticRegression(max_iter=900, solver='lbfgs', C=0.1, tol=1e-4)

# Train the model
model.fit(X_train, y_train)

# Make predictions
y_pred = model.predict(X_test)
# Comparing actual and predicted values percentage
comparison = pd.DataFrame({'Actual': y_test, 'Predicted': y_pred})
category_counts = comparison['Actual'].value_counts().add(
    comparison['Predicted'].value_counts(), fill_value=0
)
total_count = len(comparison)
percentages = (category_counts / total_count) * 100

print(percentages)

0    193.548938
1      6.451062
Name: count, dtype: float64


In [5]:
#Evaluate Logistic Regression Model Performance
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)
roc_auc = roc_auc_score(y_test, y_pred)

print(f'Accuracy: {accuracy}')
print(f'Precision: {precision}')
print(f'Recall: {recall}')
print(f'F1 Score: {f1}')
print(f'ROC AUC Score: {roc_auc}')

Accuracy: 0.9354893762266405
Precision: 0.0
Recall: 0.0
F1 Score: 0.0
ROC AUC Score: 0.5


  _warn_prf(average, modifier, msg_start, len(result))


In [6]:
#Trying with One Hot encoding technique to check the model evaluation
from sklearn.model_selection import train_test_split

# Create new features
df_proc["policy_age_at_claim"] = df_proc["policy_tenure"] * 12  # Policy Age at Claim Time
df_proc["car_age_at_claim"] = df_proc["age_of_car"] + df_proc["policy_tenure"]  # Car Age at Claim Time
df_proc["risk_factor"] = (df_proc["age_of_policyholder"] / df_proc["policy_tenure"]) * df_proc["is_speed_alert"].astype(int)
df_proc["vehicle_safety_index"] = df_proc["ncap_rating"] + df_proc["is_esc"].astype(int) + df_proc["is_tpms"].astype(int)
df_proc["advanced_sefety_features"] = df_proc["is_esc"].astype(int) + df_proc["is_parking_sensors"].astype(int) + df_proc["is_brake_assist"].astype(int)
df_proc["safety_score"] = df_proc["is_esc"].astype(int) + df_proc["is_parking_sensors"].astype(int) + df_proc["is_brake_assist"].astype(int) + df_proc["is_tpms"].astype(int) + df_proc["is_parking_camera"].astype(int)
df_proc["car_type"] = df_proc["segment"].astype(str) + "_" + df_proc["make"].astype(str) + "_" + df_proc["model"].astype(str)
df_proc["car_usage"] = df_proc["segment"].astype(str) + "_" + df_proc["fuel_type"].astype(str)
df_proc["performance_index"] = df_proc["torque_value"] + df_proc["torque_rpm"] + df_proc["power_value"] + df_proc["power_rpm"]
df_proc['policyholder_risk'] = df_proc['age_of_policyholder'] / df_proc['policy_tenure']

# Replace 'Yes'/'No' with 1/0 in boolean fields
bool_fields = [
    "is_speed_alert", "is_tpms", "is_esc", "is_parking_sensors", "is_parking_camera",
    "is_brake_assist", "is_adjustable_steering", "is_ecw", "is_day_night_rear_view_mirror",
    "is_driver_seat_height_adjustable", "is_power_steering", "is_central_locking",
    "is_power_door_locks", "is_rear_window_defogger", "is_rear_window_washer",
    "is_rear_window_wiper", "is_front_fog_lights"
]

for field in bool_fields:
    df_proc[field].replace({'No': 0, 'Yes': 1}, inplace=True)

# Apply one-hot encoding to categorical variables
categorical_features = [
    'area_cluster', 'segment', 'model', 'fuel_type', 
    'engine_type', 'rear_brakes_type', 'transmission_type', 'steering_type'
]

df_encoded = pd.get_dummies(df_proc, columns=categorical_features, drop_first=True)

# Define feature set and target variable
X = df_encoded.drop(columns=['is_claim', 'policy_id','max_torque','max_power','car_type','car_usage'])  # Exclude 'policy_id' if it's not a feature
y = df_encoded['is_claim']

#display(X.head(2))

# Split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train the model
model = LogisticRegression(max_iter=10000)
model.fit(X_train, y_train)

# Predict on test data
y_pred = model.predict(X_test)

# Evaluate model performance
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)
roc_auc = roc_auc_score(y_test, y_pred)

print(f'Accuracy: {accuracy}')
print(f'Precision: {precision}')
print(f'Recall: {recall}')
print(f'F1 Score: {f1}')
print(f'ROC AUC Score: {roc_auc}')

Accuracy: 0.9354893762266405
Precision: 0.0
Recall: 0.0
F1 Score: 0.0
ROC AUC Score: 0.5


  _warn_prf(average, modifier, msg_start, len(result))


In [7]:
# Comparing actual and predicted values percentage
comparison = pd.DataFrame({'Actual': y_test, 'Predicted': y_pred})
category_counts = comparison['Actual'].value_counts().add(
    comparison['Predicted'].value_counts(), fill_value=0
)
total_count = len(comparison)
percentages = (category_counts / total_count) * 100

print(percentages)

0    193.548938
1      6.451062
Name: count, dtype: float64


In [8]:
#Since Using the Logisticregression weight=balanced feature. 
print(y_train.value_counts())


from sklearn.linear_model import LogisticRegression

model = LogisticRegression(max_iter=1800, class_weight='balanced', random_state=42)
model.fit(X_train, y_train)

# Predict on test data
y_pred = model.predict(X_test)



is_claim
0    43881
1     2992
Name: count, dtype: int64


In [12]:
# Evaluate model performance
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score

accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)
roc_auc = roc_auc_score(y_test, y_pred)

print(f'Accuracy: {accuracy}')
print(f'Precision: {precision}')
print(f'Recall: {recall}')
print(f'F1 Score: {f1}')
print(f'ROC AUC Score: {roc_auc}')

Accuracy: 0.5410017919617714
Precision: 0.08010899182561308
Recall: 0.5833333333333334
F1 Score: 0.14087206516530906
ROC AUC Score: 0.5607079874730153


In [15]:
#Using the RandomForestClassifier method
from sklearn.ensemble import RandomForestClassifier
from imblearn.over_sampling import SMOTE

model_rf = RandomForestClassifier(class_weight='balanced', random_state=42)

smote = SMOTE(random_state=42)
X_train_resampled, y_train_resampled = smote.fit_resample(X_train, y_train)

model_rf.fit(X_train_resampled, y_train_resampled)

# Predict on test data
y_pred_rf = model_rf.predict(X_test)

# Evaluate model performance
accuracy_rf = accuracy_score(y_test, y_pred_rf)
precision_rf = precision_score(y_test, y_pred_rf)
recall_rf = recall_score(y_test, y_pred_rf)
f1_rf = f1_score(y_test, y_pred_rf)
roc_auc_rf = roc_auc_score(y_test, y_pred_rf)

print(f'Accuracy: {accuracy_rf}')
print(f'Precision: {precision_rf}')
print(f'Recall: {recall_rf}')
print(f'F1 Score: {f1_rf}')
print(f'ROC AUC Score: {roc_auc_rf}')


Accuracy: 0.8281423329635634
Precision: 0.09261658031088082
Recall: 0.18915343915343916
F1 Score: 0.12434782608695649
ROC AUC Score: 0.5306799759846371


In [16]:
#Hyperparameter Tuning with XGBoost
from xgboost import XGBClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score

# Define the model
xgb_model = XGBClassifier(scale_pos_weight=(len(y_train) - sum(y_train)) / sum(y_train), random_state=42)

# Define parameter grid for GridSearch
param_grid = {
    'n_estimators': [100, 200, 300],
    'learning_rate': [0.01, 0.1, 0.2],
    'max_depth': [3, 5, 7],
    'subsample': [0.8, 1.0],
    'colsample_bytree': [0.8, 1.0]
}

# Perform GridSearchCV
grid_search = GridSearchCV(estimator=xgb_model, param_grid=param_grid, cv=3, n_jobs=-1, verbose=2)
grid_search.fit(X_train, y_train)

# Best parameters and estimator
best_xgb_model = grid_search.best_estimator_

# Predict with the best estimator
y_pred_xgb = best_xgb_model.predict(X_test)

# Evaluate model performance
accuracy_xgb = accuracy_score(y_test, y_pred_xgb)
precision_xgb = precision_score(y_test, y_pred_xgb)
recall_xgb = recall_score(y_test, y_pred_xgb)
f1_xgb = f1_score(y_test, y_pred_xgb)
roc_auc_xgb = roc_auc_score(y_test, y_pred_xgb)

print(f'Accuracy: {accuracy_xgb}')
print(f'Precision: {precision_xgb}')
print(f'Recall: {recall_xgb}')
print(f'F1 Score: {f1_xgb}')
print(f'ROC AUC Score: {roc_auc_xgb}')


Fitting 3 folds for each of 108 candidates, totalling 324 fits
Accuracy: 0.84776858093694
Precision: 0.10763358778625955
Recall: 0.1865079365079365
F1 Score: 0.13649564375605036
ROC AUC Score: 0.5399382699961921


In [17]:
import sklearn
print(sklearn.__version__)
import pandas as pd
print("Pandas version:", pd.__version__)

import imblearn
print("Imbalanced-learn version:", imblearn.__version__)


1.1.3
Pandas version: 2.1.4
Imbalanced-learn version: 0.11.0
