# Model Training and Evaluation Notebook for Insurance Sell Prediction

## Importing all libraries

In [42]:
import pandas as pd 
import numpy as np 
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer

from sklearn.model_selection import train_test_split

import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier
from xgboost import XGBClassifier
from catboost import CatBoostClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, confusion_matrix

In [43]:
df = pd.read_csv('dataset/resampled_dataset.csv')

In [44]:
df.head(10)

Unnamed: 0,Gender,Age,Driving_License,Region_Code,Previously_Insured,Vehicle_Age,Vehicle_Damage,Annual_Premium,Policy_Sales_Channel,Vintage,Response
0,Female,22,1,19.0,1,< 1 Year,No,54559.0,152.0,87,0
1,Male,57,1,18.0,0,1-2 Year,Yes,33439.0,124.0,63,0
2,Female,64,1,28.0,0,1-2 Year,Yes,39327.0,122.0,45,0
3,Male,52,1,41.0,1,1-2 Year,No,26536.0,124.0,161,0
4,Female,21,1,7.0,0,< 1 Year,No,28416.0,152.0,92,0
5,Male,59,1,28.0,0,> 2 Years,Yes,31842.0,60.0,296,0
6,Female,48,1,40.0,0,1-2 Year,Yes,33565.0,26.0,202,0
7,Male,43,1,28.0,0,1-2 Year,No,38786.0,15.0,261,0
8,Male,24,1,47.0,1,< 1 Year,No,29291.0,152.0,226,0
9,Male,47,1,0.0,0,> 2 Years,Yes,2630.0,26.0,296,0


In [45]:
X = df.drop(columns=['Response'], axis=1)

In [46]:
X.head()

Unnamed: 0,Gender,Age,Driving_License,Region_Code,Previously_Insured,Vehicle_Age,Vehicle_Damage,Annual_Premium,Policy_Sales_Channel,Vintage
0,Female,22,1,19.0,1,< 1 Year,No,54559.0,152.0,87
1,Male,57,1,18.0,0,1-2 Year,Yes,33439.0,124.0,63
2,Female,64,1,28.0,0,1-2 Year,Yes,39327.0,122.0,45
3,Male,52,1,41.0,1,1-2 Year,No,26536.0,124.0,161
4,Female,21,1,7.0,0,< 1 Year,No,28416.0,152.0,92


In [47]:
df.dtypes

Gender                   object
Age                       int64
Driving_License           int64
Region_Code             float64
Previously_Insured        int64
Vehicle_Age              object
Vehicle_Damage           object
Annual_Premium          float64
Policy_Sales_Channel    float64
Vintage                   int64
Response                  int64
dtype: object

In [48]:
print("Categories in 'Gender' variable:     ",end=" " )
print(df['Gender'].unique())

print("Categories in 'Vehicle_Age' variable:  ",end=" ")
print(df['Vehicle_Age'].unique())

print("Categories in 'Vehicle_Damage' variable:",end=" " )
print(df['Vehicle_Damage'].unique())

Categories in 'Gender' variable:      ['Female' 'Male']
Categories in 'Vehicle_Age' variable:   ['< 1 Year' '1-2 Year' '> 2 Years']
Categories in 'Vehicle_Damage' variable: ['No' 'Yes']


In [49]:
y = df['Response']

In [50]:
y

0        0
1        0
2        0
3        0
4        0
        ..
93415    1
93416    1
93417    1
93418    1
93419    1
Name: Response, Length: 93420, dtype: int64

In [51]:
# Create Column Transformer with 3 types of transformers
num_features = X.select_dtypes(exclude="object").columns
cat_features = X.select_dtypes(include="object").columns

numeric_transformer = StandardScaler()
oh_transformer = OneHotEncoder()

preprocessor = ColumnTransformer(
    [
        ("OneHotEncoder", oh_transformer, cat_features),
         ("StandardScaler", numeric_transformer, num_features),        
    ]
)

In [52]:
X = preprocessor.fit_transform(X)

In [53]:
X.shape

(93420, 14)

In [54]:
# separate dataset into train and test
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2,random_state=1)
X_train.shape, X_test.shape

((74736, 14), (18684, 14))

In [55]:
# Define the models
models = {
    "Logistic Regression": LogisticRegression(max_iter=1000),
    "K-Neighbors Classifier": KNeighborsClassifier(),
    "Decision Tree": DecisionTreeClassifier(),
    "Random Forest Classifier": RandomForestClassifier(),
    "XGBClassifier": XGBClassifier(use_label_encoder=False, eval_metric='logloss'), 
    "CatBoost Classifier": CatBoostClassifier(verbose=False),
    "AdaBoost Classifier": AdaBoostClassifier(),
    "Gradient Boosting Classifier": GradientBoostingClassifier()
}

# Function to evaluate classifier
def evaluate_classifier(true, predicted, predicted_proba=None):
    accuracy = accuracy_score(true, predicted)
    precision = precision_score(true, predicted)
    recall = recall_score(true, predicted)
    f1 = f1_score(true, predicted)
    roc_auc = roc_auc_score(true, predicted_proba) if predicted_proba is not None else None
    conf_matrix = confusion_matrix(true, predicted)
    
    metrics = {
        'Accuracy': accuracy,
        'Precision': precision,
        'Recall': recall,
        'F1 Score': f1,
        'ROC AUC': roc_auc,
        'Confusion Matrix': conf_matrix
    }
    
    return metrics

# Lists to store model performance
model_list = []
accuracy_list = []
precision_list = []
recall_list = []
f1_list = []
roc_auc_list = []

# Train and evaluate each model
for model_name, model in models.items():
    model.fit(X_train, y_train) # Train model

    # Make predictions
    y_train_pred = model.predict(X_train)
    y_test_pred = model.predict(X_test)
    
    # Predicted probabilities for ROC AUC (if applicable)
    y_train_proba = model.predict_proba(X_train)[:, 1] if hasattr(model, "predict_proba") else None
    y_test_proba = model.predict_proba(X_test)[:, 1] if hasattr(model, "predict_proba") else None
    
    # Evaluate Train dataset
    train_metrics = evaluate_classifier(y_train, y_train_pred, y_train_proba)

    # Evaluate Test dataset
    test_metrics = evaluate_classifier(y_test, y_test_pred, y_test_proba)

    print(model_name)
    model_list.append(model_name)
    
    print('Model performance for Training set')
    for metric, value in train_metrics.items():
        if metric == 'Confusion Matrix':
            print(f"- {metric}:\n{value}")
        elif value is not None:
            print(f"- {metric}: {value:.4f}")
    print('----------------------------------')
    
    print('Model performance for Test set')
    for metric, value in test_metrics.items():
        if metric == 'Confusion Matrix':
            print(f"- {metric}:\n{value}")
        elif value is not None:
            print(f"- {metric}: {value:.4f}")

    accuracy_list.append(test_metrics['Accuracy'])
    precision_list.append(test_metrics['Precision'])
    recall_list.append(test_metrics['Recall'])
    f1_list.append(test_metrics['F1 Score'])
    roc_auc_list.append(test_metrics['ROC AUC'] if test_metrics['ROC AUC'] is not None else 0)
    
    print('='*35)
    print('\n')


Logistic Regression
Model performance for Training set
- Accuracy: 0.7852
- Precision: 0.7073
- Recall: 0.9738
- F1 Score: 0.8194
- ROC AUC: 0.8380
- Confusion Matrix:
[[22251 15073]
 [  982 36430]]
----------------------------------
Model performance for Test set
- Accuracy: 0.7784
- Precision: 0.6997
- Recall: 0.9718
- F1 Score: 0.8136
- ROC AUC: 0.8351
- Confusion Matrix:
[[5507 3879]
 [ 262 9036]]


K-Neighbors Classifier
Model performance for Training set
- Accuracy: 0.8303
- Precision: 0.7889
- Recall: 0.9025
- F1 Score: 0.8419
- ROC AUC: 0.9148
- Confusion Matrix:
[[28289  9035]
 [ 3648 33764]]
----------------------------------
Model performance for Test set
- Accuracy: 0.7548
- Precision: 0.7203
- Recall: 0.8294
- F1 Score: 0.7710
- ROC AUC: 0.8089
- Confusion Matrix:
[[6391 2995]
 [1586 7712]]


Decision Tree
Model performance for Training set
- Accuracy: 1.0000
- Precision: 1.0000
- Recall: 0.9999
- F1 Score: 1.0000
- ROC AUC: 1.0000
- Confusion Matrix:
[[37324     0]
 [    

Parameters: { "use_label_encoder" } are not used.



XGBClassifier
Model performance for Training set
- Accuracy: 0.8264
- Precision: 0.7642
- Recall: 0.9449
- F1 Score: 0.8450
- ROC AUC: 0.8962
- Confusion Matrix:
[[26415 10909]
 [ 2062 35350]]
----------------------------------
Model performance for Test set
- Accuracy: 0.7905
- Precision: 0.7326
- Recall: 0.9120
- F1 Score: 0.8125
- ROC AUC: 0.8505
- Confusion Matrix:
[[6290 3096]
 [ 818 8480]]


CatBoost Classifier
Model performance for Training set
- Accuracy: 0.8223
- Precision: 0.7602
- Recall: 0.9423
- F1 Score: 0.8415
- ROC AUC: 0.8907
- Confusion Matrix:
[[26203 11121]
 [ 2160 35252]]
----------------------------------
Model performance for Test set
- Accuracy: 0.7936
- Precision: 0.7347
- Recall: 0.9160
- F1 Score: 0.8154
- ROC AUC: 0.8540
- Confusion Matrix:
[[6310 3076]
 [ 781 8517]]


AdaBoost Classifier
Model performance for Training set
- Accuracy: 0.7984
- Precision: 0.7330
- Recall: 0.9394
- F1 Score: 0.8234
- ROC AUC: 0.8551
- Confusion Matrix:
[[24522 12802]
 [ 2268 3

In [56]:
# Create DataFrame and sort by accuracy
pd.DataFrame(list(zip(model_list, accuracy_list)), columns=['Model Name', 'Accuracy']).sort_values(by=["Accuracy"], ascending=False)

Unnamed: 0,Model Name,Accuracy
5,CatBoost Classifier,0.793567
7,Gradient Boosting Classifier,0.793353
6,AdaBoost Classifier,0.791426
4,XGBClassifier,0.790516
0,Logistic Regression,0.778367
3,Random Forest Classifier,0.778099
1,K-Neighbors Classifier,0.754817
2,Decision Tree,0.711946
