# Insurance Sell Prediction

In [1]:
import pandas as pd 
import numpy as np 
import matplotlib.pyplot as plt
import seaborn as sns

In [4]:
df = pd.read_csv('dataset/train.csv', index_col='id')

In [5]:
df.head()

Unnamed: 0_level_0,Gender,Age,Driving_License,Region_Code,Previously_Insured,Vehicle_Age,Vehicle_Damage,Annual_Premium,Policy_Sales_Channel,Vintage,Response
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
1,Male,44,1,28.0,0,> 2 Years,Yes,40454.0,26.0,217,1
2,Male,76,1,3.0,0,1-2 Year,No,33536.0,26.0,183,0
3,Male,47,1,28.0,0,> 2 Years,Yes,38294.0,26.0,27,1
4,Male,21,1,11.0,1,< 1 Year,No,28619.0,152.0,203,0
5,Female,29,1,41.0,1,< 1 Year,No,27496.0,152.0,39,0


In [6]:
X = df.drop(columns=['Response'], axis=1)

In [7]:
X.head()

Unnamed: 0_level_0,Gender,Age,Driving_License,Region_Code,Previously_Insured,Vehicle_Age,Vehicle_Damage,Annual_Premium,Policy_Sales_Channel,Vintage
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
1,Male,44,1,28.0,0,> 2 Years,Yes,40454.0,26.0,217
2,Male,76,1,3.0,0,1-2 Year,No,33536.0,26.0,183
3,Male,47,1,28.0,0,> 2 Years,Yes,38294.0,26.0,27
4,Male,21,1,11.0,1,< 1 Year,No,28619.0,152.0,203
5,Female,29,1,41.0,1,< 1 Year,No,27496.0,152.0,39


In [8]:
df.dtypes

Gender                   object
Age                       int64
Driving_License           int64
Region_Code             float64
Previously_Insured        int64
Vehicle_Age              object
Vehicle_Damage           object
Annual_Premium          float64
Policy_Sales_Channel    float64
Vintage                   int64
Response                  int64
dtype: object

In [9]:
print("Categories in 'Gender' variable:     ",end=" " )
print(df['Gender'].unique())

print("Categories in 'Vehicle_Age' variable:  ",end=" ")
print(df['Vehicle_Age'].unique())

print("Categories in 'Vehicle_Damage' variable:",end=" " )
print(df['Vehicle_Damage'].unique())

Categories in 'Gender' variable:      ['Male' 'Female']
Categories in 'Vehicle_Age' variable:   ['> 2 Years' '1-2 Year' '< 1 Year']
Categories in 'Vehicle_Damage' variable: ['Yes' 'No']


In [10]:
y = df['Response']

In [11]:
y

id
1         1
2         0
3         1
4         0
5         0
         ..
381105    0
381106    0
381107    0
381108    0
381109    0
Name: Response, Length: 381109, dtype: int64

In [13]:
# Create Column Transformer with 3 types of transformers
num_features = X.select_dtypes(exclude="object").columns
cat_features = X.select_dtypes(include="object").columns

from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer

numeric_transformer = StandardScaler()
oh_transformer = OneHotEncoder()

preprocessor = ColumnTransformer(
    [
        ("OneHotEncoder", oh_transformer, cat_features),
         ("StandardScaler", numeric_transformer, num_features),        
    ]
)

In [14]:
X = preprocessor.fit_transform(X)

In [15]:
X.shape

(381109, 14)

In [16]:
# separate dataset into train and test
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2,random_state=42)
X_train.shape, X_test.shape

((304887, 14), (76222, 14))

In [20]:
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier
from xgboost import XGBClassifier
from catboost import CatBoostClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, confusion_matrix

# Define the models
models = {
    "Logistic Regression": LogisticRegression(max_iter=1000),
    "K-Neighbors Classifier": KNeighborsClassifier(),
    "Decision Tree": DecisionTreeClassifier(),
    "Random Forest Classifier": RandomForestClassifier(),
    "XGBClassifier": XGBClassifier(use_label_encoder=False, eval_metric='logloss'), 
    "CatBoost Classifier": CatBoostClassifier(verbose=False),
    "AdaBoost Classifier": AdaBoostClassifier(),
    "Gradient Boosting Classifier": GradientBoostingClassifier()
}

# Function to evaluate classifier
def evaluate_classifier(true, predicted, predicted_proba=None):
    accuracy = accuracy_score(true, predicted)
    precision = precision_score(true, predicted)
    recall = recall_score(true, predicted)
    f1 = f1_score(true, predicted)
    roc_auc = roc_auc_score(true, predicted_proba) if predicted_proba is not None else None
    conf_matrix = confusion_matrix(true, predicted)
    
    metrics = {
        'Accuracy': accuracy,
        'Precision': precision,
        'Recall': recall,
        'F1 Score': f1,
        'ROC AUC': roc_auc,
        'Confusion Matrix': conf_matrix
    }
    
    return metrics

# Lists to store model performance
model_list = []
accuracy_list = []
precision_list = []
recall_list = []
f1_list = []
roc_auc_list = []

# Train and evaluate each model
for model_name, model in models.items():
    model.fit(X_train, y_train) # Train model

    # Make predictions
    y_train_pred = model.predict(X_train)
    y_test_pred = model.predict(X_test)
    
    # Predicted probabilities for ROC AUC (if applicable)
    y_train_proba = model.predict_proba(X_train)[:, 1] if hasattr(model, "predict_proba") else None
    y_test_proba = model.predict_proba(X_test)[:, 1] if hasattr(model, "predict_proba") else None
    
    # Evaluate Train dataset
    train_metrics = evaluate_classifier(y_train, y_train_pred, y_train_proba)

    # Evaluate Test dataset
    test_metrics = evaluate_classifier(y_test, y_test_pred, y_test_proba)

    print(model_name)
    model_list.append(model_name)
    
    print('Model performance for Training set')
    for metric, value in train_metrics.items():
        if metric == 'Confusion Matrix':
            print(f"- {metric}:\n{value}")
        elif value is not None:
            print(f"- {metric}: {value:.4f}")
    print('----------------------------------')
    
    print('Model performance for Test set')
    for metric, value in test_metrics.items():
        if metric == 'Confusion Matrix':
            print(f"- {metric}:\n{value}")
        elif value is not None:
            print(f"- {metric}: {value:.4f}")

    accuracy_list.append(test_metrics['Accuracy'])
    precision_list.append(test_metrics['Precision'])
    recall_list.append(test_metrics['Recall'])
    f1_list.append(test_metrics['F1 Score'])
    roc_auc_list.append(test_metrics['ROC AUC'] if test_metrics['ROC AUC'] is not None else 0)
    
    print('='*35)
    print('\n')


  _warn_prf(average, modifier, msg_start, len(result))


Logistic Regression
Model performance for Training set
- Accuracy: 0.8780
- Precision: 0.4615
- Recall: 0.0002
- F1 Score: 0.0003
- ROC AUC: 0.8359
- Confusion Matrix:
[[267693      7]
 [ 37181      6]]
----------------------------------
Model performance for Test set
- Accuracy: 0.8751
- Precision: 0.0000
- Recall: 0.0000
- F1 Score: 0.0000
- ROC AUC: 0.8401
- Confusion Matrix:
[[66699     0]
 [ 9523     0]]


K-Neighbors Classifier
Model performance for Training set
- Accuracy: 0.8969
- Precision: 0.6490
- Recall: 0.3377
- F1 Score: 0.4442
- ROC AUC: 0.9191
- Confusion Matrix:
[[260909   6791]
 [ 24629  12558]]
----------------------------------
Model performance for Test set
- Accuracy: 0.8550
- Precision: 0.3433
- Recall: 0.1764
- F1 Score: 0.2331
- ROC AUC: 0.7597
- Confusion Matrix:
[[63486  3213]
 [ 7843  1680]]


Decision Tree
Model performance for Training set
- Accuracy: 0.9999
- Precision: 1.0000
- Recall: 0.9991
- F1 Score: 0.9995
- ROC AUC: 1.0000
- Confusion Matrix:
[[267

Parameters: { "use_label_encoder" } are not used.



XGBClassifier
Model performance for Training set
- Accuracy: 0.8815
- Precision: 0.7187
- Recall: 0.0472
- F1 Score: 0.0886
- ROC AUC: 0.8792
- Confusion Matrix:
[[267013    687]
 [ 35432   1755]]
----------------------------------
Model performance for Test set
- Accuracy: 0.8748
- Precision: 0.4833
- Recall: 0.0289
- F1 Score: 0.0545
- ROC AUC: 0.8587
- Confusion Matrix:
[[66405   294]
 [ 9248   275]]


CatBoost Classifier
Model performance for Training set
- Accuracy: 0.8830
- Precision: 0.7603
- Recall: 0.0594
- F1 Score: 0.1101
- ROC AUC: 0.8808
- Confusion Matrix:
[[267004    696]
 [ 34979   2208]]
----------------------------------
Model performance for Test set
- Accuracy: 0.8750
- Precision: 0.4976
- Recall: 0.0322
- F1 Score: 0.0606
- ROC AUC: 0.8598
- Confusion Matrix:
[[66389   310]
 [ 9216   307]]




  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


AdaBoost Classifier
Model performance for Training set
- Accuracy: 0.8780
- Precision: 0.0000
- Recall: 0.0000
- F1 Score: 0.0000
- ROC AUC: 0.8515
- Confusion Matrix:
[[267700      0]
 [ 37187      0]]
----------------------------------
Model performance for Test set
- Accuracy: 0.8751
- Precision: 0.0000
- Recall: 0.0000
- F1 Score: 0.0000
- ROC AUC: 0.8545
- Confusion Matrix:
[[66699     0]
 [ 9523     0]]


Gradient Boosting Classifier
Model performance for Training set
- Accuracy: 0.8780
- Precision: 1.0000
- Recall: 0.0001
- F1 Score: 0.0002
- ROC AUC: 0.8563
- Confusion Matrix:
[[267700      0]
 [ 37183      4]]
----------------------------------
Model performance for Test set
- Accuracy: 0.8751
- Precision: 1.0000
- Recall: 0.0001
- F1 Score: 0.0002
- ROC AUC: 0.8593
- Confusion Matrix:
[[66699     0]
 [ 9522     1]]




In [22]:
# Create DataFrame and sort by accuracy
pd.DataFrame(list(zip(model_list, accuracy_list)), columns=['Model Name', 'Accuracy']).sort_values(by=["Accuracy"], ascending=False)

Unnamed: 0,Model Name,Accuracy
7,Gradient Boosting Classifier,0.875075
0,Logistic Regression,0.875062
6,AdaBoost Classifier,0.875062
5,CatBoost Classifier,0.875023
4,XGBClassifier,0.874813
3,Random Forest Classifier,0.865144
1,K-Neighbors Classifier,0.85495
2,Decision Tree,0.821285
