In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

import warnings
warnings.filterwarnings("ignore")

# Loading the Dataset

In [2]:
df = pd.read_csv("data.csv")
df.head()

Unnamed: 0,id,Gender,Age,Driving_License,Region_Code,Previously_Insured,Vehicle_Age,Vehicle_Damage,Annual_Premium,Policy_Sales_Channel,Vintage,Response
0,1,Male,44,1,28.0,0,> 2 Years,Yes,40454.0,26.0,217,1
1,2,Male,76,1,3.0,0,1-2 Year,No,33536.0,26.0,183,0
2,3,Male,47,1,28.0,0,> 2 Years,Yes,38294.0,26.0,27,1
3,4,Male,21,1,11.0,1,< 1 Year,No,28619.0,152.0,203,0
4,5,Female,29,1,41.0,1,< 1 Year,No,27496.0,152.0,39,0


- id: Unique ID for the customer
- Gender: Gender of the customer
- Age: Age of the customer
- Driving_License: [0 : Customer does not have DL, 1 : Customer already has DL]
- Region_Code: Unique code for the region of the customer
- Previously_Insured: [1 : Customer already has Vehicle Insurance, 0 : Customer doesn't have Vehicle Insurance]
- Vehicle_Age: Age of the Vehicle
- Vehicle_Damage: [1 : Customer got his/her vehicle damaged in the past. 0 : Customer didn't get his/her vehicle damaged in the past.]
- Annual_Premium: The amount customer needs to pay as premium in the year
- Policy_Sales_Channel: Anonymized Code for the channel of outreaching to the customer ie. Different Agents, Over Mail, Over Phone, In Person, etc.
- Vintage: Number of Days, Customer has been associated with the company
- Response: [1 : Customer is interested, 0 : Customer is not interested]

# EDA

In [3]:
df.shape

(381109, 12)

In [4]:
df.isna().sum()

id                      0
Gender                  0
Age                     0
Driving_License         0
Region_Code             0
Previously_Insured      0
Vehicle_Age             0
Vehicle_Damage          0
Annual_Premium          0
Policy_Sales_Channel    0
Vintage                 0
Response                0
dtype: int64

In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 381109 entries, 0 to 381108
Data columns (total 12 columns):
 #   Column                Non-Null Count   Dtype  
---  ------                --------------   -----  
 0   id                    381109 non-null  int64  
 1   Gender                381109 non-null  object 
 2   Age                   381109 non-null  int64  
 3   Driving_License       381109 non-null  int64  
 4   Region_Code           381109 non-null  float64
 5   Previously_Insured    381109 non-null  int64  
 6   Vehicle_Age           381109 non-null  object 
 7   Vehicle_Damage        381109 non-null  object 
 8   Annual_Premium        381109 non-null  float64
 9   Policy_Sales_Channel  381109 non-null  float64
 10  Vintage               381109 non-null  int64  
 11  Response              381109 non-null  int64  
dtypes: float64(3), int64(6), object(3)
memory usage: 34.9+ MB


In [6]:
df.describe()

Unnamed: 0,id,Age,Driving_License,Region_Code,Previously_Insured,Annual_Premium,Policy_Sales_Channel,Vintage,Response
count,381109.0,381109.0,381109.0,381109.0,381109.0,381109.0,381109.0,381109.0,381109.0
mean,190555.0,38.822584,0.997869,26.388807,0.45821,30564.389581,112.034295,154.347397,0.122563
std,110016.836208,15.511611,0.04611,13.229888,0.498251,17213.155057,54.203995,83.671304,0.327936
min,1.0,20.0,0.0,0.0,0.0,2630.0,1.0,10.0,0.0
25%,95278.0,25.0,1.0,15.0,0.0,24405.0,29.0,82.0,0.0
50%,190555.0,36.0,1.0,28.0,0.0,31669.0,133.0,154.0,0.0
75%,285832.0,49.0,1.0,35.0,1.0,39400.0,152.0,227.0,0.0
max,381109.0,85.0,1.0,52.0,1.0,540165.0,163.0,299.0,1.0


In [7]:
df["Response"].value_counts()

Response
0    334399
1     46710
Name: count, dtype: int64

# Data Preprocessing

In [9]:
num_features = ["Age","Vintage"]
cat_features = df.drop(["Age","Vintage","Response"],axis=1).columns

In [17]:
num_features

['Age', 'Vintage']

In [18]:
cat_features

Index(['id', 'Gender', 'Driving_License', 'Region_Code', 'Previously_Insured',
       'Vehicle_Age', 'Vehicle_Damage', 'Annual_Premium',
       'Policy_Sales_Channel'],
      dtype='object')

In [10]:
df["Gender"] = df["Gender"].map({"Female":0,"Male":1}).astype(int)
df.head(2)

Unnamed: 0,id,Gender,Age,Driving_License,Region_Code,Previously_Insured,Vehicle_Age,Vehicle_Damage,Annual_Premium,Policy_Sales_Channel,Vintage,Response
0,1,1,44,1,28.0,0,> 2 Years,Yes,40454.0,26.0,217,1
1,2,1,76,1,3.0,0,1-2 Year,No,33536.0,26.0,183,0


In [11]:
for col in df.columns:
    print(f"{col} -> {df[col].dtype}")

id -> int64
Gender -> int64
Age -> int64
Driving_License -> int64
Region_Code -> float64
Previously_Insured -> int64
Vehicle_Age -> object
Vehicle_Damage -> object
Annual_Premium -> float64
Policy_Sales_Channel -> float64
Vintage -> int64
Response -> int64


In [14]:
df = pd.get_dummies(df,drop_first=True).astype(int)
df.head(2)

Unnamed: 0,id,Gender,Age,Driving_License,Region_Code,Previously_Insured,Annual_Premium,Policy_Sales_Channel,Vintage,Response,Vehicle_Age_< 1 Year,Vehicle_Age_> 2 Years,Vehicle_Damage_Yes
0,1,1,44,1,28,0,40454,26,217,1,0,1,1
1,2,1,76,1,3,0,33536,26,183,0,0,0,0


In [15]:
for col in df.columns:
    print(f"{col} -> {df[col].dtype}")

id -> int64
Gender -> int64
Age -> int64
Driving_License -> int64
Region_Code -> int64
Previously_Insured -> int64
Annual_Premium -> int64
Policy_Sales_Channel -> int64
Vintage -> int64
Response -> int64
Vehicle_Age_< 1 Year -> int64
Vehicle_Age_> 2 Years -> int64
Vehicle_Damage_Yes -> int64


In [16]:
df = df.rename(columns={"Vehicle_Age_< 1 Year":"Vehicle_Age_lt_1_Year","Vehicle_Age_> 2 Years":"Vehicle_Age_gt_2_Years"})
df.head()

Unnamed: 0,id,Gender,Age,Driving_License,Region_Code,Previously_Insured,Annual_Premium,Policy_Sales_Channel,Vintage,Response,Vehicle_Age_lt_1_Year,Vehicle_Age_gt_2_Years,Vehicle_Damage_Yes
0,1,1,44,1,28,0,40454,26,217,1,0,1,1
1,2,1,76,1,3,0,33536,26,183,0,0,0,0
2,3,1,47,1,28,0,38294,26,27,1,0,1,1
3,4,1,21,1,11,1,28619,152,203,0,1,0,0
4,5,0,29,1,41,1,27496,152,39,0,1,0,0


In [19]:
from sklearn.preprocessing import LabelEncoder, StandardScaler, MinMaxScaler, RobustScaler

ss = StandardScaler()
df[num_features] = ss.fit_transform(df[num_features])

mm = MinMaxScaler()
df[["Annual_Premium"]] = mm.fit_transform(df[["Annual_Premium"]])

In [20]:
id = df.id
df = df.drop("id",axis=1)

In [21]:
from sklearn.model_selection import train_test_split

In [22]:
train_target = df["Response"]
train = df.drop(["Response"],axis=1)

X_train, X_test,y_train, y_test = train_test_split(train,train_target,random_state=42)

# Model Trainer

In [23]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import RandomizedSearchCV

random_search = {
    "criterion": ["entropy","gini"],
    "max_depth":[2,3,4,5,6,7,10],
    "min_samples_leaf":[4,6,8],
    "min_samples_split":[5,7,10],
    "n_estimators":[300]
}

clf = RandomForestClassifier()

model = RandomizedSearchCV(estimator=clf,param_distributions=random_search,n_iter=10,cv=4,verbose=4,random_state=42,n_jobs=-1)
model.fit(X_train,y_train)

Fitting 4 folds for each of 10 candidates, totalling 40 fits


In [24]:
print(model.best_params_)
print(model.best_score_)

{'n_estimators': 300, 'min_samples_split': 5, 'min_samples_leaf': 4, 'max_depth': 10, 'criterion': 'gini'}
0.8777774279529266


In [25]:
model = RandomForestClassifier(n_estimators=300, min_samples_split=5, min_samples_leaf= 4, max_depth=10, criterion='gini')
model.fit(X_train,y_train)

In [26]:
import pickle

file_name = "rf_model.pkl"
pickle.dump(model,open(file_name,"wb"))

In [27]:
rf_load = pickle.load(open(file_name,"rb"))

# Model Evaluation

In [28]:
from sklearn.metrics import classification_report

y_pred = model.predict(X_test)
print(classification_report(y_test,y_pred))

              precision    recall  f1-score   support

           0       0.88      1.00      0.93     83513
           1       0.50      0.00      0.00     11765

    accuracy                           0.88     95278
   macro avg       0.69      0.50      0.47     95278
weighted avg       0.83      0.88      0.82     95278

