# Importing Libraries

In [16]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [17]:
from sklearn.metrics import accuracy_score, f1_score, auc
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV

In [18]:
train = pd.read_csv('train.csv')
train.shape

(381109, 12)

In [30]:
test = pd.read_csv('test.csv')
test.shape

(127037, 11)

In [19]:
train.head()

Unnamed: 0,id,Gender,Age,Driving_License,Region_Code,Previously_Insured,Vehicle_Age,Vehicle_Damage,Annual_Premium,Policy_Sales_Channel,Vintage,Response
0,1,Male,44,1,28.0,0,> 2 Years,Yes,40454.0,26.0,217,1
1,2,Male,76,1,3.0,0,1-2 Year,No,33536.0,26.0,183,0
2,3,Male,47,1,28.0,0,> 2 Years,Yes,38294.0,26.0,27,1
3,4,Male,21,1,11.0,1,< 1 Year,No,28619.0,152.0,203,0
4,5,Female,29,1,41.0,1,< 1 Year,No,27496.0,152.0,39,0


In [20]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 381109 entries, 0 to 381108
Data columns (total 12 columns):
 #   Column                Non-Null Count   Dtype  
---  ------                --------------   -----  
 0   id                    381109 non-null  int64  
 1   Gender                381109 non-null  object 
 2   Age                   381109 non-null  int64  
 3   Driving_License       381109 non-null  int64  
 4   Region_Code           381109 non-null  float64
 5   Previously_Insured    381109 non-null  int64  
 6   Vehicle_Age           381109 non-null  object 
 7   Vehicle_Damage        381109 non-null  object 
 8   Annual_Premium        381109 non-null  float64
 9   Policy_Sales_Channel  381109 non-null  float64
 10  Vintage               381109 non-null  int64  
 11  Response              381109 non-null  int64  
dtypes: float64(3), int64(6), object(3)
memory usage: 34.9+ MB


In [21]:
train.isna().sum()

id                      0
Gender                  0
Age                     0
Driving_License         0
Region_Code             0
Previously_Insured      0
Vehicle_Age             0
Vehicle_Damage          0
Annual_Premium          0
Policy_Sales_Channel    0
Vintage                 0
Response                0
dtype: int64

# Data Preprocessing
* Convert the categorical features
* Divide the numerical features into bins
* Drop the unnecessary columns

In [22]:
train['Driving_License'].unique()

array([1, 0])

In [23]:
train['Previously_Insured'].unique()

array([0, 1])

In [24]:
train['Region_Code'].unique()

array([28.,  3., 11., 41., 33.,  6., 35., 50., 15., 45.,  8., 36., 30.,
       26., 16., 47., 48., 19., 39., 23., 37.,  5., 17.,  2.,  7., 29.,
       46., 27., 25., 13., 18., 20., 49., 22., 44.,  0.,  9., 31., 12.,
       34., 21., 10., 14., 38., 24., 40., 43., 32.,  4., 51., 42.,  1.,
       52.])

In [25]:
train['Vehicle_Age'].unique()

array(['> 2 Years', '1-2 Year', '< 1 Year'], dtype=object)

In [26]:
train['Policy_Sales_Channel'].unique()

array([ 26., 152., 160., 124.,  14.,  13.,  30., 156., 163., 157., 122.,
        19.,  22.,  15., 154.,  16.,  52., 155.,  11., 151., 125.,  25.,
        61.,   1.,  86.,  31., 150.,  23.,  60.,  21., 121.,   3., 139.,
        12.,  29.,  55.,   7.,  47., 127., 153.,  78., 158.,  89.,  32.,
         8.,  10., 120.,  65.,   4.,  42.,  83., 136.,  24.,  18.,  56.,
        48., 106.,  54.,  93., 116.,  91.,  45.,   9., 145., 147.,  44.,
       109.,  37., 140., 107., 128., 131., 114., 118., 159., 119., 105.,
       135.,  62., 138., 129.,  88.,  92., 111., 113.,  73.,  36.,  28.,
        35.,  59.,  53., 148., 133., 108.,  64.,  39.,  94., 132.,  46.,
        81., 103.,  90.,  51.,  27., 146.,  63.,  96.,  40.,  66., 100.,
        95., 123.,  98.,  75.,  69., 130., 134.,  49.,  97.,  38.,  17.,
       110.,  80.,  71., 117.,  58.,  20.,  76., 104.,  87.,  84., 137.,
       126.,  68.,  67., 101., 115.,  57.,  82.,  79., 112.,  99.,  70.,
         2.,  34.,  33.,  74., 102., 149.,  43.,   

In [27]:
train['Vehicle_Damage'].unique()

array(['Yes', 'No'], dtype=object)

Annual Premium could also be divided into bins

In [29]:
def data_prep(data):
    data = data.drop(columns=['id', 'Vintage'], axis=1)
    cols = ['Gender', 'Vehicle_Damage', 'Vehicle_Age']
    data = pd.get_dummies(data,columns=cols, drop_first=True)
    data['Age'] = pd.cut(data['Age'], bins=[0,29,35,50,100])
    data['Age'] = data['Age'].cat.codes
    return data

In [31]:
train_dum = data_prep(train)
test_dum = data_prep(test)

In [32]:
train_dum.head()

Unnamed: 0,Age,Driving_License,Region_Code,Previously_Insured,Annual_Premium,Policy_Sales_Channel,Response,Gender_Male,Vehicle_Damage_Yes,Vehicle_Age_< 1 Year,Vehicle_Age_> 2 Years
0,2,1,28.0,0,40454.0,26.0,1,1,1,0,1
1,3,1,3.0,0,33536.0,26.0,0,1,0,0,0
2,2,1,28.0,0,38294.0,26.0,1,1,1,0,1
3,0,1,11.0,1,28619.0,152.0,0,1,0,1,0
4,0,1,41.0,1,27496.0,152.0,0,0,0,1,0


In [33]:
test_dum.head()

Unnamed: 0,Age,Driving_License,Region_Code,Previously_Insured,Annual_Premium,Policy_Sales_Channel,Gender_Male,Vehicle_Damage_Yes,Vehicle_Age_< 1 Year,Vehicle_Age_> 2 Years
0,0,1,11.0,1,35786.0,152.0,1,0,1,0
1,2,1,28.0,0,33762.0,7.0,1,1,0,0
2,2,1,28.0,0,40050.0,124.0,1,1,0,0
3,0,1,27.0,1,37356.0,152.0,1,1,1,0
4,0,1,28.0,1,59097.0,152.0,1,0,1,0


# Train-Test Split

In [34]:
inputs = train_dum.drop('Response', axis=1)
targets = train_dum['Response']

x_test = test_dum.copy()

In [35]:
from sklearn.model_selection import StratifiedKFold

In [41]:
skf = StratifiedKFold(n_splits=10)

for train_idx, val_idx in skf.split(inputs, targets):
    x_train, x_val = inputs.iloc[train_idx], inputs.iloc[val_idx]
    y_train, y_val = targets.iloc[train_idx], targets.iloc[val_idx]

# Standardization

In [40]:
from sklearn.preprocessing import MinMaxScaler

In [42]:
scaler = MinMaxScaler()

x_train_scaled = scaler.fit_transform(x_train)

x_val_scaled = scaler.transform(x_val)
x_test_scaled = scaler.transform(x_test)