# Feature Engineering

In [1]:
import sys, os
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
sys.path.append(os.path.abspath(os.path.join('..', 'utils')))
from utility import MyLabelEncoder

import warnings
warnings.filterwarnings('ignore')


In [2]:
train_data = pd.read_csv('../Data/train.csv')
test_data =  pd.read_csv('../Data/test.csv')
train_data.shape

(381109, 12)

In [3]:
train_data.shape

(381109, 12)

In [4]:
train_data.head()

Unnamed: 0,id,Gender,Age,Driving_License,Region_Code,Previously_Insured,Vehicle_Age,Vehicle_Damage,Annual_Premium,Policy_Sales_Channel,Vintage,Response
0,1,Male,44,1,28.0,0,> 2 Years,Yes,40454.0,26.0,217,1
1,2,Male,76,1,3.0,0,1-2 Year,No,33536.0,26.0,183,0
2,3,Male,47,1,28.0,0,> 2 Years,Yes,38294.0,26.0,27,1
3,4,Male,21,1,11.0,1,< 1 Year,No,28619.0,152.0,203,0
4,5,Female,29,1,41.0,1,< 1 Year,No,27496.0,152.0,39,0


In [5]:
train_data.drop('id', axis=1, inplace=True)
test_data.drop('id', axis=1, inplace=True)

In [6]:
cat_attribs = [attr for attr in train_data.columns if train_data[attr].dtype == np.dtype('O')]
enc = MyLabelEncoder(cat_attribs)
train_data = enc.fit_transform(train_data)
test_data = enc.fit_transform(test_data)

In [7]:
train_data.head()

Unnamed: 0,Gender,Age,Driving_License,Region_Code,Previously_Insured,Vehicle_Age,Vehicle_Damage,Annual_Premium,Policy_Sales_Channel,Vintage,Response
0,1,44,1,28.0,0,2,1,40454.0,26.0,217,1
1,1,76,1,3.0,0,0,0,33536.0,26.0,183,0
2,1,47,1,28.0,0,2,1,38294.0,26.0,27,1
3,1,21,1,11.0,1,1,0,28619.0,152.0,203,0
4,0,29,1,41.0,1,1,0,27496.0,152.0,39,0


In [8]:
!pip install imbalanced-learn



In [9]:
X = train_data.drop('Response', axis=1)
y = train_data['Response']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.2)

In [10]:
cat_attribs = ['Gender', 'Driving_License', 'Previously_Insured', 'Vehicle_Age', 'Vehicle_Damage']
num_attribs = ['Age', 'Annual_Premium', 'Vintage', 'Region_Code', 'Policy_Sales_Channel']

### Oversampling

In [11]:
from imblearn.over_sampling import RandomOverSampler 
over_sampler = RandomOverSampler()
X_train_rs, y_train_rs = over_sampler.fit_resample(X_train, y_train)

In [12]:
train_set_os = X_train_rs
train_set_os['Response'] = y_train_rs
scaler = StandardScaler()
train_set_os[num_attribs] = scaler.fit_transform(train_set_os[num_attribs])
train_set_os.to_csv('../Data/ProcessedData/train_set_os.csv', index=None)

test_set_os = X_test
test_set_os['Response'] = y_test
test_set_os[num_attribs] = scaler.transform(test_set_os[num_attribs])
test_set_os.to_csv('../Data/ProcessedData/test_set_os.csv', index=None)


### Undersampling

In [13]:
from imblearn.under_sampling import RandomUnderSampler
under_sampler = RandomUnderSampler()
X_train_rs, y_train_rs = under_sampler.fit_resample(X_train, y_train)

In [14]:
train_set_us = X_train_rs
train_set_us['Response'] = y_train_rs
scaler = StandardScaler()
train_set_us[num_attribs] = scaler.fit_transform(train_set_us[num_attribs])
train_set_us.to_csv('../Data/ProcessedData/train_set_us.csv', index=None)

test_set_us = X_test
test_set_us['Response'] = y_test
test_set_us[num_attribs] = scaler.transform(test_set_us[num_attribs])
test_set_us.to_csv('../Data/ProcessedData/test_set_us.csv', index=None)


### Smote

In [15]:
from imblearn.over_sampling import SMOTE
smote = SMOTE()
X_train_rs, y_train_rs = smote.fit_resample(X_train, y_train)

In [16]:
train_set_sm = X_train_rs
train_set_sm['Response'] = y_train_rs
scaler = StandardScaler()
train_set_sm[num_attribs] = scaler.fit_transform(train_set_sm[num_attribs])
train_set_sm.to_csv('../Data/ProcessedData/train_set_sm.csv', index=None)

test_set_sm = X_test
test_set_sm['Response'] = y_test
test_set_sm[num_attribs] = scaler.transform(test_set_sm[num_attribs])
test_set_sm.to_csv('../Data/ProcessedData/test_set_sm.csv', index=None)