In [1]:
# load libraries

import numpy as np
import pandas as pd
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.tree import DecisionTreeRegressor


In [2]:
df = pd.read_csv('insurance_claims.csv')

In [3]:
df.columns

Index(['CustomerID', 'State', 'ClaimAmount', 'Education', 'Income', 'Gender',
       'Policy', 'NumberOfPolicies', 'PolicyType', 'MonthsInstallment',
       'VehicleType', 'TotalClaimAmount'],
      dtype='object')

In [4]:
df.dtypes

CustomerID            object
State                 object
ClaimAmount            int64
Education             object
Income                 int64
Gender                object
Policy                object
NumberOfPolicies       int64
PolicyType            object
MonthsInstallment      int64
VehicleType           object
TotalClaimAmount     float64
dtype: object

In [5]:
df.head()

Unnamed: 0,CustomerID,State,ClaimAmount,Education,Income,Gender,Policy,NumberOfPolicies,PolicyType,MonthsInstallment,VehicleType,TotalClaimAmount
0,CID311,IA,59,High School,196419,M,Auto,4,Business,70,MUV,112.09
1,CID838,PH,71,Professional,171186,F,Auto,4,Business,31,Medium sized,113.47
2,CID607,PH,103,Graduate,285004,F,Auto,4,Self Auto,67,Sedan,113.86
3,CID436,TX,103,Professional,181780,M,Auto,4,Corporate Auto,78,Medium sized,114.16
4,CID018,PH,112,College,275022,F,Auto,3,Self Auto,21,Medium sized,116.19


In [6]:
df.describe(include='all')

Unnamed: 0,CustomerID,State,ClaimAmount,Education,Income,Gender,Policy,NumberOfPolicies,PolicyType,MonthsInstallment,VehicleType,TotalClaimAmount
count,1000,1000,1000.0,1000,1000.0,1000,1000,1000.0,1000,1000.0,1000,1000.0
unique,1000,5,,5,,2,1,,5,,4,
top,CID237,TX,,Professional,,M,Auto,,Self Auto,,Medium sized,
freq,1,222,,224,,513,1000,,274,,270,
mean,,,4973.94,,154488.31,,,2.544,,48.376,,5915.439893
std,,,2765.799312,,86622.613704,,,1.115934,,27.776396,,4870.86426
min,,,59.0,,681.0,,,1.0,,1.0,,0.0
25%,,,2700.5,,80608.75,,,2.0,,25.0,,2282.4325
50%,,,4962.5,,156377.5,,,3.0,,48.0,,4398.6135
75%,,,7217.0,,231740.25,,,4.0,,72.0,,7688.6975


## One hot encoding

In [7]:
df_1 = df.drop(['CustomerID','TotalClaimAmount'], axis=1)

In [8]:
one_hot_encoded_training_predictors = pd.get_dummies(df_1)
one_hot_encoded_training_predictors

Unnamed: 0,ClaimAmount,Income,NumberOfPolicies,MonthsInstallment,State_AL,State_IA,State_OH,State_PH,State_TX,Education_College,...,Policy_Auto,PolicyType_Business,PolicyType_Corporate Auto,PolicyType_International,PolicyType_Self Auto,PolicyType_Service,VehicleType_MUV,VehicleType_Medium sized,VehicleType_SUV,VehicleType_Sedan
0,59,196419,4,70,0,1,0,0,0,0,...,1,1,0,0,0,0,1,0,0,0
1,71,171186,4,31,0,0,0,1,0,0,...,1,1,0,0,0,0,0,1,0,0
2,103,285004,4,67,0,0,0,1,0,0,...,1,0,0,0,1,0,0,0,0,1
3,103,181780,4,78,0,0,0,0,1,0,...,1,0,1,0,0,0,0,1,0,0
4,112,275022,3,21,0,0,0,1,0,1,...,1,0,0,0,1,0,0,1,0,0
5,119,36835,3,27,1,0,0,0,0,1,...,1,0,0,0,1,0,1,0,0,0
6,121,34815,1,88,0,0,0,0,1,0,...,1,0,0,0,0,1,0,0,1,0
7,128,37797,4,72,0,0,0,1,0,1,...,1,0,0,0,1,0,0,0,1,0
8,130,217877,3,95,0,0,0,0,1,0,...,1,1,0,0,0,0,0,0,0,1
9,135,35277,1,1,1,0,0,0,0,0,...,1,0,1,0,0,0,0,0,0,1


In [9]:
# Mean Absolute Error (MAE)
def get_mae(X, y):
    # multiple by -1 to make positive MAE score
    # instead of neg value returned as sklearn convention
    return -1 * cross_val_score(DecisionTreeRegressor(random_state=131), 
                                X, y, scoring = 'neg_mean_absolute_error').mean()

In [10]:
predictors_without_categoricals = df_1.select_dtypes(exclude=['object'])
predictors_without_categoricals

Unnamed: 0,ClaimAmount,Income,NumberOfPolicies,MonthsInstallment
0,59,196419,4,70
1,71,171186,4,31
2,103,285004,4,67
3,103,181780,4,78
4,112,275022,3,21
5,119,36835,3,27
6,121,34815,1,88
7,128,37797,4,72
8,130,217877,3,95
9,135,35277,1,1


In [11]:
target = df['TotalClaimAmount']

In [12]:
mae_without_categoricals = get_mae(predictors_without_categoricals, target)
mae_without_categoricals

2776.669619541697

In [13]:
mae_one_hot_encoded = get_mae(one_hot_encoded_training_predictors, target)
mae_one_hot_encoded

2546.5719407251563

In [14]:
print('Mean Absolute Error when Dropping Categoricals: ' + str(int(mae_without_categoricals)))
print('Mean Abslute Error with One-Hot Encoding: ' + str(int(mae_one_hot_encoded)))

Mean Absolute Error when Dropping Categoricals: 2776
Mean Abslute Error with One-Hot Encoding: 2546


In [15]:
df_1 = one_hot_encoded_training_predictors
features = list(df_1.columns)
features

['ClaimAmount',
 'Income',
 'NumberOfPolicies',
 'MonthsInstallment',
 'State_AL',
 'State_IA',
 'State_OH',
 'State_PH',
 'State_TX',
 'Education_College',
 'Education_Graduate',
 'Education_High School',
 'Education_Not specified',
 'Education_Professional',
 'Gender_F',
 'Gender_M',
 'Policy_Auto',
 'PolicyType_Business',
 'PolicyType_Corporate Auto',
 'PolicyType_International',
 'PolicyType_Self Auto',
 'PolicyType_Service',
 'VehicleType_MUV',
 'VehicleType_Medium sized',
 'VehicleType_SUV',
 'VehicleType_Sedan']