In [1]:
import pandas as pd

df_train = pd.read_csv('/kaggle/input/allstate-claims-severity/train.csv')
df_test = pd.read_csv('/kaggle/input/allstate-claims-severity/test.csv')

print (df_train.shape)
print (df_test.shape)

(188318, 132)
(125546, 131)


In [2]:
from IPython.display import display
print (df_train.columns)

display(df_train.head())

Index(['id', 'cat1', 'cat2', 'cat3', 'cat4', 'cat5', 'cat6', 'cat7', 'cat8',
       'cat9',
       ...
       'cont6', 'cont7', 'cont8', 'cont9', 'cont10', 'cont11', 'cont12',
       'cont13', 'cont14', 'loss'],
      dtype='object', length=132)


Unnamed: 0,id,cat1,cat2,cat3,cat4,cat5,cat6,cat7,cat8,cat9,...,cont6,cont7,cont8,cont9,cont10,cont11,cont12,cont13,cont14,loss
0,1,A,B,A,B,A,A,A,A,B,...,0.718367,0.33506,0.3026,0.67135,0.8351,0.569745,0.594646,0.822493,0.714843,2213.18
1,2,A,B,A,A,A,A,A,A,B,...,0.438917,0.436585,0.60087,0.35127,0.43919,0.338312,0.366307,0.611431,0.304496,1283.6
2,5,A,B,A,A,B,A,A,A,B,...,0.289648,0.315545,0.2732,0.26076,0.32446,0.381398,0.373424,0.195709,0.774425,3005.09
3,10,B,B,A,B,A,A,A,A,B,...,0.440945,0.391128,0.31796,0.32128,0.44467,0.327915,0.32157,0.605077,0.602642,939.85
4,11,A,B,A,B,A,A,A,A,B,...,0.178193,0.247408,0.24564,0.22089,0.2123,0.204687,0.202213,0.246011,0.432606,2763.85


# Checking the categorical features

In [3]:
categorical_columns = df_train.select_dtypes(include=['object']).columns
print ("Categorical Column: ", categorical_columns)

Categorical Column:  Index(['cat1', 'cat2', 'cat3', 'cat4', 'cat5', 'cat6', 'cat7', 'cat8', 'cat9',
       'cat10',
       ...
       'cat107', 'cat108', 'cat109', 'cat110', 'cat111', 'cat112', 'cat113',
       'cat114', 'cat115', 'cat116'],
      dtype='object', length=116)


# Finding duplicates and NaN values

In [4]:
df_train.duplicated().sum() #no duplicates
df_train.isnull().sum() #no NaN values

df_test.duplicated().sum()
df_test.isnull().sum()

id        0
cat1      0
cat2      0
cat3      0
cat4      0
         ..
cont10    0
cont11    0
cont12    0
cont13    0
cont14    0
Length: 131, dtype: int64

In [5]:
x_train = df_train.iloc[:,:-1]
y_train = df_train.iloc[:,-1]

In [6]:
display (x_train.head())
display(y_train.head())

print (x_train.shape)
print (y_train.shape)

Unnamed: 0,id,cat1,cat2,cat3,cat4,cat5,cat6,cat7,cat8,cat9,...,cont5,cont6,cont7,cont8,cont9,cont10,cont11,cont12,cont13,cont14
0,1,A,B,A,B,A,A,A,A,B,...,0.310061,0.718367,0.33506,0.3026,0.67135,0.8351,0.569745,0.594646,0.822493,0.714843
1,2,A,B,A,A,A,A,A,A,B,...,0.885834,0.438917,0.436585,0.60087,0.35127,0.43919,0.338312,0.366307,0.611431,0.304496
2,5,A,B,A,A,B,A,A,A,B,...,0.397069,0.289648,0.315545,0.2732,0.26076,0.32446,0.381398,0.373424,0.195709,0.774425
3,10,B,B,A,B,A,A,A,A,B,...,0.422268,0.440945,0.391128,0.31796,0.32128,0.44467,0.327915,0.32157,0.605077,0.602642
4,11,A,B,A,B,A,A,A,A,B,...,0.704268,0.178193,0.247408,0.24564,0.22089,0.2123,0.204687,0.202213,0.246011,0.432606


0    2213.18
1    1283.60
2    3005.09
3     939.85
4    2763.85
Name: loss, dtype: float64

(188318, 131)
(188318,)


# Feature Engineering
### Categorical features

In [7]:
# column 1 to 117 all are categorical columns
# lets convert the categorical columns to one hot encoding

print (x_train.iloc[:,1:117].astype('category'))

       cat1 cat2 cat3 cat4 cat5 cat6 cat7 cat8 cat9 cat10  ... cat107 cat108  \
0         A    B    A    B    A    A    A    A    B     A  ...      J      G   
1         A    B    A    A    A    A    A    A    B     B  ...      K      K   
2         A    B    A    A    B    A    A    A    B     B  ...      F      A   
3         B    B    A    B    A    A    A    A    B     A  ...      K      K   
4         A    B    A    B    A    A    A    A    B     B  ...      G      B   
...     ...  ...  ...  ...  ...  ...  ...  ...  ...   ...  ...    ...    ...   
188313    A    B    A    A    A    A    A    A    B     A  ...      G      B   
188314    A    A    A    A    A    B    A    A    A     A  ...      F      B   
188315    A    B    A    A    A    A    A    B    B     A  ...      F      B   
188316    A    B    A    A    A    A    A    A    B     B  ...      J      K   
188317    B    A    A    B    A    A    A    A    A     A  ...      G      G   

       cat109 cat110 cat111 cat112 cat1

In [8]:
categorical_columns = x_train.select_dtypes(include = ['object','category']).columns
x_train = pd.get_dummies(x_train, columns = categorical_columns, drop_first = True)

test_categorical_columns = df_test.select_dtypes(include =['object', 'category']).columns
df_test = pd.get_dummies(df_test, columns = test_categorical_columns, drop_first = True)

In [9]:
train_columns = x_train.columns
print (train_columns)

# Align the test data columns to match the training data columns
df_test = df_test.reindex(columns = train_columns, fill_value = 0)

Index(['id', 'cont1', 'cont2', 'cont3', 'cont4', 'cont5', 'cont6', 'cont7',
       'cont8', 'cont9',
       ...
       'cat116_P', 'cat116_Q', 'cat116_R', 'cat116_S', 'cat116_T', 'cat116_U',
       'cat116_V', 'cat116_W', 'cat116_X', 'cat116_Y'],
      dtype='object', length=1038)


In [10]:
display (x_train.head())

display (df_test.head())

Unnamed: 0,id,cont1,cont2,cont3,cont4,cont5,cont6,cont7,cont8,cont9,...,cat116_P,cat116_Q,cat116_R,cat116_S,cat116_T,cat116_U,cat116_V,cat116_W,cat116_X,cat116_Y
0,1,0.7263,0.245921,0.187583,0.789639,0.310061,0.718367,0.33506,0.3026,0.67135,...,False,False,False,False,False,False,False,False,False,False
1,2,0.330514,0.737068,0.592681,0.614134,0.885834,0.438917,0.436585,0.60087,0.35127,...,False,False,False,False,False,False,False,False,False,False
2,5,0.261841,0.358319,0.484196,0.236924,0.397069,0.289648,0.315545,0.2732,0.26076,...,False,False,False,False,False,False,False,False,False,False
3,10,0.321594,0.555782,0.527991,0.373816,0.422268,0.440945,0.391128,0.31796,0.32128,...,False,False,False,False,False,False,False,False,False,False
4,11,0.273204,0.15999,0.527991,0.473202,0.704268,0.178193,0.247408,0.24564,0.22089,...,False,False,False,False,False,False,False,False,False,False


Unnamed: 0,id,cont1,cont2,cont3,cont4,cont5,cont6,cont7,cont8,cont9,...,cat116_P,cat116_Q,cat116_R,cat116_S,cat116_T,cat116_U,cat116_V,cat116_W,cat116_X,cat116_Y
0,4,0.321594,0.299102,0.246911,0.402922,0.281143,0.466591,0.317681,0.61229,0.34365,...,0,False,False,False,False,False,0,0,0,False
1,6,0.634734,0.620805,0.65431,0.946616,0.836443,0.482425,0.44376,0.7133,0.5189,...,0,False,False,False,False,False,0,0,0,False
2,9,0.290813,0.737068,0.711159,0.412789,0.718531,0.212308,0.325779,0.29758,0.34365,...,0,False,False,False,False,False,0,0,0,False
3,12,0.268622,0.681761,0.592681,0.354893,0.397069,0.36993,0.342355,0.40028,0.33237,...,0,False,False,False,False,False,0,0,0,False
4,15,0.553846,0.299102,0.26357,0.696873,0.302678,0.398862,0.391833,0.23688,0.43731,...,0,False,False,False,False,False,0,0,0,False


In [11]:
print (x_train.dtypes.unique())
float_columns = x_train.select_dtypes(include = ['float64'])
print (float_columns.max())
print (float_columns.min())
# we can conclude the numerical values are already min max scaled 
# so don't need to pre-process this data

[dtype('int64') dtype('float64') dtype('bool')]
cont1     0.984975
cont2     0.862654
cont3     0.944251
cont4     0.954297
cont5     0.983674
cont6     0.997162
cont7     1.000000
cont8     0.980200
cont9     0.995400
cont10    0.994980
cont11    0.998742
cont12    0.998484
cont13    0.988494
cont14    0.844848
dtype: float64
cont1     0.000016
cont2     0.001149
cont3     0.002634
cont4     0.176921
cont5     0.281143
cont6     0.012683
cont7     0.069503
cont8     0.236880
cont9     0.000080
cont10    0.000000
cont11    0.035321
cont12    0.036232
cont13    0.000228
cont14    0.179722
dtype: float64


# Train Test Split

In [12]:
from sklearn.model_selection import train_test_split

x_train, x_test, y_train, y_test = train_test_split (x_train, y_train, test_size = 0.3, 
                                                    random_state = 42)

print (x_train.shape)
print (x_test.shape)
print (y_train.shape)
print (y_test.shape)

(131822, 1038)
(56496, 1038)
(131822,)
(56496,)


In [13]:
import numpy as np
import xgboost as xgb
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import GridSearchCV

param_grid = {
    'n_estimators': [200],
    'max_depth': [5],
    'learning_rate': [0.2],
    'eval_metric' : ['rmse']
}
model = xgb.XGBRegressor()
grid_search_cv = GridSearchCV(estimator = model, param_grid = param_grid)
grid_search_cv.fit(x_train, y_train)

In [14]:
from sklearn.metrics import mean_squared_error, mean_absolute_error
best_param = grid_search_cv.best_params_
print ("Best Parameters: ", best_param)

best_model = grid_search_cv.best_estimator_ 
print ("Best Estimator: ",best_model)


y_pred = best_model.predict(x_test)
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
print ("RMSE: ", rmse)
mae = mean_absolute_error(y_test, y_pred)
print ("MAE: ", mae)

Best Parameters:  {'eval_metric': 'rmse', 'learning_rate': 0.2, 'max_depth': 5, 'n_estimators': 200}
Best Estimator:  XGBRegressor(base_score=None, booster=None, callbacks=None,
             colsample_bylevel=None, colsample_bynode=None,
             colsample_bytree=None, device=None, early_stopping_rounds=None,
             enable_categorical=False, eval_metric='rmse', feature_types=None,
             gamma=None, grow_policy=None, importance_type=None,
             interaction_constraints=None, learning_rate=0.2, max_bin=None,
             max_cat_threshold=None, max_cat_to_onehot=None,
             max_delta_step=None, max_depth=5, max_leaves=None,
             min_child_weight=None, missing=nan, monotone_constraints=None,
             multi_strategy=None, n_estimators=200, n_jobs=None,
             num_parallel_tree=None, random_state=None, ...)
RMSE:  1926.7721239225061
MAE:  1199.976778913307


In [15]:
individual_mae = np.abs(y_test-y_pred)
print ("Individual_mae: ", individual_mae)

Individual_mae:  10168      291.949624
6936       403.581289
39511     2053.642158
40087     1205.983887
117312      47.754727
             ...     
17626      193.787617
10406     1627.806133
159789      40.384419
181583    2736.830742
10228      811.324990
Name: loss, Length: 56496, dtype: float64


In [16]:
display(df_test.head())

y_input_predict = best_model.predict(df_test)
print (y_input_predict.size)

Unnamed: 0,id,cont1,cont2,cont3,cont4,cont5,cont6,cont7,cont8,cont9,...,cat116_P,cat116_Q,cat116_R,cat116_S,cat116_T,cat116_U,cat116_V,cat116_W,cat116_X,cat116_Y
0,4,0.321594,0.299102,0.246911,0.402922,0.281143,0.466591,0.317681,0.61229,0.34365,...,0,False,False,False,False,False,0,0,0,False
1,6,0.634734,0.620805,0.65431,0.946616,0.836443,0.482425,0.44376,0.7133,0.5189,...,0,False,False,False,False,False,0,0,0,False
2,9,0.290813,0.737068,0.711159,0.412789,0.718531,0.212308,0.325779,0.29758,0.34365,...,0,False,False,False,False,False,0,0,0,False
3,12,0.268622,0.681761,0.592681,0.354893,0.397069,0.36993,0.342355,0.40028,0.33237,...,0,False,False,False,False,False,0,0,0,False
4,15,0.553846,0.299102,0.26357,0.696873,0.302678,0.398862,0.391833,0.23688,0.43731,...,0,False,False,False,False,False,0,0,0,False


125546


In [17]:
print (df_test.columns)

Index(['id', 'cont1', 'cont2', 'cont3', 'cont4', 'cont5', 'cont6', 'cont7',
       'cont8', 'cont9',
       ...
       'cat116_P', 'cat116_Q', 'cat116_R', 'cat116_S', 'cat116_T', 'cat116_U',
       'cat116_V', 'cat116_W', 'cat116_X', 'cat116_Y'],
      dtype='object', length=1038)


In [18]:
print (df_test.shape)

submission = pd.DataFrame({'id':df_test['id'],'Response':y_input_predict})
#Visualize the first 5 rows
submission.tail()

(125546, 1038)


Unnamed: 0,id,Response
125541,587617,2414.212402
125542,587621,2972.469238
125543,587627,3037.565186
125544,587629,1235.958374
125545,587634,3391.068604


In [19]:
filename = 'submission.csv'

submission.to_csv(filename,index=False)

print('Saved file: ' + filename)

Saved file: submission.csv
