In [1]:
import os
import xgboost
import numpy as np
import pandas as pd


## XGBoost

<br>
<hr>
So our task here is to predict the COST of the event.
<hr> 

XGBoost is a model that works well with such problems and it really did so !
Here are the results:
    
    mae_COST              : 701609.6580257782
    mae_INJURIES_DIRECT   : 0.10946037606956904
    mae_INJURIES_INDIRECT : 0.020507655614491187
    mae_DEATHS_DIRECT     : 0.018528018811389237
    mae_DEATHS_INDIRECT   : 0.003930450780809105

Of course the mae_COST - that is the sum of DAMAGE_PROPERTY and DAMAGE_CROPS, is still very big, but all the other targets have really good score !
<br> <br>

In [2]:
########################
##    Reading data    ##
########################

data = pd.read_pickle('cost_data.pkl')
print('data :', data.shape)
data.head()

data : (1301399, 129)


Unnamed: 0,Astronomical Low Tide,Avalanche,Blizzard,Coastal Flood,Cold/Wind Chill,Debris Flow,Dense Fog,Dense Smoke,Drought,Dust Devil,...,WISCONSIN,WYOMING,COST,INJURIES_DIRECT,INJURIES_INDIRECT,DEATHS_DIRECT,DEATHS_INDIRECT,MONTH,DAY,YEAR
302890,1,0,0,0,0,0,0,0,0,0,...,0,0,0.0,0,0,0,0,3,6,1997
302891,1,0,0,0,0,0,0,0,0,0,...,0,0,0.0,0,0,0,0,3,6,1997
302892,1,0,0,0,0,0,0,0,0,0,...,0,0,0.0,0,0,0,0,3,6,1997
307309,1,0,0,0,0,0,0,0,0,0,...,0,0,0.0,0,0,0,0,3,6,1997
307695,1,0,0,0,0,0,0,0,0,0,...,0,0,0.0,0,0,0,0,3,6,1997


In [3]:
###########################################
##    Seperate model for each feature    ##
###########################################

model_COST = xgboost.XGBRegressor(n_jobs=4)
model_INJURIES_DIRECT = xgboost.XGBRegressor(n_jobs=4)
model_INJURIES_INDIRECT = xgboost.XGBRegressor(n_jobs=4)
model_DEATHS_DIRECT = xgboost.XGBRegressor(n_jobs=4)
model_DEATHS_INDIRECT = xgboost.XGBRegressor(n_jobs=4)

In [7]:
##############################################
##    Same input data for all the models    ##
##############################################

input_data = data.drop(['COST','INJURIES_DIRECT','INJURIES_INDIRECT',
                     'DEATHS_DIRECT','DEATHS_INDIRECT'], axis=1).values

In [12]:
############################
##    Training process    ##
############################

model_COST.fit(input_data, data.COST.values)
print('COST done !')
model_INJURIES_DIRECT.fit(input_data, data.INJURIES_DIRECT.values)
print('INJURIES_DIRECT done !')
model_INJURIES_INDIRECT.fit(input_data, data.INJURIES_INDIRECT.values)
print('INJURIES_INDIRECT done !')
model_DEATHS_DIRECT.fit(input_data, data.DEATHS_DIRECT.values)
print('DEATHS_DIRECT done !')
model_DEATHS_INDIRECT.fit(input_data, data.DEATHS_INDIRECT.values)
print('DEATHS_INDIRECT done !')

COST done !
INJURIES_DIRECT done !
INJURIES_INDIRECT done !
DEATHS_DIRECT done !
DEATHS_INDIRECT done !


In [13]:
##############################
##    Predicting process    ##
##############################

pred_COST = model_COST.predict(input_data)
print('pred_COST done !')
pred_INJURIES_DIRECT = model_INJURIES_DIRECT.predict(input_data)
print('pred_INJURIES_DIRECT done !')
pred_INJURIES_INDIRECT = model_INJURIES_INDIRECT.predict(input_data)
print('pred_INJURIES_INDIRECT done !')
pred_DEATHS_DIRECT = model_DEATHS_DIRECT.predict(input_data)
print('pred_DEATHS_DIRECT done !')
pred_DEATHS_INDIRECT = model_DEATHS_INDIRECT.predict(input_data)
print('pred_DEATHS_INDIRECT done !')

pred_COST done !
pred_INJURIES_DIRECT done !
pred_INJURIES_INDIRECT done !
pred_DEATHS_DIRECT done !
pred_DEATHS_INDIRECT done !


In [17]:
###############################
##    Mean Absolute Error    ##
###############################

mae_COST = np.sum(np.abs(pred_COST-
                         data.COST.values))/pred_COST.shape[0]
print('mae_COST              :', mae_COST)
mae_INJURIES_DIRECT = np.sum(np.abs(pred_INJURIES_DIRECT-
                                    data.INJURIES_DIRECT.values))/pred_INJURIES_DIRECT.shape[0]
print('mae_INJURIES_DIRECT   :', mae_INJURIES_DIRECT)
mae_INJURIES_INDIRECT = np.sum(np.abs(pred_INJURIES_INDIRECT-
                                      data.INJURIES_INDIRECT.values))/pred_INJURIES_INDIRECT.shape[0]
print('mae_INJURIES_INDIRECT :', mae_INJURIES_INDIRECT)
mae_DEATHS_DIRECT = np.sum(np.abs(pred_DEATHS_DIRECT-
                                  data.DEATHS_DIRECT.values))/pred_DEATHS_DIRECT.shape[0]
print('mae_DEATHS_DIRECT     :', mae_DEATHS_DIRECT)
mae_DEATHS_INDIRECT = np.sum(np.abs(pred_DEATHS_INDIRECT-
                                    data.DEATHS_INDIRECT.values))/pred_DEATHS_INDIRECT.shape[0]
print('mae_DEATHS_INDIRECT   :', mae_DEATHS_INDIRECT)

mae_COST              : 701609.6580257782
mae_INJURIES_DIRECT   : 0.10946037606956904
mae_INJURIES_INDIRECT : 0.020507655614491187
mae_DEATHS_DIRECT     : 0.018528018811389237
mae_DEATHS_INDIRECT   : 0.003930450780809105


In [21]:
#############################
##    Saving the models    ##
#############################

model_COST.save_model('model_COST.xgb')
model_DEATHS_DIRECT.save_model('model_DEATHS_DIRECT.xgb')
model_DEATHS_INDIRECT.save_model('model_DEATHS_INDIRECT.xgb')
model_INJURIES_DIRECT.save_model('model_INJURIES_DIRECT.xgb')
model_INJURIES_INDIRECT.save_model('model_INJURIES_INDIRECT.xgb')