In [1]:
import xgboost as xgb     # extreme gradient boosting (XGB)
import pandas as pd
import shap
import geopandas as gpd

In [2]:
sepher_spatial = pd.read_csv('../data/geo_sepher.csv')

In [3]:
X_var = ['EP_UNEMP',
 'EP_PCI',
 'EP_NOVEH',
 'EP_MOBILE',
 'EP_CROWD',
 'EP_AGE65',
 'EP_AGE17',
 'AFAM',
 'WHITE',
 'HISPANIC',
 'WFIR_EALT',
 'HRCN_EALT',
 'CFLD_EALT',
 'RFLD_EALT',
 'HWAV_EALT',
 'POPULATION']
Y_var = ['EVICTIONS']
geospatial_columns = ['STATEFP',
                      'COUNTYFP',
                      'TRACTCE',
                      'AFFGEOID',
                      'GEOID',
                      'NAME',
                      'LSAD',
                      'ALAND',
                      'AWATER',
                      'geometry']

In [4]:
data_selected = sepher_spatial[X_var]
model = xgb.Booster()
model.load_model('../models/xgboost.model')
# drop nulls


In [5]:
preds = model.predict(xgb.DMatrix(data_selected.values))

In [6]:
t_explainer = shap.TreeExplainer(model)
t_shap_values = t_explainer(data_selected)
shap_values = t_explainer.shap_values(data_selected)

In [32]:
shap_values

array([[  2.9241502 ,  -3.7430773 ,  17.584202  , ...,   0.2845081 ,
          0.7048376 ,  -6.752398  ],
       [  0.52940035,   0.54988945,   0.59495705, ...,  -0.5898261 ,
         -0.7290488 ,   5.1765842 ],
       [  0.0907316 ,  -0.4965466 ,  -3.4391942 , ...,  -0.9183389 ,
         -2.0502198 ,  -3.1834772 ],
       ...,
       [ -0.57916635,   3.3037276 ,   2.0033216 , ...,   2.9352438 ,
          1.0031408 , -10.455181  ],
       [ -0.57916635,   3.3037276 ,   2.0033216 , ...,   2.9352438 ,
          1.0031408 , -10.455181  ],
       [ -0.57916635,   3.3037276 ,   2.0033216 , ...,   2.9352438 ,
          1.0031408 , -10.455181  ]], dtype=float32)

In [9]:
shap_cols = [col + '_SHAP_VAL' for col in X_var]
shap_df = pd.DataFrame(shap_values, columns=shap_cols)
shap_df

Unnamed: 0,EP_UNEMP_SHAP_VAL,EP_PCI_SHAP_VAL,EP_NOVEH_SHAP_VAL,EP_MOBILE_SHAP_VAL,EP_CROWD_SHAP_VAL,EP_AGE65_SHAP_VAL,EP_AGE17_SHAP_VAL,AFAM_SHAP_VAL,WHITE_SHAP_VAL,HISPANIC_SHAP_VAL,WFIR_EALT_SHAP_VAL,HRCN_EALT_SHAP_VAL,CFLD_EALT_SHAP_VAL,RFLD_EALT_SHAP_VAL,HWAV_EALT_SHAP_VAL,POPULATION_SHAP_VAL
0,2.924150,-3.743077,17.584202,1.239864,0.467236,10.183839,0.763537,-3.480553,-6.359250,1.945370,0.796593,-3.001530,-0.298346,0.284508,0.704838,-6.752398
1,0.529400,0.549889,0.594957,-0.158317,4.895345,4.336635,-0.521036,4.334855,-4.529551,-6.363328,-0.527418,4.033363,-0.544785,-0.589826,-0.729049,5.176584
2,0.090732,-0.496547,-3.439194,-1.342887,0.777482,8.405905,0.647309,2.785877,-8.723932,-7.419757,-3.144144,0.954935,-1.499840,-0.918339,-2.050220,-3.183477
3,0.643969,-2.667338,8.090971,0.643146,0.127366,1.804226,1.117712,-2.902661,-3.970088,0.041370,0.344199,-1.693155,0.644381,-1.124984,-3.487870,-8.836326
4,1.557130,3.617598,1.008074,1.766586,3.081737,4.447956,-1.210735,11.380885,-7.348872,-3.556215,0.435775,-1.971547,-2.269426,-5.094233,-8.077492,3.414099
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
15034,-0.579166,3.303728,2.003322,-0.143376,-0.522630,-5.539412,-3.060316,-5.169247,10.672346,-2.594498,-3.705574,-2.766415,1.371101,2.935244,1.003141,-10.455181
15035,-0.579166,3.303728,2.003322,-0.143376,-0.522630,-5.539412,-3.060316,-5.169247,10.672346,-2.594498,-3.705574,-2.766415,1.371101,2.935244,1.003141,-10.455181
15036,-0.579166,3.303728,2.003322,-0.143376,-0.522630,-5.539412,-3.060316,-5.169247,10.672346,-2.594498,-3.705574,-2.766415,1.371101,2.935244,1.003141,-10.455181
15037,-0.579166,3.303728,2.003322,-0.143376,-0.522630,-5.539412,-3.060316,-5.169247,10.672346,-2.594498,-3.705574,-2.766415,1.371101,2.935244,1.003141,-10.455181


In [10]:
combined_df = pd.concat([data_selected, shap_df, sepher_spatial[geospatial_columns]], axis=1)
combined_df['EVICTIONS_PRED'] = preds
combined_df['EVICTIONS'] = sepher_spatial['EVICTIONS']

In [45]:
combined_df

Unnamed: 0,EP_UNEMP,EP_PCI,EP_NOVEH,EP_MOBILE,EP_CROWD,EP_AGE65,EP_AGE17,AFAM,WHITE,HISPANIC,...,WHITE_SHAP_VAL,HISPANIC_SHAP_VAL,WFIR_EALT_SHAP_VAL,HRCN_EALT_SHAP_VAL,CFLD_EALT_SHAP_VAL,RFLD_EALT_SHAP_VAL,HWAV_EALT_SHAP_VAL,POPULATION_SHAP_VAL,EVICTIONS_PRED,EVICTIONS
0,15.4,11500.0,30.5,0.0,0.5,5.2,24.9,0,10,0,...,-6.359250,1.945370,0.796593,-3.001530,-0.298346,0.284508,0.704838,-6.752398,34.820953,70
1,10.9,14301.0,9.1,5.8,9.6,10.4,25.1,16,72,51,...,-4.529551,-6.363328,-0.527418,4.033363,-0.544785,-0.589826,-0.729049,5.176584,32.049683,33
2,11.4,20636.0,6.8,29.9,3.4,6.3,19.6,35,216,167,...,-8.723932,-7.419757,-3.144144,0.954935,-1.499840,-0.918339,-2.050220,-3.183477,3.005868,11
3,7.8,51412.0,25.4,0.0,0.0,10.6,8.9,0,27,0,...,-3.970088,0.041370,0.344199,-1.693155,0.644381,-1.124984,-3.487870,-8.836326,10.336882,0
4,9.1,22143.0,9.3,0.5,9.8,8.3,23.9,10,74,35,...,-7.348872,-3.556215,0.435775,-1.971547,-2.269426,-5.094233,-8.077492,3.414099,22.743277,31
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
15034,,,,,,,,0,0,0,...,10.672346,-2.594498,-3.705574,-2.766415,1.371101,2.935244,1.003141,-10.455181,8.315022,0
15035,,,,,,,,0,0,0,...,10.672346,-2.594498,-3.705574,-2.766415,1.371101,2.935244,1.003141,-10.455181,8.315022,0
15036,,,,,,,,0,0,0,...,10.672346,-2.594498,-3.705574,-2.766415,1.371101,2.935244,1.003141,-10.455181,8.315022,0
15037,,,,,,,,0,0,0,...,10.672346,-2.594498,-3.705574,-2.766415,1.371101,2.935244,1.003141,-10.455181,8.315022,0
