In [1]:
import xgboost as xgb     # extreme gradient boosting (XGB)
import pandas as pd
import shap
import geopandas as gpd
import matplotlib.pyplot as plt
from joblib import dump, load
import numpy as np
import sklearn            # machine-learning libary with many algorithms implemented
from sklearn.experimental import enable_hist_gradient_boosting 
from sklearn.ensemble import HistGradientBoostingRegressor

In [2]:
all_data = pd.read_csv('../data/all_data_clean.csv')

In [3]:
all_data

Unnamed: 0,FIPS,POPULATION,EP_UNEMP,EP_PCI,EP_POV,EP_NOVEH,EP_NOHSDP,EP_MOBILE,EP_MINRTY,EP_CROWD,...,EP_AGE17,AFAM,WHITE,HISPANIC,WFIR_AFREQ,HRCN_AFREQ,CFLD_AFREQ,RFLD_AFREQ,HWAV_AFREQ,EVICTIONS
0,39061025102GID,7692.0,5.0,32194.0,6.9,4.4,4.0,0.0,8.6,1.9,...,27.9,3,348,6,0.000009,0.005984,,5.458333,4.118616,5.0
1,42101027000GID,,11.1,24890.0,13.3,16.5,10.7,0.0,89.1,1.3,...,20.0,67,6,2,0.000012,0.047873,0.000000,0.000000,7.166392,
2,06031000300GID,,14.1,20563.0,10.9,0.7,2.8,0.0,51.3,0.0,...,28.9,0,0,0,0.000253,,,0.958333,2.306425,
3,24025301402GID,,3.4,30319.0,13.3,0.0,12.2,0.0,34.3,1.1,...,24.7,39,127,4,0.000047,0.071809,4.435936,2.708333,0.738127,
4,51097950500GID,2999.0,8.0,23307.0,15.2,4.4,14.8,15.7,21.2,3.4,...,15.0,22,91,0,0.000113,0.144508,4.435936,0.541667,0.741351,9.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
72903,34025809902GID,,1.6,72165.0,7.4,3.7,4.5,0.0,11.5,0.0,...,25.4,0,189,2,0.000576,0.161570,4.431920,3.666667,0.988468,
72904,09009141100GID,,6.2,36837.0,4.3,8.3,3.1,0.0,55.5,4.5,...,23.3,33,67,8,0.000028,0.137634,0.000000,0.000000,0.576606,
72905,29095009200GID,,0.8,47961.0,4.8,3.7,3.6,1.0,16.5,0.0,...,19.2,3,215,3,0.000000,,,0.000000,7.578254,
72906,06029005700GID,,12.0,20857.0,9.6,0.7,0.8,0.0,32.0,1.0,...,42.9,0,0,0,0.001385,,,5.500000,2.078385,


In [7]:
X_var = ['EP_UNEMP',
 'EP_PCI',
 'EP_NOVEH',
 'EP_MOBILE',
 'EP_CROWD',
 'EP_AGE65',
 'EP_AGE17',
 'AFAM',
 'WHITE',
 'HISPANIC',
 'WFIR_AFREQ',
 'HRCN_AFREQ',
 'CFLD_AFREQ',
 'RFLD_AFREQ',
 'HWAV_AFREQ']
Y_var = ['EVICTIONS']

In [8]:
def per_pop(population, feature):
    if population > 0:
        return 1000 * feature/population
    else:
        return np.nan

In [9]:
def transform_data(data):
    data['EVICTIONS'] = data.apply(lambda x: per_pop(x['POPULATION'], x['EVICTIONS']), axis=1)
    data['AFAM'] = data.apply(lambda x: per_pop(x['POPULATION'], x['AFAM']), axis=1)
    data['WHITE'] = data.apply(lambda x: per_pop(x['POPULATION'], x['WHITE']), axis=1)
    data['HISPANIC'] = data.apply(lambda x: per_pop(x['POPULATION'], x['HISPANIC']), axis=1)


In [10]:
transform_data(all_data)

In [11]:
model = load('../models/PoissGBoost_Base.pickle') 

In [12]:
t_explainer = shap.TreeExplainer(model)
t_shap_values = t_explainer(all_data[X_var])
shap_values = t_explainer.shap_values(all_data[X_var])

In [13]:
preds = model.predict(all_data[X_var])

In [14]:
shap_values

array([[-0.11363354, -0.0380214 , -0.14132788, ...,  0.08419695,
        -0.01375593, -0.06267731],
       [ 0.13142788,  0.09420807,  0.85200355, ..., -0.11183595,
         0.02389029, -0.04246708],
       [ 0.19894716,  0.20543846, -0.90052367, ...,  0.04436124,
        -0.01780907, -0.06716894],
       ...,
       [-0.08549595, -0.22047084, -0.16836286, ...,  0.15063212,
        -0.05570401,  0.06271685],
       [ 0.21364042,  0.18236358, -0.89834367, ...,  0.05064218,
         0.03909157, -0.0629502 ],
       [ 0.03492131,  0.15080576,  0.7171194 , ...,  0.01583062,
         0.02066931,  0.02204892]])

In [15]:
shap_cols = [col + '_SHAP_VAL' for col in X_var]
shap_df = pd.DataFrame(shap_values, columns=shap_cols)
shap_df

Unnamed: 0,EP_UNEMP_SHAP_VAL,EP_PCI_SHAP_VAL,EP_NOVEH_SHAP_VAL,EP_MOBILE_SHAP_VAL,EP_CROWD_SHAP_VAL,EP_AGE65_SHAP_VAL,EP_AGE17_SHAP_VAL,AFAM_SHAP_VAL,WHITE_SHAP_VAL,HISPANIC_SHAP_VAL,WFIR_AFREQ_SHAP_VAL,HRCN_AFREQ_SHAP_VAL,CFLD_AFREQ_SHAP_VAL,RFLD_AFREQ_SHAP_VAL,HWAV_AFREQ_SHAP_VAL
0,-0.113634,-0.038021,-0.141328,0.056219,0.032724,0.029080,-0.117562,-0.139107,-0.101818,-0.022737,0.065751,-0.018757,0.084197,-0.013756,-0.062677
1,0.131428,0.094208,0.852004,0.073314,0.005227,0.161292,0.047510,0.020578,0.040350,0.046617,0.160277,0.073257,-0.111836,0.023890,-0.042467
2,0.198947,0.205438,-0.900524,0.058344,-0.088411,0.381882,-0.104653,0.000339,-0.056923,0.048894,-0.044865,-0.084022,0.044361,-0.017809,-0.067169
3,-0.077541,0.094092,-0.831242,0.050259,-0.040990,0.036797,-0.051479,-0.035674,-0.083701,0.036399,0.051528,0.096365,0.024873,-0.013469,0.042277
4,0.030069,0.048930,-0.139258,-0.133101,0.035634,-0.241525,0.100992,0.326405,-0.010235,-0.081246,-0.021789,0.043112,0.049746,0.042713,0.023986
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
72903,-0.120835,-0.757917,-0.226228,0.030664,-0.038057,0.008820,-0.159528,-0.006762,-0.085854,-0.000583,-0.010098,0.045975,0.019821,-0.045805,-0.026405
72904,-0.016317,-0.263256,0.255679,0.035969,0.056090,-0.014679,-0.050861,0.022849,-0.022087,0.023782,0.099139,0.015180,-0.133980,0.040266,0.102698
72905,-0.085496,-0.220471,-0.168363,0.031468,-0.034543,0.398387,0.120170,0.006152,-0.050625,0.003682,0.079956,-0.065246,0.150632,-0.055704,0.062717
72906,0.213640,0.182364,-0.898344,0.072629,-0.058845,0.316684,-0.227683,0.019926,-0.040111,0.044280,-0.251542,-0.105345,0.050642,0.039092,-0.062950


In [16]:
combined_df = pd.concat([all_data['FIPS'], shap_df], axis=1)
combined_df['EVICTIONS_PRED'] = preds
combined_df['EVICTIONS'] = all_data['EVICTIONS']
combined_df['FIPS'] = combined_df['FIPS'].astype(str).str[:-3]

In [17]:
combined_df

Unnamed: 0,FIPS,EP_UNEMP_SHAP_VAL,EP_PCI_SHAP_VAL,EP_NOVEH_SHAP_VAL,EP_MOBILE_SHAP_VAL,EP_CROWD_SHAP_VAL,EP_AGE65_SHAP_VAL,EP_AGE17_SHAP_VAL,AFAM_SHAP_VAL,WHITE_SHAP_VAL,HISPANIC_SHAP_VAL,WFIR_AFREQ_SHAP_VAL,HRCN_AFREQ_SHAP_VAL,CFLD_AFREQ_SHAP_VAL,RFLD_AFREQ_SHAP_VAL,HWAV_AFREQ_SHAP_VAL,EVICTIONS_PRED,EVICTIONS
0,39061025102,-0.113634,-0.038021,-0.141328,0.056219,0.032724,0.029080,-0.117562,-0.139107,-0.101818,-0.022737,0.065751,-0.018757,0.084197,-0.013756,-0.062677,1.912854,0.650026
1,42101027000,0.131428,0.094208,0.852004,0.073314,0.005227,0.161292,0.047510,0.020578,0.040350,0.046617,0.160277,0.073257,-0.111836,0.023890,-0.042467,15.266658,
2,06031000300,0.198947,0.205438,-0.900524,0.058344,-0.088411,0.381882,-0.104653,0.000339,-0.056923,0.048894,-0.044865,-0.084022,0.044361,-0.017809,-0.067169,2.062359,
3,24025301402,-0.077541,0.094092,-0.831242,0.050259,-0.040990,0.036797,-0.051479,-0.035674,-0.083701,0.036399,0.051528,0.096365,0.024873,-0.013469,0.042277,1.565984,
4,51097950500,0.030069,0.048930,-0.139258,-0.133101,0.035634,-0.241525,0.100992,0.326405,-0.010235,-0.081246,-0.021789,0.043112,0.049746,0.042713,0.023986,3.402304,3.001000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
72903,34025809902,-0.120835,-0.757917,-0.226228,0.030664,-0.038057,0.008820,-0.159528,-0.006762,-0.085854,-0.000583,-0.010098,0.045975,0.019821,-0.045805,-0.026405,0.800298,
72904,09009141100,-0.016317,-0.263256,0.255679,0.035969,0.056090,-0.014679,-0.050861,0.022849,-0.022087,0.023782,0.099139,0.015180,-0.133980,0.040266,0.102698,3.671112,
72905,29095009200,-0.085496,-0.220471,-0.168363,0.031468,-0.034543,0.398387,0.120170,0.006152,-0.050625,0.003682,0.079956,-0.065246,0.150632,-0.055704,0.062717,3.753685,
72906,06029005700,0.213640,0.182364,-0.898344,0.072629,-0.058845,0.316684,-0.227683,0.019926,-0.040111,0.044280,-0.251542,-0.105345,0.050642,0.039092,-0.062950,1.559647,


In [18]:
combined_df.to_csv('../data/predictions_shap.csv', index=False)