In [30]:
import classifiers as cl
import pandas as pd
from sklearn.model_selection import ParameterGrid
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import AdaBoostClassifier
import geopandas as gpd
%reload_ext autoreload
%autoreload 2

In [None]:
## Creating splits

In [2]:
data_2014 = pd.read_csv('clean_data/data_2014.csv')
data_2015 = pd.read_csv('clean_data/data_2015.csv')
data_2016 = pd.read_csv('clean_data/data_2016.csv')
data_2017 = pd.read_csv('clean_data/data_2017.csv')

data_2014 = data_2014.fillna(0).drop('S0101_C01_001E', axis=1)
data_2015 = data_2015.fillna(0).drop('S0101_C01_001E', axis=1)
data_2016 = data_2016.fillna(0).drop('S0101_C01_001E', axis=1)
data_2017 = data_2017.fillna(0).drop('S0101_C01_001E', axis=1)

In [3]:
splits={
    'split_1':{
        'x_train':cl.standardrize(data_2014.iloc[:,1:]),
        'x_test':cl.standardrize(data_2015.iloc[:,1:]),
        'y_train':data_2014['risk'],
        'y_test':data_2015['risk']
    },
    'split_2':{
        'x_train':cl.standardrize(data_2014.iloc[:,1:].append(data_2015.iloc[:,1:])),
        'x_test':cl.standardrize(data_2016.iloc[:,1:]),
        'y_train':data_2014['risk'].append(data_2015['risk']),
        'y_test':data_2016['risk']
    },
        'split_3':{
        'x_train':cl.standardrize(data_2014.iloc[:,1:].append(data_2015.iloc[:,1:]).append(data_2016.iloc[:,1:])),
        'x_test':cl.standardrize(data_2017.iloc[:,1:]),
        'y_train':data_2014['risk'].append(data_2015['risk']).append(data_2016['risk']),
        'y_test':data_2016['risk']
    }
}

In [None]:
## Creating param grids

In [4]:
logistic_params = {
    'penalty':['l2'],
    'C':[0.1, 0.5, 1.0],
    'random_state':[42],
    'max_iter':[100, 200, 300, 500],
    'solver': ['lbfgs']
}

In [5]:
svm_params = {
    'C':[0.1, 0.5, 1.0],
    'kernel':['linear', 'poly', 'rbf'],
    'gamma':['auto', 'scale'],
    'max_iter':[100, 200, 300, 500, 1000],
    'random_state':[42]
}

In [6]:
boost_params ={
    'n_estimators':[30, 50, 100, 150, 200],
    'learning_rate':[0.1, 0.5, 0.7, 1.0],
    'random_state':[42]
}

In [7]:
rf_params = {
    'n_estimators':[30, 50, 100, 150, 200],
    'criterion':['gini', 'entropy'],
    'random_state':[42]
}

In [8]:
logistic_grid = list(ParameterGrid(logistic_params))
svm_grid = list(ParameterGrid(svm_params))
boost_grid = list(ParameterGrid(boost_params))
rf_grid = list(ParameterGrid(rf_params))

In [None]:
## Testing different models for different splits

In [9]:
diff_models = {
    'logistic':{'model':LogisticRegression, 'grid':logistic_grid},
    'boost':{'model':AdaBoostClassifier, 'grid':boost_grid},
    'rf':{'model':RandomForestClassifier, 'grid':rf_grid}
}

In [10]:
split1_results = cl.different_models(diff_models, splits['split_1'])

In [11]:
split2_results = cl.different_models(diff_models, splits['split_2'])

In [12]:
split3_results = cl.different_models(diff_models, splits['split_3'])

In [13]:
# average results considering the 3 different splits 
cl.average_df([split1_results, split2_results, split3_results], 'AUC')

Unnamed: 0,Accuracy,Precision,Recall,AUC,F1
"<class 'sklearn.ensemble.forest.RandomForestClassifier'>.- criterion: entropy, n_estimators: 200, random_state: 42",0.766082,0.84722,0.782407,0.760322,0.812546
"<class 'sklearn.ensemble.forest.RandomForestClassifier'>.- criterion: gini, n_estimators: 200, random_state: 42",0.765664,0.837395,0.787547,0.758589,0.810606
"<class 'sklearn.ensemble.forest.RandomForestClassifier'>.- criterion: entropy, n_estimators: 150, random_state: 42",0.761487,0.842515,0.779462,0.755344,0.808717
"<class 'sklearn.ensemble.forest.RandomForestClassifier'>.- criterion: gini, n_estimators: 100, random_state: 42",0.761905,0.835952,0.783256,0.754496,0.807827
"<class 'sklearn.ensemble.forest.RandomForestClassifier'>.- criterion: gini, n_estimators: 150, random_state: 42",0.760234,0.8353,0.782067,0.753085,0.806658
"<class 'sklearn.ensemble.forest.RandomForestClassifier'>.- criterion: entropy, n_estimators: 50, random_state: 42",0.759398,0.834925,0.780882,0.752806,0.805843
"<class 'sklearn.linear_model.logistic.LogisticRegression'>.- C: 0.1, max_iter: 200, penalty: l2, random_state: 42, solver: lbfgs",0.756475,0.852606,0.766587,0.751663,0.806864
"<class 'sklearn.linear_model.logistic.LogisticRegression'>.- C: 0.1, max_iter: 100, penalty: l2, random_state: 42, solver: lbfgs",0.756475,0.852606,0.766587,0.751663,0.806864
"<class 'sklearn.linear_model.logistic.LogisticRegression'>.- C: 0.1, max_iter: 500, penalty: l2, random_state: 42, solver: lbfgs",0.756475,0.852606,0.766587,0.751663,0.806864
"<class 'sklearn.linear_model.logistic.LogisticRegression'>.- C: 0.1, max_iter: 300, penalty: l2, random_state: 42, solver: lbfgs",0.756475,0.852606,0.766587,0.751663,0.806864


In [15]:
rf_grid[9]

{'criterion': 'entropy', 'n_estimators': 200, 'random_state': 42}

In [16]:
# The best model params:
best_model_params = rf_grid[9]

In [17]:
# Training the best model with the 3 different splits
mod_split1 = cl.specific_model(RandomForestClassifier, splits['split_1'], best_model_params)
mod_split2 = cl.specific_model(RandomForestClassifier, splits['split_2'], best_model_params)
mod_split3 = cl.specific_model(RandomForestClassifier, splits['split_3'], best_model_params)

In [18]:
# Getting the most important features in the 3 versions of the best model
imp_1 = cl.important_features(mod_split1)
imp_2 = cl.important_features(mod_split2)
imp_3 = cl.important_features(mod_split3)

In [19]:
# Average most important features 
cl.average_df([imp_1, imp_2, imp_3], 'imp')[:10]

Unnamed: 0,imp
10,0.084644
17,0.072916
6,0.072435
26,0.054846
20,0.05213
16,0.050884
11,0.045986
25,0.044393
2,0.043865
3,0.039037


In [21]:
features = data_2014.columns[1:]

In [22]:
for i in [10, 17, 6, 26, 20, 16, 11, 25, 2, 3]:
    print(features[i])

garbage
S1401_C02_001E
weapons_violation
S2301_C03_001E
S1501_C02_008E
S1101_C01_002E
sanitation
S2201_C02_001E
vehicle_theft
burglary


In [None]:
'''
This are the most important features descriptions
'''
'garbage'
'Population 3 years and over enrolled in school'
'weapons_violation'
'Employment/Population Ratio!!Estimate!!Population 16 years and over'
'Population 25 years and over!!9th to 12th grade, no diploma'
'Average household size'
'sanitation'
'Households. FOOD STAMPS/Supplemental Nutrition Assistance Program (SNAP)'
'vehicle_theft'
'burglary'

In [None]:
## Predictions for 2019 considering data from 2017

In [25]:
inputs_2017 = pd.read_csv('clean_data/complete_2017.csv')
tracts = inputs_2017['tract'].reset_index()
inputs_2017.drop(['tract', 'S0101_C01_001E'], axis=1, inplace=True)
inputs_2017 = cl.standardrize(inputs_2017)

In [26]:
proba_1 = pd.DataFrame(mod_split1.predict_proba(inputs_2017))[[1]]
proba_2 = pd.DataFrame(mod_split2.predict_proba(inputs_2017))[[1]]
proba_3 = pd.DataFrame(mod_split3.predict_proba(inputs_2017))[[1]]

In [27]:
# Average predict proba 
proba_avg = cl.average_df([proba_1, proba_2, proba_3], 1)

In [28]:
# Average predict proba by tract
probs_2019 = proba_avg.reset_index()
probs_2019 = probs_2019.merge(tracts, on='index')
probs_2019.drop('index', axis=1, inplace = True)
probs_2019.columns = ['probability', 'tract']
probs_2019.head()

Unnamed: 0,probability,tract
0,0.996667,290900
1,0.996667,671500
2,0.996667,250800
3,0.995,691500
4,0.995,711000


In [31]:
# Building a Geopandas data frame to create a map
tracts_geo = gpd.read_file('raw_data\geo_export_fe9f2155-ba22-4697-91ff-daeee48c8d0b.shp')

In [32]:
tracts_geo = tracts_geo[['tractce10', 'geometry']]
tracts_geo['tractce10'] = tracts_geo['tractce10'].astype(int)
abandon_2019_pred = tracts_geo.merge(probs_2019, left_on='tractce10', right_on='tract', how='right')

In [33]:
abandon_2019_pred.to_file('clean_data/abandon_2019_pred.shp')