In [2]:
import classifiers as cl
import pandas as pd
from sklearn.model_selection import ParameterGrid
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import AdaBoostClassifier
#import geopandas as gpd
%reload_ext autoreload
%autoreload 2

## Creating splits

In [3]:
data_2014 = pd.read_csv('clean_data/data_2014.csv')
data_2015 = pd.read_csv('clean_data/data_2015.csv')
data_2016 = pd.read_csv('clean_data/data_2016.csv')
data_2017 = pd.read_csv('clean_data/data_2017.csv')

data_2014 = data_2014.fillna(0).drop('Unnamed: 0', axis=1)
data_2015 = data_2015.fillna(0).drop('Unnamed: 0', axis=1)
data_2016 = data_2016.fillna(0).drop('Unnamed: 0', axis=1)
data_2017 = data_2017.fillna(0).drop('Unnamed: 0', axis=1)

In [4]:
splits={
    'split_1':{
        'x_train':cl.standardrize(data_2014.iloc[:,1:]),
        'x_test':cl.standardrize(data_2015.iloc[:,1:]),
        'y_train':data_2014['risk'],
        'y_test':data_2015['risk']
    },
    'split_2':{
        'x_train':cl.standardrize(data_2014.iloc[:,1:].append(data_2015.iloc[:,1:])),
        'x_test':cl.standardrize(data_2016.iloc[:,1:]),
        'y_train':data_2014['risk'].append(data_2015['risk']),
        'y_test':data_2016['risk']
    },
        'split_3':{
        'x_train':cl.standardrize(data_2014.iloc[:,1:].append(data_2015.iloc[:,1:]).append(data_2016.iloc[:,1:])),
        'x_test':cl.standardrize(data_2017.iloc[:,1:]),
        'y_train':data_2014['risk'].append(data_2015['risk']).append(data_2016['risk']),
        'y_test':data_2016['risk']
    }
}

## Creating param grids

In [5]:
logistic_params = {
    'penalty':['l2'],
    'C':[0.1, 0.5, 1.0],
    'random_state':[42],
    'max_iter':[100, 200, 300, 500],
    'solver': ['lbfgs']
}

In [11]:
svm_params = {
    'C':[0.1, 0.5, 1.0],
    'kernel':['linear', 'poly', 'rbf'],
    'gamma':['auto', 'scale'],
    'max_iter':[100, 200, 300, 500, 1000],
    'random_state':[42]
}

In [7]:
boost_params ={
    'n_estimators':[30, 50, 100, 150, 200],
    'learning_rate':[0.1, 0.5, 0.7, 1.0],
    'random_state':[42]
}

In [8]:
rf_params = {
    'n_estimators':[30, 50, 100, 150, 200],
    'criterion':['gini', 'entropy'],
    'random_state':[42]
}

In [12]:
logistic_grid = list(ParameterGrid(logistic_params))
svm_grid = list(ParameterGrid(svm_params))
boost_grid = list(ParameterGrid(boost_params))
rf_grid = list(ParameterGrid(rf_params))

## Testing different models for different splits

In [13]:
diff_models = {
    'logistic':{'model':LogisticRegression, 'grid':logistic_grid},
    'SMV':{'model':SVC, 'grid':svm_grid},
    'boost':{'model':AdaBoostClassifier, 'grid':boost_grid},
    'rf':{'model':RandomForestClassifier, 'grid':rf_grid}
}

In [51]:
split1_results = cl.different_models(diff_models, splits['split_1'])

  'recall', 'true', average, warn_for)
  'recall', 'true', average, warn_for)
  'recall', 'true', average, warn_for)
  'recall', 'true', average, warn_for)
  'recall', 'true', average, warn_for)
  'recall', 'true', average, warn_for)
  'recall', 'true', average, warn_for)
  'recall', 'true', average, warn_for)
  'recall', 'true', average, warn_for)
  'recall', 'true', average, warn_for)
  'recall', 'true', average, warn_for)
  'recall', 'true', average, warn_for)






In [52]:
split2_results = cl.different_models(diff_models, splits['split_2'])







In [53]:
split3_results = cl.different_models(diff_models, splits['split_3'])







In [54]:
cl.average_df([split1_results, split2_results, split3_results], 'AUC')

Unnamed: 0,Accuracy,Precision,Recall,AUC,F1
"<class 'sklearn.ensemble.forest.RandomForestClassifier'>.- criterion: entropy, n_estimators: 150, random_state: 42",0.688805,0.458022,0.559497,0.649886,0.503093
"<class 'sklearn.ensemble.weight_boosting.AdaBoostClassifier'>.- learning_rate: 0.1, n_estimators: 100, random_state: 42",0.687970,0.434872,0.558728,0.646779,0.489040
"<class 'sklearn.ensemble.forest.RandomForestClassifier'>.- criterion: entropy, n_estimators: 200, random_state: 42",0.685464,0.458022,0.551734,0.645393,0.500247
"<class 'sklearn.ensemble.forest.RandomForestClassifier'>.- criterion: gini, n_estimators: 200, random_state: 42",0.683793,0.478580,0.545894,0.644663,0.509758
"<class 'sklearn.ensemble.forest.RandomForestClassifier'>.- criterion: gini, n_estimators: 150, random_state: 42",0.683793,0.460329,0.548122,0.643610,0.500083
"<class 'sklearn.ensemble.forest.RandomForestClassifier'>.- criterion: entropy, n_estimators: 100, random_state: 42",0.683793,0.427666,0.552059,0.641889,0.481797
"<class 'sklearn.svm.classes.SVC'>.- C: 0.5, gamma: scale, kernel: rbf, max_iter: 1000, random_state: 42",0.684628,0.391353,0.559128,0.641720,0.460389
"<class 'sklearn.svm.classes.SVC'>.- C: 0.5, gamma: auto, kernel: rbf, max_iter: 1000, random_state: 42",0.684628,0.391353,0.559128,0.641720,0.460389
"<class 'sklearn.ensemble.weight_boosting.AdaBoostClassifier'>.- learning_rate: 0.5, n_estimators: 30, random_state: 42",0.680451,0.466856,0.543945,0.641514,0.501488
"<class 'sklearn.linear_model.logistic.LogisticRegression'>.- C: 0.1, max_iter: 300, penalty: l2, random_state: 42, solver: lbfgs",0.680869,0.448306,0.547876,0.641459,0.491345


In [29]:
best_model_params = rf_grid[8]

In [30]:
mod_split1 = cl.specific_model(RandomForestClassifier, splits['split_1'], best_model_params)
mod_split2 = cl.specific_model(RandomForestClassifier, splits['split_2'], best_model_params)
mod_split3 = cl.specific_model(RandomForestClassifier, splits['split_3'], best_model_params)

In [78]:
imp_1 = cl.important_features(mod_split1)
imp_2 = cl.important_features(mod_split2)
imp_3 = cl.important_features(mod_split3)

In [79]:
cl.average_df([imp_1, imp_2, imp_3], 'imp')[:10]

Unnamed: 0,imp
22,0.063846
17,0.04989
10,0.04618
18,0.04341
28,0.039206
27,0.039132
21,0.039024
15,0.038607
16,0.03858
3,0.038136


In [64]:
features = data_2014.columns[1:]

In [76]:
for i in [22, 17, 10, 18, 28, 27, 21, 15, 16, 3]:
    print(features[i])

S1501_C02_015E
S1101_C01_002E
garbage
S1401_C02_001E
S2301_C03_001E
S2201_C02_001E
S1501_C02_008E
street_lights
S0101_C01_001E
burglary


In [None]:
"Estimate!!Percent!!Population 25 years and over!!Percent bachelor's degree or higher"
'Estimate!!Total!!HOUSEHOLDS!!Average household size'
'garbage'
"percentage_enrolled_school"
'Employment/Population Ratio!!Estimate!!Population 16 years and over'
'Estimate!!Percent!!Households. FOOD STAMPS/SUPPLEMENTAL NUTRITION ASSISTANCE PROGRAM (SNAP)'
'Percent!!Estimate!!Population 25 years and over!!9th to 12th grade, no diploma'
'street_lights'
'Total!!Estimate!!Total population'
'burglary'

In [None]:
"Estimate!!Percent!!Population 25 years and over!!Percent bachelor's degree or higher"
'Estimate!!Total!!HOUSEHOLDS!!Average household size'
'Estimate!!Percent!!Population 3 years and over enrolled in school'
'Estimate!!Percent!!Households. FOOD STAMPS/SUPPLEMENTAL NUTRITION ASSISTANCE PROGRAM (SNAP)'
'Estimate!!Percent!!Population 25 years and over!!9th to 12th grade no diploma'
'Estimate!!Total!!Total population'
'Estimate!!Employment/Population Ratio!!Population 16 years and over'
'building_violations'
'garbage'
'weapons_violation'

In [80]:
for i in [19, 23, 8, 20, 7, 24, 5, 4, 12, 9]:
    print(features[i])

S1401_C02_030E
S1601_C02_003E
homicides
S1501_C02_002E
sexual_assault
S1701_C03_001E
public_peace_violation
robbery
abandon_vehicles
rodents


In [None]:
'Estimate!!Percent!!Population 18 to 24 years!!Enrolled in college or graduate school'
'Estimate!!Percent!!Population 5 years and over!!Speak a language other than English'
'homicides'
'Estimate!!Percent!!Population 18 to 24 years!!Less than high school graduate'
'sexual_assault'
'Estimate!!Percent below poverty level!!Population for whom poverty status is determined'
'public_peace_violation'
'robbery'
'abandon_vehicles'
'rodents'

In [84]:
inputs_2017 = pd.read_csv('clean_data/complete_2017.csv')
print(inputs_2017.columns)
tracts = inputs_2017['tract'].reset_index()
inputs_2017.drop('tract', axis=1, inplace=True)
inputs_2017 = cl.standardrize(inputs_2017)

Index(['tract', 'buss_licences', 'building_violations', 'vehicle_theft',
       'burglary', 'robbery', 'public_peace_violation', 'weapons_violation',
       'sexual_assault', 'homicides', 'rodents', 'garbage', 'sanitation',
       'abandon_vehicles', 'pot_holes', 'tree_trims', 'street_lights',
       'S0101_C01_001E', 'S1101_C01_002E', 'S1401_C02_001E', 'S1401_C02_030E',
       'S1501_C02_002E', 'S1501_C02_008E', 'S1501_C02_015E', 'S1601_C02_003E',
       'S1701_C03_001E', 'S1810_C03_001E', 'S1901_C01_012E', 'S2201_C02_001E',
       'S2301_C03_001E', 'S2701_C03_001E'],
      dtype='object')


In [97]:
proba_1 = pd.DataFrame(mod_split1.predict_proba(inputs_2017))[[1]]
proba_2 = pd.DataFrame(mod_split2.predict_proba(inputs_2017))[[1]]
proba_3 = pd.DataFrame(mod_split3.predict_proba(inputs_2017))[[1]]

In [98]:
proba_avg = cl.average_df([proba_1, proba_2, proba_3], 1)

In [99]:
probs_2019 = proba_avg.reset_index()
probs_2019 = probs_2019.merge(tracts, on='index')
probs_2019.drop('index', axis=1, inplace = True)
probs_2019.columns = ['probability', 'tract']
probs_2019

Unnamed: 0,probability,tract
0,0.762222,690400
1,0.711111,671100
2,0.711111,530501
3,0.708889,671900
4,0.704444,491300
5,0.700000,711400
6,0.688889,671800
7,0.684444,660400
8,0.680000,431301
9,0.677778,671500


In [110]:
tracts_geo = gpd.read_file('raw_data\geo_export_fe9f2155-ba22-4697-91ff-daeee48c8d0b.shp')

In [121]:
tracts_geo = tracts_geo[['tractce10', 'geometry']]
tracts_geo['tractce10'] = tracts_geo['tractce10'].astype(int)
abandon_2019_pred = tracts_geo.merge(probs_2019, left_on='tractce10', right_on='tract', how='right')

In [125]:
abandon_2019_pred.to_file('clean_data/abandon_2019_pred.shp')