In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import warnings

from sklearn.metrics import *
from sklearn.model_selection import ParameterGrid
from sklearn.calibration import CalibratedClassifierCV

from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier 
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.ensemble import BaggingClassifier

from sklearn.linear_model import LinearRegression
from sklearn.svm import LinearSVR
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
# warnings.filterwarnings('ignore')

In [2]:
########################################
##### Data processing 
########################################

# Load data 
raw_df = pd.read_csv('../data/block-groups_pa.csv')

# Limit to Philadelphia and 2002-2016
raw_df = raw_df[raw_df['parent-location'] == 'Philadelphia County, Pennsylvania']
raw_df = raw_df[raw_df['year'] >= 2002]

# Create evictions_t-1 feature  
evictions = raw_df[['GEOID', 'year', 'evictions']]
evictions_wide = evictions.pivot(index='GEOID', columns='year', values='evictions')
merged_df = pd.merge(evictions, evictions_wide, left_on='GEOID', right_index=True)
for year in range(2003, 2017): 
    previous = year - 1 
    merged_df.loc[merged_df['year'] == year, 'evictions_t-1'] = merged_df[previous]
df = merged_df[['GEOID', 'year', 'evictions', 'evictions_t-1']]
df = df[df['year'] > 2002]

# Create binary target 
df['evictions_over10'] = np.where(df['evictions'] > 10, 1, 0)

# Split data 
def single_split(df, target_col, feature_cols, year): 
    df['train'] = np.where(df['year'] < year, 1, 0)
    df['test'] = np.where(df['year'] == year, 1, 0)
    X_train = df.loc[df['train']==1, feature_cols]
    X_test = df.loc[df['test']==1, feature_cols]
    y_train = df.loc[df['train']==1, target_col]
    y_test = df.loc[df['test']==1, target_col]
    return X_train, X_test, y_train, y_test 

In [3]:
########################################
##### Classification 
########################################

seed = 1234 
clfs = {'LR':  LogisticRegression(solver='liblinear', random_state=seed),
        'KNN': KNeighborsClassifier(), 
        'DT':  DecisionTreeClassifier(random_state=seed), 
        'SVM': SVC(kernel='linear', probability=True, random_state=seed), 
        'RF':  RandomForestClassifier(random_state=seed), 
        'GB':  GradientBoostingClassifier(random_state=seed), 
        'AB':  AdaBoostClassifier(random_state=seed), 
        'NB':  GaussianNB(), 
        'ET':  ExtraTreesClassifier(random_state=seed),
        'BC':  BaggingClassifier(random_state=seed, bootstrap=True)} 

clf_small_grid = {'LR':  {'penalty': ['l1','l2'], 'C': [0.01,0.1]}, 
                  'KNN': {'n_neighbors': [5,10],'weights': ['uniform','distance'],'algorithm': ['auto','ball_tree','kd_tree']}, 
                  'DT':  {'criterion': ['gini', 'entropy'], 'max_depth': [5,50], 'max_features': [None],'min_samples_split': [5,10]}, 
                  'SVM': {'C' : [0.01,0.1]}, 
                  'RF':  {'n_estimators': [100,1000], 'max_depth': [5,50], 'max_features': ['sqrt','log2'],'min_samples_split': [5,10]}, 
                  'GB':  {'n_estimators': [100,1000], 'learning_rate' : [0.01,0.05],'subsample' : [0.1,0.5], 'max_depth': [5,10]}, 
                  'AB':  {'algorithm': ['SAMME', 'SAMME.R'], 'n_estimators': [100,1000]},
                  'NB':  {}, 
                  'ET':  {'n_estimators': [100,1000], 'criterion' : ['gini', 'entropy'] ,'max_depth': [5,10], 'max_features': ['sqrt','log2'],'min_samples_split': [5,10]}, 
                  'BC':  {'n_estimators': [100,1000]}}

clf_large_grid = {'LR':  {'penalty': ['l1','l2'], 'C': [0.00001,0.0001,0.001,0.01,0.1,1,10]}, 
                  'KNN': {'n_neighbors': [1,5,10,25,50,100],'weights': ['uniform','distance'],'algorithm': ['auto','ball_tree','kd_tree']}, 
                  'DT':  {'criterion': ['gini', 'entropy'], 'max_depth': [1,5,10,20,50,100], 'max_features': [None],'min_samples_split': [2,5,10]}, 
                  'SVM': {'C' :[0.00001,0.0001,0.001,0.01,0.1,1,10]}, 
                  'RF':  {'n_estimators': [1,10,100,1000,10000], 'max_depth': [1,5,10,20,50,100], 'max_features': ['sqrt','log2'],'min_samples_split': [2,5,10]}, 
                  'GB':  {'n_estimators': [1,10,100,1000,10000], 'learning_rate' : [0.001,0.01,0.05,0.1,0.5],'subsample' : [0.1,0.5,1.0], 'max_depth': [1,3,5,10,20,50,100]}, 
                  'AB':  {'algorithm': ['SAMME', 'SAMME.R'], 'n_estimators': [1,10,100,1000,10000]},
                  'NB':  {}, 
                  'ET':  {'n_estimators': [1,10,100,1000,10000], 'criterion' : ['gini', 'entropy'] ,'max_depth': [1,5,10,20,50,100], 'max_features': ['sqrt','log2'],'min_samples_split': [2,5,10]}, 
                  'BC':  {'n_estimators': [1,10,100,1000,10000]}}

# Define model paramaters
target_col = 'evictions'
feature_cols = ['evictions_t-1']
X_train, X_test, y_train, y_test = single_split(df, 'evictions_over10', ['evictions_t-1'], 2016)

# Loop over classifiers and parameters 
table = []
for c, model in clfs.items(): 
    parameter_values = clf_small_grid[c]
    for p in ParameterGrid(parameter_values): 
        model.set_params(**p)
        model.fit(X_train, y_train)
        scores = model.predict_proba(X_test)
        pred_labels = [1 if x[1] > 0.5 else 0 for x in scores]
        row = [c, p, accuracy_score(y_test, pred_labels)]
        table.append(row)

pd.DataFrame(table, columns = ['classifier', 'parameters', 'accuracy'])

Unnamed: 0,classifier,parameters,accuracy
0,LR,"{'C': 0.01, 'penalty': 'l1'}",0.857784
1,LR,"{'C': 0.01, 'penalty': 'l2'}",0.857784
2,LR,"{'C': 0.1, 'penalty': 'l1'}",0.857784
3,LR,"{'C': 0.1, 'penalty': 'l2'}",0.857784
4,KNN,"{'algorithm': 'auto', 'n_neighbors': 5, 'weigh...",0.859281
5,KNN,"{'algorithm': 'auto', 'n_neighbors': 5, 'weigh...",0.859281
6,KNN,"{'algorithm': 'auto', 'n_neighbors': 10, 'weig...",0.851048
7,KNN,"{'algorithm': 'auto', 'n_neighbors': 10, 'weig...",0.851048
8,KNN,"{'algorithm': 'ball_tree', 'n_neighbors': 5, '...",0.859281
9,KNN,"{'algorithm': 'ball_tree', 'n_neighbors': 5, '...",0.859281


In [4]:
########################################
##### Regression 
########################################

# Define model paramaters

regs = {'LR': LinearRegression(), 
        'SVR': LinearSVR(), 
        'DTR': DecisionTreeRegressor(), 
        'RFR': RandomForestRegressor()}

reg_small_grid = {'LR': {}, 
                  'SVR': {'C' :[0.01,0.1]},
                  'DTR': {'max_depth': [5,50], 'max_features': [None],'min_samples_split': [2,5,10]}, 
                  'RFR': {'n_estimators': [100,1000], 'max_depth': [5,50], 'max_features': ['sqrt','log2'],'min_samples_split': [2,5,10]}}

reg_large_grid = {'LR': {}, 
                  'SVR': {'C' :[0.00001,0.0001,0.001,0.01,0.1,1,10]},
                  'DTR': {'max_depth': [1,5,10,20,50,100], 'max_features': [None],'min_samples_split': [2,5,10]}, 
                  'RFR': {'n_estimators': [1,10,100,1000,10000], 'max_depth': [1,5,10,20,50,100], 'max_features': ['sqrt','log2'],'min_samples_split': [2,5,10]}}


target_col = 'evictions'
feature_cols = ['evictions_t-1']
X_train, X_test, y_train, y_test = single_split(df, 'evictions_over10', ['evictions_t-1'], 2016)

# Loop over classifiers and parameters 
table = []
for reg, model in regs.items(): 
    parameter_values = reg_small_grid[reg]
    for p in ParameterGrid(parameter_values): 
        model.set_params(**p)
        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)        
        row = [reg, 
               p, 
               mean_squared_error(y_test, y_pred), 
               explained_variance_score(y_test, y_pred), 
               r2_score(y_test, y_pred)]
        table.append(row)

pd.DataFrame(table, columns = ['classifier', 'parameters', 'mean_squared_error', 'explained_variance', 'r2_score'])



Unnamed: 0,classifier,parameters,mean_squared_error,explained_variance,r2_score
0,LR,{},0.114474,0.378168,0.378148
1,SVR,{'C': 0.01},0.13135,0.365229,0.286478
2,SVR,{'C': 0.1},0.133511,0.360108,0.274736
3,DTR,"{'max_depth': 5, 'max_features': None, 'min_sa...",0.101399,0.449177,0.449177
4,DTR,"{'max_depth': 5, 'max_features': None, 'min_sa...",0.101399,0.449177,0.449177
5,DTR,"{'max_depth': 5, 'max_features': None, 'min_sa...",0.101399,0.449177,0.449177
6,DTR,"{'max_depth': 50, 'max_features': None, 'min_s...",0.101354,0.449421,0.44942
7,DTR,"{'max_depth': 50, 'max_features': None, 'min_s...",0.101354,0.449421,0.44942
8,DTR,"{'max_depth': 50, 'max_features': None, 'min_s...",0.101354,0.449421,0.44942
9,RFR,"{'max_depth': 5, 'max_features': 'sqrt', 'min_...",0.101403,0.449159,0.449155
