In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import warnings

from sklearn.metrics import *
from sklearn.model_selection import ParameterGrid

from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier

from sklearn.linear_model import LinearRegression
from sklearn.svm import LinearSVR
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
warnings.filterwarnings('ignore')

In [2]:
########################################
##### Data processing 
########################################

# Load data 
raw_df = pd.read_csv('../data/block-groups_pa.csv')

# Limit to Philadelphia and 2002-2016
raw_df = raw_df[raw_df['parent-location'] == 'Philadelphia County, Pennsylvania']
raw_df = raw_df[raw_df['year'] >= 2002]

# Create evictions_t-1 feature  
evictions = raw_df[['GEOID', 'year', 'evictions']]
evictions_wide = evictions.pivot(index='GEOID', columns='year', values='evictions')
merged_df = pd.merge(evictions, evictions_wide, left_on='GEOID', right_index=True)
for year in range(2003, 2017): 
    previous = year - 1 
    merged_df.loc[merged_df['year'] == year, 'evictions_t-1'] = merged_df[previous]
df = merged_df[['GEOID', 'year', 'evictions', 'evictions_t-1']]
df = df[df['year'] > 2002]

# Create binary target 
df['evictions_over10'] = np.where(df['evictions'] > 10, 1, 0)

# Split data 
def single_split(df, target_col, feature_cols, year): 
    df['train'] = np.where(df['year'] < year, 1, 0)
    df['test'] = np.where(df['year'] == year, 1, 0)
    X_train = df.loc[df['train']==1, feature_cols]
    X_test = df.loc[df['test']==1, feature_cols]
    y_train = df.loc[df['train']==1, target_col]
    y_test = df.loc[df['test']==1, target_col]
    return X_train, X_test, y_train, y_test 

In [3]:
########################################
##### Classification 
########################################

# Define model paramaters
target_col = 'evictions'
feature_cols = ['evictions_t-1']
classifiers = {'LR': LogisticRegression(solver='liblinear'), 'DT': DecisionTreeClassifier()}
parameters = {'LR': {'penalty': ['l1','l2'], 'C': [0.001,0.1]}, 
              'DT': {'max_depth': [5,10], 'min_samples_split': [5,10]}}
X_train, X_test, y_train, y_test = single_split(df, 'evictions_over10', ['evictions_t-1'], 2016)

# Loop over classifiers and parameters 
table = []
for clf, model in classifiers.items(): 
    parameter_values = parameters[clf]
    for p in ParameterGrid(parameter_values): 
        model.set_params(**p)
        model.fit(X_train, y_train)
        scores = model.predict_proba(X_test)
        pred_labels = [1 if x[1] > 0.5 else 0 for x in scores]
        row = [clf, p, accuracy_score(y_test, pred_labels)]
        table.append(row)

pd.DataFrame(table, columns = ['classifier', 'parameters', 'accuracy'])

Unnamed: 0,classifier,parameters,accuracy
0,LR,"{'C': 0.001, 'penalty': 'l1'}",0.853293
1,LR,"{'C': 0.001, 'penalty': 'l2'}",0.853293
2,LR,"{'C': 0.1, 'penalty': 'l1'}",0.857784
3,LR,"{'C': 0.1, 'penalty': 'l2'}",0.857784
4,DT,"{'max_depth': 5, 'min_samples_split': 5}",0.857784
5,DT,"{'max_depth': 5, 'min_samples_split': 10}",0.857784
6,DT,"{'max_depth': 10, 'min_samples_split': 5}",0.857784
7,DT,"{'max_depth': 10, 'min_samples_split': 10}",0.857784


In [4]:
########################################
##### Regression 
########################################

# Define model paramaters
target_col = 'evictions'
feature_cols = ['evictions_t-1']
regressions = {'LR': LinearRegression(), 
               'SVR': LinearSVR(), 
               'DTR': DecisionTreeRegressor(), 
               'RFR': RandomForestRegressor()}
parameters = {'LR': {'fit_intercept': [True, False]}, 
              'SVR': {'tol': [0.0001, 0.001], 'C': [0.001,0.1]},
              'DTR': {'max_depth': [5,10], 'min_samples_split': [5,10]}, 
              'RFR': {'n_estimators': [10,100], 'max_depth': [5,10], 'min_samples_split': [5,10]}}
X_train, X_test, y_train, y_test = single_split(df, 'evictions_over10', ['evictions_t-1'], 2016)

# Loop over classifiers and parameters 
table = []
for clf, model in regressions.items(): 
    parameter_values = parameters[clf]
    for p in ParameterGrid(parameter_values): 
        model.set_params(**p)
        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)        
        row = [clf, 
               p, 
               mean_squared_error(y_test, y_pred), 
               explained_variance_score(y_test, y_pred), 
               r2_score(y_test, y_pred)]
        table.append(row)

pd.DataFrame(table, columns = ['classifier', 'parameters', 'mean_squared_error', 'explained_variance', 'r2_score'])

Unnamed: 0,classifier,parameters,mean_squared_error,explained_variance,r2_score
0,LR,{'fit_intercept': True},0.114474,0.378168,0.378148
1,LR,{'fit_intercept': False},0.114463,0.378248,0.37821
2,SVR,"{'C': 0.001, 'tol': 0.0001}",0.13183,0.364122,0.283871
3,SVR,"{'C': 0.001, 'tol': 0.001}",0.131909,0.363938,0.283441
4,SVR,"{'C': 0.1, 'tol': 0.0001}",0.134609,0.357387,0.268772
5,SVR,"{'C': 0.1, 'tol': 0.001}",0.130991,0.366045,0.288429
6,DTR,"{'max_depth': 5, 'min_samples_split': 5}",0.101399,0.449177,0.449177
7,DTR,"{'max_depth': 5, 'min_samples_split': 10}",0.101399,0.449177,0.449177
8,DTR,"{'max_depth': 10, 'min_samples_split': 5}",0.101354,0.44942,0.44942
9,DTR,"{'max_depth': 10, 'min_samples_split': 10}",0.101354,0.44942,0.44942
