In [1]:
import pandas as pd
import numpy as np
import pipeline
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import LinearSVR

from sklearn.svm import LinearSVC

import warnings
warnings.filterwarnings('ignore')

pd.set_option('display.max_columns', None)

In [2]:
def top_k_blocks_clf(model, X_train, y_train, X_test, k): 
    pred_scores = model.predict_proba(X_test)[:,1] 
    test_df['pred_scores'] = pred_scores
    test_df.sort_values(by='pred_scores', ascending=False, inplace=True)
    top_16 = test_df.head(int(test_df.shape[0]*k)//1)
    return top_16[['GEOID', 'label', 'pred_scores']]

def top_k_blocks_reg(model, X_train, y_train, X_test, k): 
    preds = model.predict(X_test)
    test_df['preds'] = preds
    test_df.sort_values(by='preds', ascending=False, inplace=True)
    top_16 = test_df.head(int(test_df.shape[0]*k)//1)
    return top_16[['GEOID', 'evictions', 'preds']]

def feature_importance_clf(model): 
    feature_importance = pd.DataFrame(zip(X_test.columns, model.feature_importances_), 
                                      columns=['feature', 'importance'])
    feature_importance.sort_values(by='importance', ascending=False, inplace=True)
    return feature_importance

def feature_importance_reg(model): 
    feature_importance = pd.DataFrame(zip(X_test.columns, model.coef_), 
                                      columns=['feature', 'coef'])
    feature_importance['absv_coef'] = feature_importance['coef'].abs()
    feature_importance.sort_values(by='absv_coef', ascending=False, inplace=True)
    return feature_importance

In [3]:
# setup 
df = pd.read_csv('data/final_merged_df.csv')

splits = pipeline.split_by_year(df, colname='year_evictions')
pipeline.split_boundaries(splits, colname='year_evictions')
cleaned_splits = [pipeline.clean_split(split) for split in splits]
labeled_splits = [pipeline.label(split, lower_bound=14, drop_column=True)
                  for split in cleaned_splits]

In [4]:
# classifiers 
train_df = labeled_splits[5][0]
test_df = labeled_splits[5][1]

X_train = train_df.drop(columns=['GEOID', 'year_evictions', 'label'])
X_test = test_df.drop(columns=['GEOID', 'year_evictions', 'label'])
y_train = train_df['label']
y_test = test_df['label']

model = DecisionTreeClassifier().fit(X_train, y_train)

In [5]:
top_k_blocks_clf(model, X_train, y_train, X_test, 0.16).head(10)

Unnamed: 0,GEOID,label,pred_scores
2841,421010114005,1,1.0
6446,421010279021,1,1.0
8315,421010346001,1,1.0
2288,421010093005,0,1.0
8301,421010345022,1,1.0
3254,421010139002,1,1.0
3268,421010140001,1,1.0
6467,421010280002,1,1.0
2253,421010092002,0,1.0
1147,421010041021,1,1.0


In [6]:
feature_importance_clf(model).head(10)

Unnamed: 0,feature,importance
22,evictions_t-1_scaled,0.482991
23,evictions_t-2_scaled,0.094106
24,evictions_t-5_scaled,0.037501
31,median_gross_rent_scaled,0.035409
33,units_scaled,0.027941
30,renter_occupied_household_size_scaled,0.022452
32,median_household_income_scaled,0.021867
8,total_renter_households_percent,0.021075
9,vacant_units_percent,0.019868
11,num_af_am_alone_percent,0.018127


In [7]:
# regresssions 
train_df = cleaned_splits[5][0]
test_df = cleaned_splits[5][1]

X_train = train_df.drop(columns=['GEOID', 'year_evictions', 'evictions'])
X_test = test_df.drop(columns=['GEOID', 'year_evictions', 'evictions'])
y_train = train_df['evictions']
y_test = test_df['evictions']

model = LinearSVR().fit(X_train, y_train)

In [8]:
top_k_blocks_reg(model, X_train, y_train, X_test, 0.16).head(10)

Unnamed: 0,GEOID,evictions,preds
6215,421010273003,128.0,75.099426
6978,421010301002,83.0,71.154279
8511,421010353021,49.0,52.60428
8315,421010346001,58.0,52.074319
8938,421010373003,31.0,47.997882
5389,421010243002,58.0,47.552069
6103,421010268001,58.0,46.394706
8980,421010377002,38.0,41.319368
5711,421010257001,51.0,39.834526
5291,421010238004,25.0,39.812737


In [9]:
feature_importance_reg(model).head(10)

Unnamed: 0,feature,coef,absv_coef
22,evictions_t-1_scaled,3.56042,3.56042
23,evictions_t-2_scaled,2.303391,2.303391
24,evictions_t-5_scaled,1.536054,1.536054
40,num_with_high_school_degree_scaled,-1.407211,1.407211
42,num_unemployed_scaled,-1.183417,1.183417
12,num_hisp_percent,1.137029,1.137029
2,subbed,0.945203,0.945203
44,violations_count_percent_binary,0.945203,0.945203
41,num_with_ged_scaled,-0.860658,0.860658
10,for_rent_units_percent,-0.789889,0.789889
