In [1]:
import pipeline

from sklearn.metrics import *

import pandas as pd
import numpy as np
from pipeline import analyze_best_model
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import LinearSVR
from sklearn.svm import LinearSVC
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier

import warnings
warnings.filterwarnings('ignore')

pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

In [2]:
# setup 
df = pd.read_csv('data/final_merged_df.csv')
splits = pipeline.split_all_years(df, colname='year_evictions')
cleaned_splits = [pipeline.clean_split(split) for split in splits]
labeled_splits = [pipeline.label(split, lower_bound=15, drop_column=True)
                  for split in cleaned_splits]
train_df, test_df = labeled_splits[-1] 

In [None]:
# best classifier   
params = {'learning_rate': 0.05, 'max_depth': 5, 'n_estimators': 1000, 'subsample': 0.5}
best_clf, clf_scores = pipeline.run_one_clf(train_df, test_df, 'GB', params, col_blacklist=['GEOID', 'year_evictions'])

In [None]:
# export list of blocks 
selected_blocks = analyze_best_model.select_k_blocks(clf_scores, .16, ['score'], ['GEOID'])
selected_blocks.to_csv('results/selected_blocks.csv', index=False)

In [None]:
# precision-recall curve 
pipeline.plot_precision_recall_n(clf_scores, 'results/pr_curve.png')

In [None]:
# feature importance 
clf_importance = analyze_best_model.feature_importance(best_clf, test_df.columns)
clf_importance.to_csv('results/clf_feature_importance.csv', index=False)
clf_importance.head(10).round(2)

In [None]:
# best regressor 
train_df, test_df = cleaned_splits[-1]
params = {'max_depth': 50, 'max_features': None, 'min_samples_split': 10}
best_reg, reg_scores = pipeline.run_one_reg(train_df, test_df, 'DTR', params, col_blacklist=['GEOID', 'year_evictions'])

In [None]:
# feature importance 
reg_importance = analyze_best_model.feature_importance(best_reg, test_df.columns)
reg_importance.to_csv('results/reg_feature_importance.csv', index=False)
reg_importance.head(10).round(2)

In [None]:
# comparing clf and reg
comparison = analyze_best_model.clf_reg_comparison(best_clf, clf_scores, best_reg, reg_scores, test_df, .16)
comparison.to_csv('results/clf_reg_comparison.csv', index=False)
comparison.head(10).round(2)