In [2]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.dummy import DummyClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report
import statsmodels.api as sm
from imblearn.over_sampling import SMOTE
from collections import Counter
from yellowbrick.classifier import ROCAUC
from yellowbrick.classifier import ConfusionMatrix
from yellowbrick.classifier import ClassificationReport
pd.set_option('display.max_colwidth', -1)
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)
np.random.seed(42)



In [3]:
df = pd.read_csv('../data/household_national.csv')

In [4]:
# columns to keep from df, removing weighted columns, etc.
mini_col_list = ['CONTROL', 'TOTROOMS', 'PERPOVLVL', 'COMTYPE', 'COMCOST','DBEVICLK', 'DBEVICNOTE', 'DBEVICTHT', 'DBFORCWHR', 'DBLVEFORC', 'RENT', 'BATHEXCLU', 'DISHWASH', 'SOLAR', 'GARAGE', 'DINING', 'LAUNDY', 'STORIES', 'NOSTEP', 'GUTREHB', 'CONDO', 'SEARCHFAM', 'SEARCHLIST', 'SEARCHNET', 'SEARCHOTH', 'SEARCHPUB', 'SEARCHREA', 'SEARCHSIGN', 'RMCHANGE', 'RMCOMMUTE', 'RMCOSTS', 'RMFAMILY', 'RMHOME', 'RMHOOD', 'RMJOB', 'RMOWNHH', 'RMOTHER', 'HMRACCESS', 'HMRENEFF', 'HMRSALE', 'BIKE', 'WALK', 'TRANAMT', 'COMDAYS', 'DIST', 'POOLAMT', 'RATINGHS', 'RATINGNH', 'NHQPCRIME', 'NHQPUBTRN', 'NHQRISK', 'NHQSCHOOL', 'NHQSCRIME', 'NORC', 'SUBDIV', 'NOWIRE', 'TENURE', 'MHWIDE', 'PLUGS', 'CELLPHONE', 'PORCH', 'LEAKO', 'LEAKI', 'NOTOIL', 'PAINTPEEL', 'HOWBUY', 'OCCJANUR', 'OCCFEBRU', 'OCCMARCH', 'OCCAPRIL', 'OCCMAY', 'OCCJUNE', 'OCCJULY', 'OCCAUGUST', 'OCCSEPTEM', 'OCCOCTOB', 'OCCNOVEM', 'OCCDECEM', 'OCCYRRND', 'LEADINSP', 'MONLSTOCC', 'SUITYRRND', 'TIMESHARE', 'VACRESDAYS', 'VACRNTDAYS', 'PERMNEAR', 'VACPRIRES', 'VACREC', 'VACINVEST', 'VACSELL', 'VACINHER', 'VACOTH', 'OWNLOT', 'RENTCNTRL', 'RENTSUB', 'NEARABAND', 'NEARBARCL', 'NEARTRASH', 'BUS', 'SUBWAY', 'VAN', 'DBMISSMORT', 'DBMISSRENT', 'DBUTBILL', 'DBEVICWHERE', 'INTLANG', 'DIVISION', 'OMB13CBSA', 'WEIGHT', 'SP1WEIGHT', 'SP2WEIGHT', 'INTMODE', 'INTSTATUS', 'INTMONTH', 'BLD', 'ENTRYSYS', 'HHSEX', 'HHMAR', 'HHSPAN', 'HHCITSHP', 'MILHH', 'HHAGE', 'HHMOVE', 'HHINUSYR', 'HHRACE', 'HHRACEAS', 'HHRACEPI', 'HHGRAD', 'HHNATVTY', 'HHENROLL', 'PARTNER', 'HSHLDTYPE', 'SAMEHHLD', 'NUMELDERS', 'NUMADULTS', 'NUMNONREL', 'HHYNGKIDS', 'HHOLDKIDS', 'NUMVETS', 'NUMYNGKIDS', 'NUMOLDKIDS', 'NUMSUBFAM', 'NUMSECFAM', 'NUMPEOPLE', 'HHADLTKIDS', 'GRANDHH', 'MULTIGEN', 'UFINROOMS', 'MHANCHOR', 'LOTSIZE', 'FINROOMS', 'YRBUILT', 'HOA', 'FOUNDTYPE', 'UNITFLOORS', 'UNITSIZE', 'BEDROOMS', 'KITCHENS', 'MONOXIDE', 'WATSOURCE', 'SEWUSERS', 'KITEXCLU', 'FRIDGE', 'KITCHSINK', 'WASHER', 'COOKTYPE', 'COOKFUEL', 'DRYER', 'SEWTYPE', 'BATHROOMS', 'HOTWATER', 'HEATFUEL', 'FIREPLACE', 'ACPRIMARY', 'ACSECNDRY', 'HEATTYPE', 'SUPP1HEAT', 'SUPP2HEAT', 'COLD', 'COLDEQ', 'NOWAT', 'COLDUTIL', 'COLDHTCAP', 'COLDINSUL', 'COLDCOST', 'COLDOTHER', 'LEAKOROOF', 'LEAKOBASE', 'LEAKOWALL', 'LEAKOOTH', 'LEAKIPLUM', 'LEAKIPIPE', 'LEAKIWATH', 'LEAKIOTH', 'LEAKIDK', 'WALLCRACK', 'FLOORHOLE', 'FNDCRUMB', 'ROOFSHIN', 'ROOFHOLE', 'ROOFSAG', 'WALLSIDE', 'WALLSLOPE', 'WINBOARD', 'WINBROKE', 'WINBARS', 'COLDEQFREQ', 'NOWATFREQ', 'NOTOILFREQ', 'MOLDKITCH', 'MOLDBATH', 'MOLDBEDRM', 'MOLDBASEM', 'MOLDLROOM', 'MOLDOTHER', 'RODENT', 'ROACH', 'SEWBREAK', 'FUSEBLOW', 'VACMONTHS', 'DWNPAYPCT', 'MHMOVE', 'ELECAMT', 'GASAMT', 'OILAMT', 'OTHERAMT', 'TRASHAMT', 'WATERAMT', 'UTILAMT', 'REMODJOBS', 'FS', 'HHHEAR', 'HHSEE', 'HHMEMRY', 'HHWALK', 'HHCARE', 'HHERRND', 'NUMHEAR', 'NUMSEE', 'NUMMEMRY', 'NUMWALK', 'NUMCARE', 'NUMERRND', 'DISHH', 'MVG1TYPE', 'MVG2TYPE', 'MVG3TYPE', 'MVG1TEN', 'MVG2TEN', 'MVG3TEN', 'MVG1PER', 'MVG2PER', 'MVG3PER', 'MVG1COST', 'MVG2COST', 'MVG3COST', 'MVG1STAT', 'MVG2STAT', 'MVG3STAT', 'MVG1LOC', 'MVG2LOC', 'MVG3LOC', 'SEARCHSTOP', 'MOVFORCE', 'MOVWHY', 'HRATE', 'NRATE', 'CARPOOL', 'TAXI', 'FERRY', 'DRIVEALL', 'COMPANYCAR', 'DRIVEPART', 'PARKING', 'TOLL', 'SUBSIDY', 'DPALTCOM', 'DPGENERT', 'DPDRFOOD', 'DPEMWATER', 'DPEVFIN', 'DPEVINFO', 'DPEVKIT', 'DPEVLOC', 'DPEVVEHIC', 'DPGETINFO', 'DPSHELTR', 'DPEVSEP', 'DPEVACPETS', 'DPFLDINS', 'DPMAJDIS', 'MGRONSITE', 'ADEQUACY', 'POVLVLINC', 'HUDSUB', 'UPKEEP', 'SPLITSAMP', 'VACANCY','FIRSTHOME', 'MARKETVAL', 'TOTBALAMT', 'PROTAXAMT', 'INSURAMT', 'HOAAMT', 'LOTAMT', 'MAINTAMT', 'MORTAMT', 'HINCP', 'FINCP', 'REMODAMT', 'TOTHCAMT']

In [5]:
# saving only the pertinent columns
clean_df = df[mini_col_list]

In [6]:
# removing quotation marks from df
for column in clean_df.columns:
    clean_df[column] = clean_df[column].astype(str).str.replace("'","")

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  This is separate from the ipykernel package so we can avoid doing imports until


In [7]:
# converting all values to numeric values
clean_df = clean_df.apply(pd.to_numeric, errors='ignore')

In [8]:
# recreate ADEQUACY column to make it binary
clean_df['ADEQUACY_BIN'] = clean_df.ADEQUACY.map(lambda x: 0 if x>1 else 1)

In [9]:
nh_cols = ['SUBDIV',
 'NORC',
 'NEARBARCL',
 'NEARABAND',
 'NEARTRASH',
 'RATINGNH',
 'NHQSCHOOL',
 'NHQPCRIME',
 'NHQSCRIME',
 'NHQPUBTRN',
 'NHQRISK', 'ADEQUACY_BIN']

In [10]:
clean_df = clean_df[nh_cols]

In [11]:
clean_df.to_csv('../data/ahs.csv', index=False)

In [13]:
clean_df.head(20)

Unnamed: 0,SUBDIV,NORC,NEARBARCL,NEARABAND,NEARTRASH,RATINGNH,NHQSCHOOL,NHQPCRIME,NHQSCRIME,NHQPUBTRN,NHQRISK,ADEQUACY_BIN
0,1,-6,3,3,3,10,1,2,2,2,2,1
1,2,-6,3,3,3,10,1,2,2,2,2,1
2,1,2,3,3,3,8,1,2,2,2,2,1
3,1,-6,3,3,3,10,1,2,2,1,2,1
4,1,-6,3,3,3,7,2,2,2,2,2,1
5,-6,1,3,3,3,7,-9,-9,-9,-9,2,1
6,-6,-6,-6,-6,-6,-6,-6,-6,-6,-6,-6,1
7,1,2,3,3,3,8,1,1,1,1,2,1
8,-6,-6,3,3,3,2,1,1,1,2,2,1
9,1,1,3,3,3,8,1,2,2,2,2,1


In [12]:
X = clean_df.drop(['ADEQUACY_BIN'], axis=1)
y = clean_df.ADEQUACY_BIN

X_train_0, X_test_0, y_train_0, y_test_0 = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

In [None]:
X_labels = list(X.columns)
y.shape

In [None]:
# always predicts the most frequent label in the training set
dummy0 = DummyClassifier(strategy='most_frequent').fit(X_train_0, y_train_0)
dummy0.fit(X_train_0, y_train_0)
dummy0.score(X_test_0, y_test_0)

In [None]:
cm = ConfusionMatrix(dummy0);
cm.score(X_test_0, y_test_0);
cm.poof();

In [None]:
smote = SMOTE(random_state=42)
X_res, y_res = smote.fit_resample(X, y)
print('Resampled dataset shape %s' % Counter(y_res))

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X_res, y_res, test_size=0.2, random_state=42, stratify=y_res)

In [None]:
scaler = StandardScaler()
scaled_X_train = scaler.fit_transform(X_train)
scaled_X_test = scaler.transform(X_test)

In [None]:
# clf = LogisticRegression(penalty='l1', random_state=42, solver='saga', max_iter=1000).fit(X_train, y_train)
# clf.predict(X_test)
# clf.predict_proba(X_test)
# clf.score(X_test, y_test)

In [None]:
# X = sm.add_constant(X)
# logit_model = sm.Logit(y, X)
# result = logit_model.fit_regularized(method='l1')
# result.summary()

In [None]:
clf_rf = RandomForestClassifier(random_state=42)
clf_rf.fit(X_train, y_train)

rf_est_list = [estimator.get_depth() for estimator in clf_rf.estimators_]

In [None]:
rf_est_list = np.array(rf_est_list)

In [None]:
print(rf_est_list.mean(), rf_est_list.min(), rf_est_list.max(), rf_est_list.shape)

In [None]:
feats = {}
for feature, importance in zip(X_labels, clf_rf.feature_importances_):
    feats[feature] = importance

importances = pd.DataFrame.from_dict(feats, orient='index').rename(columns={0: 'importance'})
importances.sort_values(by='importance', inplace=True, ascending=False)
importances.head(20)

In [None]:
y_train_hat = clf_rf.predict(X_train)
print(classification_report(y_train, y_train_hat))

In [None]:
y_test_hat = clf_rf.predict(X_test)
clf_rf.score(X_test, y_test)

In [None]:
cm = ConfusionMatrix(clf_rf);
cm.score(X_test, y_test);
cm.poof();

In [None]:
print(classification_report(y_test, y_test_hat))