In [18]:
from sklearn.tree import export_graphviz, DecisionTreeClassifier, DecisionTreeRegressor
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
from sklearn.model_selection import cross_val_score, train_test_split, GridSearchCV
from sklearn.metrics import confusion_matrix, accuracy_score, mean_squared_error
from sklearn.preprocessing import OneHotEncoder
from sklearn.inspection import permutation_importance, plot_partial_dependence

from scipy.special import comb
from scipy import stats


import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
%matplotlib inline


In [6]:
df = pd.read_csv('feature_df.csv')
df

Unnamed: 0.1,Unnamed: 0,census_tract,council_district_code,district,primary_type,unique_key,zipcode,severe,North_South,month,day,day_name,season,cluster
0,0,6.03,9.0,B,Theft,2.015505e+10,78705.0,False,South,12.0,13.0,Sunday,4.0,7.0
1,1,24.21,5.0,F,Theft,2.015246e+10,78748.0,False,South,9.0,3.0,Thursday,3.0,6.0
2,4,3.02,9.0,B,Theft: Shoplifting,2.014941e+09,78751.0,False,North,4.0,4.0,Friday,2.0,7.0
3,5,24.03,3.0,D,Theft,2.015209e+08,78745.0,False,South,1.0,2.0,Friday,1.0,0.0
4,6,24.13,2.0,F,Theft,2.015209e+10,78744.0,False,South,7.0,28.0,Tuesday,3.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
78228,159453,18.39,7.0,E,Theft: All Other Larceny,2.014851e+09,78753.0,False,North,3.0,26.0,Wednesday,1.0,2.0
78229,159456,17.54,7.0,A,Aggravated Assault,2.014362e+10,78759.0,True,North,12.0,28.0,Sunday,4.0,1.0
78230,159459,24.31,2.0,F,Theft: All Other Larceny,2.014502e+10,78744.0,False,South,6.0,3.0,Tuesday,2.0,8.0
78231,159460,17.22,10.0,A,Theft,2.015506e+10,78759.0,False,North,12.0,30.0,Wednesday,4.0,1.0


In [13]:
pre_feature_df = df.drop(['unique_key', 'census_tract'], axis = 1)
pre_feature_df = feature_df.rename(columns = {'Unnamed: 0': 'original_idx'}) 
pre_feature_df

Unnamed: 0,original_idx,council_district_code,district,primary_type,zipcode,severe,North_South,month,day,day_name,season,cluster
0,0,9.0,B,Theft,78705.0,False,South,12.0,13.0,Sunday,4.0,7.0
1,1,5.0,F,Theft,78748.0,False,South,9.0,3.0,Thursday,3.0,6.0
2,4,9.0,B,Theft: Shoplifting,78751.0,False,North,4.0,4.0,Friday,2.0,7.0
3,5,3.0,D,Theft,78745.0,False,South,1.0,2.0,Friday,1.0,0.0
4,6,2.0,F,Theft,78744.0,False,South,7.0,28.0,Tuesday,3.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...
78228,159453,7.0,E,Theft: All Other Larceny,78753.0,False,North,3.0,26.0,Wednesday,1.0,2.0
78229,159456,7.0,A,Aggravated Assault,78759.0,True,North,12.0,28.0,Sunday,4.0,1.0
78230,159459,2.0,F,Theft: All Other Larceny,78744.0,False,South,6.0,3.0,Tuesday,2.0,8.0
78231,159460,10.0,A,Theft,78759.0,False,North,12.0,30.0,Wednesday,4.0,1.0


In [14]:
feature_df = pre_feature_df.copy()
feature_df = feature_df.drop(['original_idx'], axis = 1 )
feature_df

Unnamed: 0,council_district_code,district,primary_type,zipcode,severe,North_South,month,day,day_name,season,cluster
0,9.0,B,Theft,78705.0,False,South,12.0,13.0,Sunday,4.0,7.0
1,5.0,F,Theft,78748.0,False,South,9.0,3.0,Thursday,3.0,6.0
2,9.0,B,Theft: Shoplifting,78751.0,False,North,4.0,4.0,Friday,2.0,7.0
3,3.0,D,Theft,78745.0,False,South,1.0,2.0,Friday,1.0,0.0
4,2.0,F,Theft,78744.0,False,South,7.0,28.0,Tuesday,3.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...
78228,7.0,E,Theft: All Other Larceny,78753.0,False,North,3.0,26.0,Wednesday,1.0,2.0
78229,7.0,A,Aggravated Assault,78759.0,True,North,12.0,28.0,Sunday,4.0,1.0
78230,2.0,F,Theft: All Other Larceny,78744.0,False,South,6.0,3.0,Tuesday,2.0,8.0
78231,10.0,A,Theft,78759.0,False,North,12.0,30.0,Wednesday,4.0,1.0


In [23]:
#Set up train test split with One HOt encoding
one_hot = OneHotEncoder()
y0 = feature_df.severe

X0 = one_hot.fit_transform(feature_df.copy().drop('severe',axis = 1))
X0_train, X0_test, y0_train, y0_test = train_test_split(X0, y0, 
                                                    test_size=.2)

In [26]:
# Train and fit model                                                   
rf0 = RandomForestClassifier(random_state=0,n_jobs=-1)

rf0.fit(X0_train, y0_train)
                                     
# Test Prediction
pred = rf0.predict(X0_test)
print('Accuracy score: {:.3}'.format(rf0.score(X0_test, y0_test)))

Accuracy score: 1.0


In [27]:
### Try taking out primary_type since severe is derived from it
y1 = feature_df.severe
X1 = one_hot.fit_transform(feature_df.copy().drop(['severe', 'primary_type'],axis = 1))

X1_train, X1_test, y1_train, y1_test = train_test_split(X1, y1, test_size=.2)


In [28]:
# Train and fit model                                                   
rf1 = RandomForestClassifier(random_state=0,n_jobs=-1)

rf1.fit(X1_train, y1_train)
                                     
# Test Prediction
pred = rf1.predict(X1_test)
print('Accuracy score: {:.3}'.format(rf1.score(X1_test, y1_test)))

Accuracy score: 0.776


In [29]:
feature_df.severe.value_counts()

False    57572
True     20661
Name: severe, dtype: int64

In [None]:
# Confusion Matrix, Gini plot
# Group Data, predict by group