### Dataset: training, validation and test 
k-fold CV (cross-validation) is used to reduce the risk of overfitting

In [226]:
import pandas as pd
import numpy as np
from sklearn import svm
from sklearn import tree, metrics
from sklearn.tree import DecisionTreeClassifier
from sklearn import model_selection
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score, cross_val_predict
from sklearn.preprocessing import Imputer
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import f_regression, mutual_info_regression
import matplotlib.pyplot as plt

import time

import warnings
warnings.filterwarnings("ignore")

### Pre-process dataset
crime_prep.csv contains many missing data in column 2 (county) and 3 (community). 
For missing data:
1) if it is missing at random, we can drop the data;
2) if it is missing not at random (e.g., depend on other variables), we can use imputation to reduce bias.


In [227]:
start_time = time.time()
df_prep = pd.read_csv('crime_prep.csv')
print("--- %s seconds ---" % (time.time() - start_time))
#df_prep.shape

# Remove non predictive attributes, such as state, county, community, communitynames, fold
df_prep = df_prep.drop(['v_cont_0', 'v_cat_0', 'v_cat_1', 'v_cat_2', 'v_cat_3'], axis=1)

# Use impute function to fill the missing values (NaN)
# Missing value is filled with mean of the column data
df_impute = df_prep.copy()
impute = Imputer(missing_values="NaN", strategy='mean', axis=0)

# Fit the dataframe into imputer
impute = impute.fit(df_prep)
# Fit the new values into df and transformed
df_prep = impute.transform(df_impute)

# Convert to dataframe and add columns
df = pd.DataFrame(df_prep)
df.columns = df_impute.columns 
#df

# Build features and target dataframe
features = df.drop(["target"], axis=1)
target = pd.DataFrame(df, columns=["target"])

--- 0.05208706855773926 seconds ---


### Feature Selection: Decision Tree

In [217]:
DT = tree.DecisionTreeRegressor()
DT = DT.fit(features, target)

DT_predict = DT.predict(features)
scores = cross_val_score(DT, features, target, cv=10)
print("cross validation score: ", scores)
print("Accuracy: %0.2f(+/- %0.2f)"%(scores.mean(), scores.std()*2))

KEEP_FEATURES = 5
#features_new = zip(features.columns, (SelectKBest(f_regression, k=5).fit_transform(features, target)))
importantFeatures = sorted(zip(features.columns, DT.feature_importances_), key=lambda x: x[1], reverse=True)

# fit, we can choose f_regression OR mutual_info_regression, we can test both and see the result
importantFeatures[:KEEP_FEATURES]
df_DT=pd.DataFrame(DT.feature_importances_, columns = ["Importance"], index = features.columns).sort_values(['Importance'], ascending = False)[:KEEP_FEATURES]
print(df_DT)
# plt.figure()
# df_DT.plot.bar()
# plt.xlabel('Top '+str(KEEP_FEATURES)+' Predictive Features')
# plt.ylabel('Feature Coefficients')
# plt.show()

cross validation score:  [0.34237567 0.19621919 0.33110611 0.20768963 0.22935667 0.35320383
 0.3367789  0.08079969 0.2004463  0.23229297]
Accuracy: 0.25(+/- 0.17)
           Importance
v_cont_55    0.423790
v_cont_49    0.078133
v_cont_48    0.056849
v_cont_8     0.031982
v_cont_95    0.021413


### Feature Selection: 

### Feature Selection: SelectKBest

In [228]:
# df_prep = pd.read_csv("./crime_prep.csv")
# df_prep = df_prep.drop(['v_cont_0', 'v_cat_0', 'v_cat_1', 'v_cat_2', 'v_cat_3'], axis=1)
# df_impute = df_prep.copy()
# impute = Imputer(missing_values="NaN", strategy='mean', axis=0)

# # Fit the dataframe into imputer
# impute = impute.fit(df_prep)
# # Fit the new values into df and transformed
# df_prep = impute.transform(df_impute)
# df = pd.DataFrame(df_prep)
# df.columns = df_impute.columns 

# # Build features and target dataframe
# features = df.drop(["target"], axis=1)
# target = pd.DataFrame(df, columns=["target"])

KEEP_FEATURES = 5
# # fit, we can choose f_regression OR mutual_info_regression, we can test both and see the result
importantFeatures = SelectKBest(f_regression, k=KEEP_FEATURES)
importantFeatures.fit(features,target)

X_new = importantFeatures.transform(features)
print(importantFeatures.get_support(indices=True))

pd.DataFrame(dict(feature_names= features.columns , scores = importantFeatures.scores_))\
    .sort_values('scores',ascending = False)[:KEEP_FEATURES]
# plt.figure()
# df_DT.plot.bar()
# plt.xlabel('Top '+str(KEEP_FEATURES)+' Predictive Features')
# plt.ylabel('Feature Coefficients')
# plt.show()


[ 3 43 44 45 50]


Unnamed: 0,feature_names,scores
44,v_cont_49,2388.618688
50,v_cont_55,2381.9832
43,v_cont_48,1987.05731
3,v_cont_8,1758.772273
45,v_cont_50,1588.37986


### Training models

In [230]:
scores = cross_val_score(dt_estimator, X_train, y_train, cv=10)
print("cross validation score: ", scores)
print("Accuracy: %0.2f(+/- %0.2f)"%(scores.mean(), scores.std()*2))

cross validation score:  [0.31708349 0.26977677 0.36592821 0.12890328 0.25495498 0.30962938
 0.44268113 0.14334469 0.29081643 0.22252838]
Accuracy: 0.27(+/- 0.18)


In [225]:
%reset out

Once deleted, variables cannot be recovered. Proceed (y/[n])? y
Flushing output cache (14 entries)
