### Dataset: training, validation and test 
k-fold CV (cross-validation) is used to reduce the risk of overfitting

In [None]:
import pandas as pd
import numpy as np
from sklearn import svm
from sklearn import tree, metrics
from sklearn.tree import DecisionTreeClassifier
from sklearn import model_selection
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score, cross_val_predict
from sklearn.preprocessing import Imputer
from sklearn.preprocessing import LabelEncoder

import time

import warnings
warnings.filterwarnings("ignore")

### Pre-process dataset
crime_prep.csv contains many missing data in column 2 (county) and 3 (community). 
For missing data:
1) if it is missing at random, we can drop the data;
2) if it is missing not at random (e.g., depend on other variables), we can use imputation to reduce bias.


In [None]:
start_time = time.time()
df_prep = pd.read_csv('crime_prep.csv')
print("--- %s seconds ---" % (time.time() - start_time))
#df_prep.shape

# Remove non predictive attributes, such as state, county, community, communitynames, fold
df_prep = df_prep.drop(['v_cont_0', 'v_cat_0', 'v_cat_1', 'v_cat_2', 'v_cat_3'], axis=1)

# Use impute function to fill the missing values (NaN)
# Missing value is filled with mean of the column data
df_impute = df_prep.copy()
impute = Imputer(missing_values="NaN", strategy='mean', axis=0)

# Fit the dataframe into imputer
impute = impute.fit(df_prep)
# Fit the new values into df and transformed
df_prep = impute.transform(df_impute)

# Convert to dataframe and add columns
df = pd.DataFrame(df_prep)
df.columns = df_impute.columns 
#df

### Training models

In [None]:

features = list(df.columns[1:])
target   = df.columns[0]

X_train = df[features]
y_train = df[target]

dt_estimator = tree.DecisionTreeRegressor()
dt_estimator.fit(X_train,y_train)
Y_pred = dt_estimator.predict(X_train)


# Accuracy
accuracy_DT = metrics.accuracy_score(y_train, Y_pred)
print("Accuracy: ", accuracy_DT)

In [99]:
scores = cross_val_score(dt_estimator, X_train, y_train, cv=10)
print("cross validation score: ", scores)
print("Accuracy: %0.2f(+/- %0.2f)"%(scores.mean(), scores.std()*2))

cross validation score:  [0.29744852 0.19388773 0.24329192 0.21022078 0.16834397 0.35078451
 0.36103725 0.09408191 0.2940436  0.15487129]
Accuracy: 0.24(+/- 0.17)
