### Dataset: training, validation and test 
k-fold CV (cross-validation) is used to reduce the risk of overfitting

In [33]:
import pandas as pd
import numpy as np
from sklearn import svm
from sklearn import tree, metrics
from sklearn.model_selection import cross_val_score, cross_val_predict
from sklearn.preprocessing import Imputer
from sklearn.preprocessing import LabelEncoder

import time

import warnings
warnings.filterwarnings("ignore")

### Pre-process dataset
crime_prep.csv contains many missing data in column 2 (county) and 3 (community). 
For missing data:
1) if it is missing at random, we can drop the data;
2) if it is missing not at random (e.g., depend on other variables), we can use imputation to reduce bias.


In [47]:
start_time = time.time()
df_prep = pd.read_csv('crime_prep.csv')
print("--- %s seconds ---" % (time.time() - start_time))
#df_prep.shape

# Remove non predictive attributes, such as state, county, community, communitynames, fold
df_prep = df_prep.drop(['v_cont_0', 'v_cat_0', 'v_cat_1', 'v_cat_2', 'v_cat_3'], axis=1)

# Use impute function to fill the missing values (NaN)
# Missing value is filled with mean of the column data
df_impute = df_prep.copy()
impute = Imputer(missing_values="NaN", strategy='mean', axis=0)

# Fit the dataframe into imputer
impute = impute.fit(df_prep)
# Fit the new values into df and transformed
df_prep = impute.transform(df_impute)

# Convert to dataframe and add columns
df = pd.DataFrame(df_prep)
df.columns = df_impute.columns 

--- 0.05905008316040039 seconds ---


In [48]:
df

Unnamed: 0,target,v_cont_5,v_cont_6,v_cont_7,v_cont_8,v_cont_9,v_cont_10,v_cont_11,v_cont_12,v_cont_13,...,v_cont_117,v_cont_118,v_cont_119,v_cont_120,v_cont_121,v_cont_122,v_cont_123,v_cont_124,v_cont_125,v_cont_126
0,0.20,0.19,0.33,0.02,0.90,0.12,0.17,0.34,0.47,0.29,...,0.290000,0.12,0.26,0.20,0.060000,0.040000,0.900000,0.500000,0.32,0.140000
1,0.67,0.00,0.16,0.12,0.74,0.45,0.07,0.26,0.59,0.35,...,0.305987,0.02,0.12,0.45,0.163103,0.076708,0.698589,0.440439,0.00,0.195078
2,0.43,0.00,0.42,0.49,0.56,0.17,0.04,0.39,0.47,0.28,...,0.305987,0.01,0.21,0.02,0.163103,0.076708,0.698589,0.440439,0.00,0.195078
3,0.12,0.04,0.77,1.00,0.08,0.12,0.10,0.51,0.50,0.34,...,0.305987,0.02,0.39,0.28,0.163103,0.076708,0.698589,0.440439,0.00,0.195078
4,0.03,0.01,0.55,0.02,0.95,0.09,0.05,0.38,0.38,0.23,...,0.305987,0.04,0.09,0.02,0.163103,0.076708,0.698589,0.440439,0.00,0.195078
5,0.14,0.02,0.28,0.06,0.54,1.00,0.25,0.31,0.48,0.27,...,0.305987,0.01,0.58,0.10,0.163103,0.076708,0.698589,0.440439,0.00,0.195078
6,0.03,0.01,0.39,0.00,0.98,0.06,0.02,0.30,0.37,0.23,...,0.305987,0.05,0.08,0.06,0.163103,0.076708,0.698589,0.440439,0.00,0.195078
7,0.55,0.01,0.74,0.03,0.46,0.20,1.00,0.52,0.55,0.36,...,0.305987,0.01,0.33,0.00,0.163103,0.076708,0.698589,0.440439,0.00,0.195078
8,0.53,0.03,0.34,0.20,0.84,0.02,0.00,0.38,0.45,0.28,...,0.305987,0.04,0.17,0.04,0.163103,0.076708,0.698589,0.440439,0.00,0.195078
9,0.15,0.01,0.40,0.06,0.87,0.30,0.03,0.90,0.82,0.80,...,0.305987,0.00,0.47,0.11,0.163103,0.076708,0.698589,0.440439,0.00,0.195078
