### Dataset: training, validation and test 
k-fold CV (cross-validation) is used to reduce the risk of overfitting

In [52]:
import pandas as pd
import numpy as np

from sklearn import svm
from sklearn.svm import SVR
from sklearn import tree, metrics
from sklearn.tree import DecisionTreeClassifier
from sklearn import model_selection
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score, cross_val_predict
from sklearn.feature_selection import f_regression, mutual_info_regression
from sklearn.linear_model import LogisticRegression

from sklearn.preprocessing import Imputer
from sklearn.preprocessing import LabelEncoder

from sklearn.feature_selection import SelectKBest, RFE
from sklearn.feature_selection import RFECV
from sklearn.decomposition import PCA

import matplotlib.pyplot as plt

import time

import warnings
warnings.filterwarnings("ignore")

### Pre-process dataset
crime_prep.csv contains many missing data in column 2 (county) and 3 (community). 
For missing data:
1) if it is missing at random, we can drop the data;
2) if it is missing not at random (e.g., depend on other variables), we can use imputation to reduce bias.


In [31]:
start_time = time.time()
df_prep = pd.read_csv('crime_prep.csv')
print("--- %s seconds ---" % (time.time() - start_time))
#df_prep.shape

# Remove non predictive attributes, such as state, county, community, communitynames, fold
df_prep = df_prep.drop(['v_cont_0', 'v_cat_0', 'v_cat_1', 'v_cat_2', 'v_cat_3'], axis=1)

# Use impute function to fill the missing values (NaN)
# Missing value is filled with mean of the column data
df_impute = df_prep.copy()
impute = Imputer(missing_values="NaN", strategy='mean', axis=0)

# Fit the dataframe into imputer
impute = impute.fit(df_prep)
# Fit the new values into df and transformed
df_prep = impute.transform(df_impute)

# Convert to dataframe and add columns
df = pd.DataFrame(df_prep)
df.columns = df_impute.columns 
#df

# Build features and target dataframe
features = df.drop(["target"], axis=1)
target = pd.DataFrame(df, columns=["target"])

##############################
# Config for feature selection
##############################
KEEP_FEATURES = 5

--- 0.06949400901794434 seconds ---


## Feature Selection
1. Filter Method, e.g., pearson correlation
2. Wrapper Method
    1) backward elimination: feed all the possible features to the model at first and iteratively remove the worst performing features one by one till the overall performance of the model comes in acceptable range.
    2) recursive feature elimination: recursively removing attributes and building a model on those attributes that remain.
3. Embedded Method

### Feature Importance: Decision Tree 

In [3]:
DT = tree.DecisionTreeRegressor()
DT = DT.fit(features, target)

DT_predict = DT.predict(features)
scores = cross_val_score(DT, features, target, cv=10)
print("cross validation score: ", scores)
print("Accuracy: %0.2f(+/- %0.2f)"%(scores.mean(), scores.std()*2))

#features_new = zip(features.columns, (SelectKBest(f_regression, k=5).fit_transform(features, target)))
importantFeatures = sorted(zip(features.columns, DT.feature_importances_), key=lambda x: x[1], reverse=True)

# fit, we can choose f_regression OR mutual_info_regression, we can test both and see the result
importantFeatures[:KEEP_FEATURES]
df_DT=pd.DataFrame(DT.feature_importances_, columns = ["Importance"], index = features.columns).sort_values(['Importance'], ascending = False)[:KEEP_FEATURES]
print(df_DT)
# plt.figure()
# df_DT.plot.bar()
# plt.xlabel('Top '+str(KEEP_FEATURES)+' Predictive Features')
# plt.ylabel('Feature Coefficients')
# plt.show()

cross validation score:  [0.29467502 0.25902291 0.32186714 0.16407065 0.21186619 0.27545545
 0.34413949 0.17908807 0.19815482 0.25089548]
Accuracy: 0.25(+/- 0.12)
           Importance
v_cont_55    0.424154
v_cont_49    0.078636
v_cont_48    0.057069
v_cont_8     0.031986
v_cont_95    0.021408


### Feature Selection: 

In [51]:
# By default, kernel RBF, gamma auto deprecated,C = 1.0
SVR = svm.SVR(kernel='linear')
SVR = SVR.fit(features, target)

SVR_predict = SVR.predict(features)
scores = cross_val_score(SVR, features, target, cv=10)
print("cross validation score: ", scores)
print("Accuracy: %0.2f(+/- %0.2f)"%(scores.mean(), scores.std()*2))

#features_new = zip(features.columns, (SelectKBest(f_regression, k=5).fit_transform(features, target)))
importantFeatures = sorted(zip(features.columns, SVR.coef_), key=lambda x: x[1], reverse=True)

# fit, we can choose f_regression OR mutual_info_regression, we can test both and see the result
importantFeatures[:KEEP_FEATURES]
df_SVR=pd.DataFrame(SVR.coef_, columns = ["Importance"], index = features.columns).sort_values(['Importance'], ascending = False)[:KEEP_FEATURES]
print(df_SVR)
# plt.figure()
# df_DT.plot.bar()
# plt.xlabel('Top '+str(KEEP_FEATURES)+' Predictive Features')
# plt.ylabel('Feature Coefficients')
# plt.show()

cross validation score:  [0.59980061 0.64389414 0.60847216 0.64421334 0.68573405 0.72489733
 0.71248086 0.62853597 0.65695183 0.59031333]
Accuracy: 0.65(+/- 0.09)


ValueError: Shape of passed values is (122, 1), indices imply (1, 122)

### Feature Selection: SelectKBest

In [5]:
# df_prep = pd.read_csv("./crime_prep.csv")
# df_prep = df_prep.drop(['v_cont_0', 'v_cat_0', 'v_cat_1', 'v_cat_2', 'v_cat_3'], axis=1)
# df_impute = df_prep.copy()
# impute = Imputer(missing_values="NaN", strategy='mean', axis=0)

# # Fit the dataframe into imputer
# impute = impute.fit(df_prep)
# # Fit the new values into df and transformed
# df_prep = impute.transform(df_impute)
# df = pd.DataFrame(df_prep)
# df.columns = df_impute.columns 

# # Build features and target dataframe
# features = df.drop(["target"], axis=1)
# target = pd.DataFrame(df, columns=["target"])

# # fit, we can choose f_regression OR mutual_info_regression, we can test both and see the result
importantFeatures = SelectKBest(f_regression, k=KEEP_FEATURES)
importantFeatures.fit(features,target)

X_new = importantFeatures.transform(features)
print(importantFeatures.get_support(indices=True))

pd.DataFrame(dict(feature_names= features.columns , scores = importantFeatures.scores_))\
    .sort_values('scores',ascending = False)[:KEEP_FEATURES]
# plt.figure()
# df_DT.plot.bar()
# plt.xlabel('Top '+str(KEEP_FEATURES)+' Predictive Features')
# plt.ylabel('Feature Coefficients')
# plt.show()


[ 3 43 44 45 50]


Unnamed: 0,feature_names,scores
44,v_cont_49,2388.618688
50,v_cont_55,2381.9832
43,v_cont_48,1987.05731
3,v_cont_8,1758.772273
45,v_cont_50,1588.37986


### Feature Selection: Recursive Feature Elimination (RFE)


In [63]:
# create a base classifier used to evaluate a subset of attributes
estimator = SVR(kernel='linear')
# create the RFE model and select X attributes, X=KEEP_FATURES
#selector = RFE(estimator, KEEP_FEATURES, step=1)

# Without specifing the number of features, RFECV finds the optimal 
# number of features
selector = RFECV(estimator, step=1, cv=5)

# Fit the model
selector = selector.fit(features, target)

# summarize the selection of the attributes
print(selector.support_)
print(selector.ranking_)
rfe_features = sorted(zip(features.columns, selector.ranking_), key=lambda x: x[1], reverse=False)[:KEEP_FEATURES]
print(rfe_features)

[False  True  True False  True False  True  True  True False  True  True
  True  True  True  True  True False  True  True  True  True  True  True
 False  True  True  True  True  True  True False False  True  True  True
  True  True  True  True  True  True  True  True  True  True  True  True
  True  True  True  True  True False  True  True  True  True  True  True
  True  True  True  True  True  True  True  True  True  True  True  True
  True  True  True  True False False False  True  True False  True  True
  True  True  True False  True  True  True  True False  True  True False
  True  True  True  True  True  True  True  True  True  True  True  True
  True  True  True  True  True False False  True  True  True  True  True
  True  True]
[10  1  1 11  1  2  1  1  1 15  1  1  1  1  1  1  1  3  1  1  1  1  1  1
  9  1  1  1  1  1  1  4 17  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1
  1  1  1  1  1 12  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1
  1  1  1  1 16 19  5  1  1  7  1  1 

### Dimensionality Reduction: PCA
LDA ("supervised"): use information of classes to find new features to maximize its separability
PCA ("unsupervised"): use variance of each feature to maximize its separability; it actually constructs a new set of features

PCA is concerned with covariance of predictor matrix X (input features); in regression, we also need to concern with the covariance of X (features) and y (target). 

PCR, PLS

In [59]:
pca = PCA(n_components=5)
pca.fit(features)  
print(pca.explained_variance_ratio_)  
print(pca.singular_values_)  

[0.25854019 0.18166128 0.07971167 0.06863376 0.0447857 ]
[46.24373896 38.76324226 25.67733016 23.82636609 19.24680277]


### Training models

In [231]:
scores = cross_val_score(dt_estimator, X_train, y_train, cv=10)
print("cross validation score: ", scores)
print("Accuracy: %0.2f(+/- %0.2f)"%(scores.mean(), scores.std()*2))

ValueError: Found input variables with inconsistent numbers of samples: [122, 6]

In [None]:
%reset