In [3]:
import os
import json
import datetime as dt
import time

import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline


from sklearn.preprocessing import StandardScaler, MinMaxScaler, RobustScaler
from sklearn.model_selection import cross_val_score
from sklearn.feature_selection import SelectKBest, chi2, RFE

from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier

from sklearn import metrics

In [8]:
pd.options.display.max_columns = 80
pd.options.display.float_format = lambda x : "{:,.2f}".format(x)

sns.set_style('darkgrid')
plt.rcParams['figure.figsize'] = (12,8)

In [13]:
train = pd.read_csv('clean_data/train.csv', dtype={'Id':str})
test = pd.read_csv('clean_data/test.csv', dtype={'Id':str})

ytrain = train[target_col]

train.shape, test.shape

((15120, 54), (565892, 55))

In [12]:
all_train_df = pd.read_csv('clean_data/all_train_df.csv', dtype={'Id':str})
all_test_df = pd.read_csv('clean_data/all_test_df.csv', dtype={'Id':str})

train.shape, test.shape

((565892, 115), (565892, 115))

In [4]:
with open('clean_data/train_cols.json', 'r') as f:
    train_cols = json.load(f)

In [6]:
id_col = train_cols['id']
cat_cols = train_cols['cat_cols']
int_cols = train_cols['int_cols']
target_col = train_cols['target_col']

ftr_cols = int_cols + cat_cols

In [82]:
N = 35 # nfeatures

### Goal

Build a function that performs various kinds of features selection on input dataset.

This function will be used in modeling notebook.

Methods:
    - Correlations
    - Chi2
    - RFE
    - Lasso

___Correlation___

In [68]:
def top_corr(x,y,n):
    """
    x Dataframe
    y Series or array
    
    Returns series of ftr, score for top n most correlated features
    Score is calculated the sum of abs correlations for all classes
    """
    
    class_dummies = pd.get_dummies(y).add_prefix('class_')
    corrs = []
    for col in x.columns:
        corrs.append(dummied.apply(lambda c: c.corr(x[col])).abs().sum())
        
    return pd.DataFrame([corrs],columns=x.columns).T.squeeze().nlargest(n)

In [83]:
top_corrs = top_corr(all_train_df, ytrain, N)
top_corrs

Elevation                            2.35
Wilderness_Area4                     2.17
PCA_K4                               2.17
K3                                   2.15
svd1                                 1.98
nb_prob_2                            1.97
nb_prob_1                            1.96
nb_prob_0                            1.95
pca0                                 1.94
nb_prob_6                            1.83
nb_prob_3                            1.80
nb_prob_5                            1.79
K4                                   1.79
SVD_K3                               1.65
Horizontal_Distance_To_Roadways      1.59
nb_prob_4                            1.59
Wilderness_Area1                     1.37
pca1                                 1.34
Soil_Type10                          1.31
PCA_K11                              1.30
svd2                                 1.27
Horizontal_Distance_To_Fire_Points   1.26
K7                                   1.20
svd0                              

___Chi2___

In [94]:

def top_chi2(x, y, n):
    """
    x Dataframe
    y Series/Array -- class labels
    n Int
    """

    features = x.columns

    # all features must be positive
    x_norm = MinMaxScaler().fit_transform(x)

    selector = SelectKBest(chi2, k=n)
    selector.fit(x_norm, y)
    # bool index on selected columns
    selected = selector.get_support()

    chi2_scores = pd.DataFrame(list(zip(features, selector.scores_)), columns=['ftr', 'chi2_score'])
    chi2_ftrs = chi2_scores.loc[selected]

    return chi2_ftrs.sort_values('chi2_score', ascending=False).head(n).set_index('ftr').squeeze()

In [95]:
top_chi2(all_train_df, ytrain, N)

ftr
Wilderness_Area4   6,979.31
nb_prob_6          6,537.16
nb_prob_3          6,066.18
Soil_Type3         3,815.45
K4                 3,664.66
Soil_Type10        3,629.01
Soil_Type38        3,284.79
Wilderness_Area1   3,104.40
PCA_K4             2,990.29
nb_prob_4          2,971.73
Soil_Type39        2,932.23
SVD_K3             2,897.73
nb_prob_0          2,835.58
K3                 2,746.07
nb_prob_5          2,741.51
nb_prob_2          2,547.31
svd1               2,140.41
Soil_Type40        2,103.99
nb_prob_1          1,987.66
pca0               1,960.25
Soil_Type30        1,766.97
Soil_Type29        1,652.45
Wilderness_Area3   1,316.81
Elevation          1,297.24
Soil_Type4         1,281.22
Soil_Type22        1,253.78
Soil_Type17        1,087.83
Soil_Type13        1,064.32
Soil_Type12        1,061.52
Soil_Type23        1,031.55
SVD_K6               953.56
pca1                 939.84
Wilderness_Area2     912.52
Soil_Type6           899.58
svd2                 865.08
Name: chi2_score

RFE