In [35]:
import os
import json
import datetime as dt
import time

import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline


from sklearn.preprocessing import StandardScaler, MinMaxScaler, RobustScaler
from sklearn.model_selection import cross_val_score
from sklearn.feature_selection import SelectKBest, SelectFromModel, chi2, RFE

from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier

from sklearn import metrics

In [2]:
pd.options.display.max_columns = 80
pd.options.display.float_format = lambda x : "{:,.2f}".format(x)

sns.set_style('darkgrid')
plt.rcParams['figure.figsize'] = (12,8)

In [3]:
train = pd.read_csv('clean_data/train.csv', dtype={'Id':str})
test = pd.read_csv('clean_data/test.csv', dtype={'Id':str})

train.shape, test.shape

((15120, 54), (565892, 55))

In [4]:
all_train_df = pd.read_csv('clean_data/all_train_df.csv', dtype={'Id':str})
all_test_df = pd.read_csv('clean_data/all_test_df.csv', dtype={'Id':str})

train.shape, test.shape

((15120, 54), (565892, 55))

In [5]:
with open('clean_data/train_cols.json', 'r') as f:
    train_cols = json.load(f)

In [6]:
id_col = train_cols['id']
cat_cols = train_cols['cat_cols']
int_cols = train_cols['int_cols']
target_col = train_cols['target_col']

ftr_cols = int_cols + cat_cols

In [7]:
ytrain = train[target_col]

In [8]:
N = 35 # nfeatures

### Goal

Build a function that performs various kinds of features selection on input dataset.

This function will be used in modeling notebook.

Methods:
    - Correlations
    - Chi2
    - RFE
    - Lasso

___Correlation___

In [9]:
def top_corr(x,y,n):
    """
    x Dataframe
    y Series or array
    
    Returns series of ftr, score for top n most correlated features
    Score is calculated the sum of abs correlations for all classes
    """
    
    class_dummies = pd.get_dummies(y).add_prefix('class_')
    corrs = []
    for col in x.columns:
        corrs.append(class_dummies.apply(lambda c: c.corr(x[col])).abs().sum())
        
    return pd.DataFrame([corrs],columns=x.columns).T.squeeze().nlargest(n)

In [10]:
top_corrs = top_corr(all_train_df, ytrain, N)
top_corrs

Elevation                            2.35
Wilderness_Area4                     2.17
PCA_K4                               2.17
K3                                   2.15
svd1                                 1.98
nb_prob_2                            1.97
nb_prob_1                            1.96
nb_prob_0                            1.95
pca0                                 1.94
nb_prob_6                            1.83
nb_prob_3                            1.80
nb_prob_5                            1.79
K4                                   1.79
SVD_K3                               1.65
Horizontal_Distance_To_Roadways      1.59
nb_prob_4                            1.59
Wilderness_Area1                     1.37
pca1                                 1.34
Soil_Type10                          1.31
PCA_K11                              1.30
svd2                                 1.27
Horizontal_Distance_To_Fire_Points   1.26
K7                                   1.20
svd0                              

___Chi2___

In [11]:

def top_chi2(x, y, n):
    """
    x Dataframe
    y Series/Array -- class labels
    n Int
    """

    features = x.columns

    # all features must be positive
    x_norm = MinMaxScaler().fit_transform(x)

    selector = SelectKBest(chi2, k=n)
    selector.fit(x_norm, y)
    # bool index on selected columns
    selected = selector.get_support()

    chi2_scores = pd.DataFrame(list(zip(features, selector.scores_)), columns=['ftr', 'chi2_score'])
    chi2_ftrs = chi2_scores.loc[selected]

    return chi2_ftrs.sort_values('chi2_score', ascending=False).head(n).set_index('ftr').squeeze()

In [12]:
topchi2 = top_chi2(all_train_df, ytrain, N)
topchi2

ftr
Wilderness_Area4   6,979.31
nb_prob_6          6,537.16
nb_prob_3          6,066.18
Soil_Type3         3,815.45
K4                 3,664.66
Soil_Type10        3,629.01
Soil_Type38        3,284.79
Wilderness_Area1   3,104.40
PCA_K4             2,990.29
nb_prob_4          2,971.73
Soil_Type39        2,932.23
SVD_K3             2,897.73
nb_prob_0          2,835.58
K3                 2,746.07
nb_prob_5          2,741.51
nb_prob_2          2,547.31
svd1               2,140.41
Soil_Type40        2,103.99
nb_prob_1          1,987.66
pca0               1,960.25
Soil_Type30        1,766.97
Soil_Type29        1,652.45
Wilderness_Area3   1,316.81
Elevation          1,297.24
Soil_Type4         1,281.22
Soil_Type22        1,253.78
Soil_Type17        1,087.83
Soil_Type13        1,064.32
Soil_Type12        1,061.52
Soil_Type23        1,031.55
SVD_K6               953.56
pca1                 939.84
Wilderness_Area2     912.52
Soil_Type6           899.58
svd2                 865.08
Name: chi2_score

___RFE___

In [13]:
def top_rfe(mod, x, y, n, step=0.05, **params):
    selector = RFE(mod(**params), n, step, 1)
    selector.fit(x, y)
    selected = selector.get_support()
    
    rfe_ftrs = np.asarray(x.columns)[selected]
    rfe_ftrs = pd.Series(1, index = rfe_ftrs)
    return rfe_ftrs

In [14]:
rfe_log_reg_ftrs = top_rfe(LogisticRegression, all_train_df, ytrain, N)
rfe_log_reg_ftrs

Fitting estimator with 115 features.
Fitting estimator with 110 features.
Fitting estimator with 105 features.
Fitting estimator with 100 features.
Fitting estimator with 95 features.
Fitting estimator with 90 features.
Fitting estimator with 85 features.
Fitting estimator with 80 features.
Fitting estimator with 75 features.
Fitting estimator with 70 features.
Fitting estimator with 65 features.
Fitting estimator with 60 features.
Fitting estimator with 55 features.
Fitting estimator with 50 features.
Fitting estimator with 45 features.
Fitting estimator with 40 features.


Elevation                             1
Slope                                 1
Horizontal_Distance_To_Hydrology      1
Vertical_Distance_To_Hydrology        1
Horizontal_Distance_To_Roadways       1
Hillshade_9am                         1
Hillshade_Noon                        1
Hillshade_3pm                         1
Horizontal_Distance_To_Fire_Points    1
Soil_Type2                            1
Soil_Type4                            1
Soil_Type6                            1
Soil_Type11                           1
Soil_Type12                           1
Soil_Type13                           1
Soil_Type17                           1
Soil_Type33                           1
K3                                    1
K4                                    1
svd0                                  1
svd1                                  1
svd2                                  1
svd9                                  1
pca0                                  1
pca1                                  1


___Lasso___

In [46]:

def top_lasso(x,y,n,step=0.1, verbose=0):
    xscaled = MinMaxScaler().fit_transform(x.values)
    C = n/xscaled.shape[1]

    direction='down'
    num_non_zero = xscaled.shape[1]
    while num_non_zero != N:
        if verbose:
            print("Fitting Lasso with C =",C)
        l = LogisticRegression(penalty='l1', C=C)
        l.fit(xscaled, y)
        mask = l.coef_.mean(0)!=0
        num_non_zero = mask.sum()
        if verbose:
            print('Num Non-Zero Features:', num_non_zero)
        if num_non_zero == N:
            return pd.Series(np.abs(l.coef_).mean(0)[mask], index = x.columns[mask])
        elif num_non_zero > N:
            C *= 1-step
            new_direction = 'down'
        else:
            C *= 1+step
            new_direction = 'up'
        
        # if we change direction (overshot) lower step
        if new_direction != direction:
            step*=0.5
            direction=new_direction
            if verbose:
                print('New Step:', step)


In [47]:
l1_ftrs = top_lasso(all_train_df, ytrain, N, 0.5, verbose=1)
l1_ftrs

Fitting Lasso with C = 0.30434782608695654
Num Non-Zero Features: 86
Fitting Lasso with C = 0.15217391304347827
Num Non-Zero Features: 79
Fitting Lasso with C = 0.07608695652173914
Num Non-Zero Features: 71
Fitting Lasso with C = 0.03804347826086957
Num Non-Zero Features: 59
Fitting Lasso with C = 0.019021739130434784
Num Non-Zero Features: 46
Fitting Lasso with C = 0.009510869565217392
Num Non-Zero Features: 25
New Step: 0.25
Fitting Lasso with C = 0.014266304347826088
Num Non-Zero Features: 34
Fitting Lasso with C = 0.017832880434782608
Num Non-Zero Features: 47
New Step: 0.125
Fitting Lasso with C = 0.013374660326086956
Num Non-Zero Features: 32
New Step: 0.0625
Fitting Lasso with C = 0.015046492866847826
Num Non-Zero Features: 36
New Step: 0.03125
Fitting Lasso with C = 0.014106087062669836
Num Non-Zero Features: 32
New Step: 0.015625
Fitting Lasso with C = 0.01454690228337827
Num Non-Zero Features: 35


Elevation          0.58
Hillshade_9am      0.07
Hillshade_Noon     0.24
Wilderness_Area3   0.10
Wilderness_Area4   0.46
Soil_Type1         0.01
Soil_Type32        0.00
Soil_Type33        0.00
K3                 0.31
K5                 0.08
K7                 0.00
K8                 0.03
K9                 0.01
K13                0.01
svd1               0.02
pca0               0.09
pca8               0.05
SVD_K4             0.11
SVD_K6             0.16
SVD_K9             0.03
SVD_K10            0.06
SVD_K13            0.01
PCA_K4             0.13
PCA_K6             0.03
PCA_K7             0.08
PCA_K11            0.11
PCA_K13            0.01
PCA_K14            0.04
nb_prob_0          0.76
nb_prob_1          0.64
nb_prob_2          0.62
nb_prob_3          0.78
nb_prob_4          0.85
nb_prob_5          0.60
nb_prob_6          1.07
dtype: float64

33