In [1]:
import os
import json
import datetime as dt
import time

import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline


from sklearn.preprocessing import StandardScaler, MinMaxScaler, RobustScaler
from sklearn.model_selection import cross_val_score
from sklearn.feature_selection import SelectKBest, chi2, RFE

from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier

from sklearn import metrics

In [2]:
pd.options.display.max_columns = 80
pd.options.display.float_format = lambda x : "{:,.2f}".format(x)

sns.set_style('darkgrid')
plt.rcParams['figure.figsize'] = (12,8)

In [4]:
train = pd.read_csv('clean_data/train.csv', dtype={'Id':str})
test = pd.read_csv('clean_data/test.csv', dtype={'Id':str})

train.shape, test.shape

((15120, 54), (565892, 55))

In [5]:
all_train_df = pd.read_csv('clean_data/all_train_df.csv', dtype={'Id':str})
all_test_df = pd.read_csv('clean_data/all_test_df.csv', dtype={'Id':str})

train.shape, test.shape

((15120, 54), (565892, 55))

In [6]:
with open('clean_data/train_cols.json', 'r') as f:
    train_cols = json.load(f)

In [13]:
id_col = train_cols['id']
cat_cols = train_cols['cat_cols']
int_cols = train_cols['int_cols']
target_col = train_cols['target_col']

ftr_cols = int_cols + cat_cols

In [14]:
ytrain = train[target_col]

In [15]:
N = 35 # nfeatures

### Goal

Build a function that performs various kinds of features selection on input dataset.

This function will be used in modeling notebook.

Methods:
    - Correlations
    - Chi2
    - RFE
    - Lasso

___Correlation___

In [18]:
def top_corr(x,y,n):
    """
    x Dataframe
    y Series or array
    
    Returns series of ftr, score for top n most correlated features
    Score is calculated the sum of abs correlations for all classes
    """
    
    class_dummies = pd.get_dummies(y).add_prefix('class_')
    corrs = []
    for col in x.columns:
        corrs.append(class_dummies.apply(lambda c: c.corr(x[col])).abs().sum())
        
    return pd.DataFrame([corrs],columns=x.columns).T.squeeze().nlargest(n)

In [19]:
top_corrs = top_corr(all_train_df, ytrain, N)
top_corrs

Elevation                            2.35
Wilderness_Area4                     2.17
PCA_K4                               2.17
K3                                   2.15
svd1                                 1.98
nb_prob_2                            1.97
nb_prob_1                            1.96
nb_prob_0                            1.95
pca0                                 1.94
nb_prob_6                            1.83
nb_prob_3                            1.80
nb_prob_5                            1.79
K4                                   1.79
SVD_K3                               1.65
Horizontal_Distance_To_Roadways      1.59
nb_prob_4                            1.59
Wilderness_Area1                     1.37
pca1                                 1.34
Soil_Type10                          1.31
PCA_K11                              1.30
svd2                                 1.27
Horizontal_Distance_To_Fire_Points   1.26
K7                                   1.20
svd0                              

___Chi2___

In [20]:

def top_chi2(x, y, n):
    """
    x Dataframe
    y Series/Array -- class labels
    n Int
    """

    features = x.columns

    # all features must be positive
    x_norm = MinMaxScaler().fit_transform(x)

    selector = SelectKBest(chi2, k=n)
    selector.fit(x_norm, y)
    # bool index on selected columns
    selected = selector.get_support()

    chi2_scores = pd.DataFrame(list(zip(features, selector.scores_)), columns=['ftr', 'chi2_score'])
    chi2_ftrs = chi2_scores.loc[selected]

    return chi2_ftrs.sort_values('chi2_score', ascending=False).head(n).set_index('ftr').squeeze()

In [21]:
topchi2 = top_chi2(all_train_df, ytrain, N)
topchi2

ftr
Wilderness_Area4   6,979.31
nb_prob_6          6,537.16
nb_prob_3          6,066.18
Soil_Type3         3,815.45
K4                 3,664.66
Soil_Type10        3,629.01
Soil_Type38        3,284.79
Wilderness_Area1   3,104.40
PCA_K4             2,990.29
nb_prob_4          2,971.73
Soil_Type39        2,932.23
SVD_K3             2,897.73
nb_prob_0          2,835.58
K3                 2,746.07
nb_prob_5          2,741.51
nb_prob_2          2,547.31
svd1               2,140.41
Soil_Type40        2,103.99
nb_prob_1          1,987.66
pca0               1,960.25
Soil_Type30        1,766.97
Soil_Type29        1,652.45
Wilderness_Area3   1,316.81
Elevation          1,297.24
Soil_Type4         1,281.22
Soil_Type22        1,253.78
Soil_Type17        1,087.83
Soil_Type13        1,064.32
Soil_Type12        1,061.52
Soil_Type23        1,031.55
SVD_K6               953.56
pca1                 939.84
Wilderness_Area2     912.52
Soil_Type6           899.58
svd2                 865.08
Name: chi2_score

___RFE___

In [29]:
def top_rfe(mod, x, y, n, step=0.05, **params):
    selector = RFE(mod(**params), n, step, 1)
    selector.fit(x, y)
    selected = selector.get_support()
    
    rfe_ftrs = np.asarray(x.columns)[selected]
    rfe_ftrs = pd.Series(1, index = rfe_ftrs)
    return rfe_ftrs

In [30]:
rfe_ftrs = top_rfe(LogisticRegression, all_train_df, ytrain, N)
rfe_ftrs

Fitting estimator with 115 features.
Fitting estimator with 110 features.
Fitting estimator with 105 features.
Fitting estimator with 100 features.
Fitting estimator with 95 features.
Fitting estimator with 90 features.
Fitting estimator with 85 features.
Fitting estimator with 80 features.
Fitting estimator with 75 features.
Fitting estimator with 70 features.
Fitting estimator with 65 features.
Fitting estimator with 60 features.
Fitting estimator with 55 features.
Fitting estimator with 50 features.
Fitting estimator with 45 features.
Fitting estimator with 40 features.


Elevation                             1
Slope                                 1
Horizontal_Distance_To_Hydrology      1
Vertical_Distance_To_Hydrology        1
Horizontal_Distance_To_Roadways       1
Hillshade_9am                         1
Hillshade_Noon                        1
Hillshade_3pm                         1
Horizontal_Distance_To_Fire_Points    1
Soil_Type2                            1
Soil_Type6                            1
Soil_Type12                           1
Soil_Type13                           1
Soil_Type17                           1
Soil_Type20                           1
Soil_Type23                           1
Soil_Type33                           1
K3                                    1
svd0                                  1
svd1                                  1
svd2                                  1
svd9                                  1
pca0                                  1
pca1                                  1
pca4                                  1


___Lasso___

In [None]:
def top_lasso(x,y,n):
    
    l = LogisticRegression(penalty='l1', C=C)

In [38]:
np.abs(l.coef_).mean(0)

array([4.46794924e+00, 1.69607163e-01, 7.18685496e-01, 1.04217201e+00,
       3.53229305e-01, 8.65355520e-01, 2.52883310e-01, 1.08849719e+00,
       2.89437809e-01, 1.99888861e+00, 0.00000000e+00, 0.00000000e+00,
       0.00000000e+00, 0.00000000e+00, 4.35963416e-01, 5.66523108e-01,
       1.25370609e-01, 2.68261886e-01, 1.33351480e-01, 3.17836846e-01,
       0.00000000e+00, 3.53118769e-02, 0.00000000e+00, 4.56179389e-01,
       7.31366347e-01, 1.07555484e-01, 3.17866664e-02, 2.31252451e-01,
       4.95347138e-01, 2.30914805e-01, 8.14708086e-02, 3.38305229e-01,
       0.00000000e+00, 2.48668587e-02, 2.23033409e-01, 3.25547690e-01,
       0.00000000e+00, 1.77075928e-02, 0.00000000e+00, 4.13324020e-02,
       0.00000000e+00, 0.00000000e+00, 1.98926703e-01, 4.92475593e-01,
       6.48617102e-01, 3.84221806e-03, 1.69284277e-01, 0.00000000e+00,
       1.00481291e-01, 6.99113705e-02, 6.91592949e-02, 1.74786061e-01,
       2.80323804e-01, 1.06467541e-01, 0.00000000e+00, 1.87589355e-01,
      

In [33]:
l = LogisticRegression(penalty='l1', C=0.3)
l.fit(all_train_df, ytrain)
mask = l.coef_!=0

list(zip(all_train_df.columns[mask], l.coef_[mask]))

IndexError: too many indices for array

In [86]:
from sklearn.svm import SVC
from sklearn.datasets import make_classification

# Create a dataset with only 3 informative features
X, y = make_classification(
    n_samples=1000, n_features=11, n_informative=5, n_redundant=2,
    n_repeated=0, n_classes=8, n_clusters_per_class=1, random_state=0
)

X.shape

(1000, 11)

In [87]:
lr = LogisticRegression()
lr.fit(X,y)
np.abs(lr.coef_).mean(0)

array([0.09086308, 0.46207665, 0.1529445 , 0.06592885, 0.59719337,
       0.12248879, 0.36488127, 0.32503965, 0.66566022, 0.31595705,
       0.0965713 ])

In [88]:
np.abs(lr.coef_.sum(0))

array([0.13455466, 0.2215584 , 0.2076402 , 0.03977634, 0.07105863,
       0.24173489, 0.13082656, 0.0328983 , 0.27016   , 0.1630461 ,
       0.05594337])

In [89]:
np.abs(lr.coef_.sum(0)).min()

0.032898296083423983

In [90]:
r = RFE(LogisticRegression(), 10, 1, 1)
r.fit(X,y)
r.get_support()

Fitting estimator with 11 features.


array([ True,  True,  True, False,  True,  True,  True,  True,  True,
        True,  True])