# Feature Selection
* Filter method
* Wrapper method
* Embedded method

### Importing Libraries

In [1]:
import pandas as pd
import numpy as np
import scipy
import scipy.stats as stats
from scipy.stats import normaltest

import math
import sklearn
import seaborn as sns
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings('ignore')

from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OrdinalEncoder, OneHotEncoder
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.decomposition import PCA
from sklearn.linear_model  import LogisticRegression


from sklearn.ensemble import (
    RandomForestClassifier,
    BaggingClassifier,
    AdaBoostClassifier,
)

# using feature engine library
from feature_engine.selection import (
    DropCorrelatedFeatures, 
    SmartCorrelatedSelection,
    DropConstantFeatures, 
    DropDuplicateFeatures,
    SelectBySingleFeaturePerformance,
    SelectByTargetMeanPerformance,
    SelectByShuffling,
    RecursiveFeatureElimination,
    RecursiveFeatureAddition
)

# to obtain the mutual information values
from sklearn.feature_selection import (
    f_classif,
    f_regression,
    mutual_info_classif, 
    mutual_info_regression,
    VarianceThreshold
)
# to select the features
from sklearn.feature_selection import SelectKBest, SelectPercentile

# wrapper methods
from mlxtend.feature_selection import SequentialFeatureSelector as SFS
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier, GradientBoostingRegressor
from sklearn.metrics import roc_auc_score, r2_score

from mlxtend.feature_selection import ExhaustiveFeatureSelector as EFS

# embedded methods
from sklearn.linear_model import LogisticRegression
from sklearn.feature_selection import SelectFromModel
from sklearn.feature_selection import RFE

### Loading dataset

In [2]:
data = pd.read_excel('../dataset/Bank_Personal_Loan_Modelling_transformed.xlsx')
data.shape

(5000, 14)

In [3]:
target='Personal Loan'

### Splitting in to train and test set
* its a good practice to select the features by examining only the training set. And this is to avoid the overfitting.

In [4]:
# separate dataset into train and test

X_train, X_test, y_train, y_test = train_test_split(
    data.drop(labels=[target], axis=1),  # drop the target
    data[target],  # just the target
    test_size=0.1,
    random_state=0)

X_train.shape, X_test.shape

((4500, 13), (500, 13))

In [5]:
import time
from functools import wraps

PROF_DATA = {}

def profile(fn):
    @wraps(fn)
    def with_profiling(*args, **kwargs):
        start_time = time.time()

        ret = fn(*args, **kwargs)

        elapsed_time = time.time() - start_time

        if fn.__name__ not in PROF_DATA:
            PROF_DATA[fn.__name__] = [0, []]
        PROF_DATA[fn.__name__][0] += 1
        PROF_DATA[fn.__name__][1].append(elapsed_time)

        return ret

    return with_profiling

def print_prof_data():
    for fname, data in PROF_DATA.items():
        max_time = max(data[1])
        avg_time = sum(data[1]) / len(data[1])
        print("Function %s called %d times. " % (fname, data[0]))
        print('Execution time max: %.3f, average: %.3f' % (max_time, avg_time))
        print('')

def clear_prof_data():
    global PROF_DATA
    PROF_DATA = {}

## 1. Filter methods

**Remove constand, quasi constant and duplicates**

In [6]:
@profile
def remove_constant_and_quasi_constant_features(df):
    # remove constant and quasi-constant features first:
    # we use Feature-engine for this
    sel = DropConstantFeatures(tol=0.998, variables=None, missing_values='raise')
    sel.fit_transform(df)
    return sel.transform(df)

In [7]:
df = remove_constant_and_quasi_constant_features(X_train)
df.shape

(4500, 13)

In [8]:
print_prof_data()

Function remove_constant_and_quasi_constant_features called 1 times. 
Execution time max: 0.069, average: 0.069



In [9]:
@profile
def remove_duplicate_features(df):
    # set up the selector
    sel = DropDuplicateFeatures(variables=None, missing_values='raise')
    # find the duplicate features, this might take a while
    sel.fit(df)
    return sel.fit_transform(df)

In [10]:
df = remove_duplicate_features(df)
df.shape

(4500, 13)

In [11]:
print_prof_data()

Function remove_constant_and_quasi_constant_features called 1 times. 
Execution time max: 0.069, average: 0.069

Function remove_duplicate_features called 1 times. 
Execution time max: 0.047, average: 0.047



**Remove Correlated features**
*  "Good feature subsets contain features highly correlated with the target, yet uncorrelated to each other".

In [12]:
@profile
def remove_corr_features_brute_force(df, print_res=False):
    sel = DropCorrelatedFeatures(
        threshold=0.8,
        method='pearson',
        missing_values='ignore'
    )
    # find correlated features
    sel.fit(df)
    if (print_res):
        sel.correlated_feature_sets_
    return sel.transform(df)

In [13]:
df = remove_corr_features_brute_force(X_train)

In [14]:
# smart correlation selection
@profile
def remove_corr_features_smart(x_train, y_train, print_res=False):
    # random forest
    rf = RandomForestClassifier(
        n_estimators=10,
        random_state=20,
        n_jobs=4,
    )

    # correlation selector
    sel = SmartCorrelatedSelection(
        variables=None, # if none, selector examines all numerical variables
        method="pearson",
        threshold=0.8,
        missing_values="raise",
        selection_method="model_performance", # this can be set to variance also to select feature with mst variance
        estimator=rf,
        scoring="roc_auc",
        cv=3,
    )

    # this may take a while, because we are training
    # a random forest per correlation group
    sel.fit(x_train, y_train)
    
    if (print_res):
        sel.correlated_feature_sets_
    return sel.fit_transform(x_train, y_train)

In [15]:
df = remove_corr_features_smart(df, y_train)
df.shape

(4500, 12)

**Statistical Techniques and Ranking Methods**


**Mutual Information**

In [16]:
@profile
def selectkbest_mi_clf(x_train, y_train, k=10, print_res=False):
    sel = SelectKBest(mutual_info_classif, k=k).fit(x_train, y_train)
    
    if print_res:
        # display features
        x_train.columns[sel.get_support()]
        
    return sel.transform(x_train)

In [17]:
df = selectkbest_mi_clf(X_train, y_train)
df.shape

(4500, 10)

**Chi-Square Test**
NOTE: to be used only with the categorical variables

In [18]:
# to determine the chi2 value
from sklearn.feature_selection import chi2

@profile
def chi_square_test(x_train, y_train, k=1, print_res=False):
    sel = SelectKBest(chi2, k=1).fit(x_train, y_train)
    
    if print_res:
        # display features
        x_train.columns[sel.get_support()]
        
    return sel.transform(x_train)

In [19]:
#some error in above

**ANNOVA**
* ANOVA assumes a linear relationship between the feature and the target and that the variables follow a Gaussian distribution. If this is not true, the result of this test may not be useful.
* 

In [20]:
@profile
def annova_clf(x_train, y_train, k=10, print_res=False):
    # calculate the univariate statistical measure between
    # each of the variables and the target

    # similarly to chi2, the output is one array with f-scores
    # and one array with the pvalues

    sel = SelectKBest(f_classif, k=k).fit(x_train, y_train)
    
    if print_res:
        # display features
        print(x_train.columns[sel.get_support()])
        
    return sel.transform(x_train)

In [21]:
df = annova_clf(X_train, y_train)
print(df.shape)

(4500, 10)


**Feature Selection with ML models**
* Idea is that a single feature is taken out to build a model and then this feature will be ranked as per the model's performance.

In [22]:
@profile
def select_by_single_feature_perf_clf(x_train, y_train, print_res=False):
    # set up a machine learning model
    rf = RandomForestClassifier(
        n_estimators=10, random_state=1, n_jobs=4)

    # set up the selector
    sel = SelectBySingleFeaturePerformance(
        variables=None,
        estimator=rf,
        scoring="roc_auc",
        cv=3,
        threshold=0.5)

    # find predictive features
    sel.fit(X_train, y_train)
    
    if print_res:
        print(sel.feature_performance_)
        
    return sel.transform(x_train)

In [23]:
df = select_by_single_feature_perf_clf(X_train, y_train)
print(df.shape)

(4500, 9)


## 2. Wrapper methods

In [24]:
@profile
def step_forward_selection_clf(x_train, y_train, k = 10, print_res=False):
    # review to increase the n_estimators
    sfs = SFS(RandomForestClassifier(n_estimators=10, n_jobs=4, random_state=0), 
           k_features=k, # the more features we want, the longer it will take to run
           forward=True, 
           floating=False, # see the docs for more details in this parameter
           verbose=2, # this indicates how much to print out intermediate steps
           scoring='roc_auc',
           cv=2)

    sfs = sfs.fit(np.array(x_train), y_train)
    selected_feat = x_train.columns[list(sfs.k_feature_idx_)]
    if print_res:
        print(selected_feat)
        
    return x_train[selected_feat]

In [25]:
df = step_forward_selection_clf(X_train, y_train)
print(df.shape)

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.3s remaining:    0.0s
[Parallel(n_jobs=1)]: Done  13 out of  13 | elapsed:    4.9s finished

[2021-06-03 18:57:25] Features: 1/10 -- score: 0.8813161971181842[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.3s remaining:    0.0s
[Parallel(n_jobs=1)]: Done  12 out of  12 | elapsed:    4.6s finished

[2021-06-03 18:57:30] Features: 2/10 -- score: 0.9096032051152344[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.3s remaining:    0.0s
[Parallel(n_jobs=1)]: Done  11 out of  11 | elapsed:    3.7s finished

[2021-06-03 18:57:33] Features: 3/10 -- score: 0.9394566862040203[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done  

(4500, 10)


[Parallel(n_jobs=1)]: Done   4 out of   4 | elapsed:    1.5s finished

[2021-06-03 18:57:52] Features: 10/10 -- score: 0.9890779673058461

In [26]:
@profile
def step_backward_selection_clf(x_train, y_train, k = 10, print_res=False):
    # review to increase the n_estimators
    sfs = SFS(RandomForestClassifier(n_estimators=10, n_jobs=4, random_state=0), 
           k_features=k, # the more features we want, the longer it will take to run
           forward=False, 
           floating=False, # see the docs for more details in this parameter
           verbose=2, # this indicates how much to print out intermediate steps
           scoring='roc_auc',
           cv=2)

    sfs = sfs.fit(np.array(x_train), y_train)
    selected_feat = x_train.columns[list(sfs.k_feature_idx_)]
    if print_res:
        print(selected_feat)
        
    return x_train[selected_feat]

In [27]:
df = step_backward_selection_clf(X_train, y_train)
print(df.shape)

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.3s remaining:    0.0s
[Parallel(n_jobs=1)]: Done  13 out of  13 | elapsed:    5.2s finished

[2021-06-03 18:57:58] Features: 12/10 -- score: 0.988945320140561[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.3s remaining:    0.0s
[Parallel(n_jobs=1)]: Done  12 out of  12 | elapsed:    4.2s finished

[2021-06-03 18:58:03] Features: 11/10 -- score: 0.9921873067904108[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.2s remaining:    0.0s


(4500, 10)


[Parallel(n_jobs=1)]: Done  11 out of  11 | elapsed:    3.6s finished

[2021-06-03 18:58:06] Features: 10/10 -- score: 0.9915173261928689

In [28]:
@profile
def exhaustive_selection_clf(x_train, y_train, min_features=1, max_features=2, print_res=False):
    # review to increase the n_estimators
    efs = EFS(RandomForestClassifier(n_estimators=5, n_jobs=4, random_state=0, max_depth=2),
              min_features=min_features,
              max_features=max_features,
              scoring='roc_auc',
              print_progress=True,
              cv=2)

    # search features
    efs = efs.fit(np.array(x_train), y_train)
    selected_feat = x_train.columns[list(efs.best_idx_)]
    if print_res:
        print(selected_feat)
        
    return x_train[selected_feat]

In [29]:
df = exhaustive_selection_clf(X_train, y_train)
print(df.shape)

Features: 91/91

(4500, 2)


## Embedded Methods

In [30]:
@profile
def log_reg_selection(x_train, y_train, print_res=False):
    sel = SelectFromModel(LogisticRegression(C=1000, penalty='l2', max_iter=300, random_state=10))

    sel.fit(x_train, y_train)
    
    selected_feat = x_train.columns[(sel.get_support())]
    
    if print_res:
        print(selected_feat)
    
    return x_train[selected_feat]

In [31]:
df = log_reg_selection(X_train, y_train)
print(df.shape)

(4500, 4)


In [32]:
@profile
def log_reg_lasso_selection(x_train, y_train, print_res=True):
    sel = SelectFromModel(LogisticRegression(C=0.5, penalty='l1', solver='liblinear', random_state=10))

    sel.fit(x_train, y_train)
    
    selected_feat = x_train.columns[(sel.get_support())]
    
    if print_res:
        print(selected_feat)
    
    return x_train[selected_feat]

In [33]:
df = log_reg_lasso_selection(X_train, y_train)
print(df.shape)

Index(['ID', 'Age', 'Income', 'ZIP Code', 'Family', 'CCAvg', 'Education',
       'Mortgage', 'Securities Account', 'CD Account', 'Online', 'CreditCard'],
      dtype='object')
(4500, 12)


In [34]:
@profile
def random_forest_selection_clf(x_train, y_train, print_res=False):
    sel = SelectFromModel(RandomForestClassifier(n_estimators=10, random_state=10))

    sel.fit(x_train, y_train)
    selected_feat = x_train.columns[(sel.get_support())]
    if print_res:
        print(selected_feat)
    return sel.transform(x_train)

In [35]:
df = random_forest_selection_clf(X_train, y_train)
print(df.shape)

(4500, 4)


In [36]:
@profile
def random_forest_selection_clf_rfe(x_train, y_train, print_res=False):
    sel = RFE(RandomForestClassifier(n_estimators=10, random_state=10), n_features_to_select=27)

    sel.fit(x_train, y_train)
    selected_feat = x_train.columns[(sel.get_support())]
    if print_res:
        print(selected_feat)
    return sel.transform(x_train)

In [37]:
df = random_forest_selection_clf_rfe(X_train, y_train)
print(df.shape)

(4500, 13)


## Hybrid Methods

**Feature Shuffling**

In [38]:
# create a classifier, can be any classifier, chossing RF as a good default classifier
rf = RandomForestClassifier(
    n_estimators=50, max_depth=2, random_state=2909, n_jobs=4)

In [39]:
@profile
def feature_shuffling_selection_clf(x_train, y_train, model=rf,print_res=False):
    sel = SelectByShuffling(
        variables=None, # automatically examine all numerical variables
        estimator=model, # the ML model
        scoring='roc_auc', # the metric to evaluate
        threshold=0,# the maximum performance drop allowed to select the feature
        cv=3, # cross validation
        random_state=1 # seed
    )

    sel.fit(X_train, y_train)
    
    df = sel.transform(x_train)
    
    if print_res:
        print(df.columns)
        
    return df

In [40]:
df = feature_shuffling_selection_clf(X_train, y_train)
print(df.shape)

(4500, 2)


**Recursive Feature Elimination**

In [41]:
# the ML model for which we want to select features
model = GradientBoostingClassifier(
    n_estimators=10,
    max_depth=2,
    random_state=10,
)

In [42]:
@profile
def rfe_selection_clf(x_train, y_train, model = model,print_res=False):
    # Setup the RFE selector
    sel = RecursiveFeatureElimination(
        variables=None, # automatically evaluate all numerical variables
        estimator = model, # the ML model
        scoring = 'roc_auc', # the metric we want to evalute
        threshold = 0.0005, # the maximum performance drop allowed to remove a feature
        cv=2, # cross-validation
    )

    # this may take quite a while, because
    # we are building a lot of models with cross-validation
    sel.fit(x_train, y_train)
    
    df = sel.transform(x_train)
    
    if print_res:
        print(df.columns)
        
    return df

In [43]:
df = rfe_selection_clf(X_train, y_train)
print(df.shape)

(4500, 5)


**Recursive Feature Addition**

In [44]:
# the ML model for which we want to select features
model = GradientBoostingClassifier(
    n_estimators=10,
    max_depth=2,
    random_state=10,
)

In [45]:
@profile
def rfa_selection_clf(x_train, y_train, model=model, print_res=False):
    # Setup the RFA selector

    rfa = RecursiveFeatureAddition(
        variables=None,  # automatically evaluate all numerical variables
        estimator=model,  # the ML model
        scoring='roc_auc',  # the metric we want to evalute
        threshold=0.0001,  # the minimum performance increase needed to select a feature
        cv=2,  # cross-validation
    )

    rfa.fit(X_train, y_train)
    df = rfa.transform(x_train)
    if print_res:
        print(df.columns)
        
    return df

In [46]:
df = rfa_selection_clf(X_train, y_train)
print(df.shape)

(4500, 5)


In [47]:
print_prof_data()

Function remove_constant_and_quasi_constant_features called 1 times. 
Execution time max: 0.069, average: 0.069

Function remove_duplicate_features called 1 times. 
Execution time max: 0.047, average: 0.047

Function remove_corr_features_brute_force called 1 times. 
Execution time max: 0.000, average: 0.000

Function remove_corr_features_smart called 1 times. 
Execution time max: 0.038, average: 0.038

Function selectkbest_mi_clf called 1 times. 
Execution time max: 0.479, average: 0.479

Function annova_clf called 1 times. 
Execution time max: 0.015, average: 0.015

Function select_by_single_feature_perf_clf called 1 times. 
Execution time max: 8.780, average: 8.780

Function step_forward_selection_clf called 1 times. 
Execution time max: 32.296, average: 32.296

Function step_backward_selection_clf called 1 times. 
Execution time max: 13.781, average: 13.781

Function exhaustive_selection_clf called 1 times. 
Execution time max: 29.085, average: 29.085

Function log_reg_selection cal

In [18]:
all_techniques =[
    #{'Method Name', 'Type of Method':, 'Test DataSet Size':, 'Execution Time':}
    {'Method Name': 'Drop Constant Feature', 'Type of Method':'Filter', 'Test DataSet Size':'(4500, 201)', 'Execution Time':0.069},
    {'Method Name': 'Drop Duplicate Feature', 'Type of Method':'Filter', 'Test DataSet Size':'(4500, 201)', 'Execution Time':0.047},
    {'Method Name':'Drop Correlated Feature', 'Type of Method':'Filter', 'Test DataSet Size':'(4500, 201)', 'Execution Time':0.000},
    {'Method Name':'Smart correlated Selection', 'Type of Method':'Filter', 'Test DataSet Size':'(4500, 201)', 'Execution Time':0.038},
    
    {'Method Name':'SelectKBest Mutual Information', 'Type of Method':'Statistical Techniques and Ranking Methods','Test DataSet Size': '(4500, 201)', 'Execution Time':0.479},
    {'Method Name':'ANNOVA ', 'Type of Method':'Statistical Techniques and Ranking Methods', 'Test DataSet Size':'(4500, 201)', 'Execution Time':0.015},
    
    {'Method Name':'SelectBySingleFeaturePerformance', 'Type of Method':'Feature Selection with ML Models', 'Test DataSet Size':'(4500, 201)', 'Execution Time':8.780},
    
    {'Method Name':'Step Forward Feature Selection', 'Type of Method':'Wrapper', 'Test DataSet Size':'(4500, 201)', 'Execution Time':32.296},
    {'Method Name':'Step Backward Feature Selection', 'Type of Method':'Wrapper', 'Test DataSet Size':'(4500, 201)','Execution Time': 13.781},
    {'Method Name':'Exhaustive Selection', 'Type of Method':'Wrapper', 'Test DataSet Size':'(4500, 201)', 'Execution Time':29.085},
    
    {'Method Name':'Logistic Regression Selection', 'Type of Method':'Embedded', 'Test DataSet Size':'(4500, 201)', 'Execution Time':0.301},
    {'Method Name':'Logistic Regression Lasso Selection', 'Type of Method':'Embedded', 'Test DataSet Size':'(4500, 201)', 'Execution Time':0.053},
    {'Method Name':'Random Forest Selection', 'Type of Method':'Embedded', 'Test DataSet Size':'(4500, 201)', 'Execution Time':0.053},
    {'Method Name':'Random Forest Selection using RFE', 'Type of Method':'Embedded', 'Test DataSet Size':'(4500, 201)', 'Execution Time':0.031},
    
    
    {'Method Name':'Feature Shuffling Selection', 'Type of Method':'Hybrid', 'Test DataSet Size':'(4500, 201)', 'Execution Time':5.989},
    {'Method Name':'Recursive Feature Selection', 'Type of Method':'Hybrid', 'Test DataSet Size':'(4500, 201)', 'Execution Time':0.977},
    {'Method Name':'Recursive Feature Addition', 'Type of Method':'Hybrid', 'Test DataSet Size':'(4500, 201)', 'Execution Time':0.632}
]

In [25]:
import pandas as pd
exec_res = pd.DataFrame(all_techniques).sort_values('Execution Time', ascending=False)

In [26]:
exec_res

Unnamed: 0,Method Name,Type of Method,Test DataSet Size,Execution Time
7,Step Forward Feature Selection,Wrapper,"(4500, 201)",32.296
9,Exhaustive Selection,Wrapper,"(4500, 201)",29.085
8,Step Backward Feature Selection,Wrapper,"(4500, 201)",13.781
6,SelectBySingleFeaturePerformance,Feature Selection with ML Models,"(4500, 201)",8.78
14,Feature Shuffling Selection,Hybrid,"(4500, 201)",5.989
15,Recursive Feature Selection,Hybrid,"(4500, 201)",0.977
16,Recursive Feature Addition,Hybrid,"(4500, 201)",0.632
4,SelectKBest Mutual Information,Statistical Techniques and Ranking Methods,"(4500, 201)",0.479
10,Logistic Regression Selection,Embedded,"(4500, 201)",0.301
0,Drop Constant Feature,Filter,"(4500, 201)",0.069


In [27]:
exec_res.to_excel('./feature_selection_techniques_execution_time.xlsx')