# **Feature Selection Tool Comparison **
Which feature selection tool is better? Let's compare two of the top tools in Python
1. Feature-Engine has 820 stars and 765K downloads (as of March 2022)
1. Featurewiz has 234 stars on Github and has 226K downloads (as of March 2022)
<a href="https://ibb.co/PmxS6SW"><img src="https://i.ibb.co/ZLdZMZg/featurewiz-logos.png" alt="featurewiz-logos" border="0"></a><br /><a target='_blank' href='https://imgbb.com/'>sun images free</a><br />

In [None]:
# let's install Feature-engine
!pip install feature-engine

In [None]:
### Let's install featurewiz without any dependencies. Otherwise error!
!pip install featurewiz --ignore-installed --no-deps

In [None]:
## You must also install xlrd for featurewiz which is required
!pip install xlrd
### You need to install this since Kaggle has a wrong version ##
!pip install Pillow==9.0.0

In [None]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns

from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import train_test_split, cross_validate
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler


# import selection classes from Feature-engine

from feature_engine.selection import (
    DropDuplicateFeatures,
    DropConstantFeatures,
    DropDuplicateFeatures,
    DropCorrelatedFeatures,
    SmartCorrelatedSelection,
    SelectByShuffling,
    SelectBySingleFeaturePerformance,
    RecursiveFeatureElimination,
)

# from feature-engine
from feature_engine.imputation import MeanMedianImputer

In [None]:
# load the Santander customer satisfaction dataset

data = pd.read_csv('/kaggle/input/santander-customer-satisfaction/train.csv')
print(data.shape)
data.head()

# This is a highly imbalanced class problem. 

In [None]:
target = 'TARGET'
print(data[target].value_counts(1))
data[target].hist()

# we must split the dataset into Train and Test before we do any feature engg or selection! This is a must.

In [None]:
modeltype = 'Classification'

In [None]:
# separate dataset into train and test sets
if modeltype == 'Regression':
    X_train, X_test, y_train, y_test = train_test_split(
    data.drop(labels=['ID',target], axis=1),
    data[target],
    test_size=0.2,
    random_state=0)
else:
    X_train, X_test, y_train, y_test = train_test_split(
    data.drop(labels=['ID',target], axis=1),
    data[target],
    test_size=0.2,
    random_state=0,
    stratify=data[target])

X_train.shape, X_test.shape

In [None]:
import copy
X_train_copy = copy.deepcopy(X_train)
X_test_copy = copy.deepcopy(X_test)

In [None]:
# check if there missing data (this datasets do not show NAs
# as we will see in the empty list output)

cols = data.columns[data.isnull().sum()>0].tolist()
cols

## Feature Selection using Feature-Engine first using a pipeline

Now we will select features and train a machine learning model altogether in 1 pipeline.

In [None]:
#modeltype = 'Regression'
modeltype = 'Classification'

In [None]:
import time
start_time = time.time()

# let's remove constant, quasi-constant and duplicates to speed things up
if modeltype=='Regression':
    the_model = RandomForestRegressor(n_estimators=100, max_depth=5, random_state=1)
    metric = 'neg_mean_squared_error'
else:
    the_model = RandomForestClassifier(n_estimators=100, max_depth=5, random_state=1) 
    metric = "roc_auc"

if len(cols) == 0:
    pipe1 = Pipeline([
    # ======== FEATURE SELECTION =======
    ('constant', DropConstantFeatures(tol=0.998)), # drops constand and quasi-constant altogether
    ('duplicated', DropDuplicateFeatures()), # drop duplicated
    ('shuffle', SelectByShuffling( # select by feature shuffling
        estimator = the_model,
        scoring=metric, # the metric to determine model performance
        cv=5, # the cross-validation fold
    )),
])
else:
    mdi = MeanMedianImputer(
        imputation_method='median',
        variables=cols
    )

    pipe1 = Pipeline([
    # ======== FEATURE SELECTION =======
    ('imputer', mdi),
    ('constant', DropConstantFeatures(tol=0.998)), # drops constand and quasi-constant altogether
    ('duplicated', DropDuplicateFeatures()), # drop duplicated
    ('shuffle', SelectByShuffling( # select by feature shuffling
        estimator = the_model,
        scoring=metric, # the metric to determine model performance
        cv=5, # the cross-validation fold
    )),
])

    
pipe2 = Pipeline([
    # =====  the machine learning model ====
    ('random_forest', the_model),
])

# remove variables

print('Number of original variables: ', X_train.shape[1])

X_train = pipe1.fit_transform(X_train, y_train)
X_test = pipe1.transform(X_test)

print('Number of variables after selection: ', X_train.shape[1])
print('Time taken: %0.2f seconds' %(time.time()-start_time))

In [None]:
the_model.fit(X_train, y_train)

# It took 120 seconds (2 mins) to run feature-engine to select 27 variables out of 369 (~90% reduction!)

In [None]:
# the pipeline takes in the raw data, removes all unwanted features and then
# makes the prediction with the model trained on the final subset of variables

# obtain predictions and determine model performance
y_preds = the_model.predict(X_test)
y_preds

In [None]:
if modeltype == 'Regression':
    from sklearn.metrics import r2_score, mean_squared_error
    print('R-Squared = %0.0f%%' %(100*r2_score(y_test,y_preds)))
    print('RMSE = %0.2f' %np.sqrt(mean_squared_error(y_test,y_preds)))
    #plot_scatter(y_test,testm[target+'_XGBoost_predictions'])
else:
    from sklearn.metrics import balanced_accuracy_score, classification_report
    if isinstance(target, str): 
        print('Bal accu %0.0f%%' %(100*balanced_accuracy_score(y_test,y_preds)))
        print(classification_report(y_test,y_preds))
    elif len(target) == 1:
            print('Bal accu %0.0f%%' %(100*balanced_accuracy_score(y_test,y_preds)))
            print(classification_report(y_test,y_preds))
    else:
        for each_i, target_name in enumerate(target):
            print('For %s:' %target_name)
            print('    Bal accu %0.0f%%' %(100*balanced_accuracy_score(y_test.values[:,each_i],y_preds[:,each_i])))
            print(classification_report(y_test.values[:,each_i],y_preds[:,each_i]))

# The balanced accuracy score is unfortunately 50% which means that the selected features were somewhat worthless. 

# Now let's select features using Featurewiz and see how it performs

In [None]:
from featurewiz import FeatureWiz

In [None]:
import featurewiz as FW

In [None]:
import time
start_time = time.time()

# remove variables

print('Number of original variables: ', X_train_copy.shape[1])

features = FeatureWiz(corr_limit=0.70, feature_engg='', category_encoders='', dask_xgboost_flag=False, nrows=None, verbose=2)
X_train_selected = features.fit_transform(X_train_copy, y_train)
X_test_selected = features.transform(X_test_copy)

### provides the list of selected features ###
print('Number of variables after selection: ', X_train_selected.shape[1])
print('Time taken: %0.2f seconds' %(time.time()-start_time))

In [None]:
print(X_train_selected.shape)
X_train_selected.head()

# Featurewiz took 184 seconds (3 mins) and selected 78 features. That's much more features than feature-engine and it took half the time. Let us use a similar RandomForestClassifier model to compare results

In [None]:
outputs = FW.complex_XGBoost_model(X_train_selected, y_train, 
                        X_test_selected, log_y=False, 
                GPU_flag=False, scaler='', enc_method='label', n_splits=5, verbose=-1)

In [None]:
if modeltype != 'Regression':
    #y_preds = lazy.yformer.inverse_transform(y_preds)
    y_preds = outputs[0]
else:
    y_preds = outputs[0]
y_preds[:4]

In [None]:
if modeltype == 'Regression':
    from sklearn.metrics import r2_score, mean_squared_error
    print('R-Squared = %0.0f%%' %(100*r2_score(y_test,y_preds)))
    print('RMSE = %0.2f' %np.sqrt(mean_squared_error(y_test,y_preds)))
    #plot_scatter(y_test,testm[target+'_XGBoost_predictions'])
else:
    from sklearn.metrics import balanced_accuracy_score, classification_report
    if isinstance(target, str): 
        print('Bal accu %0.0f%%' %(100*balanced_accuracy_score(y_test,y_preds)))
        print(classification_report(y_test,y_preds))
    elif len(target) == 1:
            print('Bal accu %0.0f%%' %(100*balanced_accuracy_score(y_test,y_preds)))
            print(classification_report(y_test,y_preds))
    else:
        for each_i, target_name in enumerate(target):
            print('For %s:' %target_name)
            print('    Bal accu %0.0f%%' %(100*balanced_accuracy_score(y_test.values[:,each_i],y_preds[:,each_i])))
            print(classification_report(y_test.values[:,each_i],y_preds[:,each_i]))

# Featurewiz shows very good promise in this dataset with a 74% balanced accuracy vs 50% for feature-engine. This means that in large datasets, using the SULOV algorithm and mRmR feature-selection techniques, featurewiz is able to provide superior performance.

# Let's see whether using all the features in the data set provides a better model

In [None]:
model = RandomForestClassifier(n_estimators=100,max_depth=5, random_state=1)
model

In [None]:
model.fit(X_train, y_train)
y_preds = model.predict(X_test)
y_preds

In [None]:
if modeltype == 'Regression':
    from sklearn.metrics import r2_score, mean_squared_error
    print('R-Squared = %0.0f%%' %(100*r2_score(y_test,y_preds)))
    print('RMSE = %0.2f' %np.sqrt(mean_squared_error(y_test,y_preds)))
    #plot_scatter(y_test,testm[target+'_XGBoost_predictions'])
else:
    from sklearn.metrics import balanced_accuracy_score, classification_report
    if isinstance(target, str): 
        print('Bal accu %0.0f%%' %(100*balanced_accuracy_score(y_test,y_preds)))
        print(classification_report(y_test,y_preds))
    elif len(target) == 1:
            print('Bal accu %0.0f%%' %(100*balanced_accuracy_score(y_test,y_preds)))
            print(classification_report(y_test,y_preds))
    else:
        for each_i, target_name in enumerate(target):
            print('For %s:' %target_name)
            print('    Bal accu %0.0f%%' %(100*balanced_accuracy_score(y_test.values[:,each_i],y_preds[:,each_i])))
            print(classification_report(y_test.values[:,each_i],y_preds[:,each_i]))

# Actually using all features does not work that well for this dataset. Hence you are better off using feature selection methods.