## Automatically select the number of features

It is also possible to automatically select the number of features chosen by RFE.

Using the RFECV class, This can be achieved by performing cross-validation evaluation of different numbers of features and
automatically selecting the number of features that resulted in the best mean score.

The RFECV is configured by specifying the minimum number of features via the “min_features_to_select” argument (defaults to 1) 
and we can also specify the type of cross-validation and scoring to use via the “cv” (defaults to 5) 
and “scoring” arguments (uses accuracy for classification).

In [1]:
import pandas                      as      pd
import numpy                       as      np
import seaborn                     as      sns
import matplotlib.pyplot           as      plt
import statsmodels.api             as      sm
from   sklearn.preprocessing       import  OrdinalEncoder
from   sklearn.feature_selection   import  RFE
from   sklearn                     import  metrics
from   sklearn.pipeline            import  Pipeline
from   sklearn.model_selection     import  cross_val_score
from   sklearn.model_selection     import  RepeatedStratifiedKFold
import sys, os

In [2]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_selection import RFECV

In [3]:
"""
Function Name: compare_models

   Description: This **function** compares various algorithms on 
                 1) AUROC 2) Precision, 3) Recall
   
   Input: 1) splits for k fold 
          2) random seed number
          3) Training data for predictor variables
          4) Training data for target variable



   Output: Model comparison on these metrics 1) AUROC 2) Metrics - Precision, Recall
   
"""
def compare_models(n_splits, random_state, X_train, Y_train):  

    ### To compare algorithms
    
    from    matplotlib import pyplot
    from    sklearn.model_selection    import   KFold
    from    sklearn.model_selection    import   cross_val_score
    from    sklearn.linear_model       import   LogisticRegression
    from    sklearn.tree               import   DecisionTreeClassifier
    from    sklearn.neighbors          import   KNeighborsClassifier
    from    sklearn.naive_bayes        import   GaussianNB
    from    sklearn.ensemble           import   RandomForestClassifier
    
    ### Prepare models
    
    models  = []
    models.append(('LR', LogisticRegression()))
    models.append(('KNN', KNeighborsClassifier()))
    models.append(('CART', DecisionTreeClassifier()))
    models.append(('NB', GaussianNB()))
    models.append(('RF',RandomForestClassifier()))
    
    
    ### Evaluate model in turn
   
    scores_req =  ['roc_auc', 'precision', 'recall']
    
    for i in range(len(scores_req)):
        results    =  []
        scoring    = scores_req[i]
        names      =  []
        print(scoring)
        
        for name, model in models:
                print("\n n_splits %d" % (n_splits))        	
                kfold      =  KFold(n_splits)                

                cv_results =   cross_val_score(model, X_train, Y_train, cv = kfold, scoring = scoring)
                results.append(cv_results)
                names.append(name)
                msg = "%s: %f (%f)" % (name, cv_results.mean(), cv_results.std())
                print(msg)
        
        ### Box plot algorithm comparison
        
        sub_title = 'Algorithm Comparison using ' + scoring
        
        fig = pyplot.figure()
        fig.suptitle(sub_title)
        ax  = fig.add_subplot(111)
        pyplot.boxplot(results)
        ax.set_xticklabels(names)
        pyplot.show()

### ------------------------------------------------------------------------------------------


In [4]:
inp_file_name =  r'D:\GL-DSE-Capstone-Projects\FT-C-JUL22-G3\Clean_kaggle_data.csv'
df           =  pd.read_csv(inp_file_name)
df.columns

Index(['national_inv', 'lead_time', 'in_transit_qty', 'forecast_3_month',
       'forecast_6_month', 'forecast_9_month', 'sales_1_month',
       'sales_3_month', 'sales_6_month', 'sales_9_month', 'min_bank',
       'pieces_past_due', 'perf_6_month_avg', 'perf_12_month_avg',
       'local_bo_qty', 'potential_issue_Code', 'deck_risk_Code',
       'oe_constraint_Code', 'ppap_risk_Code', 'stop_auto_buy_Code',
       'rev_stop_Code', 'went_on_backorder_Code'],
      dtype='object')

from sklearn.model_selection import train_test_split

df1, _ = train_test_split(df, test_size = 0.99, stratify = df[['went_on_backorder_Code']])

In [5]:
df1   =  df.copy()

In [6]:
y             =    df1['went_on_backorder_Code']
X             =    df1.drop(['went_on_backorder_Code'], axis = 1)                       
print('X dimension {}'. format(X.shape))
print('y dimension {}'. format(y.shape))

X dimension (1558379, 21)
y dimension (1558379,)


In [None]:
### create pipeline
rfe      = RFE(estimator = RandomForestClassifier(), n_features_to_select = 10)
model    = RandomForestClassifier(random_state=42)
pipeline = Pipeline(steps=[('s',rfe),('m',model)])
### evaluate model
cv       = RepeatedStratifiedKFold(n_splits = 10, n_repeats = 10, random_state = 1234)
n_scores = cross_val_score(pipeline, X, y, scoring='accuracy', cv=cv, n_jobs=-1, error_score='raise')
# report performance
print('Accuracy: %.3f (%.3f)' % (np.mean(n_scores), np.std(n_scores)))

In [None]:
# Recursively eliminate features with cross validation
rfecv = RFECV(estimator = model, cv = 10, scoring='accuracy')
rfecv.fit(X, y)
X_new = rfecv.transform(X)
print("Num Features Before:", X.shape[1])
print("Num Features After:", X_new.shape[1])

In [None]:
features_kept = pd.DataFrame({'columns': X.columns,
                             'Kept': rfecv.support_,
                             'Rank' : rfecv.ranking_})
features_kept

In [None]:
features_kept.loc[features_kept['Kept'] == True,]

In [None]:
reqd_cols =  features_kept.loc[features_kept['Kept'] == True, 'columns']

In [None]:
print(reqd_cols)

In [None]:
df_             =  df1.loc[:, reqd_cols]

In [None]:
y_              =  df1['went_on_backorder_Code']
X_              =  df1.drop(['went_on_backorder_Code'], axis = 1)       

In [None]:
print("Shape of X_ {}". format(X_.shape))
print("Shape of y_ {}". format(y_.shape))

In [None]:
n_splits     = 10
random_state = 123456

compare_models(n_splits, random_state, X_, y_)

https://lifewithdata.com/2022/03/20/feature-selection-with-recursive-feature-elimination-rfecv/