# Feature Selection
* Filter method
* Wrapper method
* Embedded method

### Importing Libraries

In [34]:
import pandas as pd
import numpy as np
import scipy
import scipy.stats as stats
from scipy.stats import normaltest

import math
import sklearn
import seaborn as sns
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings('ignore')

from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OrdinalEncoder, OneHotEncoder
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.decomposition import PCA
from sklearn.linear_model  import LogisticRegression


from sklearn.ensemble import (
    RandomForestClassifier,
    BaggingClassifier,
    AdaBoostClassifier,
)

# using feature engine library
from feature_engine.selection import (
    DropCorrelatedFeatures, 
    SmartCorrelatedSelection,
    DropConstantFeatures, 
    DropDuplicateFeatures,
    SelectBySingleFeaturePerformance,
    SelectByTargetMeanPerformance,
    SelectByShuffling,
    RecursiveFeatureElimination,
    RecursiveFeatureAddition
)

# to obtain the mutual information values
from sklearn.feature_selection import (
    f_classif,
    f_regression,
    mutual_info_classif, 
    mutual_info_regression,
    VarianceThreshold
)
# to select the features
from sklearn.feature_selection import SelectKBest, SelectPercentile

# wrapper methods
from mlxtend.feature_selection import SequentialFeatureSelector as SFS
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier, GradientBoostingRegressor
from sklearn.metrics import roc_auc_score, r2_score

from mlxtend.feature_selection import ExhaustiveFeatureSelector as EFS

# embedded methods
from sklearn.linear_model import LogisticRegression
from sklearn.feature_selection import SelectFromModel
from sklearn.feature_selection import RFE

### Loading dataset

In [2]:
data = pd.read_csv('../dataset/feature_selection/dataset_1.csv')
data.shape

(50000, 301)

In [None]:
# to-do: note that we need to use only 3-4 techniques
# to-do: we need to work on ensemble techniqye of above techniques features
# to-do: we need to put variables in the top, identu

### Splitting in to train and test set
* its a good practice to select the features by examining only the training set. And this is to avoid the overfitting.

In [3]:
# separate dataset into train and test

X_train, X_test, y_train, y_test = train_test_split(
    data.drop(labels=['target'], axis=1),  # drop the target
    data['target'],  # just the target
    test_size=0.3,
    random_state=0)

X_train.shape, X_test.shape

((35000, 300), (15000, 300))

## 1. Filter methods

In [33]:
# !pip install feature_engine

**Remove constand, quasi constant and duplicates**

In [35]:
def remove_constant_and_quasi_constant_features(df):
    # remove constant and quasi-constant features first:
    # we use Feature-engine for this
    sel = DropConstantFeatures(tol=0.998, variables=None, missing_values='raise')
    sel.fit(df)
    return sel.transform(df)

In [36]:
df = remove_constant_and_quasi_constant_features(X_train)
df.shape

(35000, 158)

In [42]:
def remove_duplicate_features(df):
    # set up the selector
    sel = DropDuplicateFeatures(variables=None, missing_values='raise')
    # find the duplicate features, this might take a while
    sel.fit(df)
    return sel.fit_transform(df)

In [43]:
df = remove_duplicate_features(df)
df.shape

(35000, 152)

**Remove Correlated features**
*  "Good feature subsets contain features highly correlated with the target, yet uncorrelated to each other".

In [49]:
def remove_corr_features_brute_force(df, print_res=False):
    sel = DropCorrelatedFeatures(
        threshold=0.8,
        method='pearson',
        missing_values='ignore'
    )
    # find correlated features
    sel.fit(df)
    if (print_res):
        sel.correlated_feature_sets_
    return sel.transform(df)

In [52]:
# smart correlation selection
def remove_corr_features_smart(x_train, y_train, print_res=False):
    # random forest
    rf = RandomForestClassifier(
        n_estimators=10,
        random_state=20,
        n_jobs=4,
    )

    # correlation selector
    sel = SmartCorrelatedSelection(
        variables=None, # if none, selector examines all numerical variables
        method="pearson",
        threshold=0.8,
        missing_values="raise",
        selection_method="model_performance", # this can be set to variance also to select feature with mst variance
        estimator=rf,
        scoring="roc_auc",
        cv=3,
    )

    # this may take a while, because we are training
    # a random forest per correlation group
    sel.fit(x_train, y_train)
    
    if (print_res):
        sel.correlated_feature_sets_
    return sel.fit_transform(x_train, y_train)

In [53]:
df = remove_corr_features_smart(df, y_train)
df.shape

(35000, 78)

**Statistical Techniques and Ranking Methods**


**Mutual Information**

In [55]:
# load classification dataset
data_clf = pd.read_csv('../dataset/feature_selection/dataset_1.csv')
data_clf.shape

(50000, 301)

In [58]:
# separate train and test sets
x_train_clf, x_test_clf, y_train_clf, y_test_clf = train_test_split(
    data_clf.drop(labels=['target'], axis=1),
    data_clf['target'],
    test_size=0.3,
    random_state=0)

x_train_clf.shape, x_test_clf.shape

((35000, 300), (15000, 300))

In [59]:
def selectkbest_mi_clf(x_train, y_train, k=10, print_res=False):
    sel = SelectKBest(mutual_info_classif, k=k).fit(x_train, y_train)
    
    if print_res:
        # display features
        x_train.columns[sel.get_support()]
        
    return sel.transform(x_train)

In [57]:
def selectkbest_mi_reg(x_train, y_train, percentile=10, print_res=False):
    sel = SelectPercentile(mutual_info_regression, percentile=10).fit(X_train, y_train)
    
    if print_res:
        # display features
        x_train.columns[sel.get_support()]
        
    return sel.transform(x_train)

In [60]:
df = selectkbest_mi_clf(x_train_clf, y_train_clf)
df.shape

(35000, 10)

**Chi-Square Test**
NOTE: to be used only with the categorical variables

In [61]:
def chi_square_test(x_train, y_train, k=1, print_res=False):
    sel = SelectKBest(chi2, k=1).fit(x_train, y_train)
    
    if print_res:
        # display features
        x_train.columns[sel.get_support()]
        
    return sel.transform(x_train)

**ANNOVA**
* ANOVA assumes a linear relationship between the feature and the target and that the variables follow a Gaussian distribution. If this is not true, the result of this test may not be useful.
* 

In [73]:
def annova_clf(x_train, y_train, k=10, print_res=False):
    # calculate the univariate statistical measure between
    # each of the variables and the target

    # similarly to chi2, the output is one array with f-scores
    # and one array with the pvalues

    sel = SelectKBest(f_classif, k=k).fit(x_train, y_train)
    
    if print_res:
        # display features
        print(x_train.columns[sel.get_support()])
        
    return sel.transform(x_train)

In [76]:
def annova_reg(x_train, y_train, k=10, print_res=False):
    # calculate the univariate statistical measure between
    # each of the variables and the target

    # similarly to chi2, the output is one array with f-scores
    # and one array with the pvalues

    sel = SelectPercentile(f_regression,
                        percentile=10).fit(X_train.fillna(0), y_train)
    
    if print_res:
        # display features
        print(x_train.columns[sel.get_support()])
        
    return sel.transform(x_train)

In [75]:
print(X_train.shape)
df = annova_clf(X_train, y_train, print_res=True)
print(df.shape)

(35000, 300)
Index(['var_4', 'var_15', 'var_49', 'var_58', 'var_110', 'var_114', 'var_132',
       'var_152', 'var_230', 'var_262'],
      dtype='object')
(35000, 10)


**Feature Selection with ML models**
* Idea is that a single feature is taken out to build a model and then this feature will be ranked as per the model's performance.

In [83]:
def select_by_single_feature_perf_clf(x_train, y_train, print_res=False):
    # set up a machine learning model
    rf = RandomForestClassifier(
        n_estimators=10, random_state=1, n_jobs=4)

    # set up the selector
    sel = SelectBySingleFeaturePerformance(
        variables=None,
        estimator=rf,
        scoring="roc_auc",
        cv=3,
        threshold=0.5)

    # find predictive features
    sel.fit(X_train, y_train)
    
    if print_res:
        print(sel.feature_performance_)
        
    return sel.transform(x_train)

In [84]:
print(X_train.shape)
df = select_by_single_feature_perf_clf(X_train, y_train, print_res=True)
print(df.shape)

(35000, 300)
{'var_1': 0.5001933142993099, 'var_2': 0.5, 'var_3': 0.5001784439685938, 'var_4': 0.6903562245974415, 'var_5': 0.501486791294873, 'var_6': 0.5, 'var_7': 0.5000594813228646, 'var_8': 0.49850669530475816, 'var_9': 0.5000594813228646, 'var_10': 0.5000148703307161, 'var_11': 0.5, 'var_12': 0.5, 'var_13': 0.4991720909314243, 'var_14': 0.5, 'var_15': 0.6674213182310633, 'var_16': 0.5003357973374138, 'var_17': 0.49557045024503094, 'var_18': 0.49809799867189103, 'var_19': 0.504767155890828, 'var_20': 0.5000594813228646, 'var_21': 0.6487429533835901, 'var_22': 0.5004418833556133, 'var_23': 0.5, 'var_24': 0.4997855951009161, 'var_25': 0.5011943413867188, 'var_26': 0.5013870201909277, 'var_27': 0.5033012134189864, 'var_28': 0.5000148703307161, 'var_29': 0.49609791811114484, 'var_30': 0.516004834198219, 'var_31': 0.5283737111160551, 'var_32': 0.5002911863452654, 'var_33': 0.5, 'var_34': 0.5, 'var_35': 0.6628156159398587, 'var_36': 0.5, 'var_37': 0.49632725205209066, 'var_38': 0.515148

In [79]:
def select_by_single_feature_perf_reg(x_train, y_train, print_res=False):
    # set up a machine learning model
    # set up the machine learning model
    rf = RandomForestRegressor(
        n_estimators=10, max_depth=2, random_state=1, n_jobs=4)

    # set up the selector
    sel = SelectBySingleFeaturePerformance(
        variables=None,
        estimator=rf,
        scoring="r2",
        cv=3,
        threshold=0.5)

    # find predictive features
    sel.fit(X_train, y_train)
    
    if print_res:
        print(sel.feature_performance_)
        
    return sel.transform(x_train)

In [81]:
def select_by_target_mean_perf_reg(x_train, y_train, print_res=False):
    # feautre engine automates the selection for both
    # categorical and numerical variables

    sel = SelectByTargetMeanPerformance(
        variables=None, # automatically finds categorical and numerical variables
        scoring="roc_auc_score", # the metric to evaluate performance
        threshold=0.6, # the threshold for feature selection, 
        bins=3, # the number of intervals to discretise the numerical variables
        strategy="equal_frequency", # whether the intervals should be of equal size or equal number of observations
        cv=2,# cross validation
        random_state=1, #seed for reproducibility
    )

    sel.fit(X_train, y_train)
    
    if print_res:
        print(sel.feature_performance_)
        
    return sel.transform(x_train)

## 2. Wrapper methods

In [85]:
!pip install mlxtend



In [87]:
def step_forward_selection_clf(x_train, y_train, k = 10, print_res=False):
    # review to increase the n_estimators
    sfs = SFS(RandomForestClassifier(n_estimators=10, n_jobs=4, random_state=0), 
           k_features=k, # the more features we want, the longer it will take to run
           forward=True, 
           floating=False, # see the docs for more details in this parameter
           verbose=2, # this indicates how much to print out intermediate steps
           scoring='roc_auc',
           cv=2)

    sfs = sfs.fit(np.array(x_train), y_train)
    selected_feat = x_train.columns[list(sfs.k_feature_idx_)]
    if print_res:
        print(selected_feat)
        
    return x_train[selected_feat]

In [88]:
def step_forward_selection_reg(x_train, y_train, k = 10, print_res=False):
    # review to increase the n_estimators
    sfs = SFS(RandomForestRegressor(n_estimators=10, n_jobs=4, random_state=10), 
           k_features=20, 
           forward=True, 
           floating=False, 
           verbose=2,
           scoring='r2',
           cv=2)

    sfs = sfs.fit(np.array(x_train), y_train)
    selected_feat = x_train.columns[list(sfs.k_feature_idx_)]
    if print_res:
        print(selected_feat)
        
    return x_train[selected_feat]

In [89]:
def step_backward_selection_clf(x_train, y_train, k = 10, print_res=False):
    # review to increase the n_estimators
    sfs = SFS(RandomForestClassifier(n_estimators=10, n_jobs=4, random_state=0), 
           k_features=k, # the more features we want, the longer it will take to run
           forward=False, 
           floating=False, # see the docs for more details in this parameter
           verbose=2, # this indicates how much to print out intermediate steps
           scoring='roc_auc',
           cv=2)

    sfs = sfs.fit(np.array(x_train), y_train)
    selected_feat = x_train.columns[list(sfs.k_feature_idx_)]
    if print_res:
        print(selected_feat)
        
    return x_train[selected_feat]

In [90]:
def step_backward_selection_reg(x_train, y_train, k = 10, print_res=False):
    # review to increase the n_estimators
    sfs = SFS(RandomForestRegressor(n_estimators=10, n_jobs=4, random_state=10), 
           k_features=20, 
           forward=False, 
           floating=False, 
           verbose=2,
           scoring='r2',
           cv=2)

    sfs = sfs.fit(np.array(x_train), y_train)
    selected_feat = x_train.columns[list(sfs.k_feature_idx_)]
    if print_res:
        print(selected_feat)
        
    return x_train[selected_feat]

In [92]:
def exhaustive_selection_clf(x_train, y_train, min_features=1, max_features=2, print_res=False):
    # review to increase the n_estimators
    efs = EFS(RandomForestClassifier(n_estimators=5, n_jobs=4, random_state=0, max_depth=2),
              min_features=min_features,
              max_features=max_features,
              scoring='roc_auc',
              print_progress=True,
              cv=2)

    # search features
    efs = efs.fit(np.array(x_train), y_train)
    selected_feat = x_train.columns[list(efs.best_idx_)]
    if print_res:
        print(selected_feat)
        
    return x_train[selected_feat]

In [93]:
def exhaustive_selection_reg(x_train, y_train, min_features=1, max_features=2, print_res=False):
    # review to increase the n_estimators
    efs = EFS(RandomForestRegressor(n_estimators=5, n_jobs=4, random_state=0, max_depth=2),
              min_features=min_features,
              max_features=max_features,
              scoring='r2',
              print_progress=True,
              cv=2)

    # search features
    efs = efs.fit(np.array(x_train), y_train)
    selected_feat = x_train.columns[list(efs.best_idx_)]
    if print_res:
        print(selected_feat)
        
    return x_train[selected_feat]

## Embedded Methods

In [104]:
def log_reg_selection(x_train, y_train, print_res=False):
    sel = SelectFromModel(LogisticRegression(C=1000, penalty='l2', max_iter=300, random_state=10))

    sel.fit(x_train, y_train)
    
    selected_feat = x_train.columns[(sel.get_support())]
    
    if print_res:
        print(selected_feat)
    
    return x_train[selected_feat]

In [105]:
def lin_reg_selection(x_train, y_train, print_res=False):
    sel = SelectFromModel(LinearRegression())

    sel.fit(x_train, y_train)
    
    selected_feat = x_train.columns[(sel.get_support())]
    
    if print_res:
        print(selected_feat)
    
    return x_train[selected_feat]

In [106]:
def log_reg_lasso_selection(x_train, y_train, print_res=True):
    sel = SelectFromModel(LogisticRegression(C=0.5, penalty='l1', solver='liblinear', random_state=10))

    sel.fit(x_train, y_train)
    
    selected_feat = x_train.columns[(sel.get_support())]
    
    if print_res:
        print(selected_feat)
    
    return x_train[selected_feat]

In [107]:
print(X_train.shape)
df = log_reg_selection(X_train, y_train, print_res=True)
print(df.shape)

(35000, 300)
Index(['var_6', 'var_14', 'var_21', 'var_29', 'var_30', 'var_34', 'var_35',
       'var_46', 'var_47', 'var_55', 'var_60', 'var_64', 'var_75', 'var_76',
       'var_77', 'var_79', 'var_86', 'var_98', 'var_108', 'var_111', 'var_124',
       'var_125', 'var_136', 'var_142', 'var_147', 'var_151', 'var_154',
       'var_183', 'var_213', 'var_216', 'var_217', 'var_221', 'var_222',
       'var_231', 'var_253', 'var_257', 'var_263', 'var_300'],
      dtype='object')
(35000, 38)


In [108]:
print(X_train.shape)
df = log_reg_lasso_selection(X_train, y_train, print_res=True)
print(df.shape)

(35000, 300)
Index(['var_4', 'var_5', 'var_8', 'var_13', 'var_14', 'var_15', 'var_17',
       'var_18', 'var_21', 'var_22',
       ...
       'var_271', 'var_277', 'var_279', 'var_280', 'var_286', 'var_288',
       'var_292', 'var_296', 'var_299', 'var_300'],
      dtype='object', length=117)
(35000, 117)


In [11]:
def random_forest_selection_clf(x_train, y_train, print_res=False):
    sel = SelectFromModel(RandomForestClassifier(n_estimators=10, random_state=10))

    sel.fit(x_train, y_train)
    selected_feat = x_train.columns[(sel.get_support())]
    if print_res:
        print(selected_feat)
    return sel.transform(x_train)

In [12]:
print(X_train.shape)
df = random_forest_selection_clf(X_train, y_train, print_res=True)
print(df.shape)

(35000, 300)
Index(['var_4', 'var_15', 'var_17', 'var_21', 'var_29', 'var_35', 'var_46',
       'var_49', 'var_50', 'var_55', 'var_57', 'var_74', 'var_75', 'var_76',
       'var_110', 'var_131', 'var_132', 'var_140', 'var_145', 'var_152',
       'var_157', 'var_161', 'var_166', 'var_173', 'var_185', 'var_190',
       'var_203', 'var_207', 'var_220', 'var_222', 'var_231', 'var_255',
       'var_261', 'var_262', 'var_272'],
      dtype='object')
(35000, 35)


In [5]:
def random_forest_selection_reg(x_train, y_train, print_res=False):
    sel = SelectFromModel(RandomForestRegressor(n_estimators=100, random_state=10))

    sel.fit(x_train, y_train)
    selected_feat = x_train.columns[(sel.get_support())]
    if print_res:
        print(selected_feat)
    return sel.transform(x_train)

*Below methods handle the correlated features well*

In [13]:
def random_forest_selection_clf_rfe(x_train, y_train, print_res=False):
    sel = RFE(RandomForestClassifier(n_estimators=10, random_state=10), n_features_to_select=27)

    sel.fit(x_train, y_train)
    selected_feat = x_train.columns[(sel.get_support())]
    if print_res:
        print(selected_feat)
    return sel.transform(x_train)

In [14]:
def random_forest_selection_reg_rfe(x_train, y_train, print_res=False):
    sel = RFE(RandomForestRegressor(n_estimators=10, random_state=10), n_features_to_select=27)

    sel.fit(x_train, y_train)
    selected_feat = x_train.columns[(sel.get_support())]
    if print_res:
        print(selected_feat)
    return sel.transform(x_train)

In [17]:
print(X_train.shape)
df = random_forest_selection_clf_rfe(X_train, y_train, print_res=True)
print(df.shape)

(35000, 300)
Index(['var_4', 'var_17', 'var_21', 'var_29', 'var_31', 'var_35', 'var_46',
       'var_49', 'var_50', 'var_55', 'var_74', 'var_75', 'var_76', 'var_91',
       'var_93', 'var_110', 'var_145', 'var_157', 'var_161', 'var_173',
       'var_185', 'var_190', 'var_203', 'var_207', 'var_222', 'var_231',
       'var_266'],
      dtype='object')
(35000, 27)


## Hybrid Methods

**Feature Shuffling**

In [21]:
# create a classifier, can be any classifier, chossing RF as a good default classifier
rf = RandomForestClassifier(
    n_estimators=50, max_depth=2, random_state=2909, n_jobs=4)

In [22]:
def feature_shuffling_selection_clf(x_train, y_train, model=rf,print_res=False):
    sel = SelectByShuffling(
        variables=None, # automatically examine all numerical variables
        estimator=model, # the ML model
        scoring='roc_auc', # the metric to evaluate
        threshold=0,# the maximum performance drop allowed to select the feature
        cv=3, # cross validation
        random_state=1 # seed
    )

    sel.fit(X_train, y_train)
    
    df = sel.transform(x_train)
    
    if print_res:
        print(df.columns)
        
    return df

In [23]:
print(X_train.shape)
df = feature_shuffling_selection_clf(X_train, y_train, print_res=True)
print(df.shape)

(35000, 300)
Index(['var_4', 'var_21', 'var_30', 'var_35', 'var_46', 'var_55', 'var_75',
       'var_76', 'var_82', 'var_107', 'var_110', 'var_132', 'var_152',
       'var_161', 'var_203', 'var_205', 'var_222', 'var_230', 'var_231',
       'var_262'],
      dtype='object')
(35000, 20)


In [25]:
# here again we can choose any other regressor
rf_reg = RandomForestRegressor(n_estimators=100,
                           max_depth=3,
                           random_state=2909,
                           n_jobs=4)

In [26]:
def feature_shuffling_selection_reg(x_train, y_train, model=rf_reg,print_res=False):
    sel = SelectByShuffling(
        variables=None, # automatically examine all numerical variables
        estimator=model, # the ML model
        scoring='neg_root_mean_squared_error', # the metric to evaluate
        threshold=None,# the maximum performance drop allowed to select the feature
        cv=3, # cross validation
        random_state=1 # seed
    )

    sel.fit(X_train, y_train)
    
    df = sel.transform(x_train)
    
    if print_res:
        print(df.columns)
        
    return df

**Recursive Feature Elimination**

In [30]:
# the ML model for which we want to select features

model = GradientBoostingClassifier(
    n_estimators=10,
    max_depth=2,
    random_state=10,
)

In [31]:
def rfe_selection_clf(x_train, y_train, model = model,print_res=False):
    # Setup the RFE selector
    sel = RecursiveFeatureElimination(
        variables=None, # automatically evaluate all numerical variables
        estimator = model, # the ML model
        scoring = 'roc_auc', # the metric we want to evalute
        threshold = 0.0005, # the maximum performance drop allowed to remove a feature
        cv=2, # cross-validation
    )

    # this may take quite a while, because
    # we are building a lot of models with cross-validation
    sel.fit(x_train, y_train)
    
    df = sel.transform(x_train)
    
    if print_res:
        print(df.columns)
        
    return df

In [32]:
# build initial model using all the features
model = GradientBoostingRegressor(n_estimators=10, max_depth=4, random_state=10)

In [33]:
def rfe_selection_reg(x_train, y_train, model = model,print_res=False):
    # Setup the RFE selector

    sel = RecursiveFeatureElimination(
        variables=None, # automatically evaluate all numerical variables
        estimator = model, # the ML model
        scoring = 'r2', # the metric we want to evalute
        threshold = 0.001, # the maximum performance drop allowed to remove a feature
        cv=3, # cross-validation
    )

    # this may take quite a while, because
    # we are building a lot of models with cross-validation
    sel.fit(X_train, y_train)
    
    df = sel.transform(x_train)
    
    if print_res:
        print(df.columns)
        
    return df

**Recursive Feature Addition**

In [35]:
# the ML model for which we want to select features
model = GradientBoostingClassifier(
    n_estimators=10,
    max_depth=2,
    random_state=10,
)

In [38]:
def rfa_selection_clf(x_train, y_train, model=model, print_res=False):
    # Setup the RFA selector

    rfa = RecursiveFeatureAddition(
        variables=None,  # automatically evaluate all numerical variables
        estimator=model,  # the ML model
        scoring='roc_auc',  # the metric we want to evalute
        threshold=0.0001,  # the minimum performance increase needed to select a feature
        cv=2,  # cross-validation
    )

    rfa.fit(X_train, y_train)
    df = rfa.transform(x_train)
    if print_res:
        print(df.columns)
        
    return df

In [39]:
# the model for which we want to select features
model = GradientBoostingRegressor(
    n_estimators=10, max_depth=4, random_state=10)

In [40]:
def rfa_selection_reg(x_train, y_train, model=model, print_res=False):
    # Setup the RFA selector
    rfa = RecursiveFeatureAddition(
        variables=None,  # automatically evaluate all numerical variables
        estimator=model,  # the ML model
        scoring='r2',  # the metric we want to evalute
        threshold=0.001,  # the minimum performance increase needed to select a feature
        cv=2,  # cross-validation
    )

    rfa.fit(X_train, y_train)
    df = rfa.transform(x_train)
    if print_res:
        print(df.columns)
        
    return df