## I. Filter Methods
1. Constant Features
1. Quasi-Constant Features
1. Duplicate Features
1. Correlation
1. Chi Square
1. ANOVA
1. Single Feature Model performance metrics
1. Target Mean Encoding

## II. Wrapper Methods
1. Step Forward
1. Step Backward
1. Exhaustive

## III. Embedded Methods
1. Regression Coefficients
1. Regularization
1. Trees

In [None]:
!pip install fast_ml --upgrade

In [None]:
from fast_ml.utilities import display_all
from fast_ml import eda
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
pd.set_option('display.max_columns', 1000)

from sklearn.metrics import roc_auc_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split

SEED = 2021

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

In [None]:
df = pd.read_csv('/kaggle/input/dataset-1/dataset_1.csv')
print('Shape of Dataframe : ', df.shape)
df.head()

In [None]:
df.memory_usage()/1024

In [None]:
display_all(eda.df_info(df))

In [None]:
df[['var_1','var_20']].info()

# 1. Constant Features

In [None]:
constant_features = [var for var in df.columns if df[var].nunique(dropna=False) ==1]
print(len(constant_features))
print(constant_features)

In [None]:
print (df['var_23'].nunique(dropna=False))
print (df['var_23'].unique())

In [None]:
print (df['var_294'].nunique(dropna=False))
print (df['var_294'].unique())

In [None]:
print (df['var_167'].nunique(dropna=False))
print (df['var_167'].unique())

# 2. Quasi-Constant Features

In [None]:
quasi_constant_feats = []
threshold = .99
#var = 'var_101'
for var in df.columns:
    s = df[var].value_counts(normalize=True, dropna=False)
    if s.iloc[0]>=threshold:
        quasi_constant_feats.append(var)

In [None]:
print(len(quasi_constant_feats))
print(quasi_constant_feats)

In [None]:
def get_constant_features(df, threshold=0.99, dropna=False):
    '''
    For a given dataframe, identify the constant and quasi constant features.
    To get all the constant & quasi constant features in a list - constant_features_df['Var'].to_list()
    
    Parameters:
    -----------
        df: 'dataframe'
        threshold: 'float'. default = 0.99
        dropna: 'bool'. default = false
        
    Returns:
    --------
        constant_features_df: 'dataframe'
    '''
    constant_features = []
    constant_features_df = pd.DataFrame(columns=['Desc', 'Var', 'Value', 'Perc'])
    all_vars = list(df.columns)
    i=0
    for var in all_vars:
        s = df[var].value_counts(normalize=True, dropna=dropna)
        value = s.index[0]
        perc = s.iloc[0]
    
        if perc==1:
            constant_features_df.loc[i] = ['Constant', var, value, 100*perc]

        elif perc>threshold:
            constant_features_df.loc[i] = ['Quasi Constant', var, value, 100*perc]
    
        i=i+1
    
    constant_features_df = constant_features_df.sort_values(by='Perc', ascending=False, ignore_index=True) 

    return constant_features_df

In [None]:
constant_features = get_constant_features(df, threshold=0.99, dropna=False)
constant_features

In [None]:
print(constant_features['Var'].to_list())

In [None]:
threshold=0.99
constant_features = []
constant_features_df = pd.DataFrame(columns=['Desc', 'Var', 'Value', 'Perc'])
all_vars = list(df.columns)
i=0
for var in all_vars:
    s = df[var].value_counts(normalize=True, dropna=False)
    value = s.index[0]
    perc = s.iloc[0]
    
    if perc==1:
        constant_features_df.loc[i] = ['Constant', var, value, 100*perc]
    
    elif perc>threshold:
        constant_features_df.loc[i] = ['Quasi Constant', var, value, 100*perc]
    
    i=i+1
#     if df[var].nunique() ==1:
#         constant_features.append(var)
#         constant_features_df.loc[i] = ['Constant', var, 100]
#         i=i+1

# new_vars = list(set(all_vars) - set(constant_features))


# for var in new_vars:
#     s = df[var].value_counts(normalize=True, dropna=False)
#     if s.iloc[0]>threshold:
#         constant_features_df.loc[i] = ['Quasi Constant', var, 100*s.iloc[0]]
#         i=i+1
        
constant_features_df = constant_features_df.sort_values(by='Perc', ascending=False, ignore_index=True) 


# 3. Duplicate Features

In [None]:
l1 = [1,2,3,4,5,6, 'a']
l2 = [1,2,3,4,5,6, 'a']

for x in l1:
    for y in l2:
        if x==y:
            continue
        else:
            print(x, '---', y)
    l2.remove(x)

In [None]:
df['var_1'].drop_duplicates()

In [None]:
df['var_6'].drop_duplicates()

In [None]:
df['var_7'].drop_duplicates()

In [None]:
df[['var_6', 'var_7']].drop_duplicates()

In [None]:
duplicate_features_df = pd.DataFrame(columns = ['Desc', 'feature1', 'feature2'])
duplicate_features_ = []
duplicate_pairs_ = {}
#duplicate_df = pd.DataFrame(columns = ['feature', 'duplicate_features'])

ix=0
for i,v1 in enumerate(df.columns,0):
    #tmp_df = pd.DataFrame()
    duplicate_feat = []
    if v1 not in duplicate_features_:
        for v2 in df.columns[i+1:]:
            if df[v1].nunique() == df[v2].nunique():
                
                if df[v1].equals(df[v2]):
                    duplicate_features_.append(v2)
                    duplicate_feat.append(v2)
                    duplicate_features_df.loc[ix] = ['Duplicate Values', v1, v2]
                    ix=ix+1
                
                elif df[[v1, v2]].drop_duplicates().shape[0] == df[v1].nunique():
                    duplicate_features_df.loc[ix] = ['Duplicate Index', v1, v2]
                    ix=ix+1
                    
        if duplicate_feat:
            duplicate_pairs_[v1] = duplicate_feat
            #tmp_df['feature'] = [v1]
            #tmp_df['duplicate_features'] = [duplicate_feat]
            #duplicate_df = duplicate_df.append(tmp_df)
        

In [None]:
def get_duplicate_features(df):
    '''
    For a given dataframe, identify the duplicate features
    To get all the constant & quasi constant features in a list - duplicate_features_df['feature1'].to_list()
    
    Parameters:
    -----------
        df: 'dataframe'
        
    Returns:
    --------
        duplicate_features_df: 'dataframe'
    
    '''
    duplicate_features_df = pd.DataFrame(columns = ['Desc', 'feature1', 'feature2'])
    duplicate_features_ = []
    ix=0
    for i,v1 in enumerate(df.columns,0):
        if v1 not in duplicate_features_:
            for v2 in df.columns[i+1:]:
                if df[v1].nunique() == df[v2].nunique():
                    # This check for duplicate values
                    if df[v1].equals(df[v2]):
                        duplicate_features_.append(v2)
                        duplicate_feat.append(v2)
                        duplicate_features_df.loc[ix] = ['Duplicate Values', v1, v2]
                        ix=ix+1
                    
                    # This check for duplicate index
                    elif df[[v1, v2]].drop_duplicates().shape[0] == df[v1].nunique():
                        duplicate_features_df.loc[ix] = ['Duplicate Index', v1, v2]
                        ix=ix+1
    duplicate_features_df = duplicate_features_df.sort_values(by='Desc', ascending=False, ignore_index=True)
    
    return duplicate_features_df

In [None]:
get_duplicate_features(df)

In [None]:
df['var_2'].equals(df['var_234'])

In [None]:
df[['var_2', 'var_234']].drop_duplicates()

In [None]:
df[['var_2', 'var_234']]

In [None]:
duplicate_features_df

In [None]:
print(duplicate_features_)

In [None]:
print(duplicate_pairs_)

In [None]:
[item for sub_list in list(duplicate_pairs_.values()) for item in sub_list]

In [None]:
duplicate_df

In [None]:
def get_duplicate_features (df):
    
    duplicate_features_ = []
    duplicate_pairs_ = {}

    for i,v1 in enumerate(df.columns,0):
        duplicate_feat = []
        if v1 not in duplicate_features_:
            for v2 in df.columns[i+1:]:
                if df[v1].equals(df[v2]):
                    duplicate_features_.append(v2)
                    duplicate_feat.append(v2)
            if duplicate_feat:
                duplicate_pairs_[v1] = duplicate_feat
                
    return duplicate_features_

def get_duplicate_pairs (df):
    '''
    To get list of duplicate features from this dictionary run this command
    [item for sub_list in list(duplicate_pairs_.values()) for item in sub_list]
    
    '''
    duplicate_features_ = []
    duplicate_pairs_ = {}

    for i,v1 in enumerate(df.columns,0):
        duplicate_feat = []
        if v1 not in duplicate_features_:
            for v2 in df.columns[i+1:]:
                if df[v1].equals(df[v2]):
                    duplicate_features_.append(v2)
                    duplicate_feat.append(v2)
            if duplicate_feat:
                duplicate_pairs_[v1] = duplicate_feat
                
    return duplicate_pairs_


In [None]:
duplicate_df.duplicate_features.to_list()

In [None]:
pd.DataFrame(duplicate_pairs_, columns=['feature1', 'duplicate_features'])

In [None]:
def get_duplicate_features(df):
    

# 4. Correlated Features
Find group of correlated features. Group of 3,4 features 

In [None]:
df_corr = df.corr()
df_corr = pd.DataFrame(df_corr.unstack())
df_corr = df_corr.reset_index()
df_corr.columns = ['feature1', 'feature2', 'corr']
df_corr['abs_corr'] = df_corr['corr'].abs()

print('original corr dataframe Shape', df_corr.shape)

corr_thresh = 0.8
# Removing correlation below the threshold
df_corr = df_corr.query(f'abs_corr >= {corr_thresh}')

# Removing correlations within the same features
df_corr = df_corr[~(df_corr['feature1']==df_corr['feature2'])]

# Removing cases where first v1 was compared with v2 and then later v2 compared with v1
for v1 in df_corr['feature1'].unique():
    for v2 in df_corr['feature2'].unique():
        drop_ix = df_corr[(df_corr['feature1']==v2) & (df_corr['feature2'] == v1)].index
        df_corr.drop(index=drop_ix, inplace=True)
        
# Creating correlation groups        
df_corr['corr_group'] = (df_corr.groupby(by='feature1').cumcount()==0).astype('int')
df_corr['corr_group'] = df_corr['corr_group'].cumsum()

# Formating changes
df_corr.sort_values(by='corr_group', inplace=True)
df_corr.reset_index(drop=True, inplace=True)
df_corr = df_corr[[ 'corr_group', 'feature1', 'feature2', 'corr', 'abs_corr']]
print('Final corr dataframe Shape', df_corr.shape)

In [None]:
def get_correlated_pairs(df, threshold=0.9):
    
    df_corr = df.corr()
    df_corr = pd.DataFrame(df_corr.unstack())
    df_corr = df_corr.reset_index()
    df_corr.columns = ['feature1', 'feature2', 'corr']
    df_corr['abs_corr'] = df_corr['corr'].abs()

    #print('original corr dataframe Shape', df_corr.shape)

    # Removing correlation below the threshold
    df_corr = df_corr.query(f'abs_corr >= {threshold}')

    # Removing correlations within the same features
    df_corr = df_corr[~(df_corr['feature1']==df_corr['feature2'])]

    # Removing cases where first v1 was compared with v2 and then later v2 compared with v1
    for v1 in df_corr['feature1'].unique():
        for v2 in df_corr['feature2'].unique():
            drop_ix = df_corr[(df_corr['feature1']==v2) & (df_corr['feature2'] == v1)].index
            df_corr.drop(index=drop_ix, inplace=True)

    # Creating correlation groups        
    df_corr['corr_group'] = (df_corr.groupby(by='feature1').cumcount()==0).astype('int')
    df_corr['corr_group'] = df_corr['corr_group'].cumsum()

    # Formating changes
    df_corr.sort_values(by='corr_group', inplace=True)
    df_corr.reset_index(drop=True, inplace=True)
    df_corr = df_corr[[ 'corr_group', 'feature1', 'feature2', 'corr', 'abs_corr']]
    #print('Final corr dataframe Shape', df_corr.shape)
    
    return df_corr

In [None]:
df_corr.corr_group.unique()

In [None]:
df_corr.corr_group.value_counts()

In [None]:
df_corr.query('corr_group==4')

## 5. Mutual Information

higher the mi value ; higher the importance of feature

In [None]:
from sklearn.feature_selection import mutual_info_classif

In [None]:
X_train = df.drop(columns='target').copy()
y_train = df['target']

In [None]:
mi = mutual_info_classif(X_train.fillna(0), y_train, random_state=2021)

In [None]:
mi

In [None]:
mi = pd.Series(mi)
mi

In [None]:
mi.index = X_train.columns
mi = mi.sort_values(ascending=False)
mi

In [None]:
pd.DataFrame(mi, columns = ['mi_value'])

## 6. Chi Square

Smallest the p-value higher the importance of the feature

In [None]:
train = pd.read_csv('/kaggle/input/titanic/train.csv')
train.shape

In [None]:
train.head()

In [None]:
from sklearn.feature_selection import chi2

In [None]:
train.Sex.value_counts()

In [None]:
train.Embarked.value_counts()

In [None]:
train['Sex'] = train['Sex'].map({'male': 1, 'female':0})
train['Embarked'] = train['Embarked'].map({'S': 0, 'C':1, 'Q':2})

In [None]:
cat_vars = ['Pclass', 'Sex', 'SibSp','Parch', 'Embarked']

In [None]:
X_train = train[cat_vars]
y_train = train['Survived']

In [None]:
chi_score = chi2(X_train.fillna(99), y_train)
chi_score

In [None]:
d = {'chi_score': chi_score[0], 'p_value': chi_score[1]}
pd.DataFrame(data =d, index=X_train.columns)

## 7. ANOVA

In [None]:
from sklearn.feature_selection import f_classif

In [None]:
X_train = df.drop(columns='target').copy()
y_train = df['target']

In [None]:
anova_score = f_classif(X_train.fillna(0), y_train)
anova_score

In [None]:
d = {'anova_score': anova_score[0], 'p_value': anova_score[1]}
pd.DataFrame(data =d, index=X_train.columns)

## 8. Univariate Model Performance Metrics****

In [None]:
X_train = df.drop(columns='target').copy()
y_train = df['target']

In [None]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import roc_auc_score

In [None]:
roc_values = []
for feat in X_train.columns:
    X = X_train[feat].fillna(0).to_frame()
    model = DecisionTreeClassifier()
    model.fit(X, y_train)
    y_prob = model.predict_proba(X)[:,1]
    roc_value = roc_auc_score(y_train, y_prob)
    roc_values.append(roc_value)
len(roc_values)

In [None]:
roc_values = pd.Series(roc_values)
roc_values.index = X_train.columns
roc_values.sort_values(ascending=False, inplace=True)
roc_values

In [None]:
roc_values[roc_values>0.51]

## 9. Target Mean Encoding

In [None]:
train = pd.read_csv('/kaggle/input/titanic/train.csv')
print(train.shape)

test = pd.read_csv('/kaggle/input/titanic/test.csv')
print(test.shape)

In [None]:
train.head()

In [None]:
cat_vars = ['Pclass', 'Sex', 'SibSp','Parch', 'Embarked']

In [None]:
from sklearn.model_selection import train_test_split

train.fillna('Missing', inplace=True)
df_train, df_test = train_test_split(train, train_size=0.8, random_state=2021)

y_train = df_train['Survived'].copy()
X_train = df_train.drop(columns = 'Survived').copy()

y_test = df_test['Survived'].copy()
X_test = df_test.drop(columns = 'Survived').copy()

print(X_train.shape, y_train.shape)
print(X_test.shape, y_test.shape)

In [None]:
for v in cat_vars:

    mapper = df_train.groupby(by=v)['Survived'].mean().to_dict()
    
    X_train[v] = X_train[v].map(mapper)
    X_test[v] = X_test[v].map(mapper)

In [None]:
X_test.head()

In [None]:
roc_values = []

for v in cat_vars:
    y_pred = X_test[v]
    roc_value = roc_auc_score(y_test, y_pred)
    roc_values.append(roc_value)
    
roc_values = pd.Series(roc_values)
roc_values.index = cat_vars
roc_values.sort_values(ascending=False)

# II. Wrapper Methods

1. Step Forward

http://rasbt.github.io/mlxtend/user_guide/feature_selection/SequentialFeatureSelector/#example-1-a-simple-sequential-forward-selection-example

In [None]:
X_train = df.drop(columns='target').copy()
y_train = df['target']

In [None]:
from mlxtend.feature_selection import SequentialFeatureSelector as SFS
from sklearn.ensemble import RandomForestClassifier

In [None]:
sfs1 = SFS(RandomForestClassifier(n_jobs=-1),
           k_features=10,
           forward=True,
           floating=False,
           verbose=2,
           scoring='roc_auc',
           cv=3
          )

sfs1.fit(X_train.fillna(0), y_train)

## 2. Backward Selection

In [None]:
from mlxtend.feature_selection import SequentialFeatureSelector as SFS
from sklearn.ensemble import RandomForestClassifier

In [None]:
sfs2 = SFS(RandomForestClassifier(),
 ExhaustiveFeatureSelector         k_features = 10,
           forward=False,
           floating=True,
           verbose=2,
           scoring = 'roc_auc',
           cv=3
          )

sfs2.fit(X_train.fillna(0), y_train)

## 3. Exhaustive Search 

In [None]:
from mlxtend.feature_selection import ExhaustiveFeatureSelector as EFS
from sklearn.ensemble import RandomForestClassifier

In [None]:
efs = EFS(RandomForestClassifier(),
          min_features=10,
          max_features=20, 
          print_progress=True, 
          scoring ='roc_auc',
          cv=2,
          n_jobs=-1
         )

efs.fit(X_train.fillna(0), y_train)

# III. Embedded Methods

# IV. Hybrid Methods

## 1. Feature Shuffling

In [None]:
df = pd.read_csv('/kaggle/input/dataset-1/dataset_2.csv')
print(df.shape)
df.head()

In [None]:
y=df['target'].copy()
X = df.drop(columns = 'target').copy()
X.fillna(0, inplace=True)
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=SEED)

model = RandomForestClassifier(n_estimators=50, max_depth=4, random_state=SEED)
model.fit(X_train, y_train)
print(model)

#train
y_train_pred = model.predict_proba(X_train)[:,1]
train_roc = roc_auc_score(y_train, y_train_pred)
print('Train ROC Score:', train_roc)

#test
y_test_pred = model.predict_proba(X_test)[:,1]
test_roc = roc_auc_score(y_test, y_test_pred)
print('Test ROC Score:', test_roc)

Shuffle features 1 by 1 and calculate roc score value

In [None]:
X_train_c = X_train.copy()
print(X_train_c['var_1'])

X_train_c['var_1'] = X_train_c['var_1'].sample(frac=1, random_state=SEED).reset_index(drop=True)
print(X_train_c['var_1'])

In [None]:
X_train_c['var_1'].isna().sum()

In [None]:
performance_shift = []
for f in X_train.columns:
    X_train_c = X_train.copy(deep=True)
    shuff = X_train_c[f].sample(frac=1, random_state=SEED)
    shuff.index = X_train_c[f].index
    X_train_c[f] = shuff
    
    shuffle_pred = model.predict_proba(X_train_c)[:,1]
    shuffle_roc = roc_auc_score(y_train, shuffle_pred)
    
    drift = train_roc - shuffle_roc
    performance_shift.append(drift)

In [None]:
performance_shift = pd.Series(performance_shift)
performance_shift.index = X_train.columns
performance_shift.sort_values(ascending=False, inplace=True)
performance_shift

## 2. Recursive Feature Elimination

In [None]:
df = pd.read_csv('/kaggle/input/dataset-1/dataset_2.csv')
print('Dataframe shape', df.shape)

y=df['target'].copy()
X = df.drop(columns = 'target').copy()
X.fillna(0, inplace=True)
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=SEED)



In [None]:
def recursive_feature_elimination(model, X_train, y_train, X_valid, y_valid):
    rfe_df = pd.DataFrame(columns = ['dropped_feature', 'num_features', 'train_roc', 'valid_roc'])
    features_to_drop = []

    for i in range(0, len(X_train.columns)):
        X_train_c = X_train.copy()
        X_valid_c = X_valid.copy()

        X_train_c = X_train_c.drop(columns = features_to_drop)
        X_valid_c = X_valid_c.drop(columns = features_to_drop)

        #model = RandomForestClassifier(n_estimators=50, max_depth=4, random_state=SEED)
        model.fit(X_train_c, y_train)
        #print(model)

        #train
        y_train_pred = model.predict_proba(X_train_c)[:,1]
        train_roc = roc_auc_score(y_train, y_train_pred)
        #print('Train ROC Score:', train_roc)

        #test
        y_valid_pred = model.predict_proba(X_valid_c)[:,1]
        valid_roc = roc_auc_score(y_test, y_valid_pred)
        #print('Test ROC Score:', test_roc)

        data = {'feature': X_train_c.columns, 'fi': model.feature_importances_}
        fi = pd.DataFrame(data)
        fi.sort_values(by = 'fi', ascending=False, inplace=True)

        lowest_fi = list(fi['feature'])[-1]
        features_to_drop.append(lowest_fi)

        if i ==0:
            drop_f = 'None'
        else:
            drop_f = features_to_drop[-1]

        rfe_df.loc[i] = [drop_f, len(X_train_c.columns), train_roc, valid_roc]

    print('Done')
    rfe_df['train_roc_rank'] =rfe_df['train_roc'].rank(method='min', ascending=False).astype('int')
    rfe_df['valid_roc_rank'] =rfe_df['valid_roc'].rank(method='min', ascending=False).astype('int')
    
    return rfe_df

In [None]:
model = RandomForestClassifier(n_estimators=50, max_depth=4, random_state=SEED)
rfe_df = recursive_feature_elimination(model, X_train, y_train, X_test, y_test)

In [None]:
display_all(rfe_df)

In [None]:
list(roc_df['dropped_feature'])[1:97]