# Methods used:

- Variable Importance
- Information Gain
- Zero Proportion
- Mean Absolute Difference (MAD)
- Backward Feature Elimination <span style="color: red;">(off)</span>
- Exhaustive Feature  Selection <span style="color: red;">(off)</span>
- Lasso Regularization (L1)
- Recursive Feature Elimination (RFE) 
- Boruta 
- Random Feature

# Load dependencies

In [None]:
import pandas as pd
import numpy as np

from random import uniform

from sklearn.feature_selection import mutual_info_classif
from sklearn.feature_selection import VarianceThreshold
from mlxtend.feature_selection import ExhaustiveFeatureSelector
from mlxtend.feature_selection import SequentialFeatureSelector
from sklearn.feature_selection import SelectFromModel
from sklearn.feature_selection import RFECV
from boruta import BorutaPy

from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from lightgbm import LGBMClassifier

from sklearn.model_selection import train_test_split
import optuna.integration.lightgbm as lgb

In [None]:
train=pd.read_csv('../input/tabular-playground-series-jun-2021/train.csv')
test=pd.read_csv('../input/tabular-playground-series-jun-2021/test.csv')
sub=pd.read_csv('../input/tabular-playground-series-jun-2021/sample_submission.csv')

In [None]:
conditions = [
    (train.target == "Class_1"), (train.target == "Class_2"), (train.target == "Class_3"),
    (train.target == "Class_4"), (train.target == "Class_5"), (train.target == "Class_6"),
    (train.target == "Class_7"), (train.target == "Class_8"), (train.target == "Class_9")
]
choices = [0, 1, 2, 3, 4, 5, 6, 7, 8]
train["target"] = np.select(conditions, choices)

X_test = test.drop(['id'], axis=1)
X = train.drop(['id', 'target'], axis=1)
y = train.target

In [None]:
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.15, random_state=42)

In [None]:
X_sample = X.sample(frac=0.3, random_state=314) # dev

# Final LightGBM 

see: https://www.kaggle.com/gomes555/tps-jun2021-lightgbmtunercv?scriptVersionId=64532708

In [None]:
final_params = {'objective': 'multiclass',
 'num_class': 9,
 'metric': 'multi_logloss',
 'verbosity': -1,
 'boosting_type': 'gbdt',
 'learning_rate': 0.03,
 'random_state': 314,
 'feature_pre_filter': False,
 'lambda_l1': 5.620817633003194,
 'lambda_l2': 1.4316945406619173e-08,
 'num_leaves': 19,
 'feature_fraction': 0.4,
 'bagging_fraction': 1.0,
 'bagging_freq': 0,
 'min_child_samples': 5,
 'num_iterations': 1000,
 'early_stopping_round': 100}

final_lgb = LGBMClassifier(**final_params)

In [None]:
%%time

lgb_cv = cross_val_score(final_lgb, X_train, y_train,
                         fit_params={
                             'eval_set':(X_val, y_val),
                             'early_stopping_rounds':final_params['early_stopping_round'],
                             'verbose':0,
                             'eval_metric':"multi_logloss" 
                         },
                         error_score='raise',
                         cv=5, scoring='neg_log_loss', n_jobs=-1)

print('lgb with all features Mean:', -lgb_cv.mean())
print('lgb with all features Std:', lgb_cv.std())

# Variable Importance

In [None]:
%%time

final_lgb.fit(X, y,
             eval_set=(X_val, y_val),
              early_stopping_rounds=final_params['early_stopping_round'],
              verbose=0,
              eval_metric="multi_logloss" );

In [None]:
res_var_imp = pd.DataFrame({
    "feature": final_lgb.feature_name_,
    "var_imp": final_lgb.feature_importances_
})

res_var_imp.sort_values('var_imp', ascending=False)

# Information Gain

In [None]:
%%time

mutual_info = mutual_info_classif(X, y)

In [None]:
res_mutual_info = pd.DataFrame({
    "feature": X.columns,
    "mutual_info": mutual_info
})

res_mutual_info.sort_values('mutual_info', ascending=False)

# Zero proportion

In [None]:
res_pzeros = pd.DataFrame({
    "feature": X.columns,
    "pzeros": (X.shape[0] - X.astype(bool).sum(axis=0)) / X.shape[0] * 100
})

res_pzeros.sort_values('pzeros', ascending=False)

# Mean Absolute Difference (MAD)

In [None]:
# CalculeMAD
mean_abs_diff = np.sum(np.abs(X-np.mean(X, axis=0)), axis=0)/X.shape[0]

In [None]:
res_mad = pd.DataFrame({
    "feature": X.columns,
    "mad": mean_abs_diff
})
res_mad.sort_values('mad', ascending=False)

# Backward Feature Elimination

In [None]:
#%%time
#lasso_newton = LogisticRegression(C=1, penalty="l2", solver='sag', tol = 0.1, random_state=314)
#bfs=SequentialFeatureSelector(lasso_newton,
#                              k_features='best',
#                              forward=False,
#                              floating=False, 
#                              scoring='neg_log_loss',
#                              cv=0,
#                              verbose=2,
#                              n_jobs=1)
#bfs.fit(X, y);

In [None]:
#res_bfs = pd.DataFrame({
#    "feature": X.columns,
#    "bfs": np.where(X.columns.isin(bfs.k_feature_names_), "to_keep", "to_remove")
#})
#res_bfs.sort_values('bfs', ascending=True)

# Exhaustive Feature  Selection

In [None]:
# %%time
# 
# efs = ExhaustiveFeatureSelector(LGBMClassifier(),
#                                 min_features=10,
#                                 max_features=75,
#                                 scoring='neg_log_loss',
#                                 print_progress=True,
#                                 cv=5)
# 
# efs.fit(X, y);

In [None]:
# res_efs = pd.DataFrame({
#     "feature": X.columns,
#     "efs": np.where(X.columns.isin(efs.k_feature_names_), "to_keep", "to_remove")
# })
# res_efs.sort_values('efs', ascending=True)

# Lasso Regularization (L1)

In [None]:
%%time

lasso = LogisticRegression(C=1, penalty="l1", solver="liblinear", random_state=314).fit(X, y)
lasso_selector = SelectFromModel(lasso, prefit=True, threshold="median")

In [None]:
res_lasso = pd.DataFrame({
    "feature": X.columns,
    "lasso": np.where(lasso_selector.get_support(), "to_keep", "to_remove")
})
res_lasso.sort_values('lasso', ascending=True)

# RFE

In [None]:
%%time

rf = RandomForestClassifier(n_jobs=-1, max_depth=4)
rfe_selector = RFECV(rf, min_features_to_select=20, step=1, n_jobs=1, verbose=1)
#rfe_selector.fit(X_sample.values, y[X_sample.index]) #dev
rfe_selector.fit(X.values, y)

In [None]:
res_rfe = pd.DataFrame({
    "feature": X.columns,
    "rfe": np.where(rfe_selector.support_, "to_keep", "to_remove")
})
res_rfe.sort_values('rfe', ascending=True)

# Boruta

In [None]:
%%time

rf = RandomForestClassifier(n_jobs=-1, max_depth=4)
boruta_selector = BorutaPy(rf, n_estimators='auto', verbose=2, random_state=314)
boruta_selector.fit(X.values, y)
#boruta_selector.fit(X_sample.values, y[X_sample.index]) #dev

In [None]:
res_boruta = pd.DataFrame({
    "feature": X.columns,
    "boruta": np.where(boruta_selector.support_, "to_keep", "to_remove")
})
res_boruta.sort_values('boruta', ascending=True)

# Random Column

In [None]:
X_random = pd.concat([X, pd.DataFrame({'random':[uniform(0.0, 100.0) for i in range(X.shape[0])]})], axis=1)

In [None]:
%%time
rf = RandomForestClassifier(n_jobs=-1, max_depth=3)
rf.fit(X_random, y);

In [None]:
varip_random = np.float(rf.feature_importances_[X_random.columns=="random"])
print("Random VarImp:", varip_random)

res_rand_var_imp = pd.DataFrame({
    "feature": X_random.columns,
    "rand_var_imp": rf.feature_importances_,
    "rand_var": np.where(rf.feature_importances_ > varip_random, "to_keep", "to_remove")
})
res_rand_var_imp.sort_values('rand_var_imp', ascending=False)

# Combine Results

In [None]:
feature_selection = res_var_imp.\
                    merge(res_mutual_info).\
                    merge(res_pzeros).\
                    merge(res_mad).\
                    merge(res_lasso).\
                    merge(res_boruta).\
                    merge(res_rfe).\
                    merge(res_rand_var_imp.drop('rand_var_imp', axis=1))

feature_selection.to_csv('feature_selection.csv', index=False)

In [None]:
feature_selection.style.\
    bar(subset=['var_imp'],color='#205ff2').\
    bar(subset=['mutual_info'],color='#205ff2').\
    bar(subset=['mad'],color='#205ff2').\
    background_gradient(subset=['pzeros'],cmap='coolwarm').\
    apply(lambda x: ["background: red" if v == "to_remove" else "" for v in x], axis = 1)

In [None]:
to_drop = [6, 27, 36, 47, 73, 74]

fs = X.drop(['feature_'+str(j) for j in to_drop], axis=1).columns

# Tuner LightGBM with feature selection

In [None]:
%%time

dtrain = lgb.Dataset(X_train[fs], label=y_train)
dval = lgb.Dataset(X_val[fs], label=y_val)

params = {
    "objective": "multiclass",
    "num_class": 9,
    "metric": "multi_logloss",
    "verbosity": -1,
    "boosting_type": "gbdt",
    'learning_rate': 0.03,
    'random_state': 314
    }

booster = lgb.train(params, 
                    dtrain, valid_sets=dval,
                    verbose_eval=0,
                    early_stopping_rounds=100
                   )

In [None]:
booster.params

In [None]:
#1.7487904400322265
final_lgb = LGBMClassifier(**booster.params)

In [None]:
booster.best_score

In [None]:
lgb_pred = booster.predict(test[fs])

# Sub

In [None]:
sub.iloc[:, 1:] = lgb_pred
sub.to_csv("sub_lgb_feature_selection.csv", index=False)

 # Ref

- https://www.analyticsvidhya.com/blog/2020/10/feature-selection-techniques-in-machine-learning/
- https://www.machinelearningplus.com/machine-learning/feature-selection/