It is quite popular to add random feature to data and observe which features have greater feature importance and which have smaller. But the results in this competition are quite interesting

In [None]:
import numpy as np 
import pandas as pd 
import os
import matplotlib.pyplot as plt
import seaborn as sns
from lightgbm import LGBMClassifier
from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import log_loss

import warnings
warnings.filterwarnings("ignore")

In [None]:
N_SPLITS = 5

def seed(seed=42):
    np.random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    
seed()

In [None]:
train = pd.read_csv('../input/tabular-playground-series-may-2021/train.csv').drop(['id'], axis=1)
test = pd.read_csv('../input/tabular-playground-series-may-2021/test.csv').drop(['id'], axis=1)
sample_sub = pd.read_csv('../input/tabular-playground-series-may-2021/sample_submission.csv')

In [None]:
X = train.drop(['target'], axis = 1)
y = train['target']

In [None]:
le = LabelEncoder()
train = train.assign(target = le.fit_transform(train.target))
train.head()

In [None]:
def train_and_eval_lgb(model_fn):
    oof = np.zeros((len(train), 4))
    test_preds = np.zeros((len(test), 4))
    feature_importace = pd.DataFrame()

    
    cv = StratifiedKFold(N_SPLITS, shuffle=True, random_state = 42)
    for fold, (train_idx, val_idx) in enumerate(cv.split(X, y)):
        model = model_fn()
        model = model.fit(
            X.iloc[train_idx],
            y.iloc[train_idx],
            eval_set=[(X.iloc[val_idx], y.iloc[val_idx])],
            eval_metric='multi_logloss',
            early_stopping_rounds = 100,verbose=250)
        
        tmp_oof = model.predict_proba(X.iloc[val_idx].values)
        oof[val_idx] += tmp_oof
        test_preds += model.predict_proba(test.values) / N_SPLITS
        
        fe = pd.DataFrame()
        fe['feature'] = model.feature_name_
        fe['importance'] = model.feature_importances_
        feature_importace = feature_importace.append(fe)
        print(f'fold {fold + 1} logloss = {log_loss(y.iloc[val_idx], tmp_oof)}')
    
    print(f'oof logloss = {log_loss(y.values, oof)}')
    return test_preds, feature_importace

# Original features

In [None]:
params = {
    'num_iterations': 20_000,
    'learning_rate': 0.05,
    'max_depth': 10,
    'num_leaves' : 63,
    'objective': 'multiclass',
    'metric': 'multi_logloss',
    'bagging_seed': 42,
    'boosting_type': 'gbdt',
    'is_unbalance': True
}

model_fn  = lambda : LGBMClassifier(**params)

In [None]:
test_preds, feature_importace = train_and_eval_lgb(model_fn)
order = list(feature_importace.groupby('feature').agg('mean').sort_values('importance', ascending=False).index)

plt.figure(figsize=(16,8))
p = sns.barplot(x='feature', y='importance', data = feature_importace, order = order)
plt.title("LGM Classifier importance")
plt.tight_layout()
_ = p.set_xticklabels(p.get_xticklabels(), rotation=45)

# Add random feature

In [None]:
X['random'] = np.random.random((len(X)))
test['random'] = np.random.random((len(test)))

In [None]:
test_preds, feature_importace = train_and_eval_lgb(model_fn)
order = list(feature_importace.groupby('feature').agg('mean').sort_values('importance', ascending=False).index)

plt.figure(figsize=(16,8))
p = sns.barplot(x='feature', y='importance', data = feature_importace, order = order)
plt.title("LGM Classifier importance")
plt.tight_layout()
_ = p.set_xticklabels(p.get_xticklabels(), rotation=45)