In [None]:
import numpy as np
import pandas as pd
import cudf

from xgboost import XGBClassifier

from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score

%matplotlib inline
import plotly.express as px
import matplotlib.pyplot as plt

# Load Data

In [None]:
%%time
train = cudf.read_csv('../input/tabular-playground-series-oct-2021/train.csv', index_col=0)
test = cudf.read_csv('../input/tabular-playground-series-oct-2021/test.csv', index_col=0)

sample_submission = cudf.read_csv("../input/tabular-playground-series-oct-2021/sample_submission.csv").to_pandas()

memory_usage = train.memory_usage(deep=True) / 1024 ** 2
start_mem = memory_usage.sum()

In [None]:
feature_cols = test.columns.tolist()

cnt_features =[]
cat_features =[]

for col in feature_cols:
    if train[col].dtype=='float64':
        cnt_features.append(col)
    else:
        cat_features.append(col)
        

train[cnt_features] = train[cnt_features].astype('float32')
train[cat_features] = train[cat_features].astype('uint8')

test[cnt_features] = test[cnt_features].astype('float32')
test[cat_features] = test[cat_features].astype('uint8')

memory_usage = train.memory_usage(deep=True) / 1024 ** 2
end_mem = memory_usage.sum()

train = train.to_pandas()
test = test.to_pandas()

# Classic Feature Importance

The easiest way to **determine the magnitude of importance of each feature**, is to remove each feature, and then train a model to see how much accuracy drops without that feature. A big shortcoming of this solution is that you have to train a model for each feature and this is a resource and time consuming task. 

However I have provided the results of training for a simple **XGBoost** model (with default parameter values) with removing each feature. You can find the results [here](https://www.kaggle.com/kavehshahhosseini/tpsoctclassicfeatureimportance).

The results may be different from other ways such as Shapely values, permutation importance, model feature importance and...

I've provided a sample code of how it's been done. You can see it by clicking on "Show Hidden Cell".

In [None]:
# %%time
# # Here is a sample code for training a simple xgboost model with removing each feature one by one.
# x_train, x_valid, y_train, y_valid = train_test_split(train[feature_cols], train["target"], test_size=0.2, random_state=42)
# scores = {}
# feature_cols.insert(0,"all")

# for col in feature_cols:
#     feat = feature_cols.copy()
#     feat.remove(col)
#     if "all" in feat:
#         feat.remove("all")
#     x_t = x_train[feat]
#     x_v = x_valid[feat]

#     xgb_params = {
#         'eval_metric': 'auc',
#         'objective': 'binary:logistic', 
#         'tree_method': 'gpu_hist', 
#         'predictor': 'gpu_predictor', 
#         'seed': 42, 
#         'use_label_encoder': False,
#     }
    
#     xgb_model = XGBClassifier(**xgb_params)
#     xgb_model.fit(x_t, y_train, eval_set=[(x_v, y_valid)], verbose=False)
    
#     preds_valid = xgb_model.predict_proba(x_v)[:,1]
#     auc = roc_auc_score(y_valid, preds_valid)
#     print(f"{col},{auc}", end="\t")
#     scores.update({col:auc})
    
# df = pd.Series(scores, name="xgb_scores")
# df.to_csv("xgboost.csv", index_label="feature")
# print("AVG AUC:",np.mean(df.values))

In [None]:
fi = pd.read_csv("../input/tpsoctclassicfeatureimportance/xgboost.csv").set_index("feature")
fi["importance"] = fi.loc["all","xgb_scores"] - fi["xgb_scores"]
fi = fi.sort_values(ascending=False, by="importance")
fi.head(10)

# Results
* We can see some features have **negative effect** on the model and some of them have no effect.  

In [None]:
fig = px.bar(fi, y=fi["importance"], x=fi.index)
fig.update_layout(
    title=f"Feature Importance",
    xaxis_title="Features",
    yaxis_title="Importance",
    yaxis={'categoryorder':'total descending'},
    colorway=["blue"]
)
fig.show()

Which features made accuracy worse?

In [None]:
neg_features = fi[fi.importance < 0].index
print(neg_features.tolist())

In [None]:
fig = px.bar(fi, y=fi[fi.importance < 0]["importance"], x=fi[fi.importance < 0].index)
fig.update_layout(
    title=f"Feature Importance",
    xaxis_title="Features",
    yaxis_title="Importance",
    yaxis={'categoryorder':'total descending'},
    colorway=["blue"]
)
fig.show()

Which features made accuracy better?

In [None]:
pos_features = fi[fi.importance > 0].index
print(pos_features.tolist())

In [None]:
fig = px.bar(fi, y=fi[fi.importance > 0]["importance"], x=fi[fi.importance > 0].index)
fig.update_layout(
    title=f"Feature Importance",
    xaxis_title="Features",
    yaxis_title="Importance",
    yaxis={'categoryorder':'total descending'},
    colorway=["blue"]
)
fig.show()

Note that the order of importance may remain the same with different models, but the magnitude may change. For example with a good model configuration all features with negative effect may have a positive effect, but they will impact less, in comparison with the positive columns. 