In [None]:
!pip install lofo-importance

<a id='top'></a>

![](https://storage.googleapis.com/kaggle-competitions/kaggle/26480/logos/header.png?t=2021-04-09-00-57-05)

<div class="list-group" id="list-tab" role="tablist">
<h3 data-toggle="list"  role="tab" aria-controls="home"><p style="font-size : 30px"><font color="darkgrey">Content<font/></p></h3>
    
1. [<font color="darkgrey"><p style="font-size : 18px"> Absolute Correlations</p><font/>](#1)   
2. [<font color="darkgrey"><p style="font-size : 18px"> LOFO Importances</p><font/>](#2)  
3. [<font color="darkgrey"><p style="font-size : 18px"> SHAP Values</p><font/>](#3) 

In [None]:
import numpy as np 
import pandas as pd 

import plotly.express as px
import plotly.graph_objects as go
from lofo import LOFOImportance, Dataset, plot_importance
import shap 
from matplotlib import pyplot as plt
from sklearn.model_selection import KFold
from sklearn.model_selection import train_test_split as split
from catboost import CatBoostClassifier, Pool

import warnings
warnings.filterwarnings("ignore")

In [None]:
class NBConfig:
    general = {
        "seed": 123
    }
    cv = {
        "folds": 5,
        "shuffle": True
    }
    model = {
        "iterations": 1500,
        "verbose": True,
        "loss_function": "CrossEntropy",
        "rsm": 0.55,
        "subsample": 0.75,
        "use_best_model": True,
        "metric_period": 250,
        "eval_metric": "AUC",
        "n_jobs": 4
        
    }

In [None]:
train, test, samSub = pd.read_csv("../input/tabular-playground-series-nov-2021/train.csv", index_col="id"), \
    pd.read_csv("../input/tabular-playground-series-nov-2021/test.csv", index_col="id"), \
    pd.read_csv("../input/tabular-playground-series-nov-2021/sample_submission.csv", index_col="id")

<font size="+3" color="grey"><b>1. Absolute Correlations </b></font><br><a id="1"></a>
<a href="#top" class="btn-xs btn-danger" role="button" aria-pressed="true" style="color:white" data-toggle="popover">Go back to the TOP</a>

<p style="font-size : 16px">
Absolute correlations help us discover important bivariate relationships between variables. Note that correlations are always only two-dimensional and linear. You can zoom in by using the left mouse button and/or mark the 10% largest correlations.
</p>

In [None]:
correlations = train.corr().abs()
p = np.round(np.nanpercentile(np.array(correlations), 90), decimals=3)
for i in range(correlations.shape[0]):
    correlations.iloc[i,i:correlations.shape[0]] = None

cor = px.imshow(
    correlations,
    color_continuous_scale='cividis'
)

cor.add_trace(
    go.Contour(
    z=correlations, 
    showscale=False,
    contours=dict(
        start=p, 
        end=1, 
        size=100, 
        coloring='lines',
        operation="="
        ),
    line_width=2,
    visible=False
    )
)

cor.update_traces(
    hovertemplate=" feature 1: %{x} <br> feature 2: %{y} <br> correlation: %{z}"
)

cor.layout.coloraxis.showscale = False

cor.update_layout(
    updatemenus=[
        dict(
            type = "buttons",
            direction = "left",
            buttons=list([
                dict(
                    args=[{"visible": [True, False]}],
                    label="Clean",
                    method="update"
                ),
                dict(
                    args=[{"visible": [True, True]}],
                    label="Mark >= " + str(p),
                    method="update"
                )
            ])
        )
    ],
    height=900, width=900,
    title="Absolute Correlations",
    template="simple_white"
)


cor.show()

<font size="+3" color="grey"><b>2. LOFO Importances </b></font><br><a id="2"></a>
<a href="#top" class="btn-xs btn-danger" role="button" aria-pressed="true" style="color:white" data-toggle="popover">Go back to the TOP</a>

<p style="font-size : 16px">
In order to uncover nonlinear relationships, LOFO importance is particularly useful. Here the models are compared among themselves, whereby iteratively one variable is omitted.
</p>

In [None]:
X=[c for c in train.columns if "target" not in c]
Y="target"

dataset = Dataset(
    df=train, 
    target=Y, 
    features=X
)

In [None]:
CV = KFold(
    n_splits=NBConfig.cv["folds"], 
    shuffle=NBConfig.cv["shuffle"], 
    random_state=NBConfig.general["seed"]
)

<p style="font-size : 16px">
We use the default method, which is LGBM.
</p>

In [None]:
Imp = LOFOImportance(
    dataset, 
    cv=CV, 
    scoring="roc_auc"
)

In [None]:
imp_result = Imp.get_importance()

In [None]:
plot_importance(
    imp_result.head(101), 
    figsize=(10, 60), 
    kind="default"
)

<p style="font-size : 16px">
As can be seen, the correlations and LOFO importance are similar. This suggests that the most important relationships are at least approximately linear. 
</p>

In [None]:
imp_result.to_csv("importances.csv", index=False)

<font size="+3" color="grey"><b>3. SHAP Values </b></font><br><a id="3"></a>
<a href="#top" class="btn-xs btn-danger" role="button" aria-pressed="true" style="color:white" data-toggle="popover">Go back to the TOP</a>

<p style="font-size : 16px">
SHAP values use a so-called "power set" to calculate the marginal contributions that a feature brings. It starts from the mean value which is taken as baseline. Then one feature is added at a time until all possible combinations have been gone through. In this case, this is 2^100, which is why an approximation is used in practice. 
</p>

In [None]:
X, x, Y, y = split(
    train.drop(columns=["target"]), 
    train.target,
    test_size=1/NBConfig.cv["folds"], 
    random_state=NBConfig.general["seed"], 
    shuffle=NBConfig.cv["shuffle"]
)

<p style="font-size : 16px">
Due to the very well chosen default parameters we use CatBoost.
</p>

In [None]:
CatBoost = CatBoostClassifier(
    verbose=NBConfig.model["verbose"], 
    loss_function=NBConfig.model["loss_function"],
    rsm=NBConfig.model["rsm"],
    iterations=NBConfig.model["iterations"],
    subsample=NBConfig.model["subsample"],
    use_best_model=NBConfig.model["use_best_model"],
    metric_period=NBConfig.model["metric_period"],
    eval_metric=NBConfig.model["eval_metric"],
    thread_count=NBConfig.model["n_jobs"]
)

In [None]:
CatBoost.fit(
    X,Y, 
    eval_set=Pool(x, y) 
)

In [None]:
samSub.target = CatBoost.predict(test)

In [None]:
samSub.to_csv("submission.csv")

In [None]:
%%time
explainer = shap.TreeExplainer(CatBoost)
shap_values = explainer(X)
pd.DataFrame(
    shap_values.values, 
    columns=train.columns[:-1]
).to_csv("SHAP.csv", index=False)

<p style="font-size : 16px">
The following plot shows the marginal contributions calculated for each observation. Red means that the value of the feature is high, while for blue it is low. The position of each dot shows how strongly the feature influences the prediction.
</p>

In [None]:
plt.figure(figsize=(10,60))
shap.summary_plot(
    shap_values, 
    X, 
    max_display=100
)

<p style="font-size : 16px">
The next so-called waterfall plot also starts from the average. Next, the marginal contributions (given the average of the respective feature) are gradually added up. So it is started without features, until one by one all features are included. Note that the feature values are show in gray to the left of the feature names.
</p>

In [None]:
shap.plots.waterfall(
    shap_values[0], 
    max_display=25
)