In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)


import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

In [None]:
%%time
train = pd.read_csv('../input/cat-in-the-dat/train.csv')
test = pd.read_csv('../input/cat-in-the-dat/test.csv')

In [None]:
target = train['target']
train_id = train['id']
test_id = test['id']

train.drop(['target', 'id'], axis=1, inplace=True)
test.drop('id', axis=1, inplace=True)

In [None]:
df = pd.concat([train, test], axis=0, sort=False )

In [None]:
bin_dict = {'T':1, 'F':0, 'Y':1, 'N':0}
df['bin_3'] = df['bin_3'].map(bin_dict)
df['bin_4'] = df['bin_4'].map(bin_dict)

In [None]:
print(f'Shape before dummy transformation: {df.shape}')
df = pd.get_dummies(df, columns=['nom_0', 'nom_1', 'nom_2', 'nom_3', 'nom_4'],
                    prefix=['nom_0', 'nom_1', 'nom_2', 'nom_3', 'nom_4'], 
                    drop_first=True)
print(f'Shape after dummy transformation: {df.shape}')

In [None]:
from pandas.api.types import CategoricalDtype 

ord_1 = CategoricalDtype(categories=['Novice', 'Contributor','Expert', 
                                     'Master', 'Grandmaster'], ordered=True)
ord_2 = CategoricalDtype(categories=['Freezing', 'Cold', 'Warm', 'Hot',
                                     'Boiling Hot', 'Lava Hot'], ordered=True)
ord_3 = CategoricalDtype(categories=['a', 'b', 'c', 'd', 'e', 'f', 'g',
                                     'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o'], ordered=True)
ord_4 = CategoricalDtype(categories=['A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I',
                                     'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R',
                                     'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z'], ordered=True)

df.ord_1 = df.ord_1.astype(ord_1)
df.ord_2 = df.ord_2.astype(ord_2)
df.ord_3 = df.ord_3.astype(ord_3)
df.ord_4 = df.ord_4.astype(ord_4)

df.ord_1 = df.ord_1.cat.codes
df.ord_2 = df.ord_2.cat.codes
df.ord_3 = df.ord_3.cat.codes
df.ord_4 = df.ord_4.cat.codes

In [None]:
def date_cyc_enc(df, col, max_vals):
    df[col + '_sin'] = np.sin(2 * np.pi * df[col]/max_vals)
    df[col + '_cos'] = np.cos(2 * np.pi * df[col]/max_vals)
    return df

df = date_cyc_enc(df, 'day', 7)
df = date_cyc_enc(df, 'month', 12)

In [None]:
%%time
from sklearn.preprocessing import LabelEncoder

# Label Encoding
for f in ['nom_5', 'nom_6', 'nom_7', 'nom_8', 'nom_9', 'ord_5']:
    lbl = LabelEncoder()
    lbl.fit(df[f])
    df[f'le_{f}'] = lbl.transform(df[f])

In [None]:
df.drop(['nom_5','nom_6','nom_7','nom_8','nom_9', 'ord_5'] , axis=1, inplace=True)

In [None]:
def reduce_mem_usage(df, verbose=True):
    numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
    start_mem = df.memory_usage().sum() / 1024**2    
    for col in df.columns:
        col_type = df[col].dtypes
        if col_type in numerics:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)  
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)    
    end_mem = df.memory_usage().sum() / 1024**2
    if verbose: print('Mem. usage decreased to {:5.2f} Mb ({:.1f}% reduction)'.format(end_mem, 100 * (start_mem - end_mem) / start_mem))
    return df

In [None]:
df = reduce_mem_usage(df)

# SHAP

In [None]:
#https://slundberg.github.io/shap/notebooks/plots/decision_plot.html (Good !!)

In [None]:
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import roc_auc_score
from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split
import eli5
from eli5.sklearn import PermutationImportance
from matplotlib import pyplot as plt
from pdpbox import pdp, get_dataset, info_plots
import shap

In [None]:
train = df[:train.shape[0]]
test = df[train.shape[0]:]

train.shape

In [None]:
X_train, X_val, y_train, y_val = train_test_split(train, target, test_size = 0.2, random_state = 0)

In [None]:
#  RandomForest
Model=LGBMClassifier(max_depth=10, n_estimators=1000, n_jobs=-1, num_leaves=45,  learning_rate=0.01)
Model.fit(X_train,y_train)
y_pred=Model.predict(X_val)
print(classification_report(y_pred,y_val))
#print(confusion_matrix(y_pred,y_val))
#Accuracy Score
print('Roc_Auc is ',roc_auc_score(y_pred,y_val))

* boosting_type (string, optional (default='gbdt')) – ‘gbdt’, traditional Gradient Boosting Decision Tree. ‘dart’, Dropouts meet Multiple Additive Regression Trees. ‘goss’, Gradient-based One-Side Sampling. ‘rf’, Random Forest.
* num_leaves (int, optional (default=31)) – Maximum tree leaves for base learners.
* max_depth (int, optional (default=-1)) – Maximum tree depth for base learners, <=0 means no limit.
* learning_rate (float, optional (default=0.1)) – Boosting learning rate. You can use callbacks parameter of fit method to shrink/adapt learning rate in training using reset_parameter callback. Note, that this will ignore the learning_rate argument in training.
* n_estimators (int, optional (default=100)) – Number of boosted trees to fit.
* subsample_for_bin (int, optional (default=200000)) – Number of samples for constructing bins.
* objective (string, callable or None, optional (default=None)) – Specify the learning task and the corresponding learning objective or a custom objective function to be used (see note below). Default: ‘regression’ for LGBMRegressor, ‘binary’ or ‘multiclass’ for LGBMClassifier, ‘lambdarank’ for LGBMRanker.
* class_weight (dict, 'balanced' or None, optional (default=None)) – Weights associated with classes in the form {class_label: weight}. Use this parameter only for multi-class classification task; for binary classification task you may use is_unbalance or scale_pos_weight parameters. Note, that the usage of all these parameters will result in poor estimates of the individual class probabilities. You may want to consider performing probability calibration (https://scikit-learn.org/stable/modules/calibration.html) of your model. The ‘balanced’ mode uses the values of y to automatically adjust weights inversely proportional to class frequencies in the input data as n_samples / (n_classes * np.bincount(y)). If None, all classes are supposed to have weight one. Note, that these weights will be multiplied with sample_weight (passed through the fit method) if sample_weight is specified.
* min_split_gain (float, optional (default=0.)) – Minimum loss reduction required to make a further partition on a leaf node of the tree.
* min_child_weight (float, optional (default=1e-3)) – Minimum sum of instance weight (hessian) needed in a child (leaf).
* min_child_samples (int, optional (default=20)) – Minimum number of data needed in a child (leaf).
* subsample (float, optional (default=1.)) – Subsample ratio of the training instance.
* subsample_freq (int, optional (default=0)) – Frequence of subsample, <=0 means no enable.
* colsample_bytree (float, optional (default=1.)) – Subsample ratio of columns when constructing each tree.
* reg_alpha (float, optional (default=0.)) – L1 regularization term on weights.
* reg_lambda (float, optional (default=0.)) – L2 regularization term on weights.
* random_state (int or None, optional (default=None)) – Random number seed. If None, default seeds in C++ code will be used.
* n_jobs (int, optional (default=-1)) – Number of parallel threads.
* silent (bool, optional (default=True)) – Whether to print messages while running boosting.
* importance_type (string, optional (default='split')) – The type of feature importance to be filled into feature_importances_. If ‘split’, result contains numbers of times the feature is used in a model. If ‘gain’, result contains total gains of splits which use the feature.
* **kwargs –
* Other parameters for the model. Check http://lightgbm.readthedocs.io/en/latest/Parameters.html for more parameters.

In [None]:
perm = PermutationImportance(Model, random_state=1).fit(X_val, y_val)
eli5.show_weights(perm, feature_names = X_val.columns.tolist(),top=42)

https://eli5.readthedocs.io/en/latest/autodocs/eli5.html

* As you move down the top of the graph, the importance of the feature decreases.
* The features that are shown in green indicate that they have a positive impact on our prediction
* The features that are shown in white indicate that they have no effect on our prediction
* The features shown in red indicate that they have a negative impact on our prediction

In [None]:
explainer = shap.TreeExplainer(Model)
expected_value = explainer.expected_value
if isinstance(expected_value, list):
    expected_value = expected_value[1]
print(f"Explainer expected value: {expected_value}")

select = range(20)
features = X_val.iloc[select]
#features_display = features.columns

with warnings.catch_warnings():
    warnings.simplefilter("ignore")
    shap_values = explainer.shap_values(features)[1]
    shap_interaction_values = explainer.shap_interaction_values(features)
if isinstance(shap_interaction_values, list):
    shap_interaction_values = shap_interaction_values[1]

Refer to the decision plot of the 20 test observations below. _Note: This plot isn't informative by itself; we use it only to illustrate the primary concepts._

* The x-axis represents the model's output. In this case, the units are log odds.
* The plot is centered on the x-axis at explainer.expected_value. All SHAP values are relative to the model's expected value like a linear model's effects are relative to the intercept.
* The y-axis lists the model's features. By default, the features are ordered by descending importance. The importance is calculated over the observations plotted. _This is usually different than the importance ordering for the entire dataset._ In addition to feature importance ordering, the decision plot also supports hierarchical cluster feature ordering and user-defined feature ordering.
* Each observation's prediction is represented by a colored line. At the top of the plot, each line strikes the x-axis at its corresponding observation's predicted value. This value determines the color of the line on a spectrum.
* Moving from the bottom of the plot to the top, SHAP values for each feature are added to the model's base value. This shows how each feature contributes to the overall prediction.
* At the bottom of the plot, the observations converge at explainer.expected_value.

In [None]:
shap.decision_plot(expected_value, shap_values, features)

Like the force plot, the decision plot supports link='logit' to transform log odds to probabilities.

In [None]:
shap.decision_plot(expected_value, shap_values, features,link='logit')

Observations can be highlighted using a dotted line style. Here, we highlight a misclassified observation.

In [None]:
# Our naive cutoff point is zero log odds (probability 0.5).
y_pred = (shap_values.sum(1) + expected_value) > 0
misclassified = y_pred != y_val.iloc[select]
shap.decision_plot(expected_value, shap_values, features, highlight=misclassified, link='logit')

In [None]:
shap.decision_plot(expected_value, shap_values[misclassified], features[misclassified],
                    link='logit', highlight=1)

In [None]:
shap.initjs()
shap.force_plot(expected_value, shap_values[misclassified], features[misclassified])

# PDP

Here is the code to create the Partial Dependence Plot using the PDPBox library.

In [None]:
# Create the data that we will plot
pdp_goals = pdp.pdp_isolate(model=Model, dataset=X_val, model_features=features.columns, feature='le_ord_5')

# plot it
pdp.pdp_plot(pdp_goals, 'le_ord_5')
plt.show()

A few items are worth pointing out as you interpret this plot

* The y axis is interpreted as change in the prediction from what it would be predicted at the baseline or leftmost value.
* A blue shaded area indicates level of confidence

In a nutshell, the larger the value of le_ord_5, the more likely it is to hit the target, and that data does not seem to require interpretation.

In [None]:
# Create the data that we will plot
pdp_goals = pdp.pdp_isolate(model=Model, dataset=X_val, model_features=features.columns, feature='ord_4')

# plot it
pdp.pdp_plot(pdp_goals, 'ord_4')
plt.show()

## 2D PDP

In [None]:
# Similar to previous PDP plot except we use pdp_interact instead of pdp_isolate and pdp_interact_plot instead of pdp_isolate_plot
features_to_plot = ['le_ord_5', 'ord_4']
inter1  =  pdp.pdp_interact(model=Model, dataset=X_val, model_features=features.columns, features=features_to_plot)

pdp.pdp_interact_plot(pdp_interact_out=inter1, feature_names=features_to_plot, plot_type='contour', plot_pdp=True)
plt.show()

How do we interpret it in this model of category variables? <br>
I'd appreciate it if you could share your knowledge.

**TO BE Continue**