This notebook uses [https://www.kaggle.com/tunguz](http://) s [https://www.kaggle.com/tunguz/tps-02-21-feature-importance-with-xgboost-and-shap](http://)  work as base. 


In [None]:
import lightgbm as lgbm
import xgboost as xgb
import catboost as cb
import pandas as pd
import numpy as np
from sklearn.model_selection import KFold
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import mean_squared_error
import category_encoders as ce
import shap
import matplotlib.pyplot as plt
xgb.__version__
%matplotlib inline
shap.initjs()


In [None]:
train = pd.read_csv("../input/tabular-playground-series-feb-2021/train.csv")
test = pd.read_csv("../input/tabular-playground-series-feb-2021/test.csv")
sample_sub = pd.read_csv("../input/tabular-playground-series-feb-2021/sample_submission.csv")

In [None]:
cont_features = [col for col in train.columns if 'cont' in col]
cat_features = [col for col in train.columns if 'cat' in col]
target = 'target'

y_train = train[target]
train.drop(['id'], inplace=True, axis=1)
test.drop(['id'], inplace=True, axis=1)

In [None]:
#frequency_encoding
for variable in cat_features:
    count_dict = train[variable].value_counts().to_dict()
    factor = 1.0 / sum(count_dict.values())
    normalised_count_dict = {k: v * factor for k, v in count_dict.items()}
    train[f'fe_{variable}'] = train[variable].map(normalised_count_dict)
    test[f'fe_{variable}'] = test[variable].map(normalised_count_dict)

#target_encoding
for variable in cat_features:
    # create dictionary of category:mean values.
    dict = train.groupby([variable])[target].mean().to_dict()
    # apply the encoding to the train and test sets.
    train[f'te_{variable}'] = train[variable].map(dict)
    test[f'te_{variable}'] = test[variable].map(dict)
    
#label_encoding
for variable in cat_features:
    le = LabelEncoder()
    le.fit(train[variable])
    train[f'le_{variable}'] = le.transform(train[variable])
    test[f'le_{variable}'] = le.transform(test[variable])

In [None]:
train.drop([target], inplace=True, axis=1)

train.drop(cat_features, inplace=True, axis=1)
test.drop(cat_features, inplace=True, axis=1)

In [None]:
X_train = train
X_test = test

In [None]:
xgb_parameters = {
        "objective": "reg:squarederror",
        "max_depth": 10,
        "learning_rate": 0.01,
        "colsample_bytree": 0.5,
        "subsample": 0.5,
        "reg_alpha" : 6,
        "min_child_weight": 100,
        "n_jobs": 8,
        "seed": 22,
        'tree_method': "gpu_hist",
        "gpu_id": 0,
    }


Remind that these parameters are just default values and not tuned to have best score. 

In [None]:
kf = KFold(n_splits=5, shuffle=True, random_state=22)
oof = np.zeros(len(X_train))
score_list = []
fold = 1
test_preds = []
test_df = xgb.DMatrix(X_test)

for train_index, test_index in kf.split(X_train):
    Xoof_train, Xoof_val = X_train.iloc[train_index], X_train.iloc[test_index]
    yoof_train, yoof_val = y_train.iloc[train_index], y_train.iloc[test_index]
    
    train_df = xgb.DMatrix(Xoof_train, label=yoof_train)
    val_df = xgb.DMatrix(Xoof_val, label=yoof_val)

    model = xgb.train(xgb_parameters, train_df, 3000)

    yoof_pred = model.predict(val_df)
    test_preds.append(model.predict(test_df))

    oof[test_index] = yoof_pred
    score = np.sqrt(mean_squared_error(yoof_val, yoof_pred))
    score_list.append(score)
    print(f"RMSE Seed 22 Fold-{fold} : {score}")
    fold += 1

print(f"Seed 22 folds average = {np.mean(score_list)} ({np.std(score_list)})")
test_pred = np.mean(test_preds, axis=0)
test_pred_df = pd.DataFrame(test_pred, columns=['target'])

In [None]:
shap_preds = model.predict(test_df, pred_contribs=True)

In [None]:
X_test.shape

In [None]:
shap.summary_plot(shap_preds[:,:-1], X_test, max_display=50)

In [None]:
shap.summary_plot(shap_preds[:,:-1], X_test, plot_type="bar",  max_display=50)

In [None]:
%%time
shap_interactions = model.predict(test_df, pred_interactions=True)

In [None]:
cat_columns = [col for col in X_test.columns if 'cat' in col]

I want to focus on interactions between categorical features.

In [None]:
def plot_top_k_interactions(feature_names, shap_interactions, k):
    # Get the mean absolute contribution for each feature interaction
    aggregate_interactions = np.mean(np.abs(shap_interactions[:, :-1, :-1]), axis=0)
    interactions = []
    for i in range(aggregate_interactions.shape[0]):
        for j in range(aggregate_interactions.shape[1]):
            if j < i:
                try:
                    interactions.append((feature_names[i] + "-" + feature_names[j], aggregate_interactions[i][j] * 2))
                except:
                    pass
    # sort by magnitude
    interactions.sort(key=lambda x: x[1], reverse=True)
    interaction_features, interaction_values = map(tuple, zip(*interactions))
    print(interaction_features[:20])
    plt.bar(interaction_features[:k], interaction_values[:k])
    plt.xticks(rotation=90)
    plt.tight_layout()
    plt.show()
    
plot_top_k_interactions(cat_columns, shap_interactions, 20)

5 most gain category interaction pairs are ("cat8", "cat0"), ("cat9", "cat8"), ("cat9", "cat5"), ("cat8", "cat5"), ("cat9", "cat0")

We are starting from scratch to add interactions as new features

In [None]:
train = pd.read_csv("../input/tabular-playground-series-feb-2021/train.csv")
test = pd.read_csv("../input/tabular-playground-series-feb-2021/test.csv")
sample_sub = pd.read_csv("../input/tabular-playground-series-feb-2021/sample_submission.csv")

In [None]:
cont_features = [col for col in train.columns if 'cont' in col]
cat_features = [col for col in train.columns if 'cat' in col]
target = 'target'

y_train = train[target]
train.drop(['id'], inplace=True, axis=1)
test.drop(['id'], inplace=True, axis=1)

In [None]:
cat_interactions = []
cat_pairs = [("cat8", "cat0"), ("cat9", "cat8"), ("cat9", "cat5"), ("cat8", "cat5"), ("cat9", "cat0")]
for pair in cat_pairs:
    cat_interactions.append(f'{pair[0]}_{pair[1]}')
    train[f'{pair[0]}_{pair[1]}'] = (train[pair[0]] + train[pair[1]]).astype("category")
    test[f'{pair[0]}_{pair[1]}'] = (test[pair[0]] + test[pair[1]]).astype("category")

In [None]:
#frequency_encoding
for variable in cat_features:
    count_dict = train[variable].value_counts().to_dict()
    factor = 1.0 / sum(count_dict.values())
    normalised_count_dict = {k: v * factor for k, v in count_dict.items()}
    train[f'fe_{variable}'] = train[variable].map(normalised_count_dict)
    test[f'fe_{variable}'] = test[variable].map(normalised_count_dict)

#target_encoding
for variable in cat_features:
    # create dictionary of category:mean values.
    dict = train.groupby([variable])[target].mean().to_dict()
    # apply the encoding to the train and test sets.
    train[f'te_{variable}'] = train[variable].map(dict)
    test[f'te_{variable}'] = test[variable].map(dict)
    
#label_encoding
full_data = pd.concat([train,test], axis=0)
for variable in cat_features + cat_interactions:
    le = LabelEncoder()
    le.fit(full_data[variable])
    train[f'le_{variable}'] = le.transform(train[variable])
    test[f'le_{variable}'] = le.transform(test[variable])

In [None]:
train.drop([target], inplace=True, axis=1)

train.drop(cat_features + cat_interactions, inplace=True, axis=1)
test.drop(cat_features + cat_interactions, inplace=True, axis=1)

X_train = train
X_test = test

In [None]:
kf = KFold(n_splits=5, shuffle=True, random_state=22)
oof = np.zeros(len(X_train))
score_list = []
fold = 1
test_preds = []
test_df = xgb.DMatrix(X_test, enable_categorical=True)

for train_index, test_index in kf.split(X_train):
    Xoof_train, Xoof_val = X_train.iloc[train_index], X_train.iloc[test_index]
    yoof_train, yoof_val = y_train.iloc[train_index], y_train.iloc[test_index]
    
    train_df = xgb.DMatrix(Xoof_train, label=yoof_train, enable_categorical=True)
    val_df = xgb.DMatrix(Xoof_val, label=yoof_val, enable_categorical=True)

    model = xgb.train(xgb_parameters, train_df, 3000)

    yoof_pred = model.predict(val_df)
    test_preds.append(model.predict(test_df))

    oof[test_index] = yoof_pred
    score = np.sqrt(mean_squared_error(yoof_val, yoof_pred))
    score_list.append(score)
    print(f"RMSE Seed 22 Fold-{fold} : {score}")
    fold += 1

print(f"Seed 22 folds average = {np.mean(score_list)} ({np.std(score_list)})")
test_pred = np.mean(test_preds, axis=0)
test_pred_df = pd.DataFrame(test_pred, columns=['target'])

Folds average improved 0.8441617327137406 (0.0008574847655632837) =>  0.8440450262413028 (0.0008286076422571708)