In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
# for dirname, _, filenames in os.walk('/kaggle/input'):
#     for filename in filenames:
#         print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# custom installs
!pip uninstall matplotlib -y
!pip install matplotlib==3.2.2
!pip install mplcyberpunk

In [None]:
import matplotlib
print(matplotlib.__version__)

In [None]:
import matplotlib.pyplot as plt
from matplotlib.ticker import MaxNLocator
from matplotlib import gridspec
import seaborn as sns
from IPython.display import display
from sklearn.model_selection import KFold, cross_val_score, train_test_split
from sklearn.metrics import roc_auc_score, roc_curve
from lightgbm import LGBMClassifier
import shap
import datatable
import mplcyberpunk
plt.style.use("cyberpunk")

In [None]:
# custom imports
import datatable
import mplcyberpunk
plt.style.use("cyberpunk")

# helpers
def reduce_mem_usage(props):
    start_mem_usg = props.memory_usage().sum() / 1024**2 
    print("Memory usage of properties dataframe is :",round(start_mem_usg,2)," MB")
    NAlist = [] # Keeps track of columns that have missing values filled in. 
    for col in props.columns:
        if props[col].dtype != object:  # Exclude strings
            
            # Print current column type
#             print("******************************")
#             print("Column: ",col)
#             print("dtype before: ",props[col].dtype)
            
            # make variables for Int, max and min
            IsInt = False
            mx = props[col].max()
            mn = props[col].min()
            
            # Integer does not support NA, therefore, NA needs to be filled
            if not np.isfinite(props[col]).all(): 
                NAlist.append(col)
                props[col].fillna(mn-1,inplace=True)  
                   
            # test if column can be converted to an integer
            asint = props[col].fillna(0).astype(np.int64)
            result = (props[col] - asint)
            result = result.sum()
            if result > -0.01 and result < 0.01:
                IsInt = True

            
            # Make Integer/unsigned Integer datatypes
            if IsInt:
                if mn >= 0:
                    if mx < 255:
                        props[col] = props[col].astype(np.uint8)
                    elif mx < 65535:
                        props[col] = props[col].astype(np.uint16)
                    elif mx < 4294967295:
                        props[col] = props[col].astype(np.uint32)
                    else:
                        props[col] = props[col].astype(np.uint64)
                else:
                    if mn > np.iinfo(np.int8).min and mx < np.iinfo(np.int8).max:
                        props[col] = props[col].astype(np.int8)
                    elif mn > np.iinfo(np.int16).min and mx < np.iinfo(np.int16).max:
                        props[col] = props[col].astype(np.int16)
                    elif mn > np.iinfo(np.int32).min and mx < np.iinfo(np.int32).max:
                        props[col] = props[col].astype(np.int32)
                    elif mn > np.iinfo(np.int64).min and mx < np.iinfo(np.int64).max:
                        props[col] = props[col].astype(np.int64)    
            
            # Make float datatypes 32 bit
            else:
                props[col] = props[col].astype(np.float32)
            
            # Print new column type
#             print("dtype after: ",props[col].dtype)
#             print("******************************")
    
    # Print final result
    print("___MEMORY USAGE AFTER COMPLETION:___")
    mem_usg = props.memory_usage().sum() / 1024**2 
    print("Memory usage is: ",round(mem_usg,2)," MB")
    print("This is ",round(100*mem_usg/start_mem_usg,2),"% of the initial size")
    return props, NAlist

In [None]:
%%time
train_df = datatable.fread("/kaggle/input/tabular-playground-series-may-2022/train.csv").to_pandas().set_index('id') # faster but it sometimes converts the target variable from int type to boolean type.
# train_df = pd.read_csv("/kaggle/input/tabular-playground-series-may-2022/train.csv").set_index('id')
train_df,_ = reduce_mem_usage(train_df)
train_df

In [None]:
%%time
test_df = datatable.fread("/kaggle/input/tabular-playground-series-may-2022/test.csv").to_pandas().set_index('id')
test_df = pd.read_csv("/kaggle/input/tabular-playground-series-may-2022/test.csv").set_index('id')
test_df,_ = reduce_mem_usage(test_df)
test_df

In [None]:
train_df.info()

0.9 Million samples in training data, 0.7 million samples in test data.  
There are 16 float features, 14 int features and 1 object feature

In [None]:
train_df.target.value_counts(normalize=True).round(2)

51% of the samples are negative class, 49% of the samples are positive class

In [None]:
# Sampling 20 percent of the training data for faster experimentation (0.18 million)
train_df_sampled = train_df.sample(int(len(train_df) * 0.2))
train_df_sampled.info()

# String Feature

9,00,000 training samples contain 7,41,354 unique values with the most occurring string sequence appearing 12 times

In [None]:
train_df["f_27"].value_counts()

The above 10 values has constant length of 10. Checking if the length of the string sequence always has length of 10.

In [None]:
(train_df.f_27.str.len().min(), train_df.f_27.str.len().max(), test_df.f_27.str.len().min(), test_df.f_27.str.len().min())

9,00,000 training samples contain 7,41,354 unique string sequence in f_27 (most of the values are different). 
Lets check if the string sequences in test data is same as in training data. There are 11,81,880 - 7,41,354 = 440526 string sequences which do not occur in the test data. So, this feature should not be treated as a categorical feature as the model will learn to rely on strings which do not occur in the test data.

In [None]:
train_df.f_27.value_counts()

In [None]:
pd.concat([train_df,test_df]).f_27.value_counts()

Letter at position 0, 2 and 5 just takes 2 values in the training data. Other take more than 2 values. Every position gives some information about the target variable.

In [None]:
for pos in range(10):
    g_df = train_df.groupby(train_df.f_27.str.get(pos))
    print(f"Position -> {pos}")
    display(pd.DataFrame({"size":g_df.target.size(),"mean_target":g_df.target.mean().round(2)}))

Lets find which of the 26 alphabets appears in this string sequence, along with their counts

In [None]:
from collections import Counter
c = Counter([letter for seq_list in train_df.f_27.to_list() for letter in seq_list])
for key in sorted(c.keys()):
    print(key, c[key])

In [None]:
f_27_len_unique_chars = train_df.f_27.apply(lambda x : len(set(x))).rename("f_27_len_unique_chars")
g_df = train_df.groupby(f_27_len_unique_chars)
display(pd.DataFrame({"size":g_df.target.size(),"mean_target":g_df.target.mean().round(2)}))

Insights:
1. f_27 should be split into 10 characters since the character in each position gives some information about the target variable
2. the number of unique characters in f_27 can be considered as a seperate feature

In [None]:
train_df.columns

In [None]:
test_df.columns

In [None]:
for df in [train_df, test_df]:
    for i in range(10):
        df[f"ch{i}"] = df.f_27.str.get(i).apply(ord) - ord('A')
    df["f_27_len_unique_chars"] = df.f_27.apply(lambda x : len(set(x)))
    df.drop(columns=["f_27"], inplace=True)

In [None]:
train_df.columns

In [None]:
test_df.columns

# Float Features

There are 16 float features

In [None]:
float_features_train = [col for col in train_df.columns if train_df[col].dtype == "float32"]
print(float_features_train)
print(f"len = {len(float_features_train)}")

Looking at the histogram of all the 16 float features show that all of them are normally distributed with mean/centre at 0.  
Features f_00 to f_06 has std of 1.0  
Features f_19 to f_26 has std b/w 2.3 and 2.5  
Features f_28 has std of 238  

In [None]:
fig, axs = plt.subplots(4,4, figsize=(16,16))

for feature, ax in zip(float_features_train,axs.ravel()):
    ax.hist(train_df[feature], bins=100)
    ax.set_title(f"Train {feature}, mean={train_df[feature].mean():.1f}, std_dev={train_df[feature].std():.1f}")

plt.suptitle("Histogram of the 16 float features", fontsize=20, y=0.93)
mplcyberpunk.add_glow_effects()
plt.show()

Test data also follows similar distribution

In [None]:
float_features_test = [col for col in test_df.columns if test_df[col].dtype == "float32"]
print(float_features_test)
print(f"len = {len(float_features_test)}")

In [None]:
fig, axs = plt.subplots(4,4, figsize=(16,16))

for feature, ax in zip(float_features_test,axs.ravel()):
    ax.hist(test_df[feature], bins=100)
    ax.set_title(f"Test {feature}, mean={test_df[feature].mean():.1f}, std_dev={test_df[feature].std():.1f}")

plt.suptitle("Histogram of the 16 float features", fontsize=20, y=0.93)
mplcyberpunk.add_glow_effects()
plt.show()

the correlation method used is pearson correlation coefficient.  
features f_00 to f_06 are correlated with f_28, but not with each other.  
features f_19 to f_26 are slighly correlated with each other.  
feature f_21 is slightly correlated with the target variable. No feature is strongly correlated with the target.  

In [None]:
plt.figure(figsize=(12,12))
sns.heatmap(train_df[float_features_train + ["target"]].corr(), annot=True, fmt=".2f", center=0);

The correlation matrix shows only linear dependencies.  
To see non-linear dependencies, we can plot a rolling mean of the target probability for every feature

Trying out the rolling mean on a small subset of data

In [None]:
#float_features_train[0] #f_00
feature = "f_00"
temp = pd.DataFrame({feature:train_df[feature].values,
                            'state':train_df.target.values})
temp = temp.head(10)
temp.head(10)

In [None]:
temp = temp.sort_values(feature)
temp.head(10)

In [None]:
temp.state.rolling(5, min_periods=1, center=True).mean() #center : Rolling sum with the result assigned to the center of the window index.

In [None]:
def get_nrows(features, ncols=4):
    return (len(features)+ncols-1) // ncols

assert get_nrows([1]*16, ncols=4)==4
assert get_nrows([1]*17, ncols=4)==5
assert get_nrows([1]*19, ncols=4)==5
assert get_nrows([1]*20, ncols=4)==5
assert get_nrows([1]*21, ncols=4)==6

A horizontal line means that the target does not depend on the feature (e.g., f_03, f_04, f_06),   

a line with low minimum and high maximum shows a high mutual information between feature and target (e.g., f_21, f_22, f_26, f_28).

When the window size for rolling mean is small, we can see noise in the plot. As we increase it, it becomes less noisy but we also start losing more information.

In [None]:
%%time
# Plot dependence b/w every feature and the target
def plot_mutual_info_diagram(df, features, window=15000, ncols=4, by_quantile=True, mutual_info=True, title="How the target probability depends on single features"):
    def H(p):
        """Entropy of a binary random variable in nat(natural unit of mutual information - based on natural logarithm)"""
        return -np.log(p) * p - np.log(1-p) * (1-p)
    
    nrows = get_nrows(features)
    fig, axs = plt.subplots(nrows, ncols, figsize=(16,4*nrows), sharey=True) 
    for feature, ax in zip(features, axs.ravel()):
        temp = pd.DataFrame({feature:df[feature].values,
                            'state':df.target.values})
        temp = temp.sort_values(feature)
        temp.reset_index(inplace=True)
        rolling_mean = temp.state.rolling(window, center=True, min_periods=1).mean()
        if by_quantile:
            values = temp.index.values
            ax.scatter(values, rolling_mean, s=2)
            ax.axhline(y=max(rolling_mean), color='g', linestyle='-')
            ax.axhline(y=min(rolling_mean), color='r', linestyle='-')
        else:
            values = temp[feature].values
            ax.scatter(values, rolling_mean, s=2)
            ax.axhline(y=max(rolling_mean), color='g', linestyle='-')
            ax.axhline(y=min(rolling_mean), color='r', linestyle='-')
        if mutual_info and by_quantile:
            ax.set_xlabel(f'{feature} mi={H(temp.state.mean()) - H(rolling_mean[~rolling_mean.isna()].values).mean():.5f}')
        else:
            ax.set_xlabel(f'{feature}')
    plt.suptitle(title, y=0.93, fontsize=20)
    plt.show()

plot_mutual_info_diagram(train_df, float_features_train,window=100,
                         title='How the target probability depends on the float features')
    

In [None]:
plot_mutual_info_diagram(train_df, float_features_train,window=1000,
                         title='How the target probability depends on the float features')

In [None]:
plot_mutual_info_diagram(train_df, float_features_train,window=10000,
                         title='How the target probability depends on the float features')

Before we plot the rolling mean we sort the feature values, so that the rolling mean at a feature values x approximates the target probability in an environment x-e, x+e of this feature value.    

The plot basically show the relationship between the rank of the feature values and the target probability averaged over x-7500 and x+7000 target probabilities.  

Insight:  
There are many non linear relationships (and some nonmonotonic) between individial features and the target. Linear Classifiers would be able to do a good job.

Side Note:  
Notice how in the heatmap plotter earlier, the pearson correlation between f_00 and target shows 0.06 meaning little linearly correlated, but in the above plot - it seems like there is a relation. The correlation (0.06) is positive meaning the line goes upward. This is clear when the window size for the rolling mean is 10000 but not when window size is small(say 100) in which case the upward slope of the line disappears in the noise. When the window size is large, it hides the noise.

# Integer features

In [None]:
int_features_train = [feature for feature in train_df.columns if train_df[feature].dtype == 'uint8' and feature not in ["id","target"]]
print(int_features_train)
print(f"len -> {len(int_features_train)}")

f_29 is a binary feature.  
f_30 is a ternary feature.  
All the other features have a right skewed distribution.

In [None]:
nrows = get_nrows(int_features_train)
fig, axs = plt.subplots(nrows, 4, figsize=(16,nrows*4))
for feature, ax in zip(int_features_train, axs.ravel()):
    vc = train_df[feature].value_counts()
    ax.bar(vc.index, vc)
    ax.set_xlabel(f"Train {feature}")
    ax.xaxis.set_major_locator(MaxNLocator(integer=True))

plt.suptitle("Histogram of the integer features", fontsize=20, y=0.93)
plt.show()

In [None]:
plot_mutual_info_diagram(train_df, int_features_train, window=10000,
                         title='How the target probability depends on the integer features')

# Analysing Interactions with SHAP

To use the SHAP package we first need to train a model since SHAP is a model agnostic approach designed to explain any given black-box model. If you are applying SHAP to a real-world problem you should follow best practices. Specifically, you should ensure your model performs well on both a training and validation set. The better your model the more reliable your results will be. As a quick check on this model, I have calculated the AUC of the validation set which 99%. The model should be fine to demonstrate the SHAP package.

In [None]:
%%time
X = train_df[[feature for feature in train_df.columns if feature != "target"]]
y = train_df["target"]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.4, random_state=42)

lgbm_model = LGBMClassifier(n_estimators=5000, min_child_samples=80, random_state=42)

lgbm_model.fit(X_train.values, y_train)
y_pred = lgbm_model.predict_proba(X_test.values)[:,1]

score = roc_auc_score(y_test, y_pred)
print(f"Validation AUC:{score:.3f}")

In [None]:
# X_test.shape # 3,60,000

# Using a random sample of the dataframe for better time computation
X_test_sampled = X_test.sample(20000, random_state=1307)

In [None]:
%%time
# explain the model's predictions using SHAP values
explainer = shap.TreeExplainer(lgbm_model)

In [None]:
%%time
shap_values = explainer.shap_values(X_test_sampled)

In [None]:
%%time
# Get SHAP interaction values. Beware it is time consuming to calculate the interaction values.
# shap_interaction_20 = explainer.shap_interaction_values(X_test_sampled)

In [None]:
%%time
# Get SHAP interaction values. Beware it is time consuming to calculate the interaction values.
# X_test_sampled = X_test.sample(40, random_state=42)
# shap_interaction_40 = explainer.shap_interaction_values(X_test_sampled)

In [None]:
loaded_arr = np.loadtxt('../input/shap-interaction/shap_interaction_20k.txt')
features_list_shap_dataset = ["f_00","f_01","f_02","f_03","f_04","f_05","f_06","f_07","f_08","f_09","f_10","f_11","f_12","f_13","f_14","f_15","f_16","f_17","f_18","f_19","f_20","f_21","f_22","f_23","f_24","f_25","f_26","f_28","f_29","f_30","ch0","f_27_len_unique_chars",'ch1', 'ch2', 'ch3', 'ch4', 'ch5', 'ch6', 'ch7', 'ch8', 'ch9']
load_original_arr = loaded_arr.reshape(
    #loaded_arr.shape[0], loaded_arr.shape[1] // shap_interaction.shape[2], shap_interaction.shape[2])
    loaded_arr.shape[0], loaded_arr.shape[1] // 41, 41)

shap_interaction_dataset = load_original_arr

Step1: Absolute mean plot

Individual contribution matrices allows us to explain individual model predictions at local level. But what if we want to explain how the model makes predictions at a global level? To do this we can aggregate the values in the contribution matrices by taking the absolute mean. Presenting the results in a heatmap can be effective to highlight important main effects and interaction effects.

In [None]:
# Get absolute mean of matrices
mean_shap = np.abs(shap_interaction_dataset).mean(axis=0)
df = pd.DataFrame(mean_shap, index=features_list_shap_dataset, columns=features_list_shap_dataset)

# times off diagonal by 2
df.where(df.values == np.diagonal(df),df.values*2, inplace=True)

# display 
fig = plt.figure(figsize=(35, 20), facecolor='#002637', edgecolor='r')
ax = fig.add_subplot()
sns.heatmap(df.round(decimals=3), cmap='coolwarm', annot=True, fmt='.6g', cbar=False, ax=ax, )
ax.tick_params(axis='x', colors='w', labelsize=15, rotation=90)
ax.tick_params(axis='y', colors='w', labelsize=15)

plt.suptitle("SHAP interaction values", color="white", fontsize=60, y=0.97)
plt.yticks(rotation=0) 
plt.show()

The diagonal elements are the main effects of features at global level(absolute mean of each of the 20k predictions) and off diagonal elements are the interaction effect of features at global level

Insights:  

For instance we can see that the main effect is large for features f_21 (0.539) , f_26 (0.612) and unique_characters(0.712). This tells us that these features tend to have large positive or negative main effects. In other words, these features tend to have a significant impact on the modelâ€™s predictions.  

Similarly, we can see that interaction effects for (f_00, f_26) --> (0.513) and (f_02, f_21) --> (0.482) are significant. These are just some examples.

Step2: Feature interaction analysis

So, now that we have calcuated the average (main/interaction)effect of features at global level it is interseting to deep dive into features that show large interaction effects at global level. We can use utilize the dependence plot to better understand the nature of the interactions. For the sake of demonstration I will be focusing on f_02 & f_21 and f_24 & f_30 interaction effects at the local level.

In [None]:
# %%time
# # dependency up until this point
# import numpy as np # linear algebra
# import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
# !pip uninstall matplotlib -y
# !pip install matplotlib==3.2.2
# !pip install mplcyberpunk
# import matplotlib
# print(matplotlib.__version__)

In [None]:
# import matplotlib.pyplot as plt
# from matplotlib.ticker import MaxNLocator
# from matplotlib import gridspec
# import seaborn as sns
# from IPython.display import display
# from sklearn.model_selection import KFold, cross_val_score, train_test_split
# from sklearn.metrics import roc_auc_score, roc_curve
# from lightgbm import LGBMClassifier
# import shap
# import datatable
# import mplcyberpunk
# plt.style.use("cyberpunk")
# # train_df = datatable.fread("/kaggle/input/tabular-playground-series-may-2022/train.csv").to_pandas().set_index('id')
# train_df = pd.read_csv("/kaggle/input/tabular-playground-series-may-2022/train.csv").set_index('id')
# # test_df = datatable.fread("/kaggle/input/tabular-playground-series-may-2022/test.csv").to_pandas().set_index('id')
# test_df = pd.read_csv("/kaggle/input/tabular-playground-series-may-2022/test.csv").set_index('id')
# for df in [train_df, test_df]:
#     for i in range(10):
#         df[f"ch{i}"] = df.f_27.str.get(i).apply(ord) - ord('A')
#     df["f_27_len_unique_chars"] = df.f_27.apply(lambda x : len(set(x)))
#     df.drop(columns=["f_27"], inplace=True)
# X = train_df[[feature for feature in train_df.columns if feature != "target"]]
# y = train_df["target"]

# X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.4, random_state=42)

# lgbm_model = LGBMClassifier(n_estimators=5000, min_child_samples=80, random_state=42)

# lgbm_model.fit(X_train.values, y_train)
# y_pred = lgbm_model.predict_proba(X_test.values)[:,1]

# score = roc_auc_score(y_test, y_pred)
# print(f"Validation AUC:{score:.3f}")
# X_test_sampled = X_test.sample(20000, random_state=1307)
# explainer = shap.TreeExplainer(lgbm_model)
# shap_values = explainer.shap_values(X_test_sampled)
# loaded_arr = np.loadtxt('../input/shap-interaction/shap_interaction_20k.txt')
# features_list_shap_dataset = ["f_00","f_01","f_02","f_03","f_04","f_05","f_06","f_07","f_08","f_09","f_10","f_11","f_12","f_13","f_14","f_15","f_16","f_17","f_18","f_19","f_20","f_21","f_22","f_23","f_24","f_25","f_26","f_28","f_29","f_30","ch0","f_27_len_unique_chars",'ch1', 'ch2', 'ch3', 'ch4', 'ch5', 'ch6', 'ch7', 'ch8', 'ch9']
# load_original_arr = loaded_arr.reshape(
#     #loaded_arr.shape[0], loaded_arr.shape[1] // shap_interaction.shape[2], shap_interaction.shape[2])
#     loaded_arr.shape[0], loaded_arr.shape[1] // 41, 41)

# shap_interaction_dataset = load_original_arr

In [None]:
# Get absolute mean of matrices
mean_shap = np.abs(shap_interaction_dataset).mean(axis=0)
df = pd.DataFrame(mean_shap, index=features_list_shap_dataset, columns=features_list_shap_dataset)

# times off diagonal by 2
df.where(df.values == np.diagonal(df),df.values*2, inplace=True)

# display 
fig = plt.figure(figsize=(35, 20), facecolor='#002637', edgecolor='r')
ax = fig.add_subplot()
sns.heatmap(df.round(decimals=3), cmap='coolwarm', annot=True, fmt='.6g', cbar=False, ax=ax, )
ax.tick_params(axis='x', colors='w', labelsize=15, rotation=90)
ax.tick_params(axis='y', colors='w', labelsize=15)

plt.suptitle("SHAP interaction values", color="white", fontsize=60, y=0.97)
plt.yticks(rotation=0) 
plt.show()

Following feature interactions(f1,f2) has significantly higher shap interaction values when compared to its main interaction(f1,f1):  

Involving f_30 (ternary feature):  
f_24, f_30  
f_25, f_30  

Both feature belonging to real numbers:  
f_02, f_21  
f_05, f_22  
f_00, f_26  
f_01, f_26  

Following main interactions has high shap values:  
f_21  
f_26    
f_27_len_unique_chars  

For example:  
Based on SHAP feature interactions heatmap f_02 & f_21 interactions effect which is 0.482 is larger than the main effect of f_02 which is 0.27. Lets deep dive into the local level effects.

In [None]:
# A dependence plot is a scatter plot that shows the effect a single feature has on the predictions made by the model.
# Each dot is a single prediction (row) from the dataset.
# The x-axis is the value of the feature (from the X matrix).
# The y-axis is the SHAP value for that feature, which represents how much knowing that feature's value changes the output of the model for that sample's prediction.
# The color corresponds to a second feature that may have an interaction effect with the feature we are plotting (by default this second feature is chosen automatically)

# The first argument is the index of the feature we want to plot
# The second argument is the matrix of SHAP values (it is the same shape as the data matrix)
# The third argument is the data matrix (a pandas dataframe or numpy array)
# The interaction_index argument can be used to explicitly set which feature gets used for coloring

# Sample
# shap.dependence_plot(0, shap_values, X, interaction_index="Education-Num")

# Reference:  
# https://slundberg.github.io/shap/notebooks/plots/dependence_plot.html

# Plot feature interaction
def plot_feature_interaction_with_f2_being_ternary_feature(f1, f2, f2_condition=2):
    
    fig = plt.figure(figsize=(30,15))
    spec = gridspec.GridSpec(nrows=2, ncols=3, figure=fig)
    
    # SHAP main effect {f1}
    ax = fig.add_subplot(spec[0,0])
    shap.dependence_plot(f1, shap_values[1], X_test_sampled, display_features=X_test_sampled, interaction_index=f2, ax=ax, show=False)
    ax.yaxis.label.set_color('white'); ax.xaxis.label.set_color('white'); ax.tick_params(axis='x', colors='white'); ax.tick_params(axis='y', colors='white')
    ax.set_title(f'SHAP main effect ({f1})', fontsize=10)
    
#     # SHAP main effect {f2}
#     ax = fig.add_subplot(spec[0,1])
#     shap.dependence_plot(f2, shap_values[1], X_test_sampled, display_features=X_test_sampled, interaction_index=f1, ax=ax, show=False)
#     ax.yaxis.label.set_color('white'); ax.xaxis.label.set_color('white'); ax.tick_params(axis='x', colors='white'); ax.tick_params(axis='y', colors='white')
#     ax.set_title(f'SHAP main effect ({f2})', fontsize=10)
    
    # SHAP interaction effect
    ax = fig.add_subplot(spec[0,1])
    shap.dependence_plot((f1,f2), shap_interaction_dataset, X_test_sampled, display_features=X_test_sampled, ax=ax, show=False)
    ax.yaxis.label.set_color('white'); ax.xaxis.label.set_color('white'); ax.tick_params(axis='x', colors='white'); ax.tick_params(axis='y', colors='white')
    ax.set_title(f'SHAP interaction effect', fontsize=10)
    
    # How the target probability depends on the {f1}
    ax = fig.add_subplot(spec[1,0])
    temp = pd.DataFrame({f1: train_df[f1].values,
                        "target": train_df.target.values})
    temp = temp.sort_values(f1)
    temp.reset_index(inplace=True)
    
    rolling_mean = temp.target.rolling(15_000, center=True).mean()    
    sns.scatterplot(x=temp[f1], y=rolling_mean, data=temp, s=2, ax=ax)
    ax.axhline(y=max(rolling_mean), color='g', linestyle='-')
    ax.axhline(y=min(rolling_mean), color='r', linestyle='-')
    ax.set_title(f"How the target probability depends on {f1}")
    
    # How the target probability depends on the {f1} & {f2==2}
    ax = fig.add_subplot(spec[1,1])
    temp = pd.DataFrame({f1: train_df.loc[train_df[f2]==f2_condition, f1].values,
                        "target": train_df.loc[train_df[f2]==f2_condition, "target"].values})
    temp = temp.sort_values(f1)
    temp.reset_index(inplace=True)
    
    rolling_mean = temp.target.rolling(15_000, center=True).mean()    
    sns.scatterplot(x=temp[f1], y=rolling_mean, data=temp, s=2, ax=ax)
    ax.axhline(y=max(rolling_mean), color='g', linestyle='-')
    ax.axhline(y=min(rolling_mean), color='r', linestyle='-')
    ax.set_title(f"How the target probability depends on {f1} & {f2}=={f2_condition}")
    
    # How the target probability depends on the {f1} & {f2!=2}
    ax = fig.add_subplot(spec[1,2])
    temp = pd.DataFrame({f1: train_df.loc[train_df[f2]!=f2_condition, f1].values,
                        "target": train_df.loc[train_df[f2]!=f2_condition, "target"].values})
    temp = temp.sort_values(f1)
    temp.reset_index(inplace=True)
    
    rolling_mean = temp.target.rolling(15_000, center=True).mean()    
    sns.scatterplot(x=temp[f1], y=rolling_mean, data=temp, s=2, ax=ax)
    ax.axhline(y=max(rolling_mean), color='g', linestyle='-')
    ax.axhline(y=min(rolling_mean), color='r', linestyle='-')
    ax.set_title(f"How the target probability depends on {f1} & {f2}!={f2_condition}")
    
    plt.suptitle(f"Feature Interaction Analysis\n {f1} and {f2}", fontsize=25, y=1.15)
    plt.tight_layout()
    plt.show()

In [None]:
f1='f_24'
f2='f_30'
plot_feature_interaction_with_f2_being_ternary_feature(f1, f2, f2_condition=2)

From the first plot one can conclude that for f_30 == 2 main SHAP effect of f_24 decreases for the predictions as f_24 gets larger.  
From the second plot one can conlcude that for f_30 == 2 the interaction effect between f_30 & f_24 decreases as f_24 get larger. The opposite is true for f_30 != 2.  
The third plot indicates that as f_24 gets larger the probability for state==1 increases.  
The fourth and fifth plot are the same as third plot but they are conditioned on f_30==2 and f_30!=2. The results are clearly different.  

In [None]:
f1='f_25'
f2='f_30'
plot_feature_interaction_with_f2_being_ternary_feature(f1, f2, f2_condition=1)

From the first plot, although noisy, we can see that for f_30==1, main SHAP effect of f_25 increases rapidly for the predictions as f_25 gets larger, for f_30==0, main SHAP effect of f_25 reduces slightly for the predictions as f_25 gets larger, for f_30==2, main SHAP effect of f_25 reduces rapidly for the predictions as f_25 gets larger.  
From the second plot, we can conclude that for f_30==1, the interaction effect between f_25 and f_30 increases as f_25 gets larger.  
The third plot indicates that as f_25 gets larger, the probability for state==1 reduces.  
The fourth and fifth plot are the same as third plot but they are conditioned on f_30==1 and f_30!=1. The results are clearly different.  



In [None]:
# A dependence plot is a scatter plot that shows the effect a single feature has on the predictions made by the model.
# Each dot is a single prediction (row) from the dataset.
# The x-axis is the value of the feature (from the X matrix).
# The y-axis is the SHAP value for that feature, which represents how much knowing that feature's value changes the output of the model for that sample's prediction.
# The color corresponds to a second feature that may have an interaction effect with the feature we are plotting (by default this second feature is chosen automatically)

# The first argument is the index of the feature we want to plot
# The second argument is the matrix of SHAP values (it is the same shape as the data matrix)
# The third argument is the data matrix (a pandas dataframe or numpy array)
# The interaction_index argument can be used to explicitly set which feature gets used for coloring

# Sample
# shap.dependence_plot(0, shap_values, X, interaction_index="Education-Num")

def plot_feature_interaction(f1, f2):
    
    fig = plt.figure(figsize=(20,10), tight_layout=True)
    spec = gridspec.GridSpec(2, 3, figure=fig)
    
    # SHAP main effect {f1}
    ax = fig.add_subplot(spec[0,0])
    shap.dependence_plot(f1, shap_values[1], X_test_sampled, display_features=X_test_sampled, interaction_index=None, ax=ax, show=False)
    ax.yaxis.label.set_color('white'); ax.xaxis.label.set_color('white'); ax.tick_params(axis='x', colors='white'); ax.tick_params(axis='y', colors='white')
    ax.set_title(f'SHAP main effect ({f1})', fontsize=10)
    
    # SHAP main effect {f2}
    ax = fig.add_subplot(spec[0,1])
    shap.dependence_plot(f2, shap_values[1], X_test_sampled, display_features=X_test_sampled, interaction_index=None, ax=ax, show=False)
    ax.yaxis.label.set_color('white'); ax.xaxis.label.set_color('white'); ax.tick_params(axis='x', colors='white'); ax.tick_params(axis='y', colors='white')
    ax.set_title(f'SHAP main effect ({f2})', fontsize=10)
    
    # SHAP interaction effect {f2}
    ax = fig.add_subplot(spec[0,2])
    shap.dependence_plot((f1,f2), shap_interaction_dataset, X_test_sampled, display_features=X_test_sampled, interaction_index=None, ax=ax, show=False)
    ax.yaxis.label.set_color('white'); ax.xaxis.label.set_color('white'); ax.tick_params(axis='x', colors='white'); ax.tick_params(axis='y', colors='white')
    ax.set_title(f'SHAP interaction effect ({f1} and {f2})', fontsize=10)
    
    # How the target probability depends on the {f1}
    ax = fig.add_subplot(spec[1,0])
    temp = pd.DataFrame({f1: train_df[f1].values,
                        "target": train_df.target.values})
    temp = temp.sort_values(f1)
    temp.reset_index(inplace=True)
    
    rolling_mean = temp.target.rolling(15_000, center=True).mean()    
    sns.scatterplot(x=temp[f1], y=rolling_mean, data=temp, s=2, ax=ax)
    ax.axhline(y=max(rolling_mean), color='g', linestyle='-')
    ax.axhline(y=min(rolling_mean), color='r', linestyle='-')
    ax.set_title(f"How the target probability depends on {f1}")
    
    # How the target probability depends on the {f2}
    ax = fig.add_subplot(spec[1,1])
    temp = pd.DataFrame({f2: train_df[f2].values,
                        "target": train_df.target.values})
    temp = temp.sort_values(f2)
    temp.reset_index(inplace=True)
    
    rolling_mean = temp.target.rolling(15_000, center=True).mean()    
    sns.scatterplot(x=temp[f2], y=rolling_mean, data=temp, s=2, ax=ax)
    ax.axhline(y=max(rolling_mean), color='g', linestyle='-')
    ax.axhline(y=min(rolling_mean), color='r', linestyle='-')
    ax.set_title(f"How the target probability depends on {f2}")
    
    # Scatter plot
    ax = fig.add_subplot(spec[1,2])
    sns.scatterplot(x=f1, y=f2, data=train_df, hue="target", ax=ax, s=2)
    
    if f1=="f_02" and f2=="f_21":
        ax.text(-1.5, -5, "1", fontsize=18, verticalalignment='top', rotation="horizontal", color="k", fontproperties="smallcaps")
        ax.text(0, 1, "2", fontsize=18, verticalalignment='top', rotation="horizontal", color="k", fontproperties="smallcaps")
        ax.text(1, 6.7, "3", fontsize=18, verticalalignment='top', rotation="horizontal", color="k", fontproperties="smallcaps")
    
    ax.set_title(f'scatter plot', fontsize=10)
    
    plt.suptitle(f"Feature Interaction Analysis \n {f1} and {f2}", fontsize=25, y=1.15)
    plt.show()

In [None]:
f1="f_02"
f2="f_21"
plot_feature_interaction(f1, f2)

Here, plot 2, 5 and 6 are interesting.  
In plot 2, we can see that the main shap effect for f_21 increases monotonically for f_21 features. The main shape effect reduces drastically as the f_21 values decreases post -3.8 ish and the same increases drastically as the f_21 values increases post 3.8 ish.  
In plot 5, we can see the same thing in a different aspect (f_21 values vs probability for state==1)  
In plot 6, we did a simple scatter plot of f_02 and f_21 with target as hue.  We can clearly how both these features interact which have an impact on the target probability.  

We can see that the projection to f_02 and f_21 is partitoned into 3 regions, each one having different target probabilities.

In the region labelled as 1 : the probability for target==1 is low  
In the region labelled as 2 : the probability for target==1 is medium  
In the region labelled as 3 : the probability for target==1 is high  

We can now either hope that our classifier finds these borders by itself, or we can help the classifier by creating a ternary categorical feature that indicates which region a sample belong to.  

Bottom left region (low probability for target==1) -> -1  
Middle region (medium probability for target==1) -> 0  
Top right region (high probability for target==1) -> 1  

We can get the coordinates of the 2 borders which divides the region into 3 regions by plotting the distribution of f_02 + f_21 with hue as target variable.  


In [None]:
def scatter_plot_f1_f2(f1, f2):
    fig, axs = plt.subplots(1, 2, figsize=(24,12))
    sns.scatterplot(x=f1, y=f2, data=train_df, ax=axs[0], s=2)
    axs[0].set_title(f'Correlation', fontsize=10)
    
    sns.scatterplot(x=f1, y=f2, data=train_df, hue="target", ax=axs[1], s=2)
    axs[1].set_title(f'Interaction', fontsize=10)
    if f1=="f_02" and f2=="f_21":
        axs[1].text(-1.2, -5, "1", fontsize=20, verticalalignment='top', rotation="horizontal", color="k", fontproperties="smallcaps")
        axs[1].text(0, 1, "2", fontsize=20, verticalalignment='top', rotation="horizontal", color="k", fontproperties="smallcaps")
        axs[1].text(1, 5, "3", fontsize=20, verticalalignment='top', rotation="horizontal", color="k", fontproperties="smallcaps")
    plt.suptitle("Correlation vs Interaction", fontsize=20, y=0.97)
    plt.show()

In [None]:
scatter_plot_f1_f2("f_02", "f_21")

The shape of the plot implies that there is no correlation between the two features. In the second plot we have the same scatterplot but now we have colored the observations with the target value. Here we can clearly observe that there are three different regions where f_21 and f_02 interact to give different target distribution per region

We can see that the projection to f_02 and f_21 is partitoned into 3 regions, each one having different target probabilities.

In the region labelled as 1 : the probability for target==1 is low  
In the region labelled as 2 : the probability for target==1 is medium  
In the region labelled as 3 : the probability for target==1 is high  

We can now either hope that our classifier finds these borders by itself, or we can help the classifier by creating a ternary categorical feature that indicates which region a sample belong to.  

Bottom left region (low probability for target==1) -> -1  
Middle region (medium probability for target==1) -> 0  
Top right region (high probability for target==1) -> 1  

We can get the coordinates of the 2 borders which divides the region into 3 regions by plotting the distribution of f_02 + f_21 with hue as target variable.  

In [None]:
temp = train_df[["f_02","f_21","target"]].copy()
temp[f"f_02 + f_21"] = temp["f_02"] + temp["f_21"]

fig = plt.figure(figsize=(30,10))
ax = fig.add_subplot()
sns.histplot(data=temp, x="f_02 + f_21", hue="target", bins=300, ax=ax)
ax.axvline(x=-5.3, color='y', linestyle='-', label="-5.3")
ax.text(x=-6.1,y=5000,s="-5.3",fontsize=15,fontweight='bold',color='y')
ax.axvline(x=5.2, color='y', linestyle='-')
ax.text(x=5.4,y=5000,s="5.2",fontsize=15,fontweight='bold',color='y')

ax.text(x=-10,y=3500,s="<equation> = -1",fontsize=15,fontweight='bold',color='w')
ax.text(x=-1.5,y=3500,s="<equation> = 0",fontsize=15,fontweight='bold',color='w')
ax.text(x=7,y=3500,s="<equation> = 1",fontsize=15,fontweight='bold',color='w')
plt.suptitle("f_02 + f_21 distribution",fontsize=20,fontweight='bold')
plt.show()

In [None]:
train_df["i_f02_f21"] = (train_df.f_02 + train_df.f_21 > 5.2).astype(int) - \
                        (train_df.f_02 + train_df.f_21 < -5.3).astype(int)

In [None]:
f1="f_05"
f2="f_22"
plot_feature_interaction(f1, f2)

Here, plot 2, 5 and 6 are interesting.  
In plot 2, we can see that the main shap effect for f_22 increases monotonically for f_22 features. The main shape effect reduces drastically as the f_22 values decreases post -3.8 ish and the same increases drastically as the f_22 values increases post 3.8 ish.  
In plot 5, we can see the same thing in a different aspect (f_22 values vs probability for state==1)  
In plot 6, we did a simple scatter plot of f_05 and f_22 with target as hue.  We can clearly how both these features interact which have an impact on the target probability.  

We can see that the projection to f_05 and f_22 is partitoned into 3 regions, each one having different target probabilities.

In the region labelled as 1 : the probability for target==1 is low  
In the region labelled as 2 : the probability for target==1 is medium  
In the region labelled as 3 : the probability for target==1 is high  

We can now either hope that our classifier finds these borders by itself, or we can help the classifier by creating a ternary categorical feature that indicates which region a sample belong to.  

Bottom left region (low probability for target==1) -> -1  
Middle region (medium probability for target==1) -> 0  
Top right region (high probability for target==1) -> 1  

We can get the coordinates of the 2 borders which divides the region into 3 regions by plotting the distribution of f_05 + f_22 with hue as target variable.  


In [None]:
scatter_plot_f1_f2("f_05", "f_22")

In [None]:
temp = train_df[["f_05","f_22","target"]].copy()
temp[f"f_05 + f_22"] = temp["f_05"] + temp["f_22"]

fig = plt.figure(figsize=(30,10))
ax = fig.add_subplot()
sns.histplot(data=temp, x="f_05 + f_22", hue="target", bins=300, ax=ax)
ax.axvline(x=-5.4, color='y', linestyle='-', label="-5.3")
ax.text(x=-6.1,y=5000,s="-5.4",fontsize=15,fontweight='bold',color='y')
ax.axvline(x=5.1, color='y', linestyle='-')
ax.text(x=5.4,y=5000,s="5.1",fontsize=15,fontweight='bold',color='y')

ax.text(x=-10,y=3500,s="<equation> = -1",fontsize=15,fontweight='bold',color='w')
ax.text(x=-1.5,y=3500,s="<equation> = 0",fontsize=15,fontweight='bold',color='w')
ax.text(x=7,y=3500,s="<equation> = 1",fontsize=15,fontweight='bold',color='w')
plt.suptitle("f_05 + f_22 distribution",fontsize=20,fontweight='bold')
plt.show()

In [None]:
train_df["i_f05_f22"] = (train_df.f_05 + train_df.f_22 > 5.1).astype(int) - \
                        (train_df.f_05 + train_df.f_22 < -5.4).astype(int)

In [None]:
f1='f_00'
f2='f_26'
plot_feature_interaction(f1, f2)

Interaction Plot shows some feature space region partitioning but its not very distinct(borders of the regions) because of the noise.

In [None]:
f1='f_01'
f2='f_26'
plot_feature_interaction(f1, f2)

Interaction Plot shows some feature space region partitioning but its not very distinct(borders of the regions) because of the noise. Since f_26 appeared alongside both f_00 and f_01, lets try plotting it against f_00 + f_01 (combined using sum operation)

In [None]:
def scatter_plot_f1comb_f2(f1_comb_tuple, f2):

    temp = train_df[[f1_comb_tuple[0],f1_comb_tuple[1],f2,"target"]].copy()
    temp[f"{f1_comb_tuple[0]} + {f1_comb_tuple[1]}"] = temp[f1_comb_tuple[0]] + temp[f1_comb_tuple[1]]
    
    fig, axs = plt.subplots(1, 2, figsize=(24,12))
    sns.scatterplot(x=f"{f1_comb_tuple[0]} + {f1_comb_tuple[1]}", y=f2, data=temp, ax=axs[0], s=2)
    axs[0].set_title(f'Correlation', fontsize=10)
    
    sns.scatterplot(x=f"{f1_comb_tuple[0]} + {f1_comb_tuple[1]}", y=f2, data=temp, hue="target", ax=axs[1], s=2)
    axs[1].set_title(f'Interaction', fontsize=10)
    if f1=="f_02" and f2=="f_21":
        axs[1].text(-1.2, -5, "1", fontsize=20, verticalalignment='top', rotation="horizontal", color="k", fontproperties="smallcaps")
        axs[1].text(0, 1, "2", fontsize=20, verticalalignment='top', rotation="horizontal", color="k", fontproperties="smallcaps")
        axs[1].text(1, 5, "3", fontsize=20, verticalalignment='top', rotation="horizontal", color="k", fontproperties="smallcaps")
    plt.suptitle("Correlation vs Interaction", fontsize=20, y=0.97)
    plt.show()

In [None]:
scatter_plot_f1comb_f2(("f_00","f_01"), "f_26")

In [None]:
temp = train_df[["f_00","f_01","f_26","target"]].copy()
temp[f"f_00 + f_01 + f_26"] = temp["f_00"] + temp["f_01"] + temp["f_26"]

fig = plt.figure(figsize=(30,10))
ax = fig.add_subplot()
sns.histplot(data=temp, x="f_00 + f_01 + f_26", hue="target", bins=300, ax=ax)
ax.axvline(x=-5.0, color='y', linestyle='-', label="-5.3")
ax.text(x=-6.1,y=5000,s="-5.0",fontsize=15,fontweight='bold',color='y')
ax.axvline(x=5.0, color='y', linestyle='-')
ax.text(x=5.4,y=5000,s="5.0",fontsize=15,fontweight='bold',color='y')

ax.text(x=-10,y=3500,s="<equation> = -1",fontsize=15,fontweight='bold',color='w')
ax.text(x=-1.5,y=3500,s="<equation> = 0",fontsize=15,fontweight='bold',color='w')
ax.text(x=7,y=3500,s="<equation> = 1",fontsize=15,fontweight='bold',color='w')
plt.suptitle("f_00 + f_01 + f_26 distribution",fontsize=20,fontweight='bold')
plt.show()

In [None]:
train_df["i_f00_f01_f26"] = (train_df.f_00 + train_df.f_01 + train_df.f_26 > 5.0).astype(int) - \
                        (train_df.f_00 + train_df.f_01 + train_df.f_26 < -5.0).astype(int)

Thus using the SHAP interaction analysis, we were able to engineer 3 additional features :  
1. i_f02_f21
2. i_f05_f22
3. i_f00_f01_f26

Credits:

[TPSMAY22 EDA which makes sense](https://www.kaggle.com/code/ambrosm/tpsmay22-eda-which-makes-sense/notebook)  

[Analysing Interactions with SHAP](https://www.kaggle.com/code/wti200/analysing-interactions-with-shap/notebook)