# Visualising Feature Interactions - Target

The output of the visualisations have been hidden to keep the notebook uncluttered - Click **"Show hidden output"** to view all interactions with a given feature.

We are only viewing interactions by using the target value. We could have also used a trained model to visualise interactions.

We visualise three types of interactions in this notebook:
- Float-Float (continuous-continuous) interactions (e.g. interactions between f_00 and f_01)
- Int-Int (discrete-discrete) interactions (e.g. interactions between f_29 and f_30)
- Float-Int (continuous-discrete) interactions (e.g interactions between f_00 and f_29)

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

sns.set_style("whitegrid")

In [None]:
train_df = pd.read_csv("../input/tabular-playground-series-may-2022/train.csv")

In [None]:
def feature_engineer(df):
    new_df = df.copy()

    new_df["unique_characters"] = new_df["f_27"].apply(lambda x: len(set(x)))
    
    for i in range(10):
        new_df["f_27_"+str(i)] = new_df["f_27"].str[i].apply(lambda x: ord(x) - ord("A"))
    
    new_df = new_df.drop(columns=["f_27", "id"])
    return new_df

In [None]:
%%time
train_df = feature_engineer(train_df)

In [None]:
float_cols = [i for i in train_df.columns if train_df[i].dtype == "float"]
int_cols = [i for i in train_df.columns if train_df[i].dtype == "int"]

In [None]:
def reduce_memory_usage(df, verbose=True):
    numerics = ["int8", "int16", "int32", "int64", "float16", "float32", "float64"]
    start_mem = df.memory_usage().sum() / 1024 ** 2
    for col in df.columns:
        col_type = df[col].dtypes
        if col_type in numerics:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == "int":
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)
            else:
                if (
                    c_min > np.finfo(np.float16).min
                    and c_max < np.finfo(np.float16).max
                ):
                    df[col] = df[col].astype(np.float16)
                elif (
                    c_min > np.finfo(np.float32).min
                    and c_max < np.finfo(np.float32).max
                ):
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)
    end_mem = df.memory_usage().sum() / 1024 ** 2
    if verbose:
        print(
            "Mem. usage decreased to {:.2f} Mb ({:.1f}% reduction)".format(
                end_mem, 100 * (start_mem - end_mem) / start_mem
            )
        )
    return df

In [None]:
train_df = reduce_memory_usage(train_df)
train_df = train_df.sample(700000)

# Float (Continuous) Features

In [None]:
train_df["random"] = np.random.randn(len(train_df))

In [None]:
def scatter_feature_random():
    f,ax = plt.subplots(num=1, clear=True, figsize=(20,80)) # by setting num = 1 and clear = True we can avoid running out of RAM 
    for n,f in enumerate(float_cols):
        plt.subplot(10,2,n+1)
        sns.scatterplot(data = train_df, x="random", y=f, hue="target", s=2);
    plt.tight_layout()

In [None]:
scatter_feature_random()

# Float Float (Continuous-Continuous) Interactions

In [None]:
def scatter_feature(main_feature):
    f,ax = plt.subplots(num=1, clear=True, figsize=(20,80)) # by setting num = 1 and clear = True we can avoid running out of RAM 
    for n,f in enumerate(float_cols):
        if f!= main_feature:
            plt.subplot(10,2,n+1)
            sns.scatterplot(data = train_df, x=main_feature, y=f, hue="target", s=2);
    plt.tight_layout()

## f_00

In [None]:
scatter_feature("f_00")

## f_01

In [None]:
scatter_feature("f_01")

## f_02

In [None]:
scatter_feature("f_02")

## f_03

In [None]:
scatter_feature("f_03")

## f_04

In [None]:
scatter_feature("f_04")

## f_05

In [None]:
scatter_feature("f_05")

## f_06

In [None]:
scatter_feature("f_06")

## f_19

In [None]:
scatter_feature("f_19")

## f_20

In [None]:
scatter_feature("f_20")

## f_21

In [None]:
scatter_feature("f_21")

## f_22

In [None]:
scatter_feature("f_22")

## f_23

In [None]:
scatter_feature("f_23")

## f_24

In [None]:
scatter_feature("f_24")

## f_25

In [None]:
scatter_feature("f_25")

## f_26

In [None]:
scatter_feature("f_26")

## f_28

In [None]:
scatter_feature("f_28")

# Float Float (Continuous-Continuous) Sum Interactions

In [None]:
def scatter_feature_3(main_feature):
    float_cols = [i for i in train_df.columns if train_df[i].dtype == "float16" and i != main_feature]
    remaining_cols = float_cols.copy()
    for n1,f1 in enumerate(float_cols):
        if f1 != "i_sum_f1_f2":
            f,ax = plt.subplots(num=n1, clear=True, figsize=(20,80))
        for n2,f2 in enumerate(remaining_cols):
            if f2 != "i_sum_f1_f2":
                train_df["i_sum_f1_f2"] = train_df[f1] + train_df[f2] + train_df[main_feature]
                ax = plt.subplot(10,2,n2+1)
                sns.scatterplot(data = train_df[0:200000], x="random", y="i_sum_f1_f2", hue="target", s=2);
                ax.set_ylabel(str(main_feature) + " + " + str(f1)+" + "+str(f2))
                #ax.set_ylim([-200,200])
        plt.tight_layout()
        remaining_cols.remove(f1)

## f_00

In [None]:
scatter_feature_3("f_00")

## f_01

In [None]:
scatter_feature_3("f_01")

## f_02

In [None]:
scatter_feature_3("f_02")

## f_03

In [None]:
scatter_feature_3("f_03")

## f_04

In [None]:
scatter_feature_3("f_04")

## f_05

In [None]:
scatter_feature_3("f_05")

## f_06

In [None]:
scatter_feature_3("f_06")

## f_19

In [None]:
scatter_feature_3("f_19")

## f_20

In [None]:
scatter_feature_3("f_20")

## f_21

In [None]:
scatter_feature_3("f_21")

## f_22

In [None]:
scatter_feature_3("f_22")

## f_23

In [None]:
scatter_feature_3("f_23")

## f_24

In [None]:
scatter_feature_3("f_24")

## f_25

In [None]:
scatter_feature_3("f_25")

## f_26

In [None]:
scatter_feature_3("f_26")

## f_28

In [None]:
scatter_feature_3("f_28")

# Int-Int (Dicrete Discrete) Interactions

In [None]:
#First we remove feature values with low value counts, as the results will be bias and will clutter the graphs

df_temp = train_df.copy()
for col in int_cols:
    if col not in ["target", "unique_characters"]:
        temp_f1 = train_df[col].value_counts() #val_count f_07
        temp_vals = temp_f1[temp_f1<4000].index
        df_temp.loc[df_temp[col] >= temp_vals.min(), col] = temp_vals.min()
    if col == "unique_characters": #In all other columns higher vals = less common
        temp_f1 = train_df[col].value_counts() #val_count f_07
        temp_vals = temp_f1[temp_f1<4000].index
        df_temp.loc[df_temp[col].isin(temp_vals), col] = np.nan

In [None]:
df_temp = reduce_memory_usage(df_temp)

In [None]:
def target_plot_cat_cat(main_feature):
    plot_dfs = []
    for f in int_cols:#["f_29","f_08","f_09", "f_10", "f_11", "f_30"]:
        if f not in ["target", main_feature, "f_27_7"]:
            mean_targets = df_temp[[main_feature,f,"target"]].groupby([main_feature,f])["target"].mean().rename("Mean target").reset_index().rename(columns={main_feature:"f1",f:"f2"})
            val_counts =  df_temp[[main_feature,f]].value_counts().rename("Count").reset_index().rename(columns={main_feature:"f1",f:"f2"})
            plot_df = pd.merge(val_counts,mean_targets, on=["f1","f2"])
            plot_df["f1_name"] = main_feature
            plot_df["f2_name"] = f
            plot_dfs.append(plot_df)
    plot_mult_dfs = pd.concat(plot_dfs)
    
    g = sns.relplot(data=plot_mult_dfs, x="f2",y="f1", edgecolor="grey", hue="Mean target", size="Count", col="f2_name", col_wrap = 2,
                    hue_norm = (0.1,0.9), sizes=(20,2000),size_norm=(1000,100000),
                    palette=sns.color_palette("Spectral", as_cmap=True), height=8, legend="auto", aspect=1)


    g.set_titles("Feature: {col_name}")
    for row_val, ax in g.axes_dict.items():
        ax.set_xlabel(row_val)
        ax.set_ylabel(main_feature)
        ax.set_xticks(plot_mult_dfs.loc[plot_mult_dfs["f2_name"]==row_val,"f2"].unique())
        ax.set_yticks(plot_mult_dfs["f1"].unique())
        #ax.set(yticks=plot_mult_dfs["f1"].unique(), xticks=plot_mult_dfs.loc[plot_mult_dfs["f2_name"]==row_val,"f2"].unique())

    sns.move_legend(g, "upper left", bbox_to_anchor=(.70, .05))
    g.tight_layout()
    

## f_07

In [None]:
target_plot_cat_cat("f_07")

## f_08

In [None]:
target_plot_cat_cat("f_08")

## f_09

In [None]:
target_plot_cat_cat("f_09")

## f_10

In [None]:
target_plot_cat_cat("f_10")

## f_11

In [None]:
target_plot_cat_cat("f_11")

## f_12

In [None]:
target_plot_cat_cat("f_12")

## f_13

In [None]:
target_plot_cat_cat("f_13")

## f_14

In [None]:
target_plot_cat_cat("f_14")

## f_15

In [None]:
target_plot_cat_cat("f_15")

## f_16

In [None]:
target_plot_cat_cat("f_16")

## f_17

In [None]:
target_plot_cat_cat("f_17")

## f_18

In [None]:
target_plot_cat_cat("f_18")

## f_29

In [None]:
target_plot_cat_cat("f_29")

## f_30

In [None]:
target_plot_cat_cat("f_30")

## f_27_0

In [None]:
target_plot_cat_cat("f_27_0")

## f_27_1

In [None]:
target_plot_cat_cat("f_27_1")

## f_27_2

In [None]:
target_plot_cat_cat("f_27_2")

## f_27_3

In [None]:
target_plot_cat_cat("f_27_3")

## f_27_4

In [None]:
target_plot_cat_cat("f_27_4")

## f_27_5

In [None]:
target_plot_cat_cat("f_27_5")

## f_27_6

In [None]:
target_plot_cat_cat("f_27_6")

## f_27_7

In [None]:
target_plot_cat_cat("f_27_7")

## f_27_8

In [None]:
target_plot_cat_cat("f_27_8")

## f_27_9

In [None]:
target_plot_cat_cat("f_27_9")

# Float-Integer (continuous-discrete) Interactions

In [None]:
def int_float_scatter(main_feature):
    f,ax = plt.subplots(figsize=(20,300))
    if train_df[main_feature].dtype == int:
        for n,f in enumerate(float_cols):
            if f!= main_feature:
                plt.subplot(20,2,n+1)
                sns.stripplot(data = df_temp[0:50000], y=main_feature, x=f, orient="h", hue="target", jitter=1, dodge=False, s=1);
    elif train_df[main_feature].dtype == float:
        for n,f in enumerate(int_cols):
            if f not in [main_feature]:
                plt.subplot(20,2,n+1)
                sns.stripplot(data = df_temp[0:50000], x=main_feature, y=f, orient="h", hue="target", jitter=1, dodge=False, s=1);

In [None]:
def target_plot_float_cat(main_feature):
    
    f_discrete = pd.cut(train_df[main_feature],bins=10)
    df_temp[main_feature] = f_discrete
    
    plot_dfs = []
    
    for f in int_cols:
        if f not in ["target", main_feature, "f_27_7"]:
            mean_targets = df_temp[[main_feature,f,"target"]].groupby([main_feature,f])["target"].mean().rename("Mean target").reset_index().rename(columns={main_feature:"f1",f:"f2"})
            val_counts =  df_temp[[main_feature,f]].value_counts().rename("Count").reset_index().rename(columns={main_feature:"f1",f:"f2"})
            plot_df = pd.merge(val_counts,mean_targets, on=["f1","f2"])
            plot_df["f1_name"] = main_feature
            plot_df["f2_name"] = f
            plot_df["f1_quickfix"] = plot_df["f1"].astype(str).str.replace(pat=",",repl="000") # introduced to fix occasional error with lowerbound feature - float("0.7,")
            plot_df["f1_lower_bound"] = plot_df["f1_quickfix"].str.slice(start=1,stop=6).astype(float)
            plot_df = plot_df.sort_values("f1")
            plot_dfs.append(plot_df)
    plot_mult_dfs = pd.concat(plot_dfs)
    
    g = sns.relplot(data=plot_mult_dfs, x="f2",y="f1_lower_bound", edgecolor="grey", hue="Mean target", size="Count", col="f2_name", col_wrap = 2,
                    hue_norm = (0.1,0.9), sizes=(20,2000),size_norm=(1000,100000),
                    palette=sns.color_palette("Spectral", as_cmap=True), height=8, legend="auto", aspect=1)


    g.set_titles("Feature: {col_name}")
    for row_val, ax in g.axes_dict.items():
        ax.set_xlabel(row_val)
        ax.set_ylabel(main_feature)
        ax.set_xticks(plot_mult_dfs.loc[plot_mult_dfs["f2_name"]==row_val,"f2"].unique())
        ax.set_yticks(plot_mult_dfs["f1_lower_bound"].unique())
        ax.set_yticklabels(plot_df.sort_values("f1_lower_bound")["f1"].unique())

    sns.move_legend(g, "upper left", bbox_to_anchor=(.75, .48))
    

## f_00

In [None]:
target_plot_float_cat("f_00")

## f_01

In [None]:
target_plot_float_cat("f_01")

## f_02

In [None]:
target_plot_float_cat("f_02")

## f_03

In [None]:
target_plot_float_cat("f_03")

## f_04

In [None]:
target_plot_float_cat("f_04")

## f_05

In [None]:
target_plot_float_cat("f_05")

## f_06

In [None]:
target_plot_float_cat("f_06")

## f_19

In [None]:
target_plot_float_cat("f_19")

## f_20

In [None]:
target_plot_float_cat("f_20")

## f_21

In [None]:
target_plot_float_cat("f_21")

## f_22

In [None]:
target_plot_float_cat("f_22")

## f_23

In [None]:
target_plot_float_cat("f_23")

## f_24

In [None]:
target_plot_float_cat("f_24")

## f_25

In [None]:
target_plot_float_cat("f_25")

## f_26

In [None]:
target_plot_float_cat("f_26")

## f_28

In [None]:
target_plot_float_cat("f_28")