# Model Interpretability - Identifying & Quantifying variable interactions

Explaining models through deeper study of variables and interactions

* Using seaborn's pointplots and countplots, two-way and three-way plots can be constructed to understand variable interactions. Numerical variables are handled by binning and then plotting
* Correlation Network Graphs to understand similarity between variables and groups of variables

## Prepare Dataset

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split

def PrepareData():
    df = pd.read_csv('/kaggle/input/default-of-credit-card-clients-dataset/UCI_Credit_Card.csv')
    df = df.rename(columns={'default.payment.next.month': 'def_pay', 
                            'PAY_0': 'PAY_1'})
    
    features = ['LIMIT_BAL', 'SEX', 'EDUCATION', 'MARRIAGE', 'AGE', 'PAY_1', 'PAY_2',
           'PAY_3', 'PAY_4', 'PAY_5', 'PAY_6', 'BILL_AMT1', 'BILL_AMT2',
           'BILL_AMT3', 'BILL_AMT4', 'BILL_AMT5', 'BILL_AMT6', 'PAY_AMT1',
           'PAY_AMT2', 'PAY_AMT3', 'PAY_AMT4', 'PAY_AMT5', 'PAY_AMT6','def_pay']
    X = df[features].copy()
    return X

df = PrepareData()
df.head()

## Plot two-way interactions
Two-way interactions defined as how does the change in the first variable (moderator) affects the outcome due to change in second variable

In [None]:
def getTwoVarInteraction(df, col1, col2, dep_var, bins = 3, cardinality=10):
    # generate plot
    f, (ax1, ax2) = plt.subplots(1,2, figsize = (20,6))
    
    # count uniques in col1 and col2
    col1_uniq = len(df[col1].unique()) 
    col2_uniq = len(df[col2].unique()) 
    
    # if cardinality of col1 > cardinality that means numeric hence cut into 3 buckets
    if col1_uniq > cardinality:
        df[col1+'_cut'] = pd.qcut(df[col1], bins)
        
        # if cardinality of col1 > cardinality that means numeric hence cut into 3 buckets
        if col2_uniq > cardinality:
            df[col2+'_cut'] = pd.qcut(df[col2], bins)
            sns.pointplot(x=col1+'_cut', y=dep_var, hue=col2+"_cut",data=df, dodge=False, ax=ax1, capsize=0.05)
            sns.countplot(x=col1+'_cut', data=df, ax=ax2, hue=col2+"_cut")
        # else use col2 as is
        else:
            sns.pointplot(x=col1+'_cut', y=dep_var, hue=col2, data=df, dodge=False, ax=ax1, capsize=0.05)
            sns.countplot(x=col1+'_cut', data=df, ax=ax2, hue=col2)
    # else use both cols as is
    else:
        sns.pointplot(x=col1, y=dep_var, hue=col2, data=df, dodge=False, ax=ax1, capsize=0.05, title='')
        plt.title(f'Interaction between {col1} and {col2}', hue=col2)
        sns.countplot(x=col1+'_cut', data=df, ax=ax2)
    
    plt.show()

In [None]:
getTwoVarInteraction(df, 'LIMIT_BAL', 'SEX', 'def_pay', 3)

In [None]:
getTwoVarInteraction(df, 'AGE', 'SEX', 'def_pay', 3)

## Three Variable Interactions
These are two-way interactions plotted as small multiples faceted by the third variable

In [None]:
def getThreeVarInteraction(df, col1, col2, col3, dep_var, bins = 3, cardinality=10):
    
    # count uniques in col1 and col2
    col1_uniq = len(df[col1].unique()) 
    col2_uniq = len(df[col2].unique()) 
    col3_uniq = len(df[col3].unique()) 
    
    if col1_uniq > cardinality:
        df[col1+'_cut'] = pd.qcut(df[col1], bins)
        col1_facet = col1+'_cut'
    else:
        col1_facet = col1
        
    if col2_uniq > cardinality:
        df[col2+'_cut'] = pd.qcut(df[col2], bins)
        col2_facet = col2+'_cut'
    else:
        col2_facet = col2
        
    if col3_uniq > cardinality:
        df[col3+'_cut'] = pd.qcut(df[col3], bins)
        col3_facet = col3+'_cut'
    else:
        col3_facet = col3
        
    # generate plot
    sns.catplot(x=col1_facet, y=dep_var, hue=col2_facet, col=col3_facet, data=df, kind="point", dodge=True)
    plt.show()
    sns.catplot(x=col1_facet, hue=col2_facet, col=col3_facet, data=df, kind="count", dodge=True)
    plt.show()

In [None]:
getThreeVarInteraction(df, 'LIMIT_BAL',  'SEX', 'BILL_AMT5', 'def_pay', bins = 3, cardinality=10)

In [None]:
getThreeVarInteraction(df, 'LIMIT_BAL',  'SEX', 'EDUCATION', 'def_pay', bins = 3, cardinality=10)

## Correlation Network Graphs
2D representation of the relationships (correlation) in a dataset. The correlation graph allows us to see groups of correlated variables, identify irrelevant variables, and discover or verify important, complex relationships that machine learning models should incorporate, all in two dimensions.

The graph shown below is generated on [Default of Credit Card Clients Dataset](https://www.kaggle.com/uciml/default-of-credit-card-clients-dataset). Categorical variables were first replaced with target encoding using means of dependent variable - default.payment.next.month

In [None]:
df = pd.read_csv('../input/default-of-credit-card-clients-dataset/UCI_Credit_Card.csv')
cat_vars = ['SEX', 'EDUCATION', 'MARRIAGE', 'AGE', 'PAY_0', 'PAY_2', 'PAY_3', 'PAY_4', 'PAY_5', 'PAY_6']
num_vars = ['BILL_AMT1', 'BILL_AMT2','BILL_AMT3', 'BILL_AMT4', 'BILL_AMT5', 'BILL_AMT6', 'PAY_AMT1','PAY_AMT2', 'PAY_AMT3', 'PAY_AMT4', 'PAY_AMT5', 'PAY_AMT6']
dep_var = 'default.payment.next.month'
for var in cat_vars:
    means = df[[var,dep_var]].groupby(var).mean()[dep_var]
    df[var+'_enc'] = df[var].map(means) 
corr = df[num_vars + [x+'_enc' for x in cat_vars]].corr()
links = corr.stack().reset_index()
links.columns = ['var1', 'var2', 'value']
links_filtered=links.loc[ (links['value'] > 0.2) & (links['var1'] != links['var2']) ]
 
G=nx.from_pandas_edgelist(links_filtered, 'var1', 'var2', edge_attr=["value"])
plt.figure(figsize=(15,8))
nx.draw(G, with_labels=True, node_color='orange', node_size=1000, edge_color='gray', width=links_filtered.value.values*3, font_size=8)
plt.show()

We see that there are two groups of variables that are similar to one another. 