In [1]:
import sys
sys.path.append('nam')

In [2]:
%matplotlib inline
import matplotlib.pyplot as plt

In [3]:
from itertools import combinations

In [4]:
import torch
import numpy as np
import pandas as pd
import sklearn.metrics as sk_metrics
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import OneHotEncoder
from torch.utils.data import random_split

from nam.wrapper import NAMClassifier, MultiTaskNAMClassifier

  from tqdm.autonotebook import tqdm


In [5]:
random_state = 2016

In [6]:
import data_utils
data_x, data_y, columns = data_utils.load_dataset("Recidivism")
original_dfs = data_utils.load_recidivism_data()

In [7]:
X_train, X_test, y_train, y_test = train_test_split(data_x, data_y, test_size=0.2, random_state=random_state)

In [8]:
device="cuda:7"

In [9]:
%%time
model = NAMClassifier(
    num_epochs=1000,
    num_learners=20,
    metric='auroc',
    early_stop_mode='max',
    monitor_loss=False,
    n_jobs=10,
    random_state=random_state,
    device=device
)

model.fit(X_train, y_train)

CPU times: user 3.04 s, sys: 1.57 s, total: 4.61 s
Wall time: 1min 15s


<nam.wrapper.wrapper.NAMClassifier at 0x7fdb39b93e20>

# regression trained on the logits

let's train a regression model to predict the predicted logits

full model first, to test how well it reproduces the original

In [10]:
y_train_regression = super(NAMClassifier, model).predict(X_train)
y_test_regression = super(NAMClassifier, model).predict(X_test)

In [11]:
from nam.wrapper import NAMRegressor

In [12]:
%%time
model = NAMRegressor(
    num_epochs=1000,
    num_learners=20,
    monitor_loss=False,
    n_jobs=10,
    output_reg=0,
    metric="mse",
    random_state=random_state,
    device=device
)

model.fit(X_train, y_train_regression)

CPU times: user 10.8 s, sys: 828 ms, total: 11.6 s
Wall time: 1min 23s


<nam.wrapper.wrapper.NAMRegressor at 0x7fdb3a287370>

In [13]:
pred = model.predict(X_test)
sk_metrics.roc_auc_score(y_test, pred)

0.7332789080800158

In [14]:
def var(x):
    return np.mean((x-x.mean())**2)

In [15]:
var(y_train_regression-model.predict(X_train))

0.01273581179877551

In [16]:
var(y_test_regression-pred)

0.011861964422434536

In [17]:
# very closely reproduces the original!

# coalition models trained to predict logits

In [18]:
%%time
single_models = []
for i in range(len(columns)):
    X_train_f = X_train[:, i:i+1]
    X_test_f = X_test[:, i:i+1]
    model = NAMRegressor(
        num_epochs=1000,
        num_learners=20,
        metric="mse",
        monitor_loss=False,
        n_jobs=10,
        output_reg=0,
        random_state=random_state,
        device=device
    )

    model.fit(X_train_f, y_train_regression)
    pred = model.predict(X_test_f)
    print(columns[i], var(y_test_regression-pred), sk_metrics.roc_auc_score(y_test, pred))
    single_models.append(model)

age 0.17107915620188596 0.6100990859758927
race 0.16669399068332816 0.5578775218367988
sex 0.19120595533944848 0.5455197878482283
priors_count 0.08042203168255756 0.681833704172069
length_of_stay 0.19391522325335458 0.585428972376516
c_charge_degree 0.16825890326150997 0.565035509809885
CPU times: user 1min 4s, sys: 4.71 s, total: 1min 8s
Wall time: 3min 56s


In [19]:
%%time
double_models = {}
for i,j in combinations(range(len(columns)), 2):
    X_train_f = X_train[:, [i,j]]
    X_test_f = X_test[:, [i,j]]
    model = NAMRegressor(
        num_epochs=1000,
        num_learners=20,
        metric="mse",
        monitor_loss=False,
        n_jobs=10,
        output_reg=0,
        random_state=random_state,
        device=device
    )

    model.fit(X_train_f, y_train_regression)
    pred = model.predict(X_test_f)
    print(columns[i], columns[j], var(y_test_regression-pred), sk_metrics.roc_auc_score(y_test, pred))
    double_models[(i,j)] = model

age race 0.14230508495867533 0.6083089220064777
age sex 0.15688721525120833 0.6302524371308287
age priors_count 0.03125010166825436 0.7349970386259224
age length_of_stay 0.15934563912709646 0.6458036528949432
age c_charge_degree 0.14061561311922205 0.6333071878685044
race sex 0.15327305369880792 0.5838895914370934
race priors_count 0.06690797176675162 0.6758495942117142
race length_of_stay 0.1601576499481214 0.5794822130802026
race c_charge_degree 0.1385886563860222 0.597864075597744
sex priors_count 0.0772788781758321 0.6883220480969836
sex length_of_stay 0.18323616105724502 0.5945638776392246
sex c_charge_degree 0.1587809327309068 0.5948386718103866
priors_count length_of_stay 0.07963690535277586 0.6813081269709145
priors_count c_charge_degree 0.06850494791311168 0.6861930602466211
length_of_stay c_charge_degree 0.16349815711099525 0.6072844466499122
CPU times: user 2min 59s, sys: 14.8 s, total: 3min 14s
Wall time: 12min 20s


In [20]:
full_var = var(y_train_regression)
full_var

0.21310962966641314

In [21]:
full_logit = y_test_regression
single_logits = [m.predict(X_test[:,[i]]) for i, m in enumerate(single_models)]
double_logits = {k: m.predict(X_test[:, k]) for k, m in double_models.items()}

In [22]:
r2_single = []
for i,sl in enumerate(single_logits):
    print(columns[i], 1-var(sl-full_logit)/full_var)
    r2_single.append(1-var(sl-full_logit)/full_var)

age 0.19941546778804065
race 0.21818799226446683
sex 0.10056671854559929
priors_count 0.6310137063310823
length_of_stay 0.09084749958978666
c_charge_degree 0.20889018056609499


In [23]:
r2_double = {}
for i,j in double_logits.keys():
    print(columns[i], columns[j], 1-var(double_logits[(i,j)]-full_logit)/full_var)
    r2_double[(i,j)] = (1-var(double_logits[(i,j)]-full_logit)/full_var)
    r2_double[(j,i)] = (1-var(double_logits[(i,j)]-full_logit)/full_var)

age race 0.3314025839475585
age sex 0.2720364133336858
age priors_count 0.8503636422544762
age length_of_stay 0.25250392744986727
age c_charge_degree 0.33236192447146007
race sex 0.27694926120562435
race priors_count 0.6849630416490041
race length_of_stay 0.24866709912570928
race c_charge_degree 0.3476435490316616
sex priors_count 0.6400827923550148
sex length_of_stay 0.1387248750855068
sex c_charge_degree 0.2613899460077538
priors_count length_of_stay 0.6288888457355589
priors_count c_charge_degree 0.6786179101305426
length_of_stay c_charge_degree 0.22737137230683047


In [24]:
print("", *columns, sep="|", end="|\n")
print("", *(["---"]*len(columns)), sep="|", end="|\n")
print("", *["%.3f"%v for v in r2_single], sep="|", end="|\n")

|age|race|sex|priors_count|length_of_stay|c_charge_degree|
|---|---|---|---|---|---|
|0.199|0.218|0.101|0.631|0.091|0.209|


In [25]:
print("", "", *columns, sep="|", end="|\n")
print("", *(["---"]*(len(columns)+1)), sep="|", end="|\n")
for i in range(len(columns)):
    print("", columns[i], sep="|", end="|")
    print(*["%.3f"%r2_double[(i,j)] if i!= j else "" for j in range(len(columns))], sep="|", end="|\n")

||age|race|sex|priors_count|length_of_stay|c_charge_degree|
|---|---|---|---|---|---|---|
|age||0.331|0.272|0.850|0.253|0.332|
|race|0.331||0.277|0.685|0.249|0.348|
|sex|0.272|0.277||0.640|0.139|0.261|
|priors_count|0.850|0.685|0.640||0.629|0.679|
|length_of_stay|0.253|0.249|0.139|0.629||0.227|
|c_charge_degree|0.332|0.348|0.261|0.679|0.227||


In [26]:
print("", "", *columns, sep="|", end="|\n")
print("", *(["---"]*(len(columns)+1)), sep="|", end="|\n")
for i in range(len(columns)):
    print("", columns[i], sep="|", end="|")
    print(*["%.3f"%((r2_double[(i,j)]-r2_single[i]-r2_single[j])) if i!= j else "" for j in range(len(columns))], sep="|", end="|\n")

||age|race|sex|priors_count|length_of_stay|c_charge_degree|
|---|---|---|---|---|---|---|
|age||-0.086|-0.028|0.020|-0.038|-0.076|
|race|-0.086||-0.042|-0.164|-0.060|-0.079|
|sex|-0.028|-0.042||-0.091|-0.053|-0.048|
|priors_count|0.020|-0.164|-0.091||-0.093|-0.161|
|length_of_stay|-0.038|-0.060|-0.053|-0.093||-0.072|
|c_charge_degree|-0.076|-0.079|-0.048|-0.161|-0.072||


In [27]:
# no positive interaction except for priors_count x age

# priors count x age investigation

In [28]:
columns[0], columns[3]

('age', 'priors_count')

In [29]:
X0=np.zeros_like(X_test)
X0[:, 0] = X_test[:, 0]
X3=np.zeros_like(X_test)
X3[:, 3] = X_test[:, 3]

In [30]:
np.cov(double_models[(0,3)].predict(X0), double_models[(0,3)].predict(X3))

array([[0.03546889, 0.00038584],
       [0.00038584, 0.00771755]])

In [31]:
# possible example D situation!

In [32]:
var(y_test_regression-double_models[(0,3)].predict(X0))/full_var, var(y_test_regression-double_models[(0,3)].predict(X3))/full_var

(0.8540800595969168, 0.9987154707622062)

In [33]:
var(y_test_regression-single_logits[0])/full_var, var(y_test_regression-single_logits[3])/full_var

(0.8005845322119594, 0.3689862936689177)

In [38]:
# probably just randomness, will need a bit further investigation