# Embedding Orthogonalization


In [1]:
import torch
%load_ext autoreload
%autoreload 2

import os

import numpy as np
import statsmodels.api as sm
from patsy import dmatrices

DATA_DIR = '../data'
EMBEDDING_FILE = 'mimic_cfm.npy'
META_FILE = 'mimic_meta.csv'

REDUCE_DIM = True
N_COMPONENTS = 111
TARGET_DISEASE = 'Pleural Effusion'

## Load Metadata and Embedding

In [2]:
from src.utils import get_mimic_meta_data

train_df, val_df, test_df = get_mimic_meta_data(os.path.join(DATA_DIR, META_FILE))
print(f'DATASET SIZES: TRAIN {len(train_df)} | VAL {len(val_df)} | TEST {len(test_df)}')

emb = np.load(os.path.join(DATA_DIR, EMBEDDING_FILE))
train_emb = emb[train_df['idx']]
test_emb = emb[test_df['idx']]

DATASET SIZES: TRAIN 181342 | VAL 1413 | TEST 3041


## Apply Dimensionality Reduction

In [3]:
from src.svd import PCA

if REDUCE_DIM:
    pca = PCA()
    pca.fit(train_emb)
    print(f'Explained variance: {pca.get_total_variance(N_COMPONENTS):.3f}')

    train_emb = pca.transform(train_emb, num_components=N_COMPONENTS)
    test_emb = pca.transform(test_emb, num_components=N_COMPONENTS)
else:
    print('No dimensionality reduction was applied.')

Explained variance: 0.865


## Create Response

In [4]:
# Follow U-zeros strategy for now
train_df['response'] = (train_df[TARGET_DISEASE] == 1).astype(int)
test_df['response'] = (test_df[TARGET_DISEASE] == 1).astype(int)

## Train Base Classifier

In [5]:
from src.utils import eval_predictions
from src.classifier import get_classifier

def eval_classifier(x_train, train_df, x_test, test_df):
    model = get_classifier('nn')
    model.fit(x_train, train_df['response'].tolist())
    
    train_df['preds'] = model.predict_proba(x_train)[:, 1]
    test_df['preds'] = model.predict_proba(x_test)[:, 1]
    
    print('-'*75 + '\nTRAINING')
    eval_predictions(train_df['response'], train_df['preds'])
    print('-'*75 + '\nTESTING')
    eval_predictions(test_df['response'], test_df['preds'])

In [6]:
eval_classifier(
    train_emb, train_df, test_emb, test_df
)

GPU available: False, used: False
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs


Training: 0it [00:00, ?it/s]

`Trainer.fit` stopped: `max_epochs=3` reached.


---------------------------------------------------------------------------
TRAINING
METRICS:	AUC 0.9016 | ACC 0.8461 | SENS 0.6817 | SPEC 0.9017 | PREC 0.7014 | F1 0.6914
---------------------------------------------------------------------------
TESTING
METRICS:	AUC 0.8691 | ACC 0.7988 | SENS 0.6865 | SPEC 0.8531 | PREC 0.6935 | F1 0.6900


## Explain Predictions with Protected Features

In [7]:
formula = 'preds ~ 1 + age + sex + race'
mod = sm.OLS.from_formula(formula, data=train_df).fit()
print(mod.summary())
print(sm.stats.anova_lm(mod))

                            OLS Regression Results                            
Dep. Variable:                  preds   R-squared:                       0.068
Model:                            OLS   Adj. R-squared:                  0.068
Method:                 Least Squares   F-statistic:                     3290.
Date:                Mon, 31 Jul 2023   Prob (F-statistic):               0.00
Time:                        15:24:13   Log-Likelihood:                -23799.
No. Observations:              181342   AIC:                         4.761e+04
Df Residuals:                  181337   BIC:                         4.766e+04
Df Model:                           4                                         
Covariance Type:            nonrobust                                         
                    coef    std err          t      P>|t|      [0.025      0.975]
---------------------------------------------------------------------------------
Intercept         0.0623      0.004     14.873

## Remove Protected Features from Result Prediction

Residuals = part of predictions that can't be explained by protected features

In [8]:
train_df['preds_adjusted'] = mod.resid

# Check: p-vals should all be one
formula = 'preds_adjusted ~ 1 + age + sex + race'
mod_check = sm.OLS.from_formula(formula, data=train_df).fit()
print(mod_check.summary())

                            OLS Regression Results                            
Dep. Variable:         preds_adjusted   R-squared:                      -0.000
Model:                            OLS   Adj. R-squared:                 -0.000
Method:                 Least Squares   F-statistic:                -5.974e-12
Date:                Mon, 31 Jul 2023   Prob (F-statistic):               1.00
Time:                        15:24:14   Log-Likelihood:                -23799.
No. Observations:              181342   AIC:                         4.761e+04
Df Residuals:                  181337   BIC:                         4.766e+04
Df Model:                           4                                         
Covariance Type:            nonrobust                                         
                    coef    std err          t      P>|t|      [0.025      0.975]
---------------------------------------------------------------------------------
Intercept      2.969e-16      0.004   7.09e-14

## Adjust Embedding

In [9]:
# Create the design matrix
formula = '1 ~ age + sex + race'
_, x_train = dmatrices(formula, data=train_df)
_, x_test = dmatrices(formula, data=test_df)

print(x_train.shape)

(181342, 5)


In [10]:
from src.ortho import Orthogonalizator

ortho = Orthogonalizator()
train_emb_proj = ortho.fit_transform(x_train, train_emb)
test_emb_proj = ortho.transform(x_test, test_emb)

## Check classifier performance on adjusted embedding

In [18]:
eval_classifier(
    train_emb_proj, train_df, test_emb_proj, test_df
)

GPU available: False, used: False
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs


Training: 0it [00:00, ?it/s]

`Trainer.fit` stopped: `max_epochs=3` reached.


---------------------------------------------------------------------------
TRAINING
METRICS:	AUC 0.5007 | ACC 0.5329 | SENS 0.9731 | SPEC 0.0224 | PREC 0.5358 | F1 0.6911
---------------------------------------------------------------------------
TESTING
METRICS:	AUC 0.4696 | ACC 0.5465 | SENS 0.8655 | SPEC 0.1641 | PREC 0.5538 | F1 0.6755


In [19]:
formula = 'preds ~ 1 + age + sex + race'
mod_fixed = sm.OLS.from_formula(formula, data=train_df).fit()
print(mod_fixed.summary())
print(sm.stats.anova_lm(mod_fixed))

                            OLS Regression Results                            
Dep. Variable:                  preds   R-squared:                       0.000
Model:                            OLS   Adj. R-squared:                 -0.000
Method:                 Least Squares   F-statistic:                 0.0005923
Date:                Mon, 31 Jul 2023   Prob (F-statistic):               1.00
Time:                        15:55:59   Log-Likelihood:             4.6828e+05
No. Observations:              181342   AIC:                        -9.366e+05
Df Residuals:                  181337   BIC:                        -9.365e+05
Df Model:                           4                                         
Covariance Type:            nonrobust                                         
                    coef    std err          t      P>|t|      [0.025      0.975]
---------------------------------------------------------------------------------
Intercept         0.5373      0.000   1934.288

## Add Protected Features to classifier

In [13]:
train_mat = np.concatenate([train_emb_proj, x_train], axis=1)
test_mat = np.concatenate([test_emb_proj, x_test], axis=1)

eval_classifier(
    train_mat, train_df, test_mat, test_df
)

GPU available: False, used: False
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs


Training: 0it [00:00, ?it/s]

`Trainer.fit` stopped: `max_epochs=3` reached.


---------------------------------------------------------------------------
TRAINING
METRICS:	AUC 0.8901 | ACC 0.8408 | SENS 0.6186 | SPEC 0.9161 | PREC 0.7140 | F1 0.6629
---------------------------------------------------------------------------
TESTING
METRICS:	AUC 0.7405 | ACC 0.6968 | SENS 0.6179 | SPEC 0.7350 | PREC 0.5303 | F1 0.5708


## Classify protected features

In [14]:
train_df['response'] = (train_df['sex'] == 'M').astype(int)
test_df['response'] = (test_df['sex'] == 'M').astype(int)

eval_classifier(
    train_emb, train_df, test_emb, test_df
)

GPU available: False, used: False
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs


Training: 0it [00:00, ?it/s]

`Trainer.fit` stopped: `max_epochs=3` reached.


---------------------------------------------------------------------------
TRAINING
METRICS:	AUC 0.9648 | ACC 0.9003 | SENS 0.9100 | SPEC 0.8891 | PREC 0.9049 | F1 0.9074
---------------------------------------------------------------------------
TESTING
METRICS:	AUC 0.9427 | ACC 0.8721 | SENS 0.8945 | SPEC 0.8453 | PREC 0.8739 | F1 0.8841


In [15]:
eval_classifier(
    train_emb_proj, train_df, test_emb_proj, test_df
)

GPU available: False, used: False
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs


Training: 0it [00:00, ?it/s]

`Trainer.fit` stopped: `max_epochs=3` reached.


---------------------------------------------------------------------------
TRAINING
METRICS:	AUC 0.5005 | ACC 0.5301 | SENS 0.9404 | SPEC 0.0544 | PREC 0.5355 | F1 0.6824
---------------------------------------------------------------------------
TESTING
METRICS:	AUC 0.5975 | ACC 0.5587 | SENS 0.9499 | SPEC 0.0897 | PREC 0.5558 | F1 0.7012


In [16]:
model = get_classifier('nn_nl')
model.fit(train_emb_proj, train_df['response'].tolist())

train_df['preds'] = model.predict_proba(train_emb_proj)[:, 1]
test_df['preds'] = model.predict_proba(test_emb_proj)[:, 1]

print('-'*75 + '\nTRAINING')
eval_predictions(train_df['response'], train_df['preds'])
print('-'*75 + '\nTESTING')
eval_predictions(test_df['response'], test_df['preds'])

GPU available: False, used: False
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs


Training: 0it [00:00, ?it/s]

`Trainer.fit` stopped: `max_epochs=3` reached.


---------------------------------------------------------------------------
TRAINING
METRICS:	AUC 0.9941 | ACC 0.9619 | SENS 0.9604 | SPEC 0.9637 | PREC 0.9684 | F1 0.9644
---------------------------------------------------------------------------
TESTING
METRICS:	AUC 0.0000 | ACC 0.0026 | SENS 0.0048 | SPEC 0.0000 | PREC 0.0058 | F1 0.0052


## Project adjusted embedding back to original data space and classify again

In [17]:
eval_classifier(
    pca.inverse_transform(train_emb_proj), 
    train_df, 
    pca.inverse_transform(test_emb_proj),
    test_df
)

GPU available: False, used: False
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs


Training: 0it [00:00, ?it/s]

`Trainer.fit` stopped: `max_epochs=3` reached.


---------------------------------------------------------------------------
TRAINING
METRICS:	AUC 0.4977 | ACC 0.5369 | SENS 1.0000 | SPEC 0.0000 | PREC 0.5369 | F1 0.6987
---------------------------------------------------------------------------
TESTING
METRICS:	AUC 0.6046 | ACC 0.5478 | SENS 0.9891 | SPEC 0.0188 | PREC 0.5472 | F1 0.7046
