# Embedding Orthogonalization


In [1]:
import torch
%load_ext autoreload
%autoreload 2

import os

import numpy as np
import statsmodels.api as sm
from patsy import dmatrices

DATA_DIR = '../data'
EMBEDDING_FILE = 'cfm/embeddings.npy'
META_FILE = 'mimic_meta.csv'

REDUCE_DIM = True
N_COMPONENTS = 111
TARGET_DISEASE = 'Pleural Effusion'

## Load Metadata and Embedding

In [2]:
from src.utils import get_mimic_meta_data

train_df, val_df, test_df = get_mimic_meta_data(os.path.join(DATA_DIR, META_FILE))
print(f'DATASET SIZES: TRAIN {len(train_df)} | VAL {len(val_df)} | TEST {len(test_df)}')

emb = np.load(os.path.join(DATA_DIR, EMBEDDING_FILE))
train_emb = emb[train_df['idx']]
test_emb = emb[test_df['idx']]

DATASET SIZES: TRAIN 181342 | VAL 1413 | TEST 3041


## Apply Dimensionality Reduction

In [3]:
from src.svd import PCA

if REDUCE_DIM:
    pca = PCA()
    pca.fit(train_emb)
    print(f'Explained variance: {pca.get_total_variance(N_COMPONENTS):.3f}')

    train_emb = pca.transform(train_emb, num_components=N_COMPONENTS)
    test_emb = pca.transform(test_emb, num_components=N_COMPONENTS)
else:
    print('No dimensionality reduction was applied.')

Explained variance: 0.865


## Create Response

In [4]:
# Follow U-zeros strategy for now
train_df['response'] = (train_df[TARGET_DISEASE] == 1).astype(int)
test_df['response'] = (test_df[TARGET_DISEASE] == 1).astype(int)

## Train Base Classifier

In [5]:
from src.utils import eval_predictions
from src.classifier import get_classifier

def eval_classifier(x_train, train_df, x_test, test_df):
    model = get_classifier('log_reg')
    model.fit(x_train, train_df['response'].tolist())
    
    train_df['preds'] = model.predict_proba(x_train)[:, 1]
    test_df['preds'] = model.predict_proba(x_test)[:, 1]
    
    print('-'*75 + '\nTRAINING')
    eval_predictions(train_df['response'], train_df['preds'])
    print('-'*75 + '\nTESTING')
    eval_predictions(test_df['response'], test_df['preds'])

In [6]:
eval_classifier(
    train_emb, train_df, test_emb, test_df
)

[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 4 concurrent workers.


convergence after 15 epochs took 5 seconds
---------------------------------------------------------------------------
TRAINING
METRICS:	AUC 0.9024 | ACC 0.8468 | SENS 0.6444 | SPEC 0.9153 | PREC 0.7205 | F1 0.6804
---------------------------------------------------------------------------
TESTING
METRICS:	AUC 0.8687 | ACC 0.8020 | SENS 0.6482 | SPEC 0.8765 | PREC 0.7176 | F1 0.6811


## Explain Predictions with Protected Features

In [7]:
formula = 'preds ~ 1 + age + sex + race'
mod = sm.OLS.from_formula(formula, data=train_df).fit()
print(mod.summary())
print(sm.stats.anova_lm(mod))

                            OLS Regression Results                            
Dep. Variable:                  preds   R-squared:                       0.067
Model:                            OLS   Adj. R-squared:                  0.067
Method:                 Least Squares   F-statistic:                     3273.
Date:                Fri, 28 Jul 2023   Prob (F-statistic):               0.00
Time:                        11:06:22   Log-Likelihood:                -25002.
No. Observations:              181342   AIC:                         5.001e+04
Df Residuals:                  181337   BIC:                         5.006e+04
Df Model:                           4                                         
Covariance Type:            nonrobust                                         
                    coef    std err          t      P>|t|      [0.025      0.975]
---------------------------------------------------------------------------------
Intercept         0.0251      0.004      5.961

## Remove Protected Features from Result Prediction

Residuals = part of predictions that can't be explained by protected features

In [8]:
train_df['preds_adjusted'] = mod.resid

# Check: p-vals should all be one
formula = 'preds_adjusted ~ 1 + age + sex + race'
mod_check = sm.OLS.from_formula(formula, data=train_df).fit()
print(mod_check.summary())

                            OLS Regression Results                            
Dep. Variable:         preds_adjusted   R-squared:                       0.000
Model:                            OLS   Adj. R-squared:                 -0.000
Method:                 Least Squares   F-statistic:                 5.895e-12
Date:                Fri, 28 Jul 2023   Prob (F-statistic):               1.00
Time:                        11:06:24   Log-Likelihood:                -25002.
No. Observations:              181342   AIC:                         5.001e+04
Df Residuals:                  181337   BIC:                         5.006e+04
Df Model:                           4                                         
Covariance Type:            nonrobust                                         
                    coef    std err          t      P>|t|      [0.025      0.975]
---------------------------------------------------------------------------------
Intercept      1.939e-15      0.004    4.6e-13

## Adjust Embedding

In [9]:
# Create the design matrix
formula = '1 ~ age + sex + race'
_, x_train = dmatrices(formula, data=train_df)
_, x_test = dmatrices(formula, data=test_df)

print(x_train.shape)

(181342, 5)


In [10]:
from src.ortho import Orthogonalizator

ortho = Orthogonalizator()
train_emb_proj = ortho.fit_transform(x_train, train_emb)
test_emb_proj = ortho.transform(x_test, test_emb)

## Check classifier performance on adjusted embedding

In [11]:
eval_classifier(
    train_emb_proj, train_df, test_emb_proj, test_df
)

[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 4 concurrent workers.


convergence after 16 epochs took 6 seconds
---------------------------------------------------------------------------
TRAINING
METRICS:	AUC 0.8886 | ACC 0.8392 | SENS 0.6039 | SPEC 0.9189 | PREC 0.7162 | F1 0.6553
---------------------------------------------------------------------------
TESTING
METRICS:	AUC 0.7322 | ACC 0.6932 | SENS 0.6109 | SPEC 0.7330 | PREC 0.5256 | F1 0.5650


In [12]:
formula = 'preds_adjusted ~ 1 + age + sex + race'
mod_fixed = sm.OLS.from_formula(formula, data=train_df).fit()
print(mod_fixed.summary())
print(sm.stats.anova_lm(mod_fixed))

                            OLS Regression Results                            
Dep. Variable:         preds_adjusted   R-squared:                       0.000
Model:                            OLS   Adj. R-squared:                 -0.000
Method:                 Least Squares   F-statistic:                 5.895e-12
Date:                Fri, 28 Jul 2023   Prob (F-statistic):               1.00
Time:                        11:06:33   Log-Likelihood:                -25002.
No. Observations:              181342   AIC:                         5.001e+04
Df Residuals:                  181337   BIC:                         5.006e+04
Df Model:                           4                                         
Covariance Type:            nonrobust                                         
                    coef    std err          t      P>|t|      [0.025      0.975]
---------------------------------------------------------------------------------
Intercept      1.939e-15      0.004    4.6e-13

## Add Protected Features to classifier

In [13]:
train_mat = np.concatenate([train_emb_proj, x_train], axis=1)
test_mat = np.concatenate([test_emb_proj, x_test], axis=1)

eval_classifier(
    train_mat, train_df, test_mat, test_df
)

[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 4 concurrent workers.


convergence after 68 epochs took 26 seconds
---------------------------------------------------------------------------
TRAINING
METRICS:	AUC 0.9026 | ACC 0.8471 | SENS 0.6458 | SPEC 0.9153 | PREC 0.7208 | F1 0.6812
---------------------------------------------------------------------------
TESTING
METRICS:	AUC 0.6816 | ACC 0.6304 | SENS 0.6129 | SPEC 0.6388 | PREC 0.4510 | F1 0.5197


## Classify protected features

In [14]:
train_df['response'] = (train_df['sex'] == 'M').astype(int)
test_df['response'] = (test_df['sex'] == 'M').astype(int)

eval_classifier(
    train_emb, train_df, test_emb, test_df
)

[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 4 concurrent workers.


convergence after 13 epochs took 5 seconds
---------------------------------------------------------------------------
TRAINING
METRICS:	AUC 0.9650 | ACC 0.9006 | SENS 0.9105 | SPEC 0.8891 | PREC 0.9049 | F1 0.9077
---------------------------------------------------------------------------
TESTING
METRICS:	AUC 0.9422 | ACC 0.8701 | SENS 0.8926 | SPEC 0.8431 | PREC 0.8721 | F1 0.8823


In [15]:
eval_classifier(
    train_emb_proj, train_df, test_emb_proj, test_df
)

[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 4 concurrent workers.


Epoch 1, change: 1.00000000
Epoch 2, change: 0.14980938
Epoch 3, change: 0.13148636
Epoch 4, change: 0.04471670
Epoch 5, change: 0.03168320
Epoch 6, change: 0.01859652
Epoch 7, change: 0.00963655
Epoch 8, change: 0.00912887
Epoch 9, change: 0.00370869
Epoch 10, change: 0.00124124
Epoch 11, change: 0.00073789
Epoch 12, change: 0.00065967
Epoch 13, change: 0.00025116
Epoch 14, change: 0.00012586
Epoch 1, change: 1.00000000
Epoch 2, change: 0.13441191
Epoch 3, change: 0.07905986
Epoch 4, change: 0.05699683
Epoch 5, change: 0.02951427
Epoch 6, change: 0.02556773
Epoch 7, change: 0.01472332
Epoch 8, change: 0.00824123
Epoch 9, change: 0.00560101
Epoch 10, change: 0.00351592
Epoch 11, change: 0.00215772
Epoch 12, change: 0.00136266
Epoch 13, change: 0.00028017
Epoch 14, change: 0.00026330
Epoch 15, change: 0.00020963
Epoch 1, change: 1.00000000
Epoch 2, change: 0.46749578
Epoch 3, change: 0.24439645
Epoch 4, change: 0.15432841
Epoch 5, change: 0.11217630
Epoch 6, change: 0.08694006
Epoch 7, 



METRICS:	AUC 0.3867 | ACC 0.5452 | SENS 1.0000 | SPEC 0.0000 | PREC 0.5452 | F1 0.7057


In [16]:
model = get_classifier('nn_nl')
model.fit(train_emb_proj, train_df['response'].tolist())

train_df['preds'] = model.predict_proba(train_emb_proj)[:, 1]
test_df['preds'] = model.predict_proba(test_emb_proj)[:, 1]

print('-'*75 + '\nTRAINING')
eval_predictions(train_df['response'], train_df['preds'])
print('-'*75 + '\nTESTING')
eval_predictions(test_df['response'], test_df['preds'])

GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


Training: 0it [00:00, ?it/s]

241015
Epoch 36, change: 1.87438446
Epoch 37, change: 1.55961051
Epoch 38, change: 3.49802575
Epoch 39, change: 1.83379888
Epoch 40, change: 1.68381943
Epoch 41, change: 1.73947155
Epoch 42, change: 1.57412141
Epoch 43, change: 1.02288832
Epoch 44, change: 6.81753734
Epoch 45, change: 2.87161605
Epoch 46, change: 1.68801175
Epoch 47, change: 1.83506868
Epoch 48, change: 2.37425467
Epoch 49, change: 1.43710467
Epoch 50, change: 1.86786911
Epoch 51, change: 2.80188482
Epoch 52, change: 2.06790894
Epoch 53, change: 1.17473754
Epoch 54, change: 1.03575915
Epoch 55, change: 0.91598523
Epoch 56, change: 0.46967634
Epoch 57, change: 0.35515399
Epoch 58, change: 0.14261441
Epoch 59, change: 0.08090452
Epoch 60, change: 0.04026404
Epoch 61, change: 0.11100092
Epoch 62, change: 0.07783642
Epoch 63, change: 0.01064036
Epoch 64, change: 0.00474440
Epoch 65, change: 0.00305893
Epoch 66, change: 0.00317668
Epoch 67, change: 0.00142810
Epoch 68, change: 0.00105598
Epoch 69, change: 0.00079734
Epoch 7

`Trainer.fit` stopped: `max_epochs=3` reached.


---------------------------------------------------------------------------
TRAINING
METRICS:	AUC 0.9945 | ACC 0.9639 | SENS 0.9642 | SPEC 0.9636 | PREC 0.9685 | F1 0.9663
---------------------------------------------------------------------------
TESTING
METRICS:	AUC 0.0000 | ACC 0.0023 | SENS 0.0042 | SPEC 0.0000 | PREC 0.0050 | F1 0.0046


## Project adjusted embedding back to original data space and classify again

In [17]:
eval_classifier(
    pca.inverse_transform(train_emb_proj), 
    train_df, 
    pca.inverse_transform(test_emb_proj),
    test_df
)

[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 4 concurrent workers.


Epoch 1, change: 1.00000000
Epoch 2, change: 1.91544063
Epoch 3, change: 1.46506476
Epoch 4, change: 1.43527983
Epoch 5, change: 1.34011513
Epoch 6, change: 0.75676905
Epoch 7, change: 0.49818115
Epoch 8, change: 0.29325784
Epoch 9, change: 0.13349679
Epoch 10, change: 0.11648425
Epoch 11, change: 0.07984129
Epoch 12, change: 0.04955771
Epoch 13, change: 0.03064817
Epoch 14, change: 0.01649530
Epoch 15, change: 0.01167858
Epoch 16, change: 0.01906994
Epoch 17, change: 0.01844331
Epoch 18, change: 0.00328936
Epoch 19, change: 0.00114379
Epoch 20, change: 0.00104578
Epoch 21, change: 0.00062626
Epoch 22, change: 0.00029460
Epoch 23, change: 0.00027166
Epoch 24, change: 0.00011502
convergence after 25 epochs took 93 seconds
---------------------------------------------------------------------------
TRAINING
METRICS:	AUC 0.5014 | ACC 0.5369 | SENS 1.0000 | SPEC 0.0000 | PREC 0.5369 | F1 0.6987
---------------------------------------------------------------------------
TESTING
METRICS:	AUC 