# Embedding Orthogonalization


In [1]:
%load_ext autoreload
%autoreload 2

import os

import numpy as np
from src.eval import EmbeddingEvaluator, Disease


DATA_DIR = '../data'
DATASET = 'ch'

if DATASET == 'mimic':
    EMBEDDING_FILE = 'mimic_chess.npy'
    META_FILE = 'mimic_meta.csv'
else:
    EMBEDDING_FILE = 'chex_chess.npy'
    META_FILE = 'chexpert_meta.csv'

N_COMPONENTS = None
TARGET_DISEASE = Disease.PLEURAL_EFFUSION

## Load Metadata and Embedding

In [2]:
from src.utils import get_mimic_meta_data, get_chexpert_meta_data

if DATASET == 'mimic':
    train_df, val_df, test_df = get_mimic_meta_data(os.path.join(DATA_DIR, META_FILE))
else:
    train_df, val_df, test_df = get_chexpert_meta_data(DATA_DIR)
print(f'DATASET SIZES: TRAIN {len(train_df)} | VAL {len(val_df)} | TEST {len(test_df)}')

emb = np.load(os.path.join(DATA_DIR, EMBEDDING_FILE))
emb = np.nan_to_num(emb)
train_emb = emb[train_df['idx']]
test_emb = emb[test_df['idx']]

DATASET SIZES: TRAIN 76205 | VAL 12673 | TEST 38240


In [3]:
evaluator = EmbeddingEvaluator(train_df, test_df, train_emb, test_emb, n_components=N_COMPONENTS)

### WithOUT Orthogonalization

In [4]:
evaluator.eval_classifier(response=TARGET_DISEASE, ortho=False)

GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


Training: 0it [00:00, ?it/s]

  rank_zero_warn("Detected KeyboardInterrupt, attempting graceful shutdown...")


---------------------------------------------------------------------------
TRAINING
METRICS:	AUC 0.7675 | ACC 0.7058 | SENS 0.5736 | SPEC 0.7966 | PREC 0.6593 | F1 0.6135
---------------------------------------------------------------------------
TESTING
METRICS:	AUC 0.7717 | ACC 0.7096 | SENS 0.5810 | SPEC 0.7974 | PREC 0.6617 | F1 0.6187
---------------------------------------------------------------------------
                            OLS Regression Results                            
Dep. Variable:                 scores   R-squared:                       0.069
Model:                            OLS   Adj. R-squared:                  0.069
Method:                 Least Squares   F-statistic:                     1413.
Date:                Tue, 29 Aug 2023   Prob (F-statistic):               0.00
Time:                        11:01:30   Log-Likelihood:            -1.0743e+05
No. Observations:               76205   AIC:                         2.149e+05
Df Residuals:               

### With Orthogonalization

In [5]:
evaluator.eval_classifier(response=TARGET_DISEASE, ortho=True)

GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


Training: 0it [00:00, ?it/s]

`Trainer.fit` stopped: `max_epochs=3` reached.


---------------------------------------------------------------------------
TRAINING
METRICS:	AUC 0.7808 | ACC 0.7150 | SENS 0.5960 | SPEC 0.7966 | PREC 0.6679 | F1 0.6299
---------------------------------------------------------------------------
TESTING
METRICS:	AUC 0.7835 | ACC 0.7196 | SENS 0.6033 | SPEC 0.7990 | PREC 0.6720 | F1 0.6358
---------------------------------------------------------------------------
                            OLS Regression Results                            
Dep. Variable:                 scores   R-squared:                       0.000
Model:                            OLS   Adj. R-squared:                 -0.000
Method:                 Least Squares   F-statistic:                 2.912e-12
Date:                Tue, 29 Aug 2023   Prob (F-statistic):               1.00
Time:                        11:01:51   Log-Likelihood:            -1.1661e+05
No. Observations:               76205   AIC:                         2.332e+05
Df Residuals:               