In [15]:
import torch
%load_ext autoreload
%autoreload 2

import os

import numpy as np
import pandas as pd
from pytorch_lightning import seed_everything

from src.eval import Disease

DATA_DIR = '../data'
#EMBEDDING_FILE = 'mimic_cfm.npy'
#EMBEDDING_FILE = 'mimic_chess.npy'
EMBEDDING_FILE = 'mimic_densenet_mimic.npy'
META_FILE = 'mimic_meta.csv'

#EMBEDDING_FILE = 'chex_chess.npy'
#EMBEDDING_FILE = 'chex_densenet_chex.npy'
#META_FILE = 'chexpert_meta.csv'


The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [16]:
from src.utils import get_mimic_meta_data, get_chexpert_meta_data

train_df, val_df, test_df = get_mimic_meta_data(os.path.join(DATA_DIR, META_FILE))
#train_df, val_df, test_df = get_chexpert_meta_data(DATA_DIR)
print(f'DATASET SIZES: TRAIN {len(train_df)} | VAL {len(val_df)} | TEST {len(test_df)}')

emb = np.load(os.path.join(DATA_DIR, EMBEDDING_FILE))
emb = np.nan_to_num(emb)
train_emb = emb[train_df['idx']]
test_emb = emb[test_df['idx']]

DATASET SIZES: TRAIN 181342 | VAL 1413 | TEST 3041


In [17]:
from src.eval import EmbeddingEvaluator

evaluator = EmbeddingEvaluator(train_df, test_df, train_emb, test_emb)

In [18]:
print(len(test_df))
print('WHITE', len(test_df[test_df['race'] == 'WHITE']))
print('BLACK', len(test_df[test_df['race'] == 'BLACK']))
print('ASIAN', len(test_df[test_df['race'] == 'ASIAN']))
print('MALE', len(test_df[test_df['sex'] == 'M']))
print('FEMALE', len(test_df[test_df['sex'] == 'F']))

3041
WHITE 2235
BLACK 676
ASIAN 130
MALE 1658
FEMALE 1383


In [19]:
from src.utils import eval_predictions
from src.classifier import get_classifier


def get_classifier_metrics(
        evaluator,
        run_name,
        response: Disease = Disease.PLEURAL_EFFUSION,
        clf_name: str = 'nn', clf_args = None,
        runs = 1,
) -> dict:
    
    subsets = {
        'WHITE': test_df['race'] == 'WHITE',
        'BLACK': test_df['race'] == 'BLACK',
        'ASIAN': test_df['race'] == 'ASIAN',
        'MALE': test_df['sex'] == 'M',
        'FEMALE': test_df['sex'] == 'F',
    }
    
    res = {
        'id': [],
        'run': [],
        'subgroup': [],
        'ortho':  [],
        'auc': [],
    }
    
    evaluator.train_df['response'] = (evaluator.train_df[response.value] == 1).astype(int)
    evaluator.test_df['response'] = (evaluator.test_df[response.value] == 1).astype(int)

    # Choose which embedding is the target
    train_emb = evaluator.train_emb
    test_emb = evaluator.test_emb

    for run_id in range(1, runs + 1):
        print('RUN', run_id, '-----------------')
        model = get_classifier(clf_name, clf_args)
        model.fit(train_emb, evaluator.train_df['response'].tolist())
    
        evaluator.test_df['preds'] = model.predict_proba(test_emb)[:, 1]
        
        #print('NORMAL --------------------')
        m = eval_predictions(evaluator.test_df['response'], evaluator.test_df['preds'], do_print=False)
        auc_normal = m['AUC']
        #print('ALL', auc_normal)
        
        res['id'].append(run_name)
        res['run'].append(run_id)
        res['subgroup'].append('ALL')
        res['ortho'].append(0)
        res['auc'].append(auc_normal)
        
        for k, v in subsets.items():
            m = eval_predictions(
                evaluator.test_df[v]['response'],
                evaluator.test_df[v]['preds'],
                do_print=False
            )
            #print(k, m['AUC'])
            
            res['id'].append(run_name)
            res['run'].append(run_id)
            res['subgroup'].append(k)
            res['ortho'].append(0)
            res['auc'].append(m['AUC'])
    
        # Choose which embedding is the target
        train_emb = evaluator.train_emb_ortho 
        test_emb = evaluator.test_emb_ortho
    
        model = get_classifier(clf_name, clf_args)
        model.fit(train_emb, evaluator.train_df['response'].tolist())
        
        evaluator.test_df['preds'] = model.predict_proba(test_emb)[:, 1]
    
        #print('ORTHO --------------------')
        m = eval_predictions(evaluator.test_df['response'], evaluator.test_df['preds'], do_print=False)
        auc_ortho = m['AUC']
        #print('ALL', auc_ortho)
        
        res['id'].append(run_name)
        res['run'].append(run_id)
        res['subgroup'].append('ALL')
        res['ortho'].append(1)
        res['auc'].append(auc_ortho)
        
        for k, v in subsets.items():
            m = eval_predictions(
                evaluator.test_df[v]['response'],
                evaluator.test_df[v]['preds'],
                do_print=False
            )
            res['id'].append(run_name)
            res['run'].append(run_id)
            res['subgroup'].append(k)            
            res['ortho'].append(1)
            res['auc'].append(m['AUC'])
            
            #print(k, m['AUC'])
            
    return res

In [None]:
#seed_everything(1337424242)

res = get_classifier_metrics(evaluator, run_name='MIMIC_CLF', runs=10, response=Disease.PLEURAL_EFFUSION, clf_args={'max_epochs': 10})

RUN 1 -----------------


GPU available: False, used: False
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs
`Trainer.fit` stopped: `max_epochs=10` reached.
GPU available: False, used: False
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs
`Trainer.fit` stopped: `max_epochs=10` reached.


RUN 2 -----------------


GPU available: False, used: False
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs
`Trainer.fit` stopped: `max_epochs=10` reached.
GPU available: False, used: False
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs
`Trainer.fit` stopped: `max_epochs=10` reached.


RUN 3 -----------------


GPU available: False, used: False
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs
`Trainer.fit` stopped: `max_epochs=10` reached.
GPU available: False, used: False
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs
`Trainer.fit` stopped: `max_epochs=10` reached.


RUN 4 -----------------


GPU available: False, used: False
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs
`Trainer.fit` stopped: `max_epochs=10` reached.
GPU available: False, used: False
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs
`Trainer.fit` stopped: `max_epochs=10` reached.


RUN 5 -----------------


GPU available: False, used: False
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs


In [None]:
df = pd.DataFrame.from_dict(res)
df.to_csv('subgroup.csv', mode='a', header=False)
print(df)