In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import numpy as np
from common import *

In [3]:
import argparse
parser = argparse.ArgumentParser(description='Run Diagnosis experiments')
parser.add_argument("--data_dir", type=str, required=True)
parser.add_argument('--display', dest='display', action='store_true')
parser.add_argument("--output_dir", type=str)
parser.add_argument("--mock", dest='mock', action='store_true')

args = parser.parse_args(['--data_dir=.', '--output_dir=outputs/', '--display'])

In [4]:
from dataloaders import readmission_dataset
data = readmission_dataset(args)

INFO - 2019-03-10 12:44:16,899 - Reading Structured data ...
INFO - 2019-03-10 12:44:17,053 - Reading Notes ...
INFO - 2019-03-10 12:44:37,987 - Stratifying ...


In [5]:
vc = [0] * data.vocab.vocab_size
for i, v in data.vocab.idx2word.items() :
    vc[i] = v

In [6]:
from PatientVec.Experiments.modifiable_config_exp import attention_configs
avg_attn_config = attention_configs[0](data, structured=True, args=args)

In [7]:
avg_attn_config['model']['embedder']['type'] = 'elmo_embedder'
del avg_attn_config['model']['embedder']['embedding_file']
avg_attn_config['model']['embedder']['elmo_options'] = {
    'options_file' : '../../elmo_2x4096_512_2048cnn_2xhighway_options.json',
    'weight_file' : '../../elmo_2x4096_512_2048cnn_2xhighway_weights_PubMed_only.hdf5',
    'vocab_to_cache' : vc
}

In [8]:
avg_attn_config['training_config']['common']['bsize'] = 16

In [9]:
from PatientVec.models.Vanilla import ClassificationTrainer as BasicCT
from PatientVec.trainer import Trainer, Evaluator

Better speed can be achieved with apex installed from https://www.github.com/nvidia/apex.


In [10]:
import logging.config
logging.config.dictConfig({
    'version': 1,
    'disable_existing_loggers': True
})

In [11]:
dev_data = data.filter_data_length(data.get_data('dev', structured=True), 90)

Pos Percentage [0.20937082]


INFO - 2019-03-10 12:44:41,131 - Maximum Sentence Length 588265.000000 , 90 percentile length 19174.000000 ... 
INFO - 2019-03-10 12:44:44,599 - Truncated all ...


In [12]:
train_data = data.filter_data_length(data.get_data('train', structured=True), 90)

Pos Percentage [0.19738451]


INFO - 2019-03-10 12:44:54,148 - Maximum Sentence Length 721825.000000 , 90 percentile length 18689.000000 ... 
INFO - 2019-03-10 12:45:16,915 - Truncated all ...


In [13]:
trainer = Trainer(BasicCT, avg_attn_config, _type=data.metrics_type, display_metrics=args.display)
trainer.train(train_data, dev_data, save_on_metric=data.save_on_metric)

INFO - 2019-03-10 12:45:17,891 - Caching character cnn layers for words in vocabulary.


HBox(children=(IntProgress(value=0, max=20), HTML(value='')))

HBox(children=(IntProgress(value=0, max=1840), HTML(value='')))




KeyboardInterrupt: 

In [None]:
from PatientVec.Experiments.evaluate import get_evaluator

In [None]:
evaluator = get_evaluator(data, 'Attention/Average(hs=256)+Attention(additive)(hs=128)+Structured/')

In [None]:
output = evaluator.evaluate(dev_data)

In [None]:
max_attentions = [max(x) for x in output['attentions']]

In [None]:
plt.plot(sorted(max_attentions))

Saving Models
==============

In [None]:
from common import generate_latex_tables
keys_to_use = ['roc_auc', 'pr_auc']
generate_latex_tables(data, keys_to_use)

In [None]:
dirname = 'outputs/Readmission/Basic/'
exps = os.listdir(dirname)
for e in sorted(exps) :
    if 'Structured' in e :
        print(e)
        print_results_from_model(get_latest_model(os.path.join(dirname, e)))

In [None]:
dirname = 'outputs/Diagnosis/Basic/'
exps = os.listdir(dirname)
for e in sorted(exps) :
    if 'Structured' in e :
        print(e)
        print_results_from_model(get_latest_model(os.path.join(dirname, e)))