In [2]:
%load_ext autoreload
%autoreload 2

In [3]:
import numpy as np
from common import *

In [4]:
import argparse
parser = argparse.ArgumentParser(description='Run Diagnosis experiments')
parser.add_argument("--data_dir", type=str, required=True)
parser.add_argument('--display', dest='display', action='store_true')
parser.add_argument("--output_dir", type=str)
parser.add_argument("--mock", dest='mock', action='store_true')

args = parser.parse_args(['--data_dir=.', '--output_dir=outputs/', '--display'])

In [5]:
from PatientVec.models.Vanilla import ClassificationTrainer as BasicCT
from PatientVec.models.Hierarchical import ClassificationTrainer as HierCT
from PatientVec.trainer import Trainer, Evaluator
from PatientVec.Experiments.modifiable_config_exp import vanilla_configs, attention_configs, hierarchical_configs, structured_configs

from PatientVec.Experiments.hyperparam_exps import get_basic_data
from PatientVec.models.baselines.LR import LR, LDA

Better speed can be achieved with apex installed from https://www.github.com/nvidia/apex.


In [6]:
from dataloaders import hip_dataset, knee_dataset

In [None]:
for yr in [1, 2, 3] :
    data = hip_dataset(args, yr=yr)
#     train_data, dev_data = get_basic_data(data, structured=False, truncate=100)

#     lr = LR({'vocab' : data.vocab, 'stop_words' : True, 'exp_name' : data.name, 'type' : 'classifier', 'norm' : 'l2'})
#     lr.train(train_data)
#     lr.evaluate(dev_data, save_results=True)
#     try :
#         lr.print_all_features(n=30)
#     except :
#         pass
        
#     print('=' * 200)

    train_data, dev_data = get_basic_data(data, structured=False, truncate=98)

    for e in attention_configs :
        config = e(data, structured=False, args=args)
        if args.output_dir is not None :
            config['exp_config']['basepath'] = args.output_dir
        config['training_config']['common']['bsize'] = 8
        config['training_config']['common']['class_weight'] = False
        config['training_config']['common']['balanced'] = True
        config['exp_config']['exp_name'] += '+Balanced'
        config['training_config']['type'] = 'RMSprop'
        print(config)

        trainer = Trainer(BasicCT, config, _type=data.metrics_type, display_metrics=True)
        trainer.train(train_data, dev_data, n_iters=15, save_on_metric=data.save_on_metric)

        evaluator = Evaluator(BasicCT, trainer.model.dirname, _type=data.metrics_type, display_metrics=True)
        _ = evaluator.evaluate(dev_data, save_results=True)
        print('-'*300)

    for e in vanilla_configs :
        config = e(data, structured=False, args=args)
        if args.output_dir is not None :
            config['exp_config']['basepath'] = args.output_dir
        config['training_config']['common']['bsize'] = 8
        config['training_config']['common']['class_weight'] = False
        config['training_config']['common']['balanced'] = True
        config['exp_config']['exp_name'] += '+Balanced'
        print(config)

        trainer = Trainer(BasicCT, config, _type=data.metrics_type, display_metrics=True)
        trainer.train(train_data, dev_data, n_iters=15, save_on_metric=data.save_on_metric)

        evaluator = Evaluator(BasicCT, trainer.model.dirname, _type=data.metrics_type, display_metrics=True)
        _ = evaluator.evaluate(dev_data, save_results=True)
        print('-'*300)
        
    print('='*500)

In [None]:
for yr in [1, 2, 3] :
    data = hip_dataset(args, yr=yr)
#     train_data, dev_data = get_basic_data(data, structured=False, truncate=100)

#     lr = LR({'vocab' : data.vocab, 'stop_words' : True, 'exp_name' : data.name, 'type' : 'classifier', 'norm' : 'l2'})
#     lr.train(train_data)
#     lr.evaluate(dev_data, save_results=True)
#     try :
#         lr.print_all_features(n=30)
#     except :
#         pass
        
#     print('=' * 200)

    train_data, dev_data = get_basic_data(data, structured=False, truncate=98)

    for e in vanilla_configs :
        config = e(data, structured=False, args=args)
        if args.output_dir is not None :
            config['exp_config']['basepath'] = args.output_dir
        config['training_config']['common']['bsize'] = 8
        config['model']['predictor']['replicate'] = True
        config['model']['predictor']['alpha'] = 0.3
        config['exp_config']['exp_name'] += '+Replicate'
        config['training_config']['type'] = 'RMSprop'
        print(config)

        trainer = Trainer(BasicCT, config, _type=data.metrics_type, display_metrics=True)
        trainer.train(train_data, dev_data, n_iters=15, save_on_metric=data.save_on_metric)

        evaluator = Evaluator(BasicCT, trainer.model.dirname, _type=data.metrics_type, display_metrics=True)
        _ = evaluator.evaluate(dev_data, save_results=True)
        print('-'*300)
        
    print('='*500)

In [None]:
for yr in [1, 2, 3] :
    data = hip_dataset(args, yr=yr)

Saving Models
==============

In [18]:
from common import generate_latex_tables
keys_to_use = ['accuracy', 'roc_auc', 'pr_auc']
for yr in [1, 2, 3] :
    data = knee_dataset(args, yr=yr)
    generate_latex_tables(data, keys_to_use)

INFO - 2019-03-07 17:45:33,752 - Reading Structured data ...
INFO - 2019-03-07 17:45:33,910 - Reading Notes ...
INFO - 2019-03-07 17:45:34,649 - Stratifying ...
INFO - 2019-03-07 17:45:34,763 - Reading Structured data ...
INFO - 2019-03-07 17:45:34,920 - Reading Notes ...
INFO - 2019-03-07 17:45:36,095 - Stratifying ...
INFO - 2019-03-07 17:45:36,186 - Reading Structured data ...
INFO - 2019-03-07 17:45:36,343 - Reading Notes ...
INFO - 2019-03-07 17:45:37,707 - Stratifying ...


In [24]:
for model in ['Attention', 'baselines', 'Basic'] :
    yr_df = {}
    for yr in [1, 2, 3] :
        df = pd.read_csv('Text-encoding-EHR/results/HipSurgery_' + str(yr) + '/' + model + '.csv')
        df.index = df['Method']
        df = df.drop(columns=['Method'])
        yr_df[yr] = df
    yr_df = pd.concat(yr_df.values(), axis=1, keys=yr_df.keys())
    yr_df.columns = yr_df.columns.swaplevel(0, 1)
    yr_df.sort_index(axis=1, level=0, inplace=True)
    display(HTML(yr_df.to_html()))

Unnamed: 0_level_0,accuracy,accuracy,accuracy,pr_auc,pr_auc,pr_auc,roc_auc,roc_auc,roc_auc
Unnamed: 0_level_1,1,2,3,1,2,3,1,2,3
Method,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2
Average(hs=256) +Attention(additive)(hs=128),0.85489,0.869085,0.64511,0.338696,0.254872,0.372558,0.847882,0.793791,0.810393
"CNN(hs=64)(kernels=3,5,7,9) +Attention(additive)(hs=128)",0.862776,0.757098,0.567823,0.273516,0.259278,0.276672,0.794734,0.815622,0.808006
LSTM(hs=128) +Attention(additive)(hs=128),0.735016,0.873817,0.862776,0.367419,0.404569,0.342527,0.869379,0.852519,0.819012


Unnamed: 0_level_0,accuracy,accuracy,accuracy,pr_auc,pr_auc,pr_auc,roc_auc,roc_auc,roc_auc
Unnamed: 0_level_1,1,2,3,1,2,3,1,2,3
Method,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2
LR +BOW +norm=l2,0.862776,0.858044,0.847003,0.415968,0.412722,0.35876,0.819119,0.819864,0.814649
LR +TFIDF +norm=l2,0.87224,0.865931,0.842271,0.374768,0.369283,0.356594,0.811882,0.818131,0.815744


Unnamed: 0_level_0,accuracy,accuracy,accuracy,pr_auc,pr_auc,pr_auc,roc_auc,roc_auc,roc_auc
Unnamed: 0_level_1,1,2,3,1,2,3,1,2,3
Method,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2
Average(hs=256),0.87224,0.894322,0.870662,0.417312,0.387561,0.356003,0.826416,0.827678,0.807048
"CNN(hs=64)(kernels=3,5,7,9)",0.883281,0.897476,0.90694,0.409353,0.408185,0.420606,0.838107,0.845982,0.846955
LSTM(hs=128),0.764984,0.589905,0.895899,0.213168,0.156853,0.122745,0.683922,0.602846,0.579069


In [25]:
for model in ['Attention', 'baselines', 'Basic'] :
    yr_df = {}
    for yr in [1, 2, 3] :
        df = pd.read_csv('Text-encoding-EHR/results/KneeSurgery_' + str(yr) + '/' + model + '.csv')
        df.index = df['Method']
        df = df.drop(columns=['Method'])
        yr_df[yr] = df
    yr_df = pd.concat(yr_df.values(), axis=1, keys=yr_df.keys())
    yr_df.columns = yr_df.columns.swaplevel(0, 1)
    yr_df.sort_index(axis=1, level=0, inplace=True)
    display(HTML(yr_df.to_html()))

Unnamed: 0_level_0,accuracy,accuracy,accuracy,pr_auc,pr_auc,pr_auc,roc_auc,roc_auc,roc_auc
Unnamed: 0_level_1,1,2,3,1,2,3,1,2,3
Method,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2
Average(hs=256) +Attention(additive)(hs=128),0.875156,0.80025,0.883895,0.584225,0.560217,0.54308,0.882891,0.881102,0.87157
"CNN(hs=64)(kernels=3,5,7,9) +Attention(additive)(hs=128)",0.813983,0.870162,0.877653,0.633362,0.55696,0.568251,0.899954,0.861172,0.876522
LSTM(hs=128) +Attention(additive)(hs=128),0.875156,0.905119,0.875156,0.596392,0.552613,0.51841,0.881602,0.87514,0.864989


Unnamed: 0_level_0,accuracy,accuracy,accuracy,pr_auc,pr_auc,pr_auc,roc_auc,roc_auc,roc_auc
Unnamed: 0_level_1,1,2,3,1,2,3,1,2,3
Method,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2
LR +BOW +norm=l2,0.891386,0.88764,0.881398,0.588659,0.557406,0.548971,0.872613,0.866329,0.871315
LR +TFIDF +norm=l2,0.885144,0.873908,0.871411,0.615401,0.578271,0.575673,0.876429,0.877235,0.878303


Unnamed: 0_level_0,accuracy,accuracy,accuracy,pr_auc,pr_auc,pr_auc,roc_auc,roc_auc,roc_auc
Unnamed: 0_level_1,1,2,3,1,2,3,1,2,3
Method,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2
Average(hs=256),0.902622,0.883895,0.882647,0.526643,0.458182,0.408885,0.859654,0.855897,0.841065
"CNN(hs=64)(kernels=3,5,7,9)",0.893883,0.88015,0.911361,0.52717,0.539622,0.512994,0.864904,0.851047,0.868805
LSTM(hs=128),0.897628,0.600499,0.435705,0.138312,0.109922,0.148654,0.617889,0.509863,0.590208
