In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
from dataset import Dataset
import numpy as np

In [3]:
from common import *

In [4]:
data = Dataset(name='Diagnosis', dirname='preprocess/Diagnosis/')

INFO - 2019-02-12 12:36:27,458 - Reading Structured data ...
INFO - 2019-02-12 12:36:27,555 - Reading Notes ...
INFO - 2019-02-12 12:36:37,450 - Stratifying ...


In [5]:
labellist = [x for x in data.dataframe.columns if x.startswith('y_')]
data.generate_labels(labellist, len(labellist), 'multilabel')

In [None]:
data.generate_encoded_field('gender_y', 'onehot')
data.generate_encoded_field('age_y', 'onehot')
data.generate_encoded_field('ethnicity_y', 'onehot')

In [None]:
features = [x for x in data.dataframe.columns if x.startswith('feature')]
for f in features :
    data.generate_encoded_field(f, 'trivial')
    
data.set_structured_params(regexs=[r'^feature', 'gender_y', 'age_y', 'ethnicity_y'])

In [None]:
from trainer import Trainer, Evaluator

In [None]:
from models.Vanilla import ClassificationTrainer as BasicCT
from models.Hierarchical import ClassificationTrainer as HierCT

In [None]:
from Experiments.experiments import experiments, hierarchical_experiments, structured_experiments

Basic Experiments
=================

In [None]:
train_data = data.get_data('train', structured=True)
dev_data = data.get_data('dev', structured=True)
test_data = data.get_data('test', structured=True)

In [None]:
train_data = data.filter_data_length(train_data, truncate=90)
dev_data = data.filter_data_length(dev_data, truncate=90)

In [None]:
for e in experiments :
    config = e(data, structured=True)
    print(config)
    trainer = Trainer(BasicCT, config, _type='multilabel')
    trainer.train(train_data, dev_data, save_on_metric='macro_roc_auc')

    evaluator = Evaluator(BasicCT, trainer.model.dirname, _type='multilabel')
    _ = evaluator.evaluate(dev_data, save_results=True)
    print('='*300)

In [None]:
for e in hierarchical_experiments :
    config = e(data, structured=False)
    print(config)
    trainer = Trainer(HierCT, config)
    trainer.train(train_data, dev_data)

    evaluator = Evaluator(HierCT, trainer.model.dirname)
    _ = evaluator.evaluate(dev_data, save_results=True)
    print('='*30)

In [None]:
train_data = data.get_data('train', structured=True, encodings=data.structured_columns)
dev_data = data.get_data('dev', structured=True, encodings=data.structured_columns)
train_data = data.filter_data_length(train_data, truncate=90)
dev_data = data.filter_data_length(dev_data, truncate=90)

for e in structured_experiments :
    for use_structured in [True, False] :
        config = e(data, structured=use_structured, encodings=data.structured_columns)
        print(config)

        trainer = Trainer(BasicCT, config, _type='multilabel')
        trainer.train(train_data, dev_data, save_on_metric='macro_roc_auc')

        evaluator = Evaluator(BasicCT, trainer.model.dirname, _type='multilabel')
        _ = evaluator.evaluate(dev_data, save_results=True)
        print('='*30)

Pos Percentage [0.21358209 0.07716941 0.10244004 0.31971179 0.13028647 0.13014724
 0.20870897 0.07160013 0.26850917 0.31939852 0.09485189 0.19033033
 0.28253681 0.41581677 0.26697762 0.07180897 0.13039089 0.0897003
 0.04953183 0.04076021 0.08917818 0.14152947 0.18249852 0.14615893
 0.07797   ]
Pos Percentage [0.20548768 0.06969758 0.10194272 0.31744442 0.12237132 0.13458842
 0.2032846  0.06649309 0.25776087 0.31484078 0.0929301  0.19587422
 0.29541358 0.42339275 0.25475666 0.06789505 0.12096936 0.08531945
 0.0530743  0.03745243 0.0817144  0.1327859  0.17684759 0.13198478
 0.07790907]


INFO - 2019-02-12 12:36:43,823 - Maximum Sentence Length 721825.000000 , 90 percentile length 16714.400000 ... 
INFO - 2019-02-12 12:36:43,830 - Pos Percentage of remaining data ... 
INFO - 2019-02-12 12:36:43,850 - [0.19322401 0.07650062 0.10272277 0.3112237  0.12059097 0.12650835
 0.20064975 0.07023515 0.25978496 0.32545637 0.09316986 0.18722927
 0.28480817 0.42090811 0.24957457 0.06973236 0.1237237  0.08249536
 0.04641089 0.03774752 0.08048422 0.12349165 0.15741027 0.12925433
 0.06845606]
INFO - 2019-02-12 12:36:44,645 - Maximum Sentence Length 407062.000000 , 90 percentile length 16420.400000 ... 
INFO - 2019-02-12 12:36:44,647 - Pos Percentage of remaining data ... 
INFO - 2019-02-12 12:36:44,650 - [0.18517694 0.06877365 0.10549744 0.30847986 0.11395504 0.12819942
 0.19719564 0.06543512 0.24905408 0.32383708 0.09014022 0.19519252
 0.30202537 0.42777654 0.23213888 0.06209659 0.1135099  0.07834409
 0.04941019 0.03360783 0.07589584 0.11551302 0.1502337  0.11662586
 0.06855108]
INFO -

{'model': {'type': 'seq_classifier_with_structured_attention', 'embedder': {'type': 'token_embedder', 'vocab_size': 28172, 'embed_size': 200, 'embedding_file': 'preprocess/Diagnosis/embedding_matrix.npy'}, 'decoder': {'num_layers': 2, 'hidden_dims': [128, 25], 'activations': ['tanh', 'linear']}, 'predictor': {'type': 'multilabel'}, 'structured': {'use_structured': True, 'structured_dim': 142}, 'encoder': {'type': 'average', 'projection': True, 'hidden_size': 256, 'activation': 'relu'}, 'attention': {'similarity': {'type': 'additive', 'hidden_size': 128, 'tensor_2_dim': 142}}}, 'training_config': {'type': 'Adam', 'groups': [('.*', {'lr': 0.001, 'weight_decay': 1e-05})], 'common': {'bsize': 32, 'class_weight': True}}, 'exp_config': {'exp_name': 'Diagnosis/Structured Attention/Average(hs=256)+Attention(additive)(all)(hs=128)+Structured'}}
Setting Embedding


HBox(children=(IntProgress(value=0, max=20), HTML(value='')))

HBox(children=(IntProgress(value=0, max=808), HTML(value='')))

HBox(children=(IntProgress(value=0, max=141), HTML(value='')))

{'macro_roc_auc': 0.8304698503581013, 'macro_pr_auc': 0.49911071178985866}


Unnamed: 0,label_0,label_1,label_2,label_3,label_4,label_5,label_6,label_7,label_8,label_9,label_10,label_11,label_12,label_13,label_14,label_15,label_16,label_17,label_18,label_19,label_20,label_21,label_22,label_23,label_24
1/f1-score,0.509,0.546,0.453,0.588,0.459,0.406,0.436,0.369,0.602,0.729,0.451,0.493,0.559,0.642,0.507,0.321,0.472,0.375,0.173,0.2,0.228,0.433,0.57,0.495,0.423
1/precision,0.392,0.396,0.311,0.526,0.327,0.282,0.321,0.242,0.487,0.662,0.301,0.365,0.472,0.588,0.396,0.204,0.334,0.247,0.1,0.117,0.138,0.304,0.462,0.351,0.287
1/recall,0.725,0.877,0.838,0.667,0.771,0.722,0.678,0.772,0.789,0.811,0.901,0.759,0.685,0.707,0.704,0.746,0.804,0.784,0.626,0.695,0.666,0.751,0.744,0.838,0.808
1/support,832.0,309.0,474.0,1386.0,512.0,576.0,886.0,294.0,1119.0,1455.0,405.0,877.0,1357.0,1922.0,1043.0,279.0,510.0,352.0,222.0,151.0,341.0,519.0,675.0,524.0,308.0


Model not saved on  macro_roc_auc 0.8304698503581013


HBox(children=(IntProgress(value=0, max=808), HTML(value='')))

HBox(children=(IntProgress(value=0, max=141), HTML(value='')))

{'macro_roc_auc': 0.8423368189900142, 'macro_pr_auc': 0.5266592731971445}


Unnamed: 0,label_0,label_1,label_2,label_3,label_4,label_5,label_6,label_7,label_8,label_9,label_10,label_11,label_12,label_13,label_14,label_15,label_16,label_17,label_18,label_19,label_20,label_21,label_22,label_23,label_24
1/f1-score,0.526,0.528,0.462,0.606,0.464,0.361,0.473,0.388,0.596,0.735,0.483,0.485,0.565,0.674,0.501,0.311,0.466,0.433,0.176,0.218,0.229,0.418,0.568,0.532,0.449
1/precision,0.395,0.372,0.318,0.494,0.323,0.229,0.366,0.259,0.476,0.654,0.333,0.343,0.461,0.568,0.408,0.189,0.323,0.307,0.101,0.128,0.137,0.278,0.429,0.389,0.311
1/recall,0.785,0.909,0.842,0.785,0.822,0.852,0.67,0.772,0.797,0.84,0.877,0.827,0.73,0.829,0.646,0.864,0.835,0.736,0.676,0.748,0.683,0.838,0.84,0.845,0.808
1/support,832.0,309.0,474.0,1386.0,512.0,576.0,886.0,294.0,1119.0,1455.0,405.0,877.0,1357.0,1922.0,1043.0,279.0,510.0,352.0,222.0,151.0,341.0,519.0,675.0,524.0,308.0


Model Saved on  macro_roc_auc 0.8423368189900142


HBox(children=(IntProgress(value=0, max=808), HTML(value='')))

HBox(children=(IntProgress(value=0, max=141), HTML(value='')))

{'macro_roc_auc': 0.8460961811673168, 'macro_pr_auc': 0.5363069322831943}


Unnamed: 0,label_0,label_1,label_2,label_3,label_4,label_5,label_6,label_7,label_8,label_9,label_10,label_11,label_12,label_13,label_14,label_15,label_16,label_17,label_18,label_19,label_20,label_21,label_22,label_23,label_24
1/f1-score,0.547,0.586,0.526,0.614,0.453,0.399,0.466,0.406,0.612,0.753,0.497,0.503,0.565,0.644,0.507,0.326,0.434,0.462,0.177,0.229,0.246,0.433,0.541,0.524,0.427
1/precision,0.428,0.437,0.391,0.524,0.313,0.275,0.355,0.277,0.534,0.736,0.348,0.375,0.477,0.609,0.381,0.202,0.292,0.33,0.103,0.134,0.153,0.293,0.388,0.376,0.288
1/recall,0.76,0.89,0.806,0.742,0.824,0.722,0.679,0.762,0.716,0.77,0.869,0.762,0.693,0.682,0.756,0.832,0.845,0.767,0.644,0.815,0.63,0.832,0.896,0.865,0.825
1/support,832.0,309.0,474.0,1386.0,512.0,576.0,886.0,294.0,1119.0,1455.0,405.0,877.0,1357.0,1922.0,1043.0,279.0,510.0,352.0,222.0,151.0,341.0,519.0,675.0,524.0,308.0


Model Saved on  macro_roc_auc 0.8460961811673168


HBox(children=(IntProgress(value=0, max=808), HTML(value='')))

HBox(children=(IntProgress(value=0, max=141), HTML(value='')))

In [None]:
from PatientVec.models.baselines.LR import LR, LDA

In [None]:
lr = LR({'vocab' : data.vocab, 'stop_words' : True, 'exp_name' : data.name, 'type' : 'multilabel'})
lr.train(train_data)
lr.evaluate(dev_data, save_results=True)
# lr.get_features(n=20)

In [None]:
lda = LDA({'vocab' : data.vocab, 'stop_words' : True, 'exp_name' : data.name, 'type' : 'multilabel'})
lda.train(train_data)
lda.evaluate(dev_data, save_results=True)
# print(lda.get_topics(n=10))
# topics = lda.get_topics(n=10)
# print([topics[i] for i in np.argsort(lda.lda_classifier.coef_[0])])

In [None]:
[topics[i] for i in np.argsort(lda.lda_classifier.coef_[0])]

In [None]:
preds = outputs['predictions'][:, 0]

In [None]:
from common import collapse_and_print_word_attn, print_sent_attn

In [None]:
n = 50
collapse_and_print_word_attn(data.vocab, dev_data.X[n], outputs['word_attentions'][n])
print_sent_attn(data.vocab, dev_data.X[n], outputs['sentence_attentions'][n])

In [None]:
outputs['sentence_attentions'][0]

In [None]:
from scipy.stats import spearmanr, kendalltau

In [None]:
corrs = [kendalltau(range(len(outputs['sentence_attentions'][i])), outputs['sentence_attentions'][i]) 
         for i in range(len(outputs['sentence_attentions']))]

In [None]:
rho, leng = zip(*[(x[0], y) for x, y in zip(corrs, [len(z) for z in outputs['sentence_attentions']]) if x[0] == x[0]])

In [None]:
plt.hist(rho, bins=30)

In [None]:
pval, leng1 = zip(*[(x[1], y) for x, y in zip(corrs, [len(z) for z in outputs['sentence_attentions']]) if x[1] == x[1]])

Saving Models
==============

In [None]:
import os
for e in experiments :
    config = e(data, structured=False)
    filename = config['exp_config']['exp_name']
    filename = os.path.join('outputs/classification/', filename)
    push_latest_model(filename, config['exp_config']['exp_name'])
    
    config = e(data, structured=True)
    filename = config['exp_config']['exp_name']
    filename = os.path.join('outputs/classification/', filename)
    push_latest_model(filename, config['exp_config']['exp_name'])
    
# for e in hierarchical_experiments :
#     config = e(data, structured=False)
#     filename = config['exp_config']['exp_name']
#     filename = os.path.join('outputs/hierarchical_classification/', filename)
#     push_latest_model(filename, config['exp_config']['exp_name'])
    
#     config = e(data, structured=True)
#     filename = config['exp_config']['exp_name']
#     filename = os.path.join('outputs/hierarchical_classification/', filename)
#     push_latest_model(filename, config['exp_config']['exp_name'])
    
# for e in structured_experiments :
#     config = e(data, structured=True, encodings=['gender_y', 'ethnicity_y', 'age_y'])
#     filename = config['exp_config']['exp_name']
#     filename = os.path.join('outputs/classification/', filename)
#     push_latest_model(filename, config['exp_config']['exp_name'])
    
#     config = e(data, structured=False, encodings=['gender_y', 'ethnicity_y', 'age_y'])
#     filename = config['exp_config']['exp_name']
#     filename = os.path.join('outputs/classification/', filename)
#     push_latest_model(filename, config['exp_config']['exp_name'])
    
#     config = e(data, structured=True, encodings=data.structured_columns)
#     filename = config['exp_config']['exp_name']
#     filename = os.path.join('outputs/classification/', filename)
#     push_latest_model(filename, config['exp_config']['exp_name'])
    
#     config = e(data, structured=False, encodings=data.structured_columns)
#     filename = config['exp_config']['exp_name']
#     filename = os.path.join('outputs/classification/', filename)
#     push_latest_model(filename, config['exp_config']['exp_name'])
    
    
for e in os.listdir('outputs/baselines/Diagnosis/baselines/') :
    filename = os.path.join('outputs/baselines/Diagnosis/baselines/', e)
    push_latest_model(filename, os.path.join('Diagnosis/baselines/', e))

In [None]:
dataset = 'Diagnosis'
dataset_path = os.path.join('latex_evals', dataset)
output_path = os.path.join('Text-encoding-EHR/results/', dataset)
os.makedirs(output_path, exist_ok=True)

In [None]:
dirs = os.listdir(dataset_path)
keys_to_use = ['macro_roc_auc', 'macro_pr_auc']
for d in dirs :
    subpath = os.path.join(dataset_path, d)
    output_file = os.path.join(output_path, d + '.csv')
    dfs = []
    for f in sorted(os.listdir(subpath)) :
        if os.path.isfile(os.path.join(subpath, f)) :
            d = json.load(open(os.path.join(subpath, f)))
            results = {k:d['results'][k] for k in keys_to_use}
            results['Method'] = f[:-14].replace('+', ' +').replace('_', ':')
            dfs.append(pd.DataFrame([results]))
        else :
            logging.error("%s not a file", f)
    
    dfs = pd.concat(dfs)
    dfs.to_csv(output_file, columns=['Method'] + keys_to_use, index=False)