In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import numpy as np
from common import *

In [3]:
import argparse
parser = argparse.ArgumentParser(description='Run Diagnosis experiments')
parser.add_argument("--data_dir", type=str, required=True)
parser.add_argument('--display', dest='display', action='store_true')
parser.add_argument("--output_dir", type=str)
parser.add_argument("--mock", dest='mock', action='store_true')

args = parser.parse_args(['--data_dir=.', '--output_dir=outputs/', '--display'])

In [None]:
from dataloaders import readmission_dataset
data = readmission_dataset(args)

In [None]:
from PatientVec.Experiments.training_exps import get_basic_data

In [None]:
train_data, dev_data = get_basic_data(data, truncate=100)

In [None]:
data.generate_bowder(train_data, stop_words=True, norm=None)

In [None]:
train_data.X = data.get_vec_encoding(train_data, _type='tfidf')
dev_data.X = data.get_vec_encoding(dev_data, _type='tfidf')

In [None]:
train_data.X = np.array(train_data.X.todense())
dev_data.X = np.array(dev_data.X.todense())

In [None]:
from PatientVec.Experiments.configs import vector_experiment
config = vector_experiment(data)
config

In [None]:
from trainer import Trainer, Evaluator
from models.Baseline import ClassificationTrainer as BaseCT

In [None]:
trainer = Trainer(BaseCT, config, _type=data.metrics_type, display_metrics=args.display)
trainer.train(train_data, dev_data, save_on_metric=data.save_on_metric)

evaluator = Evaluator(BaseCT, trainer.model.dirname, _type=data.metrics_type, display_metrics=args.display)
_ = evaluator.evaluate(dev_data, save_results=True)
print('='*300)

In [None]:
from PatientVec.models.baselines.LR import LR, LDA

train_data, dev_data = get_basic_data(data, truncate=100)
lr = LR({'vocab' : data.vocab, 'stop_words' : True, 'exp_name' : data.name, 'type' : 'classifier', 'norm' : 'l2'})
lr.train(train_data)
lr.evaluate(dev_data, save_results=True)

In [None]:
lr.print_all_features(n=40)

Attention Comparison
====================

In [None]:
lstm_attention_config = experiments[3](data, structured=True)
lstm_attn_eval = Evaluator(BasicCT, get_latest_model(os.path.join('outputs', lstm_attention_config['exp_config']['exp_name'])))
dev_data = data.get_data('dev', structured=True)
dev_data = data.filter_data_length(dev_data, truncate=90)
lstm_outputs = lstm_attn_eval.evaluate(dev_data, save_results=False)
del lstm_attn_eval

In [None]:
lstm_structured_attention_config = structured_experiments[1](data, structured=True, encodings=data.structured_columns)
lstm_struct_attn_eval = Evaluator(BasicCT, 
                                  get_latest_model(os.path.join('outputs', lstm_structured_attention_config['exp_config']['exp_name'])))
dev_data = data.get_data('dev', structured=True, encodings=data.structured_columns)
dev_data = data.filter_data_length(dev_data, truncate=90)
lstm_struct_outputs = lstm_struct_attn_eval.evaluate(dev_data, save_results=False)
del lstm_struct_attn_eval

In [None]:
from attention_comparison import get_comparative_measures

In [None]:
measures = []
for x, a1, a2 in zip(dev_data.X, lstm_outputs['attentions'], lstm_struct_outputs['attentions']) :
    measures.append(get_comparative_measures(x, a1, a2))
measures = pd.DataFrame(measures)
y = np.array(dev_data.y)[:, 0]
measures['y'] = y

words_A = [[data.vocab.idx2word[x] for x in d] for d in list(measures['A'])]
measures['A'] = words_A

words_B = [[data.vocab.idx2word[x] for x in d] for d in list(measures['B'])]
measures['B'] = words_B

measures_0 = measures['y'] == 0
measures_1 = measures['y'] == 1

In [None]:
lstm_max_attentions = [max(x) for x in lstm_outputs['attentions']]
lstm_struct_max_attentions = [max(x) for x in lstm_struct_outputs['attentions']]
_ = plt.hist([lstm_max_attentions, lstm_struct_max_attentions], bins=30)

In [None]:
plt.scatter(measures['len'], measures['haus'], s=5)

In [None]:
_ = plt.hist(measures[measures_0]['emd'], bins=30)
_ = plt.hist(measures[measures_1]['emd'], bins=30)

In [None]:
from attention_comparison import get_comparative_measures

In [None]:
for c in [measures_0, measures_1] :
    plt.hist([measures[c]['A&B'], measures[c]['A-B'], measures[c]['B-A']], label=['A&B', 'A-B', 'B-A'])
    plt.legend()
    plt.show()

In [None]:
for c in [measures_0, measures_1] :
    plt.hist([measures[c]['jacc']], label=['jacc'], bins=30)
    plt.show()
    plt.scatter(measures[c]['jacc'], measures[c]['A|B'], s=1)
    plt.show()

Baselines
==========

In [None]:
lda = LDA({'vocab' : data.vocab, 'stop_words' : True, 'exp_name' : data.name})
lda.train(train_data)
lda.evaluate(dev_data, save_results=True)
print(lda.get_topics(n=10))
topics = lda.get_topics(n=10)
print([topics[i] for i in np.argsort(lda.lda_classifier.coef_[0])])

In [None]:
[topics[i] for i in np.argsort(lda.lda_classifier.coef_[0])]

In [None]:
preds = outputs['predictions'][:, 0]

In [None]:
from common import collapse_and_print_word_attn, print_sent_attn

In [None]:
n = 50
collapse_and_print_word_attn(data.vocab, dev_data.X[n], outputs['word_attentions'][n])
print_sent_attn(data.vocab, dev_data.X[n], outputs['sentence_attentions'][n])

In [None]:
outputs['sentence_attentions'][0]

In [None]:
from scipy.stats import spearmanr, kendalltau

In [None]:
corrs = [kendalltau(range(len(outputs['sentence_attentions'][i])), outputs['sentence_attentions'][i]) 
         for i in range(len(outputs['sentence_attentions']))]

In [None]:
rho, leng = zip(*[(x[0], y) for x, y in zip(corrs, [len(z) for z in outputs['sentence_attentions']]) if x[0] == x[0]])

In [None]:
plt.hist(rho, bins=30)

In [None]:
pval, leng1 = zip(*[(x[1], y) for x, y in zip(corrs, [len(z) for z in outputs['sentence_attentions']]) if x[1] == x[1]])

Saving Models
==============

In [None]:
from common import generate_latex_tables
keys_to_use = ['1/precision', '1/recall', '1/f1-score', 'accuracy', 'roc_auc', 'pr_auc']
generate_latex_tables(data, keys_to_use)

In [1]:
import torch

In [4]:
a = torch.randn(5, 10, 20)
b = torch.randn(100, 1, 10)
c = torch.randn(100, 50, 20)

In [6]:
torch.bmm(b, a)

RuntimeError: Expected tensor to have size 100 at dimension 0, but got size 5 for argument #2 'batch2' (while checking arguments for bmm)