In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
from dataset import Dataset
import numpy as np

In [5]:
import argparse
parser = argparse.ArgumentParser(description='Run Diagnosis experiments')
parser.add_argument("--data_dir", type=str, required=True)
parser.add_argument('--display', dest='display', action='store_true')
parser.add_argument("--output_dir", type=str)
parser.add_argument("--mock", dest='mock', action='store_true')

args = parser.parse_known_args(['--data_dir=.', '--output_dir=outputs/', '--display', '--lr=1'])

In [None]:
data_imdb = Dataset(name='imdb', dirname='preprocess/IMDB/')

In [None]:
labellist = ['sentiment']
data_imdb.generate_labels(labellist, len(labellist), 'binary')
data_imdb.save_on_metric = 'roc_auc'
data_imdb.metrics_type = 'classifier'

In [None]:
from dataloaders import readmission_dataset, diagnosis_dataset
data_read = readmission_dataset(args)
data_diag = diagnosis_dataset(args)

In [None]:
train_data_imdb, _ = get_basic_data(data_imdb, structured=False, truncate=100)
train_data_read, _ = get_basic_data(data_read, structured=False, truncate=95)

In [None]:
train_data_diag, _ = get_basic_data(data_diag, structured=False, truncate=95)

In [None]:
data_imdb.generate_bowder(train_data_imdb, stop_words=True, norm=None)
train_data_imdb.X = data_imdb.get_vec_encoding(train_data_imdb, _type='bow')

In [None]:
data_read.generate_bowder(train_data_read, stop_words=True, norm=None)
train_data_read.X = data_read.get_vec_encoding(train_data_read, _type='bow')

In [None]:
data_diag.generate_bowder(train_data_diag, stop_words=True, norm=None)
train_data_diag.X = data_diag.get_vec_encoding(train_data_diag, _type='bow')

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
sns.kdeplot(train_data_imdb.X.mean(0))
sns.kdeplot(train_data_read.X.mean(0))
sns.kdeplot(train_data_diag.X.mean(0))

In [None]:
from PatientVec.Experiments.hyperparam_exps import *

In [None]:
train_data_imdb, dev_data_imdb = get_basic_data(data_imdb, structured=False, truncate=90)

In [None]:
for l in range(1, 8) :
    config = {'vocab' : data_imdb.vocab, 'stop_words' : True, 'exp_name' : data_imdb.name, 'type' : data_imdb.metrics_type, 'norm' : None}
    lr = LR(config)
    lr.train(train_data_imdb)
    lr.evaluate(dev_data_imdb, save_results=True)

In [None]:
data.generate_bowder(train_data, stop_words=True, norm='l2')
train_data.X = data.get_vec_encoding(train_data, _type='bow')
dev_data.X = data.get_vec_encoding(dev_data, _type='bow')

In [None]:
e = vector_configs[0]
config = e(data, structured=True, args=args)
config['model']['reg']['weight'] = 1.0 / train_data.X.shape[0]
config['training_config']['groups'][0][1]['lr'] = 0.01
config

In [None]:
trainer = Trainer(VectorCT, config, _type=data.metrics_type, display_metrics=args.display)
trainer.train(train_data, dev_data, save_on_metric=data.save_on_metric)

In [None]:
evaluator = Evaluator(VectorCT, trainer.model.dirname, _type=data.metrics_type, display_metrics=args.display)
_ = evaluator.evaluate(dev_data, save_results=True)

In [None]:
for n, v in evaluator.model.model.decoder.named_parameters() :
    print(n, v)

In [None]:
weight = evaluator.model.model.decoder._linear_layers[0].weight[0].cpu().data.numpy()

In [None]:
idx = np.argsort(weight)[:10]
idx, weight[idx]

In [None]:
np.array(data.bowder.words_to_keep)[idx]

In [None]:
train_data = data.filter_data_length(data.get_data('train', structured=False), 95)
test_data = data.filter_data_length(data.get_data('test', structured=False), 95)

In [None]:
vc = [0] * data.vocab.vocab_size
for i, v in data.vocab.idx2word.items() :
    vc[i] = v

In [None]:
from PatientVec.Experiments.modifiable_config_exp import attention_configs
avg_attn_config = attention_configs[0](data, structured=False, args=args)

In [None]:
avg_attn_config['model']['embedder']['type'] = 'elmo_embedder'
del avg_attn_config['model']['embedder']['embedding_file']
avg_attn_config['model']['embedder']['elmo_options'] = {
    'options_file' : '../../elmo_2x2048_256_2048cnn_1xhighway_options.json',
    'weight_file' : '../../elmo_2x2048_256_2048cnn_1xhighway_weights.hdf5',
    'vocab_to_cache' : vc,
    'scalar_mix_parameters' : [0.3, 0.3, 0.3]
}

In [None]:
avg_attn_config['training_config']['common']['bsize'] = 64

In [None]:
from PatientVec.models.Vanilla import ClassificationTrainer as BasicCT
from PatientVec.trainer import Trainer, Evaluator

In [None]:
import logging.config
logging.config.dictConfig({
    'version': 1,
    'disable_existing_loggers': True
})

In [None]:
trainer = Trainer(BasicCT, avg_attn_config, _type=data.metrics_type, display_metrics=args.display)
trainer.train(train_data, test_data, save_on_metric=data.save_on_metric)

In [None]:
outputs = trainer.model.evaluate(test_data.mock(200))

In [None]:
for n, v in trainer.model.model.named_parameters() :
    print(n, v)