In [1]:
import os
cwd = os.getcwd()

# protection against running this cell multiple times
assert os.path.dirname(cwd).split('/')[-1] == 'master-thesis','Oops, directory already changed previously as indended. Ignoring...'

# change working directory (if assert passed)
new_cwd = os.path.dirname(cwd) # parent directory
os.chdir(new_cwd)

In [2]:
import os
from rnn_utils import DiagnosesDataset, split_dataset, MYCOLLATE
from rnn_utils import RNN, train_one_epoch, eval_model, compute_loss

import torch
from torch.utils.data import Dataset, DataLoader, random_split

from sklearn.model_selection import ParameterGrid, ParameterSampler

import numpy as np

from config import Settings; settings = Settings()

import wandb

# Parameters

In [9]:
dataset_id = 'diag_only/mimic_iv_quick_baseline_dataset'

# model
grouping = 'ccs'
batch_size=64

# Reproducibility

In [10]:
# Reproducibility
seed = settings.random_seed

np.random.seed(seed)
torch.manual_seed(seed)
torch.cuda.manual_seed(seed)

torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False

# Create dataset

In [17]:
dataset_folder = os.path.join(settings.data_base,settings.model_ready_dataset_folder,dataset_id)
print('dataset at',dataset_folder)

dataset at data/model_ready_dataset/diag_only/mimic_iv_quick_baseline_dataset


In [18]:
dataset = DiagnosesDataset(os.path.join(dataset_folder,'mimic_iv_quick_baseline_dataset.json'),grouping)

train_dataset = DiagnosesDataset(os.path.join(dataset_folder,'train_subset.json'),grouping)
val_dataset = DiagnosesDataset(os.path.join(dataset_folder,'val_subset.json'),grouping)
test_dataset = DiagnosesDataset(os.path.join(dataset_folder,'test_subset.json'),grouping)


len(train_dataset)
len(val_dataset)
len(test_dataset)


train_dataloader = DataLoader(train_dataset,batch_size=batch_size,collate_fn=MYCOLLATE(dataset),shuffle=True)
val_dataloader = DataLoader(val_dataset,batch_size=batch_size,collate_fn=MYCOLLATE(dataset)) #batch_size here is arbitrary and doesn't affect total validation speed
test_dataloader = DataLoader(test_dataset,batch_size=batch_size,collate_fn=MYCOLLATE(dataset))

# wandb

In [19]:
wandb.login()

Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize


wandb: Paste an API key from your profile and hit enter, or press ctrl+c to quit: ········


[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /home/debian/.netrc


True

## Define data and train conditions

In [20]:
n_labels = input_size = next(iter(train_dataloader))['target_sequences']['sequence'].shape[2]

criterion = torch.nn.BCEWithLogitsLoss(reduction='none')

## Define search space

In [21]:
hyperparameters = {
    'hidden_size':[50,75,100],
    'num_layers':[2],
    'lr':[0.01,0.02,0.03],
    'model':['rnn','gru','lstm']
    
}
meta_parameters = {
    'epochs':10
}

params = ParameterGrid(hyperparameters)
print(f'params:',len(params))

#random_params = ParameterSampler(params.param_grid,n_iter=len(params)-1,random_state=231)
#next(iter(random_params))

params: 27


In [28]:
a = next(iter(train_dataloader))['train_sequences']['sequence']


In [29]:
a.batch_sizes

tensor([64, 30, 18, 12, 11,  6,  5,  5,  5,  5,  5,  3,  2,  1,  1,  1,  1,  1,
         1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1])

## Run models

In [30]:
for idx,param_set in enumerate(params):
    config = {**param_set, 
              **meta_parameters}
    
    wandb.init(
        project="basic_deterministic_model_tunning", 
        config=config
    )
    
    model = RNN(input_size=input_size,
              hidden_size=config['hidden_size'],
              num_layers=config['num_layers'],
              n_labels=n_labels,
              model=config['model'])
    
    optimizer = torch.optim.Adam(model.parameters(), lr=config['lr'])
    
    for epoch in range(1,config['epochs']+1):
        log = {}
        
        loss = train_one_epoch(model,train_dataloader,epoch,criterion,optimizer);
        
        wandb.log({'epoch':epoch,'loss':loss})
        
    train_metrics = eval_model(model,train_dataloader,dataset,metrics=['roc','f1'])[1].filter(regex='_adm')
    val_metrics = eval_model(model,val_dataloader,dataset,metrics=['roc','f1'])[1].filter(regex='_adm')
    train_metrics.index = ['train_' + n for n in train_metrics.index]
    val_metrics.index = ['val_' + n for n in val_metrics.index]
        

    log = dict()

    log.update(train_metrics.to_dict())
    log.update(val_metrics.to_dict())
    log.update({'loss':loss})

    wandb.log(log)

VBox(children=(Label(value='0.034 MB of 0.034 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=1.0, max…

TypeError: RNN.forward() takes 2 positional arguments but 3 were given

# best model

- name: silver-salad-21
- lr: 0.01
- model_type = 'lstm'
- num_layers = 1
- hidden_size = 75
- epochs: 15

In [None]:
hyperparameters = {
    'hidden_size':[25,50,75],
    'num_layers':[2],
    'lr':[0.01,0.02,0.03],
    'model':['rnn','gru','lstm']
    
}

In [13]:
meta_parameters

{'epochs': 15}

In [16]:
param_set = dict(hidden_size=75,num_layers=1,lr=0.01,model_type='lstm',epochs=15,mode='metrics')
wandb.init(
    project="basic_deterministic_model_tunning", 
    config=param_set
)




In [19]:
model = RNN(input_size=input_size,
            hidden_size=param_set['hidden_size'],
            num_layers=param_set['num_layers'],
            n_labels=n_labels,
            model=param_set['model_type'])
optimizer = torch.optim.Adam(model.parameters(), lr=param_set['lr'])
for epoch in range(1,config['epochs']+1):
    loss = train_one_epoch(model,train_dataloader,epoch,criterion,optimizer);

In [None]:
config = {**param_set, 
              **meta_parameters}
    
    wandb.init(
        project="basic_deterministic_model_tunning", 
        config=config
    )
    
    model = RNN(input_size=input_size,
              hidden_size=config['hidden_size'],
              num_layers=config['num_layers'],
              n_labels=n_labels,
              model=config['model'])
    
    optimizer = torch.optim.Adam(model.parameters(), lr=config['lr'])
    
    for epoch in range(1,config['epochs']+1):
        log = {}
        
        loss = train_one_epoch(model,train_dataloader,epoch,criterion,optimizer);
        
        wandb.log({'epoch':epoch,'loss':loss})
        
    train_metrics = eval_model(model,train_dataloader,dataset,metrics=['roc','f1'])[1].filter(regex='_adm')
    val_metrics = eval_model(model,val_dataloader,dataset,metrics=['roc','f1'])[1].filter(regex='_adm')
    train_metrics.index = ['train_' + n for n in train_metrics.index]
    val_metrics.index = ['val_' + n for n in val_metrics.index]
        

    log = dict()

    log.update(train_metrics.to_dict())
    log.update(val_metrics.to_dict())
    log.update({'loss':loss})

    wandb.log(log)