In [11]:
import os
cwd = os.getcwd()

# protection against running this cell multiple times
assert os.path.dirname(cwd).split('/')[-1] == 'master-thesis','Oops, directory already changed previously as indended. Ignoring...'

# change working directory (if assert passed)
new_cwd = os.path.dirname(cwd) # parent directory
os.chdir(new_cwd)

AssertionError: Oops, directory already changed previously as indended. Ignoring...

In [12]:
import sys
import os
this_dir = ".."
if this_dir not in sys.path:
    sys.path.append(this_dir)

from rnn_utils import DiagnosesDataset, split_dataset, MYCOLLATE
from rnn_utils import train_one_epoch, eval_model

from mourga_variational.variational_rnn import VariationalRNN

import torch
from torch.utils.data import Dataset, DataLoader, random_split

from sklearn.model_selection import ParameterGrid, ParameterSampler

import numpy as np

from config import Settings; settings = Settings()

import wandb

# Reproducibility

In [13]:
# Reproducibility
seed = settings.random_seed

np.random.seed(seed)
torch.manual_seed(seed)
torch.cuda.manual_seed(seed)

<torch._C.Generator at 0x10ef5afb0>

# Create dataset

In [14]:
dataset_id = 'diag_only'
grouping = 'ccs'
batch_size=64

In [15]:
dataset_folder = os.path.join(settings.data_base,settings.model_ready_dataset_folder,dataset_id)
print('dataset at',dataset_folder)

dataset = DiagnosesDataset(os.path.join(dataset_folder,'dataset.json'),grouping)

train_dataset = DiagnosesDataset(os.path.join(dataset_folder,'train_subset.json'),grouping)
val_dataset = DiagnosesDataset(os.path.join(dataset_folder,'val_subset.json'),grouping)
test_dataset = DiagnosesDataset(os.path.join(dataset_folder,'test_subset.json'),grouping)


len(train_dataset)
len(val_dataset)
len(test_dataset)


train_dataloader = DataLoader(train_dataset,batch_size=batch_size,collate_fn=MYCOLLATE(dataset),shuffle=True)
val_dataloader = DataLoader(val_dataset,batch_size=batch_size,collate_fn=MYCOLLATE(dataset)) #batch_size here is arbitrary and doesn't affect total validation speed
test_dataloader = DataLoader(test_dataset,batch_size=batch_size,collate_fn=MYCOLLATE(dataset))

dataset at data/model_ready_dataset/diag_only


5249

1125

1125

# Define model

## Hyperparameters

In [16]:
input_size = next(iter(train_dataloader))['target_sequences']['sequence'].shape[2]
hidden_size = 100
num_layers = 1
n_labels = input_size
model_type = 'gru'

# Now for wandb

In [17]:
wandb.login()



True

## define data and train conditions

In [18]:
criterion = torch.nn.BCEWithLogitsLoss(reduction='none')

## Define search space

In [19]:
hyperparameters = {
    'hidden_size':[50,75,100], #-1 is to have the same as input size
    'num_layers':[1,2],
    'lr':[0.01,0.02],
    'model':['rnn','gru','lstm'],
    'dropout':[0.1,0.2],
    'batch_size':[64],
    'name':['variational']
}

meta_parameters = {
    'epochs':10
}

params = ParameterGrid(hyperparameters)
print(f'params:',len(params))

random_params = ParameterSampler(params.param_grid,n_iter=len(params)-1,random_state=231)
next(iter(random_params))

params: 72


{'num_layers': 2,
 'name': 'variational',
 'model': 'rnn',
 'lr': 0.02,
 'hidden_size': 100,
 'dropout': 0.2,
 'batch_size': 64}

wandb: Network error (ConnectionError), entering retry loop.


## Run models

In [10]:
for param_set in params:
    config = {**param_set, 
              **meta_parameters}
    
    wandb.init(
        project="thesis_mc_dropout_model_tunning", 
        config=config
    )
    
    model = VariationalRNN(input_size=input_size,
                          hidden_size=config['hidden_size'],
                          num_layers=config['num_layers'],
                          n_labels=n_labels,
                          rnn_type=config['model'],
                          dropouti=config['dropout'],
                          dropoutw=config['dropout'],
                          dropouto=config['dropout'])
    
    optimizer = torch.optim.Adam(model.parameters(), lr=config['lr'])
    
    for epoch in range(1,config['epochs']+1):
        loss = train_one_epoch(model,train_dataloader,epoch,criterion,optimizer)
        
        wandb.log({'epoch':epoch,'loss':loss})
        
    val_loss, val_metrics = eval_model(model,val_dataloader,dataset,metrics=['roc','f1'])[1].filter(regex='_adm')
    val_metrics.index = ['val_' + n for n in val_metrics.index]
    
    log = val_metrics.to_dict()
    log.update({'loss':val_loss})
   
    wandb.log({'val_loss':log['loss'],
               'val_roc':log['recall@roc_adm'],
               'val_f1':log['f1_adm'],
              })

ValueError: Target size (torch.Size([64, 33, 272])) must be the same as input size (torch.Size([15, 64, 33, 272]))

wandb: Network error (ConnectionError), entering retry loop.
wandb: Network error (ConnectionError), entering retry loop.
