In [7]:
import os
from rnn_utils import DiagnosesDataset, split_dataset, MYCOLLATE
from rnn_utils import RNN, train_one_epoch, eval_model

import torch
from torch.utils.data import Dataset, DataLoader, random_split

from sklearn.model_selection import ParameterGrid, ParameterSampler

import numpy as np

from config import Settings; settings = Settings()

import wandb

# Parameters

In [14]:
dataset_id = 'diag_only'

# model
grouping = 'ccs'
batch_size=64

# Reproducibility

In [15]:
# Reproducibility
seed = settings.random_seed

np.random.seed(seed)
torch.manual_seed(seed)
torch.cuda.manual_seed(seed)

torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False

# Create dataset

In [16]:
dataset_folder = os.path.join(settings.data_base,settings.model_ready_dataset_folder,dataset_id)
print('dataset at',dataset_folder)

dataset at data/model_ready_dataset/diag_only


In [17]:
dataset = DiagnosesDataset(os.path.join(dataset_folder,'dataset.json'),grouping)

train_dataset = DiagnosesDataset(os.path.join(dataset_folder,'train_subset.json'),grouping)
val_dataset = DiagnosesDataset(os.path.join(dataset_folder,'val_subset.json'),grouping)
test_dataset = DiagnosesDataset(os.path.join(dataset_folder,'test_subset.json'),grouping)


len(train_dataset)
len(val_dataset)
len(test_dataset)


train_dataloader = DataLoader(train_dataset,batch_size=batch_size,collate_fn=MYCOLLATE(dataset),shuffle=True)
val_dataloader = DataLoader(val_dataset,batch_size=batch_size,collate_fn=MYCOLLATE(dataset)) #batch_size here is arbitrary and doesn't affect total validation speed
test_dataloader = DataLoader(test_dataset,batch_size=batch_size,collate_fn=MYCOLLATE(dataset))

# wandb

In [18]:
wandb.login()

[34m[1mwandb[0m: Currently logged in as: [33msnovaisg[0m (use `wandb login --relogin` to force relogin)


True

## Define data and train conditions

In [20]:
n_labels = input_size = next(iter(train_dataloader))['target_sequences']['sequence'].shape[2]

criterion = torch.nn.BCEWithLogitsLoss()

## Define search space

In [26]:
hyperparameters = {
    'hidden_size':[25,50,100,150], #-1 is to have the same as input size
    'num_layers':[1,2],
    'lr':[0.01,0.02,0.03],
    'model':['rnn','gru','lstm']
    
}
meta_parameters = {
    'epochs':15
}

params = ParameterGrid(hyperparameters)
print(f'params:',len(params))

#random_params = ParameterSampler(params.param_grid,n_iter=len(params)-1,random_state=231)
#next(iter(random_params))

params: 72


## Run models

In [11]:
for param_set in params:
    config = {**param_set, 
              **meta_parameters}
    
    wandb.init(
        project="prognosis_modelling", 
        config=config
    )
    
    model = RNN(input_size=input_size,
              hidden_size=config['hidden_size'],
              num_layers=config['num_layers'],
              n_labels=n_labels,
              model=config['model'])
    
    optimizer = torch.optim.Adam(model.parameters(), lr=config['lr'])
    
    for epoch in range(1,config['epochs']+1):
        loss = train_one_epoch(model,train_dataloader,epoch,criterion,optimizer)
        wandb.log({"loss":loss})
    
    # eval model on validation data
    val_results = eval_model(model,val_dataloader,dataset, criterion, epoch, 'validation')
    wandb.log({'val_loss':val_results['loss'],
               'recall@10':val_results['last adm']['recall10']['mean'],
               'recall@20':val_results['last adm']['recall20']['mean'],
               'recall@30':val_results['last adm']['recall30']['mean']
              })

[34m[1mwandb[0m: Network error (ConnectionError), entering retry loop.


Problem at: <ipython-input-11-10f269797694> 5 <module>


KeyboardInterrupt: 

# best model

- lr: 0.03
- model_type = 'lstm'
- num_layers = 1
- hidden_size = 50
- epochs: 15