This pipeline should generate a dataset that serves as an intermediary step to processing the data for training neural networks


Structure (json)

Each level is a patient

- A patient may contain several admissions
    - each admission has information:
        - codes
            - can be formatted in different ways
        - timestamps
        - clinical notes (todo)

# Data structure

(easier to visualize in md editor rather than md compiled)

{metadata : {groupings: \[list_of_icd9_groupings\] <br>
 data: { <br>
  patient_1: { <br>
   grouping1: \[code1,code2,code3\], <br>
   ... <br>
   groupingn: \[code1,code2,code3\] <br>
  }, <br>
  ..., <br>
  patient_n: {<br>
  ...<br>
  }<br>
}

# What is the advantage of having data stored in this way?


- data available on demand to input to models, no need to perform data retrieval and processing operations

In [2]:
from Mimic import Mimic
from ICDCodesGrouper import ICDCodesGrouper

import pandas as pd
import numpy as np

from tqdm.notebook import tqdm

from torch.utils.data import Dataset
from torch import nn
import torch
from torch.nn.utils.rnn import pack_padded_sequence, pad_packed_sequence, pad_sequence, pack_sequence
import torch.nn.functional as F

import json

In [3]:
grouper = ICDCodesGrouper()
mimic = Mimic(grouper=grouper)

## Get patients eligible for modelling

In [4]:
eligible_patients = np.loadtxt('data/eligible_patients.txt',dtype=int)

## Build dictionary of data

In [5]:
adm = mimic.read_admissions().where(lambda df: df.SUBJECT_ID.isin(eligible_patients)).dropna(how='all')[['SUBJECT_ID','ADMITTIME','HADM_ID']]
adm.head(1)
adm.shape

Unnamed: 0,SUBJECT_ID,ADMITTIME,HADM_ID
25361,20957.0,2100-06-24 22:37:00,113808.0


(19917, 3)

In [6]:
# where it all begins
data = {}

# dumb, error prone way of getting only the columns of icd codings (eg.: icd9,ccs,chapters,etc)
grouping_columns = [col for col in mimic.read_diagnoses() if col not in ["ROW_ID","SUBJECT_ID","HADM_ID","SEQ_NUM"]]

for p in tqdm(eligible_patients):
    
    p = int(p)
    
    data[p] = {}
    
    p_adm_data = adm[adm.SUBJECT_ID == p]
    
    # sanity check that all admissions are sorted inside each patient data
    assert p_adm_data.ADMITTIME.is_monotonic_increasing, f'Oopsie, p={p}'
    
    for hadm in p_adm_data['HADM_ID']:
        #diagnoses data
        diag_data = mimic.get_diagnoses_for_admission(hadm)
        
        # if no information about diagnostics then ignore
        if diag_data.ICD9_CODE.isna().all():
            continue
        
        # sanity check that diagnostics are sorted (important for future experiments)
        assert diag_data.SEQ_NUM.is_monotonic_increasing, f'Oopsie, p={p}, hadm_id={hadm}'
        
        for grouping in grouping_columns:
            if grouping not in data[p].keys():
                data[p][grouping] = [diag_data[grouping].tolist()]
            else:
                data[p][grouping].append(diag_data[grouping].tolist())

HBox(children=(IntProgress(value=0, max=7499), HTML(value='')))




In [7]:
# add metadata of available groupings
metadata = {'groupings':grouping_columns}
data = {'metadata':metadata,'data':data}

# save

In [8]:
with open('data/model_data.json', 'w') as fp:
    json.dump(data, fp)

# Test

### Read

In [9]:
with open('data/model_data.json','r') as fp:
    data = json.load(fp)

### read all patient ids

In [10]:
patient_ids = list(data['data'].keys()) # patient id's
len(patient_ids)

7499

### get a patient with 4 admissions

In [None]:
### for e in patient_ids:
    if len(data['data'][e]['ICD9_CODE']) == 4:
        print('found')
        break
data['data'][e]['ICD9_CODE']

## One hot encode

In [None]:
coding = 'ccs'
new_data = [pat[coding] for pat in data['data']]

In [7]:
data

{'metadata': {'groupings': ['ICD9_CODE', 'ccs', 'icd9chapters']},
 'data': {'17': {'ICD9_CODE': [['7455', '45829', 'V1259', '2724'],
    ['4239', '5119', '78551', '4589', '311', '7220', '71946', '2724']],
   'ccs': [[213, 238, 117, 53], [97, 130, 249, 117, 657, 205, 204, 53]],
   'icd9chapters': [[14, 7, 18, 3], [7, 8, 16, 7, 5, 13, 13, 3]]},
  '21': {'ICD9_CODE': [['41071',
     '78551',
     '5781',
     '5849',
     '40391',
     '4280',
     '4592',
     '5070',
     '42731',
     '4271',
     '41401',
     '25000',
     '28521',
     '1122',
     '2720',
     '2749',
     'V1046',
     '43889'],
    ['0388',
     '78552',
     '40391',
     '42731',
     '70709',
     '5119',
     '6823',
     '99859',
     '00845',
     '5720',
     '99592',
     'V0980',
     '25000',
     '2859',
     '43889',
     '2749',
     '41401',
     '185',
     '4439',
     '2449',
     'E8788']],
   'ccs': [[100,
     249,
     153,
     157,
     99,
     108,
     121,
     129,
     106,
     106,


# Testing

In [2]:
from torch.utils.data import Dataset
class DiagnosesDataset(Dataset):
    def __init__(self, diagnoses_file,
                 grouping='ccs' # desired grouping to use (for both input and output currently)
                ):
        
        # load admissions data
        with open(diagnoses_file,'r') as fp:
            self.data = json.load(fp)
        
        # list patients
        self.patients = list(self.data['data'].keys())
        
        self.grouping = grouping
        
        # necessary data of each code_grouping (eg. ccs, chapters) for posterior padding and one_hot_encoding of batches
        self.grouping_data = {}
        for grouping_code in self.data['metadata']['groupings']:
            self.grouping_data[grouping_code] = {}
            
            # get all codes of this group
            all_data_grouping = [self.data['data'][pat][grouping_code] for pat in self.data['data']]
            
            #flatten list of lists of lists
            all_data_grouping = [item for sublist in all_data_grouping for item in sublist]
            all_data_grouping = [item for sublist in all_data_grouping for item in sublist]
            
            # store n_labels this group
            self.grouping_data[grouping_code]['n_labels'] = len(set(all_data_grouping))
            
            # store unique sorted codes from dataset
            self.grouping_data[grouping_code]['sorted'] = sorted(set(all_data_grouping))
            
            # store code2int & int2code
            int2code = dict(enumerate(self.grouping_data[grouping_code]['sorted']))
            code2int = {ch: ii for ii, ch in int2code.items()}
            
            self.grouping_data[grouping_code]['int2code'] = int2code
            self.grouping_data[grouping_code]['code2int'] = code2int
        
    def __str__(self):
        return 'Available groupings: ' +str(self.data['metadata']['groupings'])

    def __len__(self):
        return len(self.data['data'])

    def __getitem__(self, idx):
        """
        gets original converted from int2code
        """
        patient_data = self.data['data'][self.patients[idx]][self.grouping]
        
        # convert codes to code2int
        #patient_data_2int = [[self.grouping_data[self.grouping]['code2int'][code] for code in adm] for adm in patient_data]
        
        train = patient_data[:-1]
        target = patient_data[1:]
        
        return {'train':train,'target':target}
    
                
                
# TEST
dataset = DiagnosesDataset('data/model_data.json')
#test: get first patient's train,test
dataset[0]

from torch.utils.data import DataLoader
dl = DataLoader(dataset,
                batch_size=3,
                collate_fn = lambda x:x)

dl_iter = iter(dl)

{'train': [[213, 238, 117, 53]],
 'target': [[97, 130, 249, 117, 657, 205, 204, 53]]}

In [5]:
class MYCOLLATE:
    def __init__(self,dataset):
        self.dataset = dataset
    
    def __call__(self,batch):
        patients = {'train':[],'target':[]}
        
        grouping_code = self.dataset.grouping
        n_labels = self.dataset.grouping_data[grouping_code]['n_labels']
        code2int = self.dataset.grouping_data[grouping_code]['code2int']
        
        # <Nº admissions - 1> of each patient
        seq_lengths = []
        for pat in batch:
            train_admissions = []
            target_admissions = []
            seq_lengths.append(len(pat))

            # convert each train admission into a multi-hot vector
            for train_admission in pat['train']:
                admission = (F.one_hot(torch.tensor(list(map(lambda code: code2int[code],train_admission))),num_classes=n_labels)
                             .sum(dim=0).float() #one-hot of each diagnose to multi-hot vector of diagnoses
                            )
                train_admissions.append(admission)

            # convert each target admission into a one-hot vector
            for target_admission in pat['target']:
                # convert each admission to multi-hot vector
                admission = (F.one_hot(torch.tensor(list(map(lambda code: code2int[code],target_admission))),num_classes=n_labels)
                             .sum(dim=0).float() #one-hot of each diagnose to multi-hot vector of diagnoses
                            )
                target_admissions.append(admission)

            # stack multiple train admissions of a single patient into a single tensor
            if len(train_admissions) > 1:
                train_admissions = torch.stack(train_admissions)
            else:
                train_admissions = train_admissions[0].view((1,-1))

            # stack multiple target admissions of a single patient into a single tensor
            if len(target_admissions) > 1:
                target_admissions = torch.stack(target_admissions)
            else:
                target_admissions = target_admissions[0].view((1,-1))

            # store final train and test tensors
            patients['train'].append(train_admissions)
            patients['target'].append(target_admissions)

        # pad sequences (some patients have more admissions than others)
        patients['train'] = pack_sequence(patients['train'],enforce_sorted=False)
        patients['target'] = pack_sequence(patients['target'],enforce_sorted=False)
        
        # pack the padded sequences
        
        #patients['train'] = pack_padded_sequence(patients['train'],lengths=seq_lengths,batch_first=True)
        #patients['target'] = pack_padded_sequence(patients['target'],lengths=seq_lengths,batch_first=True)
        return {'train_sequence':patients['train'],'target_sequence':patients['target']}

dl = DataLoader(dataset,batch_size=4,collate_fn=MYCOLLATE(dataset))

dl_iter = iter(dl)

In [6]:
batch = next(dl_iter)

In [7]:
batch['train_sequence']

PackedSequence(data=tensor([[0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.]]), batch_sizes=tensor([4]), sorted_indices=tensor([0, 1, 2, 3]), unsorted_indices=tensor([0, 1, 2, 3]))

In [8]:
sequences,lens = pad_packed_sequence(batch['train_sequence'],batch_first=True)

In [9]:
dataset.grouping_data[dataset.grouping]['n_labels']

272

In [10]:
input_size = dataset.grouping_data[dataset.grouping]['n_labels'] # vocab
hidden_size = 50 # 
num_layers = 1
rnn = nn.RNN(input_size=input_size,
             hidden_size=hidden_size,
             num_layers=num_layers
            )

In [11]:
for data in iter(dl):
    res = rnn.forward(data['train_sequence'])
    breaka

In [None]:
class RNN(nn.Module):
    
    def __init__(self,input_size,hidden_size,num_layers,n_labels):
        self.input_size = input_size
        self.hidden_size = hidden_size
        self.num_layers = hum_layers
        
        self.rnn = nn.RNN(input_size=input_size,
                          hidden_size=hidden_size,
                          num_layers=num_layers
                         )
        
        self.fc = nn.Linear(in_features = hidden_size,
                            out_features=n_labels
                           )
        self.

In [16]:
res[1].shape

torch.Size([1, 4, 50])