In [None]:
import torch
DEVICE = 'cuda' if torch.cuda.is_available() else 'cpu'

import numpy as np
import pandas as pd
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader, random_split
from torch.nn import functional as F
pd.set_option('future.no_silent_downcasting',True)

class MimicDataSetPhenotype(Dataset):
    def __init__(self, data_dir, csv_file, mean_variance , cat_dict, mode, seq_len, pad_value = 0, device = DEVICE):
        super().__init__()
        self.data_dir = data_dir
        self.csv_file = csv_file
        self.seq_len = seq_len
        self.mode = mode
        self.data_df = pd.read_csv(csv_file)
        self.mean_variance = mean_variance
        self.pad_value = pad_value
        self.device = device
        self.cat_dict = cat_dict
    
    def __len__(self):
        return len(self.data_df)
    
    def __getitem__(self, idx):
        path = self.data_dir + self.data_df['stay'][idx]
        data = pd.read_csv(path)
        # categorical_variables = ['Glascow coma scale eye opening', 
        #                          'Glascow coma scale motor response', 
        #                          'Glascow coma scale verbal response']
        id_name_dict = {}
        # data.drop(labels=categorical_variables, axis=1, inplace=True)
        data.replace(['ERROR','no data','.','-','/','VERIFIED','CLOTTED',"*",'ERROR DISREGARD PREVIOUS RESULT OF 32','DISREGARD PREVIOUSLY REPORTED 33'], np.nan, inplace=True)
        for i in range(len(data.columns)):
            id_name_dict[i] = data.columns[i]
        values = data.values
        sample = self.extract(values, id_name_dict)
        if len(sample[0]) >= self.seq_len :
            sample[0] = sample[0][-self.seq_len:]
            sample[1] = sample[1][-self.seq_len:]
            sample[2] = sample[2][-self.seq_len:]
            sample[3] = sample[3][-self.seq_len:]
        num_padd_tokens = self.seq_len - len(sample[0])
        
        variable_input = torch.cat([
            torch.tensor(sample[2], dtype=torch.int64),
            torch.tensor([self.pad_value]*num_padd_tokens, dtype=torch.int64)
        ])
        value_input = torch.cat([
            torch.tensor(sample[1], dtype=torch.float),
            torch.tensor([self.pad_value]*num_padd_tokens, dtype=torch.float)
        ])
        val = torch.tensor(sample[0], dtype=torch.float)
        time_input = torch.cat([
             val - val.min() ,
            torch.tensor([self.pad_value]*num_padd_tokens, dtype=torch.float)
        ])
        variables = sample[3] + ['pad token']*num_padd_tokens
        
        assert variable_input.size(0) == self.seq_len
        assert value_input.size(0) == self.seq_len
        assert time_input.size(0) == self.seq_len
        cols = self.data_df.columns[2:]
        return {
            "encoder_input" : [time_input.to(self.device), variable_input.to(self.device), value_input.to(self.device)],
            "encoder_mask": (variable_input != self.pad_value).unsqueeze(0).int().to(self.device),
            "variables" : variables,
            "label" : torch.tensor(self.data_df[cols].values[idx].argmax(), dtype=torch.int64).to(self.device)
        }
    
    def extract(self, values, id_name_dict):
        sample = [[],[],[],[]]
        for i in range(values.shape[0]):
            time = values[i,0]
            for j in range(1, values.shape[1]):
                if self.isNAN(values[i][j]) == False:
                    if id_name_dict[j] in self.cat_dict.keys():
                        sample[0].append(time)
                        sample[1].append(self.cat_dict[id_name_dict[j]][values[i][j]])
                        sample[2].append(j)
                        sample[3].append(id_name_dict[j])
                    else:
                        mean = self.mean_variance[id_name_dict[j]]['mean']
                        var = self.mean_variance[id_name_dict[j]]['variance']
                        val = (float(values[i][j]) - mean)/var
                        sample[0].append(time)
                        sample[1].append(val)
                        sample[2].append(j)
                        sample[3].append(id_name_dict[j])
        return sample
    def isNAN(self, val):
        return val!=val

from sklearn.metrics import roc_auc_score
from tqdm import tqdm

def calculate_roc_auc(model, data_loader):
    model.eval()
    all_probabilities = []
    all_labels = []
    
    with torch.no_grad():
        for inputs in tqdm(data_loader, leave=False):
            outputs = model(inputs['encoder_input'], inputs['encoder_mask'])
            labels = inputs['label']
            logits = torch.sigmoid(outputs)
            
            all_probabilities.append(logits.cpu().numpy())
            all_labels.append(labels.cpu().numpy())

    logits_all = np.concatenate(all_probabilities)
    labels_all = np.concatenate(all_labels)
    
    roc_auc = roc_auc_score(labels_all, logits_all)
    return roc_auc

from sklearn.metrics import average_precision_score
from tqdm import tqdm

def calculate_auc_prc(model, data_loader):
    model.eval()
    all_probabilities = []
    all_labels = []

    with torch.no_grad():
        for inputs in tqdm(data_loader, leave=False):
            outputs = model(inputs['encoder_input'], inputs['encoder_mask'])
            labels = inputs['label']
            logits = torch.sigmoid(outputs)

            all_probabilities.append(logits.cpu().numpy())
            all_labels.append(labels.cpu().numpy())

    logits_all = np.concatenate(all_probabilities)
    labels_all = np.concatenate(all_labels)

    auc_prc = average_precision_score(labels_all, logits_all)
    return auc_prc


In [None]:
import torch
DEVICE = 'cuda' if torch.cuda.is_available() else 'cpu'

import numpy as np
import pandas as pd
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader, random_split
from torch.nn import functional as F
from tqdm import tqdm
import pickle
pd.set_option('future.no_silent_downcasting',True)

class Normalizer:
    def __init__(self, data, data_dir):
        self.data = data
        self.data_dir = data_dir
        self.categorical_variables = ['Glascow coma scale eye opening', 
                                 'Glascow coma scale motor response', 
                                 'Glascow coma scale verbal response']
        self.mean_var_dict = self.get_mean_var()
        
        
    def get_mean_var(self):
        sample_path = self.data_dir + self.data['stay'][0]
        id_name_dict = {}
        df = pd.read_csv(sample_path)
        df.drop(labels=self.categorical_variables, axis=1, inplace=True)
        for i in range(len(df.columns)):
            id_name_dict[i] = df.columns[i]
        variable_values = {k : [] for k in df.columns[1:]}
        for sample_path in tqdm(self.data['stay']):
            sample_path = self.data_dir+sample_path
            df = pd.read_csv(sample_path)
            values = df.values
            df.drop(labels=self.categorical_variables, axis=1, inplace=True)
            df.replace(['ERROR','no data','.','-','/','VERIFIED','CLOTTED',"*",'ERROR DISREGARD PREVIOUS RESULT OF 32','DISREGARD PREVIOUSLY REPORTED 33'], np.nan, inplace=True)
            cols = df.columns[1:]
            df = df[cols]
            values = df.values
            for i in range(values.shape[0]):
                for j in range(values.shape[1]):
                    if self.isNAN(values[i][j]) == False:
                        variable_values[id_name_dict[j+1]].append((float(values[i][j])))
        result_dict = {}
        for feature, values in variable_values.items():
            mean_value = np.mean(values)
            variance_value = np.var(values)
            result_dict[feature] = {'mean': mean_value, 'variance': variance_value}
        return result_dict
    def isNAN(self, val):
        return val!=val
    

train_data_path = "/data/datasets/mimic3_18var/root/phenotyping/train_listfile.csv"
val_data_path = "/data/datasets/mimic3_18var/root/phenotyping/val_listfile.csv"

data_dir = "/data/datasets/mimic3_18var/root/phenotyping/train/"


save = False
if save:
    normalizer = Normalizer(pd.read_csv(train_data_path), data_dir)
    with open('normalizer.pkl', 'wb') as file:
        pickle.dump(normalizer, file)

    print("Completed Saving Normalizer........")

In [None]:
normalizer = Normalizer(pd.read_csv(train_data_path), data_dir)

In [None]:
normalizer.mean_var_dict

In [None]:
import pickle 
import torch
DEVICE = 'cuda' if torch.cuda.is_available() else 'cpu'

import numpy as np
import pandas as pd
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader, random_split
from torch.nn import functional as F
pd.set_option('future.no_silent_downcasting',True)


category_config = {
    "Glascow coma scale verbal response": {
            "No Response-ETT": 1,
            "No Response": 1,
            "1 No Response": 1,
            "1.0 ET/Trach": 1,
            "2 Incomp sounds": 2,
            "Incomprehensible sounds": 2,
            "3 Inapprop words": 3,
            "Inappropriate Words": 3,
            "4 Confused": 4,
            "Confused": 4,
            "5 Oriented": 5,
            "Oriented": 5
    },
    "Glascow coma scale eye opening": {
            "None": 0,
            "1 No Response": 1,
            "2 To pain": 2, 
            "To Pain": 2,
            "3 To speech": 3, 
            "To Speech": 3,
            "4 Spontaneously": 4,
            "Spontaneously": 4
        },
    "Glascow coma scale motor response": {
            "1 No Response": 1,
            "No response": 1,
            "2 Abnorm extensn": 2,
            "Abnormal extension": 2,
            "3 Abnorm flexion": 3,
            "Abnormal Flexion": 3,
            "4 Flex-withdraws": 4,
            "Flex-withdraws": 4,
            "5 Localizes Pain": 5,
            "Localizes Pain": 5,
            "6 Obeys Commands": 6,
            "Obeys Commands": 6
        }
}

class Categorizer:
    def __init__(self, data, data_dir):
        self.category_dict = category_config
        self.data = data
        self.data_dir = data_dir
        
train_data_path = "/data/datasets/mimic3_18var/root/phenotyping/train_listfile.csv"
val_data_path = "/data/datasets/mimic3_18var/root/phenotyping/val_listfile.csv"

data_dir = "/data/datasets/mimic3_18var/root/phenotyping/train/"


save = False

if save:
    categorizer = Categorizer(pd.read_csv(train_data_path), data_dir)
    with open("categorizer.pkl", "wb") as file:
        pickle.dump(categorizer, file)

    print("Completed Saving Categorizer........")

In [None]:
categorizer = Categorizer(pd.read_csv(train_data_path), data_dir)

In [None]:
train_ds = MimicDataSetPhenotype(data_dir, train_data_path, normalizer.mean_var_dict, categorizer.category_dict, 'training', 400)

In [None]:
train_dataloader = DataLoader(train_ds, batch_size = 32, shuffle=True)


In [None]:
train_ds.data_df[cols].values[0].argmax()

In [None]:
for batch in train_dataloader:
    break

In [None]:
batch['encoder_input'][0]

In [None]:
batch['encoder_input'][1]

In [None]:
batch['encoder_input'][2]

In [None]:
batch['label'].shape

In [None]:
batch['label'],

In [None]:
cols = (train_ds.data_df.columns[2:])

In [None]:
cols

In [None]:
train_ds.data_df[cols].values[0].shape

In [None]:
val_ds = MimicDataSetPhenotype(data_dir, val_data_path, normalizer.mean_var_dict, categorizer.category_dict, 'validation', 400)

In [None]:
val_dataloader = DataLoader(val_ds, batch_size = 32, shuffle=True)


In [None]:
for batch in tqdm(val_dataloader):
    pass

In [3]:
MAX_LEN = 448
batch_size = 32
d_model = 64
num_heads = 8
N = 2
num_variables = 18 
num_variables += 1 #for no variable embedding while doing padding
d_ff = 128
epochs = 100
learning_rate = 8e-4
drop_out = 0.2
sinusoidal = True
th_val_roc = 0.84
th_val_pr = 0.48
num_classes = 25
import torch
DEVICE = 'cuda' if torch.cuda.is_available() else 'cpu'

import pandas as pd
import numpy as np
from utils import MimicDataSetPhenotype, calculate_multi_class_metrics
pd.set_option('future.no_silent_downcasting',True)

import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader, random_split
from torch.nn import functional as F

from model import Model
from tqdm import tqdm
from normalizer import Normalizer
from categorizer import Categorizer


train_data_path = "/data/datasets/mimic3_18var/root/phenotyping/train_listfile.csv"
val_data_path = "/data/datasets/mimic3_18var/root/phenotyping/val_listfile.csv"

data_dir = "/data/datasets/mimic3_18var/root/phenotyping/train/"


import pickle

with open('normalizer.pkl', 'rb') as file:
    normalizer = pickle.load(file)

with open('categorizer.pkl', 'rb') as file:
    categorizer = pickle.load(file)
    

mean_variance = normalizer.mean_var_dict
cat_dict = categorizer.category_dict


train_ds = MimicDataSetPhenotype(data_dir, train_data_path, mean_variance, cat_dict, 'training', MAX_LEN)
val_ds = MimicDataSetPhenotype(data_dir, val_data_path, mean_variance, cat_dict, 'validation', MAX_LEN)
# test_ds = MimicDataSetPhenotype(test_data_dir, test_data_path, mean_variance, cat_dict,'testing', MAX_LEN)

train_dataloader = DataLoader(train_ds, batch_size = batch_size, shuffle=True)
val_dataloader = DataLoader(val_ds, batch_size = 1, shuffle=True)
# test_dataloader = DataLoader(test_ds, batch_size = 1, shuffle=True)

model = Model(d_model, num_heads, d_ff, num_classes, N, sinusoidal).to(DEVICE)
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)

total_params = sum(p.numel() for p in model.parameters())
print(f'Total number of parameters: {total_params}')

for epoch in range(epochs):
    for batch in tqdm(train_dataloader, desc=f'Epoch {epoch + 1}/{epochs}', leave=False):
        inp = batch['encoder_input']
        mask = batch['encoder_mask']
        y = batch['label']
        outputs = model(inp, mask)
        loss = criterion(outputs, y.float().reshape(-1,1))
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
    roc_auc_micro, roc_auc_macro = calculate_multi_class_metrics(model, val_dataloader)
    # print(f'Epoch {epoch + 1}/{epochs}, Train AUC-ROC: {calculate_roc_auc(model, train_dataloader):.3f}')
    print(f'Epoch {epoch + 1}/{epochs}, Validation Micro AUC-ROC: {roc_auc_micro:.3f}')
    print(f'Epoch {epoch + 1}/{epochs}, Validation Macro AUC-ROC: {roc_auc_macro:.3f}')
    if (auc_prc > th_val_pr) or (auc_roc > th_val_roc):
        print("Reached threshold limit stopping...............")
        break

# print("Testing...............")
# print(f"Validation AUC-ROC, AUC-PRC: {calculate_roc_auc(model, test_dataloader):.3f}, {calculate_auc_prc(model, test_dataloader):.3f}")

# Constructing the file path
file_path = f"model_maxlen{MAX_LEN}_batch{batch_size}_dmodel{d_model}_heads{num_heads}_N{N}_vars{num_variables}_dff{d_ff}_epochs{epochs}_lr{learning_rate}_dropout{drop_out}_sinusoidal{sinusoidal}_testing.pth"

# Example usage
torch.save(model.state_dict(), "models/"+ file_path)


Total number of parameters: 77186


                                                                                            

KeyboardInterrupt: 

In [None]:
for batch in train_dataloader:
    pass
    