Importing

In [1]:
import numpy as np 
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
import os

In [8]:
!pip install torchmetrics

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting torchmetrics
  Downloading torchmetrics-0.11.1-py3-none-any.whl (517 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m517.2/517.2 KB[0m [31m9.4 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: torchmetrics
Successfully installed torchmetrics-0.11.1


Dataset

In [3]:
df = pd.read_csv('/content/dataset.csv')

print('Negative : {}'.format(len(df[df['hospital_death']==0])))
print('Positive : {}'.format(len(df[df['hospital_death']==1])))

Negative : 8635
Positive : 570


In [5]:
def clean_data(dataframe, 
               solve_missing : str ='drop', 
               solve_missing_cat : str = 'drop',
               drop_list : list = [], 
               category_list : list = []): 
    df = dataframe.drop_duplicates()
    df = df.drop(drop_list, axis=1)
    if solve_missing == 'drop' or solve_missing_cat == 'drop':
        df = df.dropna()#
        object_dtype = list(df.select_dtypes(include='object').columns)
        for col in df.columns:
            if col in category_list or col in object_dtype:
                df[col] = df[col].astype('category')
    else:
        object_dtype = list(df.select_dtypes(include='object').columns)
        for col in df.columns:
            if col in category_list or col in object_dtype:
                df[col] = df[col].astype('category')
                if solve_missing_cat == 'mode':#
                    df[col] = df[col].fillna(df[col].mode()[0], inplace=False)
            else:
                if solve_missing == 'mean':#
                    df[col] = df[col].fillna(df[col].mean(), inplace=False)
                else:#
                    df[col] = df[col].fillna(df[col].median(), inplace=False)
    return df

def split_xy(dataframe, label : str):
    return dataframe.drop(labels=label, axis=1), dataframe[label]

def encode(df):
    category_dtype = list(df.select_dtypes(include='category').columns)
    cat = pd.get_dummies(df, columns = category_dtype, drop_first = True)
    for col in cat.columns:
        if cat[col].dtype == np.uint8:
            cat[col] = cat[col].astype('category')
    return cat

def scale(features : tuple):
    trainFeatures, valFeatures, testFeatures = features
    scaler = StandardScaler()
    category_dtype = list(trainFeatures.select_dtypes(include='category').columns)
    continuous_dtype = list(filter(lambda c: c not in category_dtype, trainFeatures.columns))

    scaler.fit(trainFeatures[continuous_dtype])
    cont_xtrain = scaler.transform(trainFeatures[continuous_dtype])#
    cont_xval = scaler.transform(valFeatures[continuous_dtype])
    cont_xtest = scaler.transform(testFeatures[continuous_dtype])
    
    cat_xtrain = trainFeatures[category_dtype]
    cat_xval = valFeatures[category_dtype]
    cat_xtest = testFeatures[category_dtype]
    print(cat_xtrain.shape, cat_xval.shape, cat_xtrain.shape)
    xtrain = np.concatenate((cont_xtrain, cat_xtrain), axis=1)
    xval = np.concatenate((cont_xval, cat_xval), axis=1)
    xtest = np.concatenate((cont_xtest, cat_xtest), axis=1)
    return scaler, xtrain, xval, xtest

In [6]:
category_list = ['elective_surgery', 'ethnicity', 'icu_admit_source', 'icu_stay_type', 'icu_type', 'apache_3j_bodysystem', 'apache_2_bodysystem', 
                 'aids', 'cirrhosis', 'diabetes_mellitus', 'hepatic_failure', 'immunosuppression', 'leukemia', 'lymphoma', 'solid_tumor_with_metastasis', 'hospital_death']
drop_list = ['encounter_id', 'patient_id', 'hospital_id', 'Unnamed: 83']
clean_df = clean_data(df, 'drop', 'drop', drop_list, category_list)
print('Negative : {}'.format(len(clean_df[clean_df['hospital_death']==0])))
print('Positive : {}'.format(len(clean_df[clean_df['hospital_death']==1])))
# clean_df.info()
features, labels = split_xy(clean_df, 'hospital_death')
print(features.shape)
print(labels.shape)
####
encoded_features = encode(features)#one hot encoding
print('Encoded Features Shape : {}'.format(encoded_features.shape))
X_train, X_test, ytrain, ytest = train_test_split(encoded_features, labels, test_size = 0.2, stratify = labels)
X_test, X_val, ytest, yval = train_test_split(X_test, ytest, test_size = 0.5, stratify = ytest)
scaler, xtrain, xval, xtest = scale((X_train, X_val, X_test))
print(scaler.mean_)
print(scaler.var_)
print(xtrain.shape, ytrain.shape)
print(xval.shape, yval.shape)
print(xtest.shape, ytest.shape)
xtrain = np.array(xtrain, dtype='float32')
xval = np.array(xval, dtype='float32')
xtest = np.array(xtest, dtype='float32')
ytrain = np.array(ytrain, dtype='float32')
yval = np.array(yval, dtype='float32')
ytest = np.array(ytest, dtype='float32')


Negative : 4322
Positive : 281
(4603, 80)
(4603,)
Encoded Features Shape : (4603, 109)
(3682, 45) (461, 45) (3682, 45)
[6.21072787e+01 3.00508095e+01 1.69672102e+02 1.04745519e+02
 5.63270830e-01 8.64883922e+01 1.78632808e+02 5.58511901e+02
 1.71917436e-01 3.04182510e-02 3.57332971e+00 5.54345464e+00
 0.00000000e+00 4.31884845e+00 1.00127648e+02 1.43400326e-01
 9.83932645e+01 3.33256382e+01 3.64336882e+01 2.87343835e-01
 8.90285171e+01 5.24913091e+01 8.90274307e+01 5.25024443e+01
 1.05896524e+02 6.83742531e+01 1.13730038e+02 6.93552417e+01
 1.14087724e+02 6.90809343e+01 3.42675177e+01 1.25263444e+01
 9.92653449e+01 8.96632265e+01 1.57385932e+02 9.71450299e+01
 1.57367463e+02 9.71499919e+01 3.73686193e+01 3.62747218e+01
 7.78742531e+01 6.39022271e+01 7.79302010e+01 6.39546442e+01
 9.47365562e+01 8.28489951e+01 9.92180880e+01 8.52096687e+01
 9.93242803e+01 8.51567083e+01 2.57243346e+01 1.75717002e+01
 9.80765888e+01 9.42281369e+01 1.40445139e+02 1.17364747e+02
 1.40372895e+02 1.17326996e

Model

In [9]:
import torch
import torch.nn.functional as F
import torch.nn as nn
from torch.autograd import Variable
from torchmetrics import Accuracy, MeanSquaredError, Precision, Recall

class Model(nn.Module):
    def __init__(self, input_dim, output_dim):
        super(Model, self).__init__()
        self.layer1 = nn.Linear(input_dim, 1024)
        self.gnorm1 = nn.GroupNorm(32, 1024)
        
        self.layer2 = nn.Linear(1024, 512)
        self.gnorm2 = nn.GroupNorm(16, 512)
        
        self.layer3 = nn.Linear(512, 128)
        self.gnorm3 = nn.GroupNorm(4, 128)
        self.layer4 = nn.Linear(128, output_dim)
        
    def forward(self, x):
        x = F.relu(self.gnorm1(self.layer1(x)))
        x = F.relu(self.gnorm2(self.layer2(x)))
        x = F.relu(self.gnorm3(self.layer3(x)))
        x = torch.sigmoid(self.layer4(x))
        return x

def L2(params):
    l2_lambda = 0.001
    l2_reg = torch.tensor(0.)
    for param in params:
        l2_reg += torch.sum(torch.pow(param, 2))
    return l2_lambda * l2_reg


In [10]:
from torch.utils.data import Dataset, DataLoader

class CustomDataset(Dataset):
    def __init__(self, x, y, device):
        self._x = x
        self._y = y
        self._device = device

    def __len__(self):
        return len(self._x)

    def __getitem__(self, idx):
        X, Y = self._x[idx], self._y[idx].ravel()
        return torch.as_tensor(X, dtype=torch.float32, device=self._device), torch.as_tensor(Y, dtype=torch.float32, device=self._device)
    
def train_loop(dataloader, valloader, model, loss_fn, optimizer, max_iter, metrics : dict, device):
    model.train()
    history = dict()
    history['Loss'] = []
    history['val_Loss'] = []
    for m in metrics:
        history[m['name']] = []
        history['val_{}'.format(m['name'])] = []
    size = len(dataloader.dataset)
    for itr in range(max_iter):
        real_time = dict()
        real_time['Loss'] = []
        for m in metrics:
            real_time[m['name']] = []
        for batch, (X, y) in enumerate(dataloader):
            
            X.to(device)
            y.to(device)
            pred = model(X)
            loss = loss_fn(pred, y)
            loss += L2(model.parameters())
            
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
           
            real_time['Loss'].append(loss.item())
            for m in metrics:
                if m['type'] == 'float':
                    real_time[m['name']].append(m['fn'](pred, y).item())
                else:
                    real_time[m['name']].append(m['fn'](pred, y.type(torch.uint8)).item())
        history['Loss'].append(np.mean(real_time['Loss']))
        for m in metrics:
            history[m['name']].append(np.mean(real_time[m['name']]))
            print(f"{m['name']}: {np.mean(real_time[m['name']]):>8f}", end='\t')
        print('')
        epoch_loss = np.mean(real_time['Loss'])
        print(f"Loss: {epoch_loss:>7f} [{itr:>5d}/{max_iter:>5d}]")
        print('Val', end = '\t')
        tmp = dict()
        tmp['Loss'] = []
        for m in metrics:
            tmp[m['name']] = []
        model.eval()
        with torch.no_grad():
            for X, y in valloader:
                X.to(device)
                y.to(device)
                pred = model(X)
                test_loss = loss_fn(pred, y)
                tmp['Loss'].append(test_loss.item())
                for m in metrics:
                    if m['type'] == 'float':
                        tmp[m['name']].append(m['fn'](pred, y).item())
                    else:
                        tmp[m['name']].append(m['fn'](pred, y.type(torch.uint8)).item())
        for k, v in tmp.items():
            print(f"{k}: {np.mean(v):>8f}", end='\t')
            history['val_{}'.format(k)].append(np.mean(v))
        print('###END')
        
    return history

Training and Validation

In [11]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

model     = Model(xtrain.shape[-1], 1)#
model.to(device)
optimizer = torch.optim.Adam(model.parameters(), lr=0.000001)

if torch.cuda.is_available():
    loss_fn   = nn.BCELoss().cuda()#binary cross entropy
    metrics = [
        {'name': 'Accuracy', 'fn' : Accuracy(task="binary").cuda(), 'type' : 'int'},
        {'name': 'MeanSquaredError', 'fn': MeanSquaredError().cuda(), 'type' : 'float'},
        {'name': 'Precision', 'fn': Precision(task="binary").cuda(), 'type': 'int'},
        {'name': 'Recall', 'fn': Recall(task="binary").cuda(), 'type':'int'}
    ]
    
else:
    loss_fn = nn.BCELoss().cpu()
    metrics = [
        {'name': 'Accuracy', 'fn' : Accuracy(task="binary").cpu(), 'type' : 'int'},
        
    ]
train_dataset = CustomDataset(xtrain, ytrain, device)
val_dataset = CustomDataset(xval, yval, device)
test_dataset = CustomDataset(xtest, ytest, device)
train_loader = DataLoader(train_dataset, batch_size=256, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=1, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=1, shuffle=False)

In [12]:
epochs = 5
history = train_loop(train_loader, val_loader, model, loss_fn, optimizer, epochs,  metrics,device)

Accuracy: 0.763462	
Loss: 2.830772 [    0/    5]
Val	Loss: 0.580351	Accuracy: 0.852495	###END
Accuracy: 0.842761	
Loss: 2.797851 [    1/    5]
Val	Loss: 0.548298	Accuracy: 0.891540	###END
Accuracy: 0.893936	
Loss: 2.765639 [    2/    5]
Val	Loss: 0.519147	Accuracy: 0.915401	###END
Accuracy: 0.921062	
Loss: 2.737945 [    3/    5]
Val	Loss: 0.492568	Accuracy: 0.930586	###END
Accuracy: 0.933115	
Loss: 2.711376 [    4/    5]
Val	Loss: 0.468390	Accuracy: 0.932755	###END
