In [None]:
# %pip install torchsummaryX

In [None]:
import datetime
import gc
import os
import pickle
import zipfile
import numpy.typing as npt
import numpy as np
import pandas as pd
import sklearn
import torch
import wandb
from sklearn.compose import ColumnTransformer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import (accuracy_score, classification_report,
                             confusion_matrix, f1_score, precision_score,
                             recall_score)
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.preprocessing import (LabelEncoder, MinMaxScaler, OneHotEncoder,
                                   OrdinalEncoder, StandardScaler)
from sklearn.svm import SVC
from torchsummary import summary
from tqdm.auto import tqdm
import torch.nn as nn

In [None]:
if torch.cuda.is_available():
    DEVICE = 'cuda'
    DEVICE_N_WORKERS = 4
else:
    DEVICE = 'cpu'
    DEVICE_N_WORKERS = 0

DEVICE

In [4]:
config = {
    'epochs': 50,
    'batch_size': 32,
    'init_lr': 5e-4,
    'dropout_rate': 0.2,
    'scheduler_factor': 0.8,
    'scheduler_patience': 2,
}

# Read Data


In [5]:
data_filename = os.path.join(os.getcwd(), 'data', 'S1File.csv')
metadata_filename = os.path.join(os.getcwd(), 'data', 'metadata.csv')

In [6]:
df = pd.read_csv(data_filename)
metadata = pd.read_csv(metadata_filename)

In [7]:
features = metadata.variable.to_list()
label = 'UCX_abnormal'  # UCX test result
diagnosis = 'UTI_diag'  # ED diagnosis

# Map UCX and clinical diagnosis to int
df[label] = df[label].map({'yes': 1, 'no': 0})
df[diagnosis] = df[diagnosis].map({'Yes': 1, 'No': 0})

# Reorder columns
df = df[[label] + [diagnosis] + features]

# Data Preprocessing


In [8]:
def trim_missing(df: pd.DataFrame) -> pd.DataFrame:
    """
    First, drop the columns with not_reported values > 10%
    Then, drop observations with not_reported or other values
    return cleaned dataframe
    """
    # Drop the columns with not_reported values > 10%
    drop = []
    demo = ['age', 'gender', 'race', 'ethnicity', 'lang',
            'employStatus', 'maritalStatus', 'chief_complaint']
    cols = [i for i in df.columns if i not in demo]
    for col in cols:
        ratio = df[col][df[col] == 'not_reported'].count()/df.shape[0]*100
        if ratio > 0.1:
            drop.append(col)
    df = df.drop(labels=drop, axis=1)

    # Drop observations with not_reported or other values
    df= df[~df.apply(lambda row: row =='not_reported').any(axis=1)]
    df= df[~df.apply(lambda row: row =='other').any(axis=1)]
    df= df[~df.apply(lambda row: row =='4+').any(axis=1)]

    # Convert numeric features to float
    num = ['ua_ph', 'ua_spec_grav', 'age']
    for col in num:
        mean = df[(df[col] != 'not_reported') & (df[col]!= 'other')][col].astype(
            'float').mean()
        df[col] = df[col].replace('not_reported', mean)
        df[col] = df[col].astype(float)

    return df

In [9]:
def encode_features(df: pd.DataFrame) -> tuple[pd.DataFrame, ColumnTransformer]:
    """
    Input the cleaned dataframe,
    OneHotEncode the categorical (non-ordinal) attributes,
    OrdinalEncode the ordinal attributes
    return the final dataframe
    """

    other = ['ua_ph', 'ua_spec_grav', 'age']
    ord = ['ua_blood', 'ua_glucose', 'ua_ketones', 'ua_leuk', 'ua_protein']
    onehot = ['chief_complaint', 'race', 'ethnicity',
              'maritalStatus', 'employStatus']
    label = [i for i in df.columns if i not in ord+other+onehot]

    preprocessor = ColumnTransformer(
        transformers=[
            ('onehot', OneHotEncoder(), onehot),
            ('label', OrdinalEncoder(), label),
            ('ordinal', OrdinalEncoder(categories=[
             ['negative', 'small', 'moderate', 'large']]* len(ord)), ord)
        ])

    transformed = preprocessor.fit_transform(df)

    onehot_col_names = preprocessor.named_transformers_[
        'onehot'].get_feature_names_out(onehot)
    new_column_names = list(onehot_col_names) + label + ord
    # Preserve the original index
    df_transformed = pd.DataFrame(
        transformed, columns=new_column_names, index=df.index)  # type: ignore

    df_final = pd.concat([df[other], df_transformed], axis=1)

    return df_final, preprocessor

In [None]:
df_cleaned = trim_missing(df)
df_cleaned.head()

In [None]:
X, encoder = encode_features(df_cleaned.iloc[:, 2:])
Y = df_cleaned.iloc[:, :2]
print(f'Feature X shape: {X.shape}')
print(f'Label Y shape: {Y.shape}, where'
      f'\n\tthe first column is true label ({label})'
      f'\n\tthe second column is ed diagnosis ({diagnosis})')

In [None]:
X.head()

In [None]:
Y.head()

# Split Data


In [14]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y,
                                                    test_size=0.2,
                                                    random_state=42)
y_train, y_test = Y_train[label], Y_test[label]

assert y_train.name == label
assert y_test.name == label

In [15]:
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train,
                                                  test_size=0.25,
                                                  random_state=42)

In [None]:
len(X_train), len(X_val), len(X_test)

In [None]:
len(y_train.shape), len(y_val.shape), len(y_test.shape)

# Datasets


In [18]:
class TrainDataset(torch.utils.data.Dataset):

    def __init__(self, X: np.ndarray, y: np.ndarray):
        assert len(X) == len(y), 'inconsistent shape between X and y'
        self.features = X
        self.labels = y
        self.length = len(X)
        self.n_feature = X.shape[1]

    def __len__(self):
        return self.length

    def __getitem__(self, i):
        feature = torch.FloatTensor(self.features[i])
        label = torch.FloatTensor([self.labels[i]])
        return feature, label

In [19]:
class TestDataset(torch.utils.data.Dataset):

    def __init__(self, X: np.ndarray):
        self.features = X
        self.length = len(X)

    def __len__(self):
        return self.length

    def __getitem__(self, i):
        feature = torch.FloatTensor(self.features[i])
        return feature

# Dataloader


In [20]:
train_data = TrainDataset(X=X_train.values, y=y_train.values)
val_data = TrainDataset(X=X_val.values, y=y_val.values)
test_data = TestDataset(X=X_test.values)

In [None]:
train_loader = torch.utils.data.DataLoader(dataset=train_data,
                                           num_workers=DEVICE_N_WORKERS,
                                           batch_size=config['batch_size'],
                                           pin_memory=True,
                                           shuffle=True,
                                           drop_last=True)
val_loader = torch.utils.data.DataLoader(dataset=val_data,
                                         num_workers=0,
                                         batch_size=config['batch_size'],
                                         pin_memory=True,
                                         shuffle=False,
                                         drop_last=True)
test_loader = torch.utils.data.DataLoader(dataset=test_data,
                                          num_workers=0,
                                          batch_size=config['batch_size'],
                                          pin_memory=True,
                                          shuffle=False)

print("Batch size: ", config['batch_size'])
print("Train dataset samples = {}, batches = {}".format(
    train_data.__len__(), len(train_loader)))
print("Validation dataset samples = {}, batches = {}".format(
    val_data.__len__(), len(val_loader)))
print("Test dataset samples = {}, batches = {}".format(
    test_data.__len__(), len(test_loader)))

In [None]:
# Testing code to check if your data loaders are working
for i, (feature, label) in enumerate(train_loader):
    print(feature.shape, label.shape)
    break

# NN


In [23]:
class NN(torch.nn.Module):

    def __init__(self, input_size: int, dropout_rate: float):

        super(NN, self).__init__()

        self.model = torch.nn.Sequential(
            torch.nn.Linear(input_size, 512),
            torch.nn.BatchNorm1d(512),
            torch.nn.GELU(),
            torch.nn.Linear(512, 2048),
            torch.nn.BatchNorm1d(2048),
            torch.nn.GELU(),
            torch.nn.Dropout(dropout_rate),
            torch.nn.Linear(2048, 2048),
            torch.nn.BatchNorm1d(2048),
            torch.nn.GELU(),
            torch.nn.Linear(2048, 2048),
            torch.nn.BatchNorm1d(2048),
            torch.nn.GELU(),
            torch.nn.Dropout(dropout_rate),
            torch.nn.Linear(2048, 2048),
            torch.nn.BatchNorm1d(2048),
            torch.nn.GELU(),
            torch.nn.Linear(2048, 2048),
            torch.nn.BatchNorm1d(2048),
            torch.nn.GELU(),
            torch.nn.Dropout(dropout_rate),
            torch.nn.Linear(2048, 2048),
            torch.nn.BatchNorm1d(2048),
            torch.nn.GELU(),
            torch.nn.Linear(2048, 512),
            torch.nn.BatchNorm1d(512),
            torch.nn.GELU(),
            torch.nn.Dropout(dropout_rate),
            torch.nn.Linear(512, 256),
            torch.nn.BatchNorm1d(256),
            torch.nn.GELU(),
            torch.nn.Linear(256, 128),
            torch.nn.BatchNorm1d(128),
            torch.nn.GELU(),
            torch.nn.Linear(128, 1),
            torch.nn.Sigmoid()
        )

    def forward(self, x):
        return self.model(x)

# Setup

In [24]:
model = NN(input_size=train_data.n_feature,
           dropout_rate=config['dropout_rate']).to(DEVICE)

In [25]:
criterion = nn.BCEWithLogitsLoss()
optimizer = torch.optim.AdamW(model.parameters(), lr=config['init_lr'])
scaler = torch.amp.GradScaler('cuda')
scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode='min',
                                                       factor=config['scheduler_factor'],
                                                       patience=config['scheduler_patience'])

In [None]:
wandb.login(key="c3a06f318f071ae7444755a93fa8a5cbff1f6a86")

In [None]:
run = wandb.init(
    name='nn',
    reinit=True,  # Allows reinitalizing runs when you re-run this cell
    # id     = "y28t31uz", ### Insert specific run id here if you want to resume a previous run
    # resume = "must", ### You need this to resume previous runs, but comment out reinit = True when using this
    project="map",  # Project should be created in your wandb account
    config=config  # Wandb Config for your run
)

In [None]:
# Save your model architecture as a string with str(model)
model_arch = str(model)

# Save it in a txt file
arch_file = open("model_arch.txt", "w")
file_write = arch_file.write(model_arch)
arch_file.close()

# log it in your wandb run with wandb.save()
wandb.save('model_arch.txt')

# Training and Validation Functions

In [29]:
def train(model, dataloader, criterion, optimizer, scaler):
    """
    return total_loss, total_acc
    """

    model.train()
    total_loss, total_acc = 0, 0

    batch_bar = tqdm(total=len(dataloader), dynamic_ncols=True,
                     leave=False, position=0, desc='Train')

    for i, (feature, label) in enumerate(dataloader):
        optimizer.zero_grad()
        feature = feature.to(DEVICE)
        label = label.to(DEVICE)

        # Forward Propagation
        with torch.autocast(device_type='cuda', dtype=torch.float16):
            # print(feature.device)
            # print(next(model.parameters()).device)
            logits = model(feature)
            loss = criterion(logits, label)

        # Backpropagation
        scaler.scale(loss).backward()

        # GD
        scaler.step(optimizer)
        scaler.update()

        # Record
        prediction = (logits >= 0.5).int()
        total_loss += loss.item()
        total_acc += torch.sum(prediction == label).item() / logits.shape[0]
        batch_bar.set_postfix(loss="{:.04f}".format(float(total_loss / (i + 1))),
                              acc="{:.04f}%".format(float(total_acc*100 / (i + 1))))
        batch_bar.update()

        # Release memory
        del feature, label, logits, prediction
        torch.cuda.empty_cache()

    batch_bar.close()

    total_loss /= len(dataloader)
    total_acc /= len(dataloader)
    return total_loss, total_acc

In [30]:
def eval(model, dataloader, criterion):
    """
    return total_loss, total_acc, precision, recall, f1
    """

    model.eval()
    total_loss, total_acc = 0, 0
    predictions, labels = [], []

    batch_bar = tqdm(total=len(dataloader), dynamic_ncols=True,
                     leave=False, position=0, desc='Val')

    for i, (feature, label) in enumerate(dataloader):
        feature = feature.to(DEVICE)
        label = label.to(DEVICE)

        # Forward Propagation
        with torch.inference_mode():
            logits = model(feature)
            loss = criterion(logits, label)

        # Record
        prediction = (logits >= 0.5).int()
        total_loss += loss.item()
        total_acc += torch.sum(prediction == label).item() / logits.shape[0]
        batch_bar.set_postfix(loss="{:.04f}".format(float(total_loss / (i + 1))),
                              acc="{:.04f}%".format(float(total_acc*100 / (i + 1))))
        batch_bar.update()

        labels.extend(label.tolist())
        predictions.extend(prediction.tolist())

        # Release memory
        del feature, label, logits, prediction
        torch.cuda.empty_cache()

    batch_bar.close()

    total_loss /= len(dataloader)
    total_acc /= len(dataloader)
    precision = precision_score(labels, predictions)
    recall = recall_score(labels, predictions)
    f1 = f1_score(labels, predictions)
    return (total_loss, total_acc, precision, recall, f1)

In [31]:
def test(model, test_loader):

    model.eval()
    predictions = []

    with torch.no_grad():
        for i, feature in enumerate(tqdm(test_loader)):

            feature = feature.to(DEVICE)
            logits = model(feature)
            prediction = (logits >= 0.5).int()
            predictions.extend(prediction.tolist())

    return predictions

In [32]:
def model_performace(model, X_train, X_test, y_train, y_test,
                     ljust_len=30):
    print('Training accuracy: {}'.format(
        "%.4f" % model.score(X_train, y_train)))

    male, female = X_test.gender == 1, X_test.gender == 0
    print('Test accuracy:\n\t{}{}\n\t{}{}\n\t{}{}'.format(
        'General population'.ljust(ljust_len),
        "%.4f" % model.score(X_test, y_test),
        'Male'.ljust(ljust_len),
        "%.4f" % model.score(X_test[male], y_test[male]),
        'Female'.ljust(ljust_len),
        "%.4f" % model.score(X_test[female], y_test[female])))

    employ_cols = X_test.columns[X_test.columns.str.contains('employStatus')]
    for employ_col in employ_cols:
        rows = X_test[employ_col] == 1
        print('\t{}{}'.format(
            employ_col.split('_')[-1].ljust(ljust_len),
            "%.4f" % model.score(X_test[rows], y_test[rows])))

    y_pred = model.predict(X_test)
    report = classification_report(y_test, y_pred)
    print('\n', report)

In [33]:
def save_checkpoint(file_path, model, optimizer, scaler, scheduler,
                    epoch, train_acc, val_acc, precision, recall, f1):

    checkpoint = {'epoch': epoch,
                  'model_state_dict': model.state_dict(),
                  'optimizer_state_dict': optimizer.state_dict(),
                  'scaler_state_dict': scaler.state_dict(),
                  'scheduler_state_dict': scheduler.state_dict(),
                  'train_accuray': train_acc, 'val_accuray': val_acc,
                  'precision': precision, 'recall': recall, 'f1': f1}
    torch.save(checkpoint, file_path)

# Experiment

In [None]:
wandb.watch(model, log="all")

In [None]:
best_score = 0

In [None]:
torch.cuda.empty_cache()
gc.collect()

for epoch in range(config['epochs']):
    print(f"\nEpoch {epoch+1}/{config['epochs']}")

    curr_lr = float(optimizer.param_groups[0]['lr'])
    train_loss, train_acc = train(model,
                                  train_loader,
                                  criterion,
                                  optimizer,
                                  scaler)
    val_loss, val_acc, precision, recall, f1 = eval(model,
                                                    val_loader,
                                                    criterion)
    scheduler.step(val_loss)

    print("\tTrain Acc {:.04f}%\tTrain Loss {:.04f}\t Learning Rate {:.07f}".format(
        train_acc*100, train_loss, curr_lr))
    print("\tVal Acc {:.04f}%\tVal Loss {:.04f}".format(
        val_acc*100, val_loss))
    print("\tVal Precison {:.04f}\tRecall {:.04f}\tF1 {:.04f}".format(
        precision, recall, f1))

    wandb.log({
        'lr': curr_lr,
        'train_acc': train_acc*100,
        'train_loss': train_loss,
        'val_acc': val_acc*100,
        'val_loss': val_loss,
        'val_precison': precision,
        'val_recall': recall,
        'val_f1': f1
    })

    epoch += 1

    if (val_acc > best_score):
        best_score = val_acc
        save_checkpoint(f'{run.id}_best_model.pt', model, optimizer, scaler, scheduler,
                        epoch, train_acc, val_acc, precision, recall, f1)
        print(f'Best model saved at epoch {epoch}')

run.finish()

Train:  57%|█████▋    | 644/1121 [00:33<00:09, 48.99it/s, acc=85.5833%, loss=0.6650]

# Inference

In [None]:
y_pred = test(model, test_loader)
len(y_pred)

In [None]:
print(classification_report(y_test, y_pred))

In [35]:
save_checkpoint(f'{wandb.run.id}_best_model.pt',
                            model, optimizer, scaler, scheduler,
                            epoch, train_acc, val_acc, precision, recall, best_score)

<All keys matched successfully>