In [115]:
import datetime
import gc
import os
import pickle
import zipfile
import numpy.typing as npt
import numpy as np
import pandas as pd
import sklearn
import torch
import wandb
from sklearn.compose import ColumnTransformer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import (accuracy_score, classification_report,
                             confusion_matrix, f1_score, precision_score,
                             recall_score)
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.preprocessing import (LabelEncoder, MinMaxScaler, OneHotEncoder,
                                   OrdinalEncoder, StandardScaler)
from sklearn.svm import SVC
from torchsummaryX import summary
from tqdm.auto import tqdm

In [54]:
def empty_cache():
    if (DEVICE =='gpu'):
        torch.cuda.empty_cache()
    elif (DEVICE == 'mps'):
        torch.mps.empty_cache()


if torch.cuda.is_available():
    DEVICE = 'cuda'
    DEVICE_N_WORKERS = 4
elif torch.backends.mps.is_available():
    DEVICE = 'mps'
    DEVICE_N_WORKERS = 0
else:
    DEVICE = 'cpu'
    DEVICE_N_WORKERS = 0

DEVICE

'mps'

In [80]:
config = {
    'epochs': 50,
    'batch_size': 64,
    'context': 30,
    'init_lr': 1e-3,
    'dropout_rate': 0.35,
    'scheduler_factor': 0.5,
    'scheduler_patience': 2,
    'architecture': 'diamond'
}

# Read Data


In [56]:
data_filename = os.path.join(os.getcwd(), 'data', 'S1File.csv')
metadata_filename = os.path.join(os.getcwd(), 'data', 'metadata.csv')

In [57]:
df = pd.read_csv(data_filename)
metadata = pd.read_csv(metadata_filename)

In [58]:
features = metadata.variable.to_list()
label = 'UCX_abnormal'  # UCX test result
diagnosis = 'UTI_diag'  # ED diagnosis

# Map UCX and clinical diagnosis to int
df[label] = df[label].map({'yes': 1, 'no': 0})
df[diagnosis] = df[diagnosis].map({'Yes': 1, 'No': 0})

# Reorder columns
df = df[[label] + [diagnosis] + features]

# Data Preprocessing


In [59]:
def trim_missing(df: pd.DataFrame) -> pd.DataFrame:
    """
    First, drop the columns with not_reported values > 10%
    Then, drop observations with not_reported or other values
    return cleaned dataframe
    """
    # Drop the columns with not_reported values > 10%
    drop = []
    demo = ['age', 'gender', 'race', 'ethnicity', 'lang',
            'employStatus', 'maritalStatus', 'chief_complaint']
    cols = [i for i in df.columns if i not in demo]
    for col in cols:
        ratio = df[col][df[col] == 'not_reported'].count()/df.shape[0]*100
        if ratio > 0.1:
            drop.append(col)
    df = df.drop(labels=drop, axis=1)

    # Drop observations with not_reported or other values
    df= df[~df.apply(lambda row: row =='not_reported').any(axis=1)]
    df= df[~df.apply(lambda row: row =='other').any(axis=1)]
    df= df[~df.apply(lambda row: row =='4+').any(axis=1)]

    # Convert numeric features to float
    num = ['ua_ph', 'ua_spec_grav', 'age']
    for col in num:
        mean = df[(df[col] != 'not_reported') & (df[col]!= 'other')][col].astype(
            'float').mean()
        df[col] = df[col].replace('not_reported', mean)
        df[col] = df[col].astype(float)

    return df

In [60]:
def encode_features(df: pd.DataFrame) -> tuple[pd.DataFrame, ColumnTransformer]:
    """
    Input the cleaned dataframe,
    OneHotEncode the categorical (non-ordinal) attributes,
    OrdinalEncode the ordinal attributes
    return the final dataframe
    """

    other = ['ua_ph', 'ua_spec_grav', 'age']
    ord = ['ua_blood', 'ua_glucose', 'ua_ketones', 'ua_leuk', 'ua_protein']
    onehot = ['chief_complaint', 'race', 'ethnicity',
              'maritalStatus', 'employStatus']
    label = [i for i in df.columns if i not in ord+other+onehot]

    preprocessor = ColumnTransformer(
        transformers=[
            ('onehot', OneHotEncoder(), onehot),
            ('label', OrdinalEncoder(), label),
            ('ordinal', OrdinalEncoder(categories=[
             ['negative', 'small', 'moderate', 'large']]* len(ord)), ord)
        ])

    transformed = preprocessor.fit_transform(df)

    onehot_col_names = preprocessor.named_transformers_[
        'onehot'].get_feature_names_out(onehot)
    new_column_names = list(onehot_col_names) + label + ord
    # Preserve the original index
    df_transformed = pd.DataFrame(
        transformed, columns=new_column_names, index=df.index)  # type: ignore

    df_final = pd.concat([df[other], df_transformed], axis=1)

    return df_final, preprocessor

In [61]:
df_cleaned = trim_missing(df)
df_cleaned.head()

Unnamed: 0,UCX_abnormal,UTI_diag,ua_blood,ua_color,ua_glucose,ua_ketones,ua_leuk,ua_nitrite,ua_ph,ua_protein,...,MISCELLANEOUS_MEDICAL_SUPPLIES__DEVICES__NON_DRUG,MUSCLE_RELAXANTS,PRE_NATAL_VITAMINS,PSYCHOTHERAPEUTIC_DRUGS,SEDATIVE_HYPNOTICS,SKIN_PREPS,SMOKING_DETERRENTS,THYROID_PREPS,UNCLASSIFIED_DRUG_PRODUCTS,VITAMINS
0,1,1,negative,yellow,negative,negative,small,negative,7.5,negative,...,No,No,No,No,No,No,No,No,No,No
2,1,0,negative,yellow,negative,negative,small,negative,5.0,small,...,No,No,No,Yes,Yes,No,No,Yes,Yes,No
3,1,1,negative,yellow,negative,negative,large,negative,5.5,small,...,No,No,No,No,No,No,No,No,No,Yes
4,0,0,negative,orange,negative,small,small,positive,6.0,moderate,...,No,No,No,No,No,No,No,No,No,No
5,1,0,large,yellow,negative,large,small,negative,6.0,small,...,No,No,No,No,No,No,No,No,No,No


In [62]:
X, encoder = encode_features(df_cleaned.iloc[:, 2:])
Y = df_cleaned.iloc[:, :2]
print(f'Feature X shape: {X.shape}')
print(f'Label Y shape: {Y.shape}, where'
      f'\n\tthe first column is true label ({label})'
      f'\n\tthe second column is ed diagnosis ({diagnosis})')

Feature X shape: (59792, 153)
Label Y shape: (59792, 2), where
	the first column is true label (UCX_abnormal)
	the second column is ed diagnosis (UTI_diag)


In [63]:
X.head()

Unnamed: 0,ua_ph,ua_spec_grav,age,chief_complaint_ABDOMINAL PAIN,chief_complaint_ALTERED MENTAL STATUS,chief_complaint_BACK PAIN,chief_complaint_CHEST PAIN,chief_complaint_DIZZINESS,chief_complaint_DYSURIA,chief_complaint_EMESIS,...,SKIN_PREPS,SMOKING_DETERRENTS,THYROID_PREPS,UNCLASSIFIED_DRUG_PRODUCTS,VITAMINS,ua_blood,ua_glucose,ua_ketones,ua_leuk,ua_protein
0,7.5,1.02,83.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
2,5.0,1.016,78.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0,1.0
3,5.5,1.016,84.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,3.0,1.0
4,6.0,1.03,55.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,2.0
5,6.0,1.03,47.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,3.0,0.0,3.0,1.0,1.0


In [64]:
Y.head()

Unnamed: 0,UCX_abnormal,UTI_diag
0,1,1
2,1,0
3,1,1
4,0,0
5,1,0


# Split Data


In [65]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y,
                                                    test_size=0.2,
                                                    random_state=42)
y_train, y_test = Y_train[label], Y_test[label]

assert y_train.name == label
assert y_test.name == label

In [66]:
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train,
                                                  test_size=0.25,
                                                  random_state=42)

In [67]:
len(X_train), len(X_val), len(X_test)

(35874, 11959, 11959)

In [68]:
len(y_train.shape), len(y_val.shape), len(y_test.shape)

(1, 1, 1)

# Datasets


In [85]:
class TrainDataset(torch.utils.data.Dataset):

    def __init__(self, X: np.ndarray, y: np.ndarray):
        assert len(X) == len(y), 'inconsistent shape between X and y'
        self.features = X
        self.labels = y
        self.length = len(X)
        self.n_feature = X.shape[1]

    def __len__(self):
        return self.length

    def __getitem__(self, i):
        feature = torch.FloatTensor(self.features[i])
        label = torch.tensor(self.labels[i])
        return feature, label

In [86]:
class TestDataset(torch.utils.data.Dataset):

    def __init__(self, X: np.ndarray):
        self.features = X
        self.length = len(X)

    def __len__(self):
        return self.length

    def __getitem__(self, i):
        feature = torch.FloatTensor(self.features[i])
        return feature

# Dataloader


In [87]:
train_data = TrainDataset(X=X_train.values, y=y_train.values)
val_data = TrainDataset(X=X_val.values, y=y_val.values)
test_data = TestDataset(X=X_test)

In [88]:
train_loader = torch.utils.data.DataLoader(dataset=train_data,
                                           num_workers=DEVICE_N_WORKERS,
                                           batch_size=config['batch_size'],
                                           pin_memory=True,
                                           shuffle=True)
val_loader = torch.utils.data.DataLoader(dataset=val_data,
                                         num_workers=0,
                                         batch_size=config['batch_size'],
                                         pin_memory=True,
                                         shuffle=False)
test_loader = torch.utils.data.DataLoader(dataset=test_data,
                                          num_workers=0,
                                          batch_size=config['batch_size'],
                                          pin_memory=True,
                                          shuffle=False)

print("Batch size: ", config['batch_size'])
print("Train dataset samples = {}, batches = {}".format(
    train_data.__len__(), len(train_loader)))
print("Validation dataset samples = {}, batches = {}".format(
    val_data.__len__(), len(val_loader)))
print("Test dataset samples = {}, batches = {}".format(
    test_data.__len__(), len(test_loader)))

Batch size:  64
Train dataset samples = 35874, batches = 561
Validation dataset samples = 11959, batches = 187
Test dataset samples = 11959, batches = 187


In [89]:
# Testing code to check if your data loaders are working
for i, data in enumerate(train_loader):
    feature, label = data
    print(feature.shape, label.shape)
    break

torch.Size([64, 153]) torch.Size([64])


# NN


In [134]:
class NN(torch.nn.Module):

    def __init__(self, input_size: int, dropout_rate: float):

        super(NN, self).__init__()

        self.model = torch.nn.Sequential(
            torch.nn.Linear(input_size, 512),
            # torch.nn.BatchNorm1d(512),
            torch.nn.GELU(),
            torch.nn.Linear(512, 1024),
            # torch.nn.BatchNorm1d(1024),
            torch.nn.GELU(),
            torch.nn.Dropout(dropout_rate),
            torch.nn.Linear(1024, 2048),
            # torch.nn.BatchNorm1d(2048),
            torch.nn.GELU(),
            torch.nn.Linear(2048, 2048),
            # torch.nn.BatchNorm1d(2048),
            torch.nn.GELU(),
            torch.nn.Dropout(dropout_rate),
            torch.nn.Linear(2048, 4096),
            # torch.nn.BatchNorm1d(2048),
            torch.nn.GELU(),
            torch.nn.Linear(4096, 2048),
            # torch.nn.BatchNorm1d(2048),
            torch.nn.GELU(),
            torch.nn.Dropout(dropout_rate),
            torch.nn.Linear(2048, 1024),
            # torch.nn.BatchNorm1d(1024),
            torch.nn.GELU(),
            torch.nn.Linear(1024, 512),
            # torch.nn.BatchNorm1d(512),
            torch.nn.GELU(),
            torch.nn.Dropout(dropout_rate),
            torch.nn.Linear(512, 256),
            torch.nn.GELU(),
            torch.nn.Linear(256, 128),
            torch.nn.GELU(),
            torch.nn.Linear(128, 1),
            torch.nn.Sigmoid()
        )

    def forward(self, x):
        return self.model(x)


model = NN(input_size=train_data.n_feature,
           dropout_rate=config['dropout_rate'])
summary(model, feature.shape)

----------------------------------------------------------------
        Layer (type)               Output Shape         Param #
            Linear-1              [-1, 64, 512]          78,848
              GELU-2              [-1, 64, 512]               0
            Linear-3             [-1, 64, 1024]         525,312
              GELU-4             [-1, 64, 1024]               0
           Dropout-5             [-1, 64, 1024]               0
            Linear-6             [-1, 64, 2048]       2,099,200
              GELU-7             [-1, 64, 2048]               0
            Linear-8             [-1, 64, 2048]       4,196,352
              GELU-9             [-1, 64, 2048]               0
          Dropout-10             [-1, 64, 2048]               0
           Linear-11             [-1, 64, 4096]       8,392,704
             GELU-12             [-1, 64, 4096]               0
           Linear-13             [-1, 64, 2048]       8,390,656
             GELU-14             [-1, 6

In [122]:
from torchsummary import summary

In [12]:
def model_performace(model, X_train, X_test, y_train, y_test,
                     ljust_len=30):
    print('Training accuracy: {}'.format(
        "%.4f" % model.score(X_train, y_train)))

    male, female = X_test.gender == 1, X_test.gender == 0
    print('Test accuracy:\n\t{}{}\n\t{}{}\n\t{}{}'.format(
        'General population'.ljust(ljust_len),
        "%.4f" % model.score(X_test, y_test),
        'Male'.ljust(ljust_len),
        "%.4f" % model.score(X_test[male], y_test[male]),
        'Female'.ljust(ljust_len),
        "%.4f" % model.score(X_test[female], y_test[female])))

    employ_cols = X_test.columns[X_test.columns.str.contains('employStatus')]
    for employ_col in employ_cols:
        rows = X_test[employ_col] == 1
        print('\t{}{}'.format(
            employ_col.split('_')[-1].ljust(ljust_len),
            "%.4f" % model.score(X_test[rows], y_test[rows])))

    y_pred = model.predict(X_test)
    report = classification_report(y_test, y_pred)
    print('\n', report)