In [30]:
import os
import pickle

import numpy as np
import pandas as pd
from sklearn.compose import ColumnTransformer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import (accuracy_score, classification_report,
                             confusion_matrix, f1_score, precision_score,
                             recall_score)
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import (LabelEncoder, OneHotEncoder, OrdinalEncoder,
                                   StandardScaler)
from sklearn.svm import SVC

In [31]:
data_filename = os.path.join(os.getcwd(), 'data', 'S1File.csv')
metadata_filename = os.path.join(os.getcwd(), 'data', 'metadata.csv')

In [32]:
df = pd.read_csv(data_filename)
metadata = pd.read_csv(metadata_filename)

In [33]:
features = metadata.variable.to_list()
label = 'UCX_abnormal'  # UCX test result
diagnosis = 'UTI_diag'  # ED diagnosis

# Map UCX and clinical diagnosis to int
df[label] = df[label].map({'yes': 1, 'no': 0})
df[diagnosis] = df[diagnosis].map({'Yes': 1, 'No': 0})

# Reorder columns
df = df[[label] + [diagnosis] + features]

# Data Preprocessing


In [34]:
def trim_missing(df: pd.DataFrame) -> pd.DataFrame:
    """
    First, drop the columns with not_reported values > 10%
    Then, drop observations with not_reported or other values
    return cleaned dataframe
    """
    # Drop the columns with not_reported values > 10%
    drop = []
    demo = ['age', 'gender', 'race', 'ethnicity', 'lang',
            'employStatus', 'maritalStatus', 'chief_complaint']
    cols = [i for i in df.columns if i not in demo]
    for col in cols:
        ratio = df[col][df[col] == 'not_reported'].count()/df.shape[0]*100
        if ratio > 0.1:
            drop.append(col)
    df = df.drop(labels=drop, axis=1)

    # Drop observations with not_reported or other values
    df= df[~df.apply(lambda row: row =='not_reported').any(axis=1)]
    df= df[~df.apply(lambda row: row =='other').any(axis=1)]
    df= df[~df.apply(lambda row: row =='4+').any(axis=1)]

    # Convert numeric features to float
    num = ['ua_ph', 'ua_spec_grav', 'age']
    for col in num:
        mean = df[(df[col] != 'not_reported') & (df[col]!= 'other')][col].astype(
            'float').mean()
        df[col] = df[col].replace('not_reported', mean)
        df[col] = df[col].astype(float)

    return df

In [35]:
def encode_features(df: pd.DataFrame) -> tuple[pd.DataFrame, ColumnTransformer]:
    """
    Input the cleaned dataframe,
    OneHotEncode the categorical (non-ordinal) attributes,
    OrdinalEncode the ordinal attributes
    return the final dataframe
    """

    other = ['ua_ph', 'ua_spec_grav', 'age']
    ord = ['ua_blood', 'ua_glucose', 'ua_ketones', 'ua_leuk', 'ua_protein']
    onehot = ['chief_complaint', 'race', 'ethnicity',
              'maritalStatus', 'employStatus']
    label = [i for i in df.columns if i not in ord+other+onehot]

    preprocessor = ColumnTransformer(
        transformers=[
            ('onehot', OneHotEncoder(), onehot),
            ('label', OrdinalEncoder(), label),
            ('ordinal', OrdinalEncoder(categories=[
             ['negative', 'small', 'moderate', 'large']]* len(ord)), ord)
        ])

    transformed = preprocessor.fit_transform(df)

    onehot_col_names = preprocessor.named_transformers_[
        'onehot'].get_feature_names_out(onehot)
    new_column_names = list(onehot_col_names) + label + ord
    # Preserve the original index
    df_transformed = pd.DataFrame(
        transformed, columns=new_column_names, index=df.index)  # type: ignore

    df_final = pd.concat([df[other], df_transformed], axis=1)

    return df_final, preprocessor

In [36]:
df_cleaned = trim_missing(df)
df_cleaned.head()

Unnamed: 0,UCX_abnormal,UTI_diag,ua_blood,ua_color,ua_glucose,ua_ketones,ua_leuk,ua_nitrite,ua_ph,ua_protein,...,MISCELLANEOUS_MEDICAL_SUPPLIES__DEVICES__NON_DRUG,MUSCLE_RELAXANTS,PRE_NATAL_VITAMINS,PSYCHOTHERAPEUTIC_DRUGS,SEDATIVE_HYPNOTICS,SKIN_PREPS,SMOKING_DETERRENTS,THYROID_PREPS,UNCLASSIFIED_DRUG_PRODUCTS,VITAMINS
0,1,1,negative,yellow,negative,negative,small,negative,7.5,negative,...,No,No,No,No,No,No,No,No,No,No
2,1,0,negative,yellow,negative,negative,small,negative,5.0,small,...,No,No,No,Yes,Yes,No,No,Yes,Yes,No
3,1,1,negative,yellow,negative,negative,large,negative,5.5,small,...,No,No,No,No,No,No,No,No,No,Yes
4,0,0,negative,orange,negative,small,small,positive,6.0,moderate,...,No,No,No,No,No,No,No,No,No,No
5,1,0,large,yellow,negative,large,small,negative,6.0,small,...,No,No,No,No,No,No,No,No,No,No


In [37]:
X, encoder = encode_features(df_cleaned.iloc[:, 2:])
Y = df_cleaned.iloc[:, :2]
print(f'Feature X shape: {X.shape}')
print(f'Label Y shape: {Y.shape}, where'
      f'\n\tthe first column is true label ({label})'
      f'\n\tthe second column is ed diagnosis ({diagnosis})')

Feature X shape: (59792, 153)
Label Y shape: (59792, 2), where
	the first column is true label (UCX_abnormal)
	the second column is ed diagnosis (UTI_diag)


In [38]:
X.head()

Unnamed: 0,ua_ph,ua_spec_grav,age,chief_complaint_ABDOMINAL PAIN,chief_complaint_ALTERED MENTAL STATUS,chief_complaint_BACK PAIN,chief_complaint_CHEST PAIN,chief_complaint_DIZZINESS,chief_complaint_DYSURIA,chief_complaint_EMESIS,...,SKIN_PREPS,SMOKING_DETERRENTS,THYROID_PREPS,UNCLASSIFIED_DRUG_PRODUCTS,VITAMINS,ua_blood,ua_glucose,ua_ketones,ua_leuk,ua_protein
0,7.5,1.02,83.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
2,5.0,1.016,78.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0,1.0
3,5.5,1.016,84.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,3.0,1.0
4,6.0,1.03,55.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,2.0
5,6.0,1.03,47.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,3.0,0.0,3.0,1.0,1.0


In [39]:
Y.head()

Unnamed: 0,UCX_abnormal,UTI_diag
0,1,1
2,1,0
3,1,1
4,0,0
5,1,0


# Training


In [40]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y,
                                                    test_size=0.2,
                                                    random_state=42)
y_train, y_test = Y_train[label], Y_test[label]

assert y_train.name == label
assert y_test.name == label

In [41]:
def model_performace(model, X_train, X_test, y_train, y_test,
                     ljust_len=30):
    print('Training accuracy: {}'.format(
        "%.4f" % model.score(X_train, y_train)))

    male, female = X_test.gender == 1, X_test.gender == 0
    print('Test accuracy:\n\t{}{}\n\t{}{}\n\t{}{}'.format(
        'General population'.ljust(ljust_len),
        "%.4f" % model.score(X_test, y_test),
        'Male'.ljust(ljust_len),
        "%.4f" % model.score(X_test[male], y_test[male]),
        'Female'.ljust(ljust_len),
        "%.4f" % model.score(X_test[female], y_test[female])))

    employ_cols = X_test.columns[X_test.columns.str.contains('employStatus')]
    for employ_col in employ_cols:
        rows = X_test[employ_col] == 1
        print('\t{}{}'.format(
            employ_col.split('_')[-1].ljust(ljust_len),
            "%.4f" % model.score(X_test[rows], y_test[rows])))

    y_pred = model.predict(X_test)
    report = classification_report(y_test, y_pred)
    print('\n', report)

### Logistic Regression


In [42]:
model = LogisticRegression(n_jobs=-1, max_iter=2000)
model.fit(X_train, y_train)

In [43]:
model_performace(model, X_train, X_test, y_train, y_test)

Training accuracy: 0.8297
Test accuracy:
	General population            0.8311
	Male                          0.8893
	Female                        0.8052
	Disabled                      0.8492
	Full Time                     0.8431
	Not Employed                  0.8345
	On Active Military Duty       1.0000
	Part Time                     0.8187
	Retired                       0.8142
	Self Employed                 0.8478
	Student - Full Time           0.8575
	Student - Part Time           0.8571
	Unknown                       0.7647

               precision    recall  f1-score   support

           0       0.86      0.94      0.90      9277
           1       0.68      0.46      0.55      2682

    accuracy                           0.83     11959
   macro avg       0.77      0.70      0.72     11959
weighted avg       0.82      0.83      0.82     11959



### Naive Bayes


In [44]:
model = GaussianNB()
model.fit(X_train, y_train)

In [45]:
model_performace(model, X_train, X_test, y_train, y_test)

Training accuracy: 0.6850
Test accuracy:
	General population            0.6837
	Male                          0.7099
	Female                        0.6720
	Disabled                      0.6352
	Full Time                     0.7689
	Not Employed                  0.7398
	On Active Military Duty       0.0000
	Part Time                     0.7172
	Retired                       0.5723
	Self Employed                 0.7403
	Student - Full Time           0.8280
	Student - Part Time           0.1905
	Unknown                       0.7059

               precision    recall  f1-score   support

           0       0.87      0.69      0.77      9277
           1       0.38      0.65      0.48      2682

    accuracy                           0.68     11959
   macro avg       0.63      0.67      0.63     11959
weighted avg       0.76      0.68      0.71     11959



### kNN


In [46]:
knn = KNeighborsClassifier()
param_grid = {'n_neighbors': list(range(1, 21))}
grid_search = GridSearchCV(knn, param_grid, cv=5, scoring='accuracy')
grid_search.fit(X_train, y_train)
best_k = grid_search.best_params_['n_neighbors']
best_k

17

In [47]:
model = KNeighborsClassifier(n_neighbors=best_k)
model.fit(X_train, y_train)
model_performace(model, X_train, X_test, y_train, y_test)

Training accuracy: 0.8168
Test accuracy:
	General population            0.8108
	Male                          0.8830
	Female                        0.7788
	Disabled                      0.8153
	Full Time                     0.8383
	Not Employed                  0.8247
	On Active Military Duty       1.0000
	Part Time                     0.8107
	Retired                       0.7761
	Self Employed                 0.8358
	Student - Full Time           0.8387
	Student - Part Time           0.8095
	Unknown                       0.7059

               precision    recall  f1-score   support

           0       0.82      0.97      0.89      9277
           1       0.70      0.28      0.40      2682

    accuracy                           0.81     11959
   macro avg       0.76      0.62      0.64     11959
weighted avg       0.79      0.81      0.78     11959



### MLP


In [48]:
model = MLPClassifier(hidden_layer_sizes=(512, 512, 512),
                      max_iter=50, verbose=True)
model.fit(X_train, y_train)

Iteration 1, loss = 0.46940892
Iteration 2, loss = 0.39536704
Iteration 3, loss = 0.39002572
Iteration 4, loss = 0.38549666
Iteration 5, loss = 0.37908697
Iteration 6, loss = 0.37696277
Iteration 7, loss = 0.37466359
Iteration 8, loss = 0.37094112
Iteration 9, loss = 0.36875624
Iteration 10, loss = 0.36613043
Iteration 11, loss = 0.36235400
Iteration 12, loss = 0.35813264
Iteration 13, loss = 0.35421956
Iteration 14, loss = 0.35068307
Iteration 15, loss = 0.34673188
Iteration 16, loss = 0.34438637
Iteration 17, loss = 0.33987821
Iteration 18, loss = 0.33386638
Iteration 19, loss = 0.32822954
Iteration 20, loss = 0.32183520
Iteration 21, loss = 0.31596192
Iteration 22, loss = 0.31007711
Iteration 23, loss = 0.30357111
Iteration 24, loss = 0.29226125
Iteration 25, loss = 0.28823633
Iteration 26, loss = 0.28143370
Iteration 27, loss = 0.27025241
Iteration 28, loss = 0.26264297
Iteration 29, loss = 0.25308551
Iteration 30, loss = 0.24236954
Iteration 31, loss = 0.23060000
Iteration 32, los



In [49]:
model_performace(model, X_train, X_test, y_train, y_test)

Training accuracy: 0.9616
Test accuracy:
	General population            0.8053
	Male                          0.8898
	Female                        0.7676
	Disabled                      0.8303
	Full Time                     0.8202
	Not Employed                  0.8081
	On Active Military Duty       1.0000
	Part Time                     0.7936
	Retired                       0.7843
	Self Employed                 0.8269
	Student - Full Time           0.8333
	Student - Part Time           0.8095
	Unknown                       0.7059

               precision    recall  f1-score   support

           0       0.87      0.88      0.88      9277
           1       0.57      0.54      0.55      2682

    accuracy                           0.81     11959
   macro avg       0.72      0.71      0.71     11959
weighted avg       0.80      0.81      0.80     11959



### SVM


In [None]:
model = SVC(shrinking=False, verbose=True)
model.fit(X_train, y_train)

In [None]:
model_performace(model, X_train, X_test, y_train, y_test)

# Save Models


In [None]:
# models = {}

In [110]:
# # Store model results
# models['logistic regression'] = {
#     'model': model,
#     'training accuracy': model.score(X_train, y_train),
#     'test accuracy': model.score(X_test, y_test),
#     'precision': precision_score(y_test, y_pred),
#     'recall': recall_score(y_test, y_pred),
#     'f1': f1_score(y_test, y_pred)
# }

In [111]:
# # Save models to local file
# with open('model_performances.pkl', 'wb') as f:
#     pickle.dump(models, f)