In [27]:
import os
import pickle

import numpy as np
import pandas as pd
from plotnine import (aes, after_stat, facet_wrap, geom_bar, geom_label,
                      geom_text, ggplot, position_dodge2, stage)
from scipy import stats
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import (accuracy_score, classification_report,
                             confusion_matrix, f1_score, precision_score,
                             recall_score)
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.neural_network import MLPClassifier
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import (LabelEncoder, OneHotEncoder, OrdinalEncoder,
                                   StandardScaler)
from sklearn.svm import SVC, LinearSVC, NuSVC

In [2]:
data_filename = os.path.join(os.getcwd(), 'data', 'S1File.csv')
metadata_filename = os.path.join(os.getcwd(), 'data', 'metadata.csv')

In [3]:
df = pd.read_csv(data_filename)
metadata = pd.read_csv(metadata_filename)

# Data Preprocessing


In [4]:
def trim(df):
    """
    First, drop the columns with not_reported values > 10%
    Then, drop observations with not_reported or other values
    return cleaned dataframe
    """
    drop = ['ID', 'PATID', 'split', 'alt_diag']
    demo = ['UCX_abnormal', 'age', 'gender', 'race', 'ethnicity',
            'lang', 'employStatus', 'maritalStatus', 'chief_complaint']
    cols = [i for i in df.columns if i not in demo]
    for col in cols:
        ratio = df[col][df[col] == 'not_reported'].count()/df.shape[0]*100
        if ratio > 0.1:
            drop.append(col)
    df = df.drop(labels=drop, axis=1)

    df= df[~df.apply(lambda row: row =='not_reported').any(axis=1)]
    df= df[~df.apply(lambda row: row =='other').any(axis=1)]
    df= df[~df.apply(lambda row: row =='4+').any(axis=1)]

    num = ['ua_ph', 'ua_spec_grav', 'age']
    for col in num:
        mean = df[df[col] != 'not_reported'][df[col]
                                             != 'other'][col].astype('float').mean()
        df[col] = df[col].replace('not_reported', mean)
        df[col] = df[col].astype(float)

    return df

In [5]:
def encode(df):
    """
    Input the cleaned dataframe,
    OneHotEncode the categorical (non-ordinal) attributes,
    OrdinalEncode the ordinal attributes
    return the final dataframe
    """

    ord = ['ua_blood', 'ua_glucose', 'ua_ketones', 'ua_leuk', 'ua_protein']
    other = ['ua_ph', 'ua_spec_grav', 'age']
    onehot = ['chief_complaint', 'race', 'ethnicity',
              'maritalStatus', 'employStatus', 'dispo']
    label = [i for i in df.columns if i not in ord+other+onehot]

    preprocessor = ColumnTransformer(
        transformers=[
            ('onehot', OneHotEncoder(), onehot),
            ('label', OrdinalEncoder(), label),
            ('ordinal', OrdinalEncoder(categories=[
             ['negative', 'small', 'moderate', 'large']]* len(ord)), ord)
        ])

    transformed = preprocessor.fit_transform(df)

    onehot_col_names = preprocessor.named_transformers_[
        'onehot'].get_feature_names_out(onehot)
    new_column_names = list(onehot_col_names) + label + ord
    # Preserve the original index
    df_transformed = pd.DataFrame(
        transformed, columns=new_column_names, index=df.index)  # type: ignore

    df_final = pd.concat([df[other], df_transformed], axis=1)

    return df_final, preprocessor

In [6]:
df_cleaned = trim(df)
final_df, encoder = encode(df_cleaned)
X, y = final_df.loc[:, final_df.columns != 'UTI_diag'], final_df['UTI_diag']

print(f'Feature X shape: {X.shape}')
print(f'Label y shape: {y.shape}')

Feature X shape: (59792, 166)
Label y shape: (59792,)


# Training


In [8]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2,
                                                    random_state=42)

In [13]:
def model_performace(model, X_train, X_test, y_train, y_test,
                     ljust_len=30):
    print('Training accuracy: {}'.format(
        "%.4f" % model.score(X_train, y_train)))

    male, female = X_test.gender == 1, X_test.gender == 0
    print('Test accuracy:\n\t{}{}\n\t{}{}\n\t{}{}'.format(
        'General population'.ljust(ljust_len),
        "%.4f" % model.score(X_test, y_test),
        'Male'.ljust(ljust_len),
        "%.4f" % model.score(X_test[male], y_test[male]),
        'Female'.ljust(ljust_len),
        "%.4f" % model.score(X_test[female], y_test[female])))

    employ_cols = X_test.columns[X_test.columns.str.contains('employStatus')]
    for employ_col in employ_cols:
        rows = X_test[employ_col] == 1
        print('\t{}{}'.format(
            employ_col.split('_')[-1].ljust(ljust_len),
            "%.4f" % model.score(X_test[rows], y_test[rows])))

    y_pred = model.predict(X_test)
    report = classification_report(y_test, y_pred)
    print('\n', report)

### Logistic Regression


In [20]:
model = LogisticRegression(n_jobs=-1, max_iter=2000)
model.fit(X_train, y_train)

In [21]:
model_performace(model, X_train, X_test, y_train, y_test)

Training accuracy: 0.8318
Test accuracy:
	General population            0.8327
	Male                          0.8358
	Female                        0.8313
	Disabled                      0.8285
	Full Time                     0.8557
	Not Employed                  0.8379
	On Active Military Duty       1.0000
	Part Time                     0.8529
	Retired                       0.8089
	Self Employed                 0.8448
	Student - Full Time           0.8333
	Student - Part Time           0.7143
	Unknown                       0.8824

               precision    recall  f1-score   support

         0.0       0.87      0.93      0.90      9378
         1.0       0.65      0.50      0.56      2581

    accuracy                           0.83     11959
   macro avg       0.76      0.71      0.73     11959
weighted avg       0.82      0.83      0.82     11959



### MLP


In [16]:
model = MLPClassifier(hidden_layer_sizes=(512, 512, 512),
                      max_iter=30, verbose=True)
model.fit(X_train, y_train)

Iteration 1, loss = 0.42004687
Iteration 2, loss = 0.33828819
Iteration 3, loss = 0.32703345
Iteration 4, loss = 0.32456161
Iteration 5, loss = 0.32051777
Iteration 6, loss = 0.31657311
Iteration 7, loss = 0.31485682
Iteration 8, loss = 0.31301486
Iteration 9, loss = 0.31182255
Iteration 10, loss = 0.30851475
Iteration 11, loss = 0.30555041
Iteration 12, loss = 0.30394689
Iteration 13, loss = 0.29986917
Iteration 14, loss = 0.29605025
Iteration 15, loss = 0.29299207
Iteration 16, loss = 0.29061088
Iteration 17, loss = 0.28614915
Iteration 18, loss = 0.28258855
Iteration 19, loss = 0.27684068
Iteration 20, loss = 0.27230183
Iteration 21, loss = 0.26655842
Iteration 22, loss = 0.26319490
Iteration 23, loss = 0.25651230
Iteration 24, loss = 0.25056954
Iteration 25, loss = 0.24400880
Iteration 26, loss = 0.23812677
Iteration 27, loss = 0.23305134
Iteration 28, loss = 0.22566394
Iteration 29, loss = 0.22021962
Iteration 30, loss = 0.21072650




In [17]:
model_performace(model, X_train, X_test, y_train, y_test)

Training accuracy: 0.9028
Test accuracy:
	General population            0.8278
	Male                          0.8296
	Female                        0.8270
	Disabled                      0.8040
	Full Time                     0.8486
	Not Employed                  0.8365
	On Active Military Duty       1.0000
	Part Time                     0.8381
	Retired                       0.8126
	Self Employed                 0.8149
	Student - Full Time           0.8280
	Student - Part Time           0.8095
	Unknown                       0.8235

               precision    recall  f1-score   support

         0.0       0.89      0.89      0.89      9378
         1.0       0.60      0.61      0.60      2581

    accuracy                           0.83     11959
   macro avg       0.75      0.75      0.75     11959
weighted avg       0.83      0.83      0.83     11959



### SVM


In [11]:
model = SVC(shrinking=False, verbose=True)
model.fit(X_train, y_train)

[LibSVM]...........
*.
*.
*
optimization finished, #iter = 12041
obj = -20244.764812, rho = -0.477214
nSV = 21119, nBSV = 21051
Total nSV = 21119


In [12]:
model_performace(model, X_train, X_test, y_train, y_test)

Training accuracy: 0.8063
Test accuracy:
	General population            0.8111
	Male                          0.8247
	Female                        0.8051
	Disabled                      0.8190
	Full Time                     0.8304
	Not Employed                  0.8179
	On Active Military Duty       1.0000
	Part Time                     0.8198
	Retired                       0.7861
	Self Employed                 0.8418
	Student - Full Time           0.8038
	Student - Part Time           0.7619
	Unknown                       0.9412

               precision    recall  f1-score   support

         0.0       0.82      0.96      0.89      9378
         1.0       0.66      0.26      0.37      2581

    accuracy                           0.81     11959
   macro avg       0.74      0.61      0.63     11959
weighted avg       0.79      0.81      0.78     11959



'              precision    recall  f1-score   support\n\n         0.0       0.82      0.96      0.89      9378\n         1.0       0.66      0.26      0.37      2581\n\n    accuracy                           0.81     11959\n   macro avg       0.74      0.61      0.63     11959\nweighted avg       0.79      0.81      0.78     11959\n'

### Naive Bayes


In [22]:
from sklearn.naive_bayes import GaussianNB

In [23]:
model = GaussianNB()
model.fit(X_train, y_train)

In [24]:
model_performace(model, X_train, X_test, y_train, y_test)

Training accuracy: 0.6866
Test accuracy:
	General population            0.6789
	Male                          0.6554
	Female                        0.6894
	Disabled                      0.6098
	Full Time                     0.7338
	Not Employed                  0.7032
	On Active Military Duty       1.0000
	Part Time                     0.7446
	Retired                       0.6196
	Self Employed                 0.6836
	Student - Full Time           0.7500
	Student - Part Time           0.7143
	Unknown                       0.6471

               precision    recall  f1-score   support

         0.0       0.97      0.61      0.75      9378
         1.0       0.40      0.93      0.56      2581

    accuracy                           0.68     11959
   macro avg       0.68      0.77      0.65     11959
weighted avg       0.84      0.68      0.71     11959



### kNN


In [25]:
from sklearn.neighbors import KNeighborsClassifier

In [28]:
knn = KNeighborsClassifier()
param_grid = {'n_neighbors': list(range(1, 21))}
grid_search = GridSearchCV(knn, param_grid, cv=5, scoring='accuracy')
grid_search.fit(X_train, y_train)
best_k = grid_search.best_params_['n_neighbors']
best_k

15

In [29]:
model = KNeighborsClassifier(n_neighbors=best_k)
model.fit(X_train, y_train)
model_performace(model, X_train, X_test, y_train, y_test)

Training accuracy: 0.8283
Test accuracy:
	General population            0.8083
	Male                          0.8236
	Female                        0.8011
	Disabled                      0.8162
	Full Time                     0.8300
	Not Employed                  0.8217
	On Active Military Duty       1.0000
	Part Time                     0.8233
	Retired                       0.7756
	Self Employed                 0.8090
	Student - Full Time           0.8199
	Student - Part Time           0.6667
	Unknown                       0.9412

               precision    recall  f1-score   support

         0.0       0.84      0.93      0.88      9378
         1.0       0.59      0.36      0.45      2581

    accuracy                           0.81     11959
   macro avg       0.72      0.65      0.67     11959
weighted avg       0.79      0.81      0.79     11959



### Models


In [None]:
# models = {}

In [110]:
# # Store model results
# models['logistic regression'] = {
#     'model': model,
#     'training accuracy': model.score(X_train, y_train),
#     'test accuracy': model.score(X_test, y_test),
#     'precision': precision_score(y_test, y_pred),
#     'recall': recall_score(y_test, y_pred),
#     'f1': f1_score(y_test, y_pred)
# }

In [111]:
# # Save models to local file
# with open('model_performances.pkl', 'wb') as f:
#     pickle.dump(models, f)