In [27]:
# Loading data:


import pandas as pd

# For all_perscriptions.cvs: mapping the antimicrobials into one-hot representation
p_data = pd.read_csv('../../data/data sample/amr-uti-antimicrobial-resistance-in-urinary-tract-infections-1.0.0/all_prescriptions.csv')
print("The shape of prescriptions csv: ",p_data.shape)

# Create one-hot encoded columns for 'prescription'
one_hot = pd.get_dummies(p_data['prescription'])

p_data_encoded = pd.concat([p_data, one_hot], axis=1)
p_data_encoded = p_data_encoded.drop('prescription', axis=1)
p_data_encoded = p_data_encoded.drop('is_train', axis=1)




# For all_uti_features.csv: directly read the csv file
uti_data = pd.read_csv('../../data/data sample/amr-uti-antimicrobial-resistance-in-urinary-tract-infections-1.0.0/all_uti_features.csv')
print("The shape of UTI feature csv: ", uti_data.shape)

# Merge CSVs 
final_data = pd.merge(uti_data, p_data_encoded, on='example_id', how='left')
prescription_columns = ['CIP','LVX', 'NIT', 'SXT']

# For patiences that don't have prescription, fill 0 on four antimicrobial
final_data[prescription_columns] = final_data[prescription_columns].fillna(0)






The shape of prescriptions csv:  (15806, 3)
The shape of UTI feature csv:  (116902, 791)
The shape of label csv:  (116902, 7)
(116902, 795)
(116902, 6)
(116902, 800)


In [35]:
# For all_uti_resist_labels.csv: directly read the csv file
label_data = pd.read_csv('../../data/data sample/amr-uti-antimicrobial-resistance-in-urinary-tract-infections-1.0.0/all_uti_resist_labels.csv')

print(sum(final_data['example_id']==label_data['example_id']))


(116902, 7)
116902


In [51]:

# Split training and test data
train_data = final_data[final_data['is_train']==1]
train_label = label_data[label_data['is_train']==1]

test_data = final_data[final_data['is_train']==0]
test_label = label_data[label_data['is_train']==0]





# Dealing with NAN label:

## Strategy 1: remove samples with NAN label directly

In [52]:
# Subset the label data for the relevant columns
label_columns_data = label_data[prescription_columns]


nan_rows = label_columns_data.isna().any(axis=1)


nan_count = nan_rows.sum()

print("Number of samples with any NaN values in labels:", nan_count)


Number of samples with any NaN values in labels: 16133


In [53]:

# Drop rows with any NaN values in the specified prescription columns
label_data_clean = label_data.dropna(subset=prescription_columns)
valid_ids = label_data_clean['example_id']


train_data_clean = train_data[train_data['example_id'].isin(valid_ids)]
test_data_clean = test_data[test_data['example_id'].isin(valid_ids)]

train_label_clean = train_label[train_label['example_id'].isin(valid_ids)]
test_label_clean = test_label[test_label['example_id'].isin(valid_ids)]

# Show the shapes of the cleaned datasets
print("Cleaned Train Data Shape:", train_data_clean.shape)
print("Cleaned Train Label Shape:", train_label_clean.shape)
print("Cleaned Test Data Shape:", test_data_clean.shape)
print("Cleaned Test Label Shape:", test_label_clean.shape)


Cleaned Train Data Shape: (69184, 800)
Cleaned Train Label Shape: (69184, 7)
Cleaned Test Data Shape: (31585, 800)
Cleaned Test Label Shape: (31585, 7)


In [54]:
# Split training and test data

x_train = train_data_clean.values
y_train = train_label_clean[prescription_columns].values

x_test=  test_data_clean.values
y_test = test_label_clean[prescription_columns].values

print(x_train.shape)
print(y_train.shape)
print(x_test.shape)
print(y_test.shape)



(69184, 800)
(69184, 4)
(31585, 800)
(31585, 4)


## Strategy 2: fill in NAN label based on cluster result

todo

# Question: 
The uncomplicated column in label data: features? label to predict? or train seperate models based on uncomplicated or complicated

# Train Models: XGBoost and LR

In [61]:
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.multiclass import OneVsRestClassifier
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score, classification_report




## LR

In [58]:
lr_model = OneVsRestClassifier(LogisticRegression(max_iter=1000))
lr_model.fit(x_train, y_train)


## XGBoost

In [59]:
xgb_model = XGBClassifier(objective='binary:logistic', eval_metric='logloss', use_label_encoder=False)
xgb_model.fit(x_train, y_train)


# Model Evaluation

In [62]:

# Predictions
y_pred_lr = lr_model.predict(x_test)
y_pred_xgb = xgb_model.predict(x_test)

# Evaluate Logistic Regression
print("Logistic Regression Performance:")
print(classification_report(y_test, y_pred_lr, target_names=prescription_columns))

# Evaluate XGBoost
print("XGBoost Performance:")
print(classification_report(y_test, y_pred_xgb, target_names=prescription_columns))


Logistic Regression Performance:
              precision    recall  f1-score   support

           0       0.99      1.00      1.00      6718
           1       0.99      1.00      0.99      6723
           2       1.00      1.00      1.00      7037
           3       0.59      0.41      0.49      8127

   micro avg       0.91      0.83      0.87     28605
   macro avg       0.89      0.85      0.87     28605
weighted avg       0.88      0.83      0.85     28605
 samples avg       0.39      0.39      0.38     28605

XGBoost Performance:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00      6718
           1       1.00      1.00      1.00      6723
           2       1.00      1.00      1.00      7037
           3       1.00      1.00      1.00      8127

   micro avg       1.00      1.00      1.00     28605
   macro avg       1.00      1.00      1.00     28605
weighted avg       1.00      1.00      1.00     28605
 samples avg       0.50

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
