In [1]:
%store -r sub_data

In [2]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, f1_score, roc_auc_score
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import RandomOverSampler
import numpy as np

# Split original data
X = sub_data.drop(columns=['Health_status'])
Y = sub_data['Health_status']
X_train, X_test, y_train, y_test = train_test_split(
    X, Y, test_size=0.2, stratify=Y, random_state=42
)

# Store metrics
accuracies = []
f1_macros = []
auc_scores = []

# Loop over 100 different random states
for i in range(100):
    # Oversample training data
    ros = RandomOverSampler(random_state=i)
    X_resampled, y_resampled = ros.fit_resample(X_train, y_train)

    # Train model
    lr = LogisticRegression(class_weight='balanced', max_iter=1000, solver='liblinear')

    lr.fit(X_resampled, y_resampled)

    # Predict and evaluate
    y_pred = lr.predict(X_test)
    y_prob = lr.predict_proba(X_test)[:, 1]  # for AUC

    acc = accuracy_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred, average='macro')
    auc = roc_auc_score(y_test, y_prob)

    accuracies.append(acc)
    f1_macros.append(f1)
    auc_scores.append(auc)

# Print results
print(f"Accuracy: Mean = {np.mean(accuracies):.4f}, Std = {np.std(accuracies):.4f}")
print(f"F1 Macro: Mean = {np.mean(f1_macros):.4f}, Std = {np.std(f1_macros):.4f}")
print(f"AUC:      Mean = {np.mean(auc_scores):.4f}, Std = {np.std(auc_scores):.4f}")


Accuracy: Mean = 0.6798, Std = 0.0049
F1 Macro: Mean = 0.6329, Std = 0.0046
AUC:      Mean = 0.7563, Std = 0.0025
