In [1]:
%store -r obj_data

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import (
    classification_report,
    confusion_matrix,
    roc_curve,
    auc
)
from imblearn.over_sampling import RandomOverSampler

In [3]:
# Spliting features and target
X = obj_data.drop(columns=['Health_status'])
Y = obj_data['Health_status']

# Train-test spliting (stratified)
X_train, X_test, y_train, y_test = train_test_split(
    X, Y, test_size=0.2, random_state=42, stratify=Y
)

In [4]:
# Applying oversampling ONLY on training data
ros = RandomOverSampler(random_state=42)
X_resampled, y_resampled = ros.fit_resample(X_train, y_train)

# Training the Random Forest on resampled data
rf = RandomForestClassifier(
    n_estimators=400,
    max_depth=10,
    min_samples_leaf=5,
    class_weight='balanced', 
    random_state=42
)
rf.fit(X_resampled, y_resampled)

In [5]:
# Predicting on original (non-resampled) test set
y_pred = rf.predict(X_test)

# Evaluation
print(classification_report(y_test, y_pred))
print(confusion_matrix(y_test, y_pred))

              precision    recall  f1-score   support

         0.0       0.83      0.75      0.79       657
         1.0       0.36      0.47      0.41       194

    accuracy                           0.69       851
   macro avg       0.59      0.61      0.60       851
weighted avg       0.72      0.69      0.70       851

[[493 164]
 [102  92]]


In [6]:
from sklearn.model_selection import RepeatedStratifiedKFold, cross_val_score

cv = RepeatedStratifiedKFold(n_splits=5, n_repeats=20, random_state=42)


scores = cross_val_score(rf, X, Y, cv=cv, scoring='accuracy', n_jobs=-1)


print("All 100 accuracies:\n", scores)

print(f"\nMean accuracy: {scores.mean():.4f}")
print(f"Std  accuracy: {scores.std():.4f}")

All 100 accuracies:
 [0.73090482 0.70740306 0.70975323 0.74735605 0.69294118 0.72855464
 0.71445358 0.73090482 0.73443008 0.72588235 0.72032902 0.72855464
 0.73207991 0.70622797 0.72       0.70152761 0.73443008 0.73207991
 0.71680376 0.72705882 0.73678026 0.69212691 0.73325499 0.73795535
 0.72941176 0.72385429 0.72385429 0.73325499 0.71445358 0.73647059
 0.72855464 0.71680376 0.72385429 0.70857814 0.73176471 0.72385429
 0.72385429 0.71092832 0.72032902 0.74470588 0.72385429 0.70740306
 0.71562867 0.71210341 0.72823529 0.70387779 0.75088132 0.72032902
 0.72032902 0.73647059 0.71562867 0.73443008 0.72502938 0.75088132
 0.7        0.72150411 0.70857814 0.74735605 0.71797885 0.72
 0.73913043 0.73207991 0.72972973 0.70152761 0.70941176 0.69800235
 0.72737955 0.74970623 0.70975323 0.72       0.72620447 0.73678026
 0.71445358 0.72385429 0.73529412 0.72032902 0.72150411 0.72385429
 0.7226792  0.73176471 0.71210341 0.75440658 0.71680376 0.7027027
 0.72470588 0.71210341 0.74148061 0.70975323 0.7

In [7]:
# Get 100 F1 scores
f1_scores = cross_val_score(rf, X, Y, cv=cv, scoring='f1_macro', n_jobs=-1)
print("\nAll 100 F1 scores:\n", f1_scores)
print(f"\nMean F1-score: {f1_scores.mean():.4f}")
print(f"Std  F1-score: {f1_scores.std():.4f}")


All 100 F1 scores:
 [0.64956186 0.61782609 0.6038511  0.66287715 0.58929672 0.64114523
 0.62813769 0.63979136 0.6357131  0.63789381 0.63738739 0.6422398
 0.64079532 0.6191044  0.59669059 0.61742655 0.65466135 0.63848768
 0.6256104  0.63999883 0.64821212 0.59360965 0.64406099 0.64698453
 0.64525752 0.6291774  0.63151688 0.64180115 0.62479384 0.65857577
 0.64114523 0.6221088  0.6291774  0.62104432 0.64402222 0.65686956
 0.62556988 0.60740166 0.61637043 0.65744839 0.64038007 0.60196071
 0.62347886 0.61703632 0.63534828 0.59650034 0.65827954 0.63195787
 0.63195787 0.65134591 0.62113486 0.65466135 0.63592584 0.65942007
 0.6        0.62482212 0.62540291 0.64306546 0.63644908 0.63180961
 0.6567002  0.64079532 0.63531652 0.61072972 0.62186092 0.61560553
 0.6401429  0.66062291 0.62089576 0.63510101 0.63803827 0.65465165
 0.61897278 0.62556988 0.64594545 0.62622538 0.63838332 0.64554299
 0.62700394 0.63717649 0.61939646 0.66024829 0.62788195 0.61620542
 0.6464     0.62284169 0.65229181 0.619762

In [None]:
auc_scores = cross_val_score(rf, X, Y, cv=cv, scoring='roc_auc', n_jobs=-1)
print("\nAll 100 AUC scores:\n", auc_scores)
print(f"\nMean AUC: {auc_scores.mean():.4f}")
print(f"Std  AUC: {auc_scores.std():.4f}")



All 100 AUC scores:
 [0.74428439 0.69656671 0.70723689 0.76025641 0.70874717 0.74650473
 0.71122252 0.71902901 0.72897123 0.72029796 0.73446939 0.72565865
 0.73208429 0.72001251 0.72938144 0.69397762 0.7585793  0.7447316
 0.70999062 0.73932141 0.74153054 0.67643459 0.75231057 0.72143527
 0.73778131 0.7353795  0.72416012 0.73184108 0.70920106 0.74937139
 0.74018108 0.73266488 0.70977106 0.72252971 0.72072228 0.74005555
 0.73967895 0.70594235 0.73212164 0.7335303  0.71949191 0.72094337
 0.73111142 0.71838649 0.7099494  0.67555587 0.759764   0.71939776
 0.71372733 0.75263232 0.70490671 0.73325331 0.73200584 0.7552611
 0.70082663 0.72993457 0.73094666 0.72964427 0.71601782 0.71192167
 0.75258517 0.74025169 0.71555336 0.69172921 0.72773133 0.70093678
 0.72973842 0.76263553 0.7272045  0.71508046 0.73494798 0.73942789
 0.71645562 0.72119293 0.72593192 0.68723815 0.74593199 0.7377646
 0.73828174 0.72623837 0.71079885 0.76033674 0.71188941 0.7159631
 0.72970361 0.715012   0.74871722 0.70627187

In [1]:
%store -r obj_data

In [2]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, f1_score, roc_auc_score
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import RandomOverSampler
import numpy as np

# Split original data
X = obj_data.drop(columns=['Health_status'])
Y = obj_data['Health_status']
X_train, X_test, y_train, y_test = train_test_split(
    X, Y, test_size=0.2, stratify=Y, random_state=42
)

# Store metrics
accuracies = []
f1_macros = []
auc_scores = []

# Loop over 100 different random states
for i in range(100):
    # Oversample training data
    ros = RandomOverSampler(random_state=i)
    X_resampled, y_resampled = ros.fit_resample(X_train, y_train)

    # Train model
    rf = RandomForestClassifier(
        n_estimators=400,
        max_depth=10,
        min_samples_leaf=5,
        class_weight='balanced',
        random_state=i
    )
    rf.fit(X_resampled, y_resampled)

    # Predict and evaluate
    y_pred = rf.predict(X_test)
    y_prob = rf.predict_proba(X_test)[:, 1]  # for AUC

    acc = accuracy_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred, average='macro')
    auc = roc_auc_score(y_test, y_prob)

    accuracies.append(acc)
    f1_macros.append(f1)
    auc_scores.append(auc)

# Print results
print(f"Accuracy: Mean = {np.mean(accuracies):.4f}, Std = {np.std(accuracies):.4f}")
print(f"F1 Macro: Mean = {np.mean(f1_macros):.4f}, Std = {np.std(f1_macros):.4f}")
print(f"AUC:      Mean = {np.mean(auc_scores):.4f}, Std = {np.std(auc_scores):.4f}")


Accuracy: Mean = 0.6985, Std = 0.0073
F1 Macro: Mean = 0.6066, Std = 0.0077
AUC:      Mean = 0.6937, Std = 0.0035
