In [16]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import (
    classification_report,
    confusion_matrix,
    roc_curve,
    auc
)
from imblearn.over_sampling import RandomOverSampler

In [17]:
%store -r machine_data

In [18]:
# Spliting features and target
X = machine_data.drop(columns=['Health_status'])
Y = machine_data['Health_status']

# Train-test spliting (stratified)
X_train, X_test, y_train, y_test = train_test_split(
    X, Y, test_size=0.2, random_state=42, stratify=Y
)

In [19]:
# Applying oversampling ONLY on training data
ros = RandomOverSampler(random_state=42)
X_resampled, y_resampled = ros.fit_resample(X_train, y_train)

# Training the Random Forest on resampled data
rf = RandomForestClassifier(
    n_estimators=400,
    max_depth=10,
    min_samples_leaf=5,
    class_weight='balanced', 
    random_state=42
)
rf.fit(X_resampled, y_resampled)

In [20]:
X_resampled.shape

(5252, 33)

In [21]:
y_resampled.shape

(5252,)

In [22]:
X_train.shape

(3403, 33)

In [23]:
y_train.shape

(3403,)

In [24]:
# Predicting on original (non-resampled) test set
y_pred = rf.predict(X_test)

# Evaluation
print(classification_report(y_test, y_pred))
print(confusion_matrix(y_test, y_pred))

              precision    recall  f1-score   support

         0.0       0.85      0.76      0.80       657
         1.0       0.40      0.54      0.46       194

    accuracy                           0.71       851
   macro avg       0.62      0.65      0.63       851
weighted avg       0.74      0.71      0.72       851

[[499 158]
 [ 90 104]]


In [25]:
from sklearn.model_selection import RepeatedStratifiedKFold, cross_val_score

cv = RepeatedStratifiedKFold(n_splits=5, n_repeats=20, random_state=42)


scores = cross_val_score(rf, X_resampled, y_resampled, cv=cv, scoring='accuracy', n_jobs=-1)


print("All 100 accuracies:\n", scores)

print(f"\nMean accuracy: {scores.mean():.4f}")
print(f"Std  accuracy: {scores.std():.4f}")

All 100 accuracies:
 [0.83349191 0.82588011 0.83904762 0.81333333 0.82380952 0.82968601
 0.82017127 0.80666667 0.81809524 0.80666667 0.82302569 0.83063749
 0.80190476 0.83142857 0.82285714 0.84776403 0.82017127 0.82095238
 0.84190476 0.80285714 0.82492864 0.80970504 0.80666667 0.82285714
 0.82761905 0.81255947 0.82397716 0.84095238 0.81428571 0.83238095
 0.82397716 0.81065652 0.83428571 0.83428571 0.83047619 0.82492864
 0.82968601 0.83809524 0.83142857 0.83238095 0.81826832 0.80399619
 0.8247619  0.8152381  0.83809524 0.83254044 0.83444339 0.82571429
 0.81238095 0.82761905 0.82397716 0.80780209 0.83142857 0.83142857
 0.82666667 0.82017127 0.82588011 0.81714286 0.81904762 0.83428571
 0.81731684 0.81446242 0.82857143 0.82571429 0.8352381  0.82492864
 0.81351094 0.81619048 0.83809524 0.80380952 0.82017127 0.82207422
 0.82190476 0.82285714 0.81619048 0.80685062 0.83824929 0.8152381
 0.81142857 0.84380952 0.81255947 0.81160799 0.84380952 0.83809524
 0.8247619  0.80685062 0.81921979 0.830476

In [26]:
# Get 100 F1 scores
f1_scores = cross_val_score(rf, X, Y, cv=cv, scoring='f1_macro', n_jobs=-1)
print("\nAll 100 F1 scores:\n", f1_scores)
print(f"\nMean F1-score: {f1_scores.mean():.4f}")
print(f"Std  F1-score: {f1_scores.std():.4f}")


All 100 F1 scores:
 [0.6884163  0.63803827 0.62068197 0.70678152 0.63534828 0.67749496
 0.65024807 0.6819155  0.6411503  0.6727696  0.69637324 0.66804693
 0.67519084 0.65258689 0.64626167 0.64957047 0.7010709  0.65931906
 0.64929145 0.67112266 0.66809672 0.62025736 0.65331664 0.68922625
 0.67098269 0.66807752 0.6384445  0.65051462 0.67019887 0.66784621
 0.67643477 0.66778256 0.65757323 0.67648079 0.66797551 0.66792543
 0.68404319 0.65654991 0.66339094 0.65711461 0.66705607 0.66974029
 0.66180424 0.6608036  0.62626777 0.61198974 0.67762221 0.67500273
 0.65286065 0.69049207 0.63950424 0.69489715 0.64251208 0.6819112
 0.64469392 0.67228523 0.6608198  0.63270339 0.68642709 0.66064352
 0.68390164 0.67762782 0.64483    0.63679044 0.65544181 0.63903759
 0.67125381 0.67655251 0.67643477 0.64570822 0.65015058 0.69130934
 0.66780712 0.65258689 0.66789333 0.64453601 0.67221491 0.6742037
 0.68942604 0.64354182 0.64634733 0.67442014 0.65152315 0.6773695
 0.65215774 0.67654161 0.66665899 0.66600267

In [27]:
auc_scores = cross_val_score(rf, X, Y, cv=cv, scoring='roc_auc', n_jobs=-1)
print("\nAll 100 AUC scores:\n", auc_scores)
print(f"\nMean AUC: {auc_scores.mean():.4f}")
print(f"Std  AUC: {auc_scores.std():.4f}")



All 100 AUC scores:
 [0.78489385 0.7419699  0.72150042 0.81685428 0.75270304 0.78992296
 0.75062373 0.77590265 0.72270951 0.77618965 0.7798412  0.76360056
 0.77182287 0.75298624 0.75507606 0.74255049 0.78848719 0.76753127
 0.74665416 0.78885624 0.78052378 0.72044909 0.7563668  0.78139462
 0.76706688 0.77298404 0.75809286 0.75692385 0.75870857 0.78398447
 0.78188894 0.75852438 0.74851324 0.77245153 0.76852841 0.7606741
 0.77263098 0.75522917 0.78101939 0.7588006  0.76979083 0.76353779
 0.76899842 0.77030175 0.74068079 0.69708453 0.79386935 0.75053743
 0.76923077 0.79490665 0.72585479 0.78824397 0.76924163 0.7811132
 0.74913566 0.78135543 0.76293367 0.75292253 0.76786273 0.74970927
 0.78861272 0.79213545 0.75061589 0.73929018 0.75678118 0.75019222
 0.77336064 0.76749988 0.76887117 0.75776339 0.75647664 0.77106184
 0.76123115 0.74949969 0.77496385 0.7393259  0.76775095 0.75459367
 0.78792996 0.75865131 0.75248317 0.79197069 0.74634782 0.77065353
 0.75065219 0.76944562 0.77859373 0.754852

In [2]:
%store -r machine_data

In [3]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, f1_score, roc_auc_score
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import RandomOverSampler
import numpy as np

# Split original data
X = machine_data.drop(columns=['Health_status'])
Y = machine_data['Health_status']
X_train, X_test, y_train, y_test = train_test_split(
    X, Y, test_size=0.2, stratify=Y, random_state=42
)

# Store metrics
accuracies = []
f1_macros = []
auc_scores = []

# Loop over 100 different random states
for i in range(100):
    # Oversample training data
    X_train, X_test, y_train, y_test = train_test_split(
    X, Y, test_size=0.2, stratify=Y, random_state=i)
    ros = RandomOverSampler(random_state=i)
    X_resampled, y_resampled = ros.fit_resample(X_train, y_train)

    # Train model
    rf = RandomForestClassifier(
        n_estimators=400,
        max_depth=10,
        min_samples_leaf=5,
        class_weight='balanced',
        random_state=i
    )
    rf.fit(X_resampled, y_resampled)

    # Predict and evaluate
    y_pred = rf.predict(X_test)
    y_prob = rf.predict_proba(X_test)[:, 1]  # for AUC

    acc = accuracy_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred, average='macro')
    auc = roc_auc_score(y_test, y_prob)

    accuracies.append(acc)
    f1_macros.append(f1)
    auc_scores.append(auc)

# Print results
print(f"Accuracy: Mean = {np.mean(accuracies):.4f}, Std = {np.std(accuracies):.4f}")
print(f"F1 Macro: Mean = {np.mean(f1_macros):.4f}, Std = {np.std(f1_macros):.4f}")
print(f"AUC:      Mean = {np.mean(auc_scores):.4f}, Std = {np.std(auc_scores):.4f}")


Accuracy: Mean = 0.7325, Std = 0.0150
F1 Macro: Mean = 0.6561, Std = 0.0167
AUC:      Mean = 0.7585, Std = 0.0152


In [None]:
print(accuracies)

[0.7168037602820212, 0.717978848413631, 0.7109283196239718, 0.7309048178613397, 0.717978848413631, 0.7156286721504113, 0.7203290246768508, 0.7215041128084606, 0.7168037602820212, 0.7027027027027027, 0.7121034077555817, 0.7074030552291422, 0.7309048178613397, 0.717978848413631, 0.7144535840188014, 0.7215041128084606, 0.7297297297297297, 0.7109283196239718, 0.7097532314923619, 0.7144535840188014, 0.7121034077555817, 0.7191539365452408, 0.7121034077555817, 0.7097532314923619, 0.7144535840188014, 0.7132784958871915, 0.7191539365452408, 0.7262044653349001, 0.7062279670975323, 0.7168037602820212, 0.7250293772032902, 0.7203290246768508, 0.7097532314923619, 0.7203290246768508, 0.7203290246768508, 0.7144535840188014, 0.7144535840188014, 0.7132784958871915, 0.7262044653349001, 0.7191539365452408, 0.7203290246768508, 0.7238542890716804, 0.7085781433607521, 0.7203290246768508, 0.7156286721504113, 0.7121034077555817, 0.717978848413631, 0.7085781433607521, 0.7203290246768508, 0.7144535840188014, 0.7