In [1]:
# Import libraries
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder, StandardScaler
from imblearn.over_sampling import SMOTE

In [2]:
# Step 1: Load the data
data = "../dummy-data.csv"
df = pd.read_csv(data)

In [3]:
# Define the class labels
class_labels = [
    "Hypertension",
    "Cardiovascular Disease (CVD)",
    "Chronic Fatigue Syndrome (CFS)",
    "Stress-related Disorders",
    "Healthy",
    "Diabetes",
    "Anaemia",
    "Atherosclerosis",
    "Arrhythmia",
    "Respiratory Disease (COPD or Asthma)",
    "Autonomic Dysfunction",
]

In [4]:
# Disease classification logic
def classify_disease(row):
    if (
        row["Blood Pressure (systolic)"] >= 140
        or row["Blood Pressure (diastolic)"] >= 90
    ):
        return "Hypertension"
    elif row["Heart Rate (bpm)"] < 60 or row["Heart Rate (bpm)"] > 100:
        return "Cardiovascular Disease (CVD)"
    elif row["HRV SDNN (ms)"] < 50:
        return "Chronic Fatigue Syndrome (CFS)"
    elif row["Hemoglobin A1c (%)"] > 6.4:
        return "Diabetes"
    elif (row["Gender (0-M;1-F)"] == 0 and row["Hemoglobin (g/dl)"] < 13.5) or (
        row["Gender (0-M;1-F)"] == 1 and row["Hemoglobin (g/dl)"] < 12.0
    ):
        return "Anaemia"
    elif row["Hemoglobin A1c (%)"] > 5.7 and row["Hemoglobin A1c (%)"] <= 6.4:
        return "Atherosclerosis"
    elif row["Mean RRi (ms)"] < 600 or row["HRV SDNN (ms)"] > 100:
        return "Arrhythmia"
    elif row["Stress Index"] > 70 or row["SNS Index"] > 1.0:
        return "Stress-related Disorders"
    elif row["Breathing Rate (brpm)"] > 20 or row["Oxygen Saturation (%)"] < 95:
        return "Respiratory Disease (COPD or Asthma)"
    elif row["PNS Index"] < -1.0 or row["SNS Index"] > 1.0:
        return "Autonomic Dysfunction"
    else:
        return "Healthy"

In [5]:
# Apply disease classification
df["Disease Classification"] = df.apply(classify_disease, axis=1)

In [6]:
# Encode labels using LabelEncoder
label_encoder = LabelEncoder()
df["Disease Classification"] = label_encoder.fit_transform(df["Disease Classification"])

In [7]:
# Split features and target
X = df.drop(columns=["Disease Classification"])
y = df["Disease Classification"]

In [8]:
# Display the class distribution before resampling
print("Class Distribution before SMOTE:")
print(pd.Series(y).value_counts())

Class Distribution before SMOTE:
Disease Classification
8     219610
4     101114
5      42896
10     30639
7      21191
6      14726
0       8304
2       6582
1       3369
9       1525
3         44
Name: count, dtype: int64


In [9]:
# Step 2: Resample the data using SMOTE
smote = SMOTE(random_state=42)
X_resampled, y_resampled = smote.fit_resample(X, y)

In [10]:
# Display the class distribution after resampling
print("Class Distribution After SMOTE:")
print(pd.Series(y_resampled).value_counts())

Class Distribution After SMOTE:
Disease Classification
7     219610
10    219610
0     219610
8     219610
6     219610
4     219610
5     219610
2     219610
9     219610
1     219610
3     219610
Name: count, dtype: int64


In [11]:
# Step 3: Scale the features using StandardScaler
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X_resampled)

In [12]:
# Save preprocessed data
preprocessed_data = {
    "X_scaled": X_scaled,
    "y_resampled": y_resampled,
    "class_labels": class_labels,
}
pd.to_pickle(preprocessed_data, "preprocessed_data.pkl")

print("Preprocessed data saved to 'preprocessed_data.pkl'.")

Preprocessed data saved to 'preprocessed_data.pkl'.
