In [1]:
# Import libraries
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
import pickle
import os

In [2]:
# Load data
data = "../dummy-data.csv"
df = pd.read_csv(data)

In [3]:
# Display the first five rows of the dataset
df.head()

Unnamed: 0,Heart Rate (bpm),Breathing Rate (brpm),Oxygen Saturation (%),Blood Pressure (systolic),Blood Pressure (diastolic),Stress Index,Recovery Ability,PNS Index,SNS Index,RMSSD (ms),SD2 (ms),Hemoglobin A1c (%),Mean RRi (ms),SD1 (ms),HRV SDNN (ms),Hemoglobin (g/dl),Gender (0-M;1-F)
0,80.3,12.2,96.4,107.3,74.2,39.6,0,-0.9,0.4,49.7,67.9,4.7,958.8,82.3,87.9,17.1,0
1,73.1,17.7,95.9,92.4,70.8,98.7,0,-0.6,0.6,34.3,67.4,3.7,853.5,50.9,63.2,14.7,1
2,72.2,18.0,96.0,102.4,75.6,45.3,0,-0.9,0.4,40.5,59.0,4.3,873.3,65.4,54.7,13.8,0
3,70.6,14.7,95.1,110.0,62.2,77.8,0,-0.6,0.5,35.3,58.2,4.4,693.9,82.6,75.8,15.8,0
4,99.5,19.5,97.6,110.2,73.0,57.3,0,-0.2,0.5,37.9,62.3,3.0,826.3,63.9,79.0,12.5,1


In [4]:
# Define the class labels
class_labels = [
    "Hypertension",
    "Cardiovascular Disease (CVD)",
    "Chronic Fatigue Syndrome (CFS)",
    "Stress-related Disorders",
    "Healthy",
    "Diabetes",
    "Anaemia",
    "Atherosclerosis",
    "Arrhythmia",
    "Respiratory Disease (COPD or Asthma)",
    "Autonomic Dysfunction",
]

In [5]:
def classify_disease(row):
    if row["Blood Pressure (systolic)"] >= 140 or row["Blood Pressure (diastolic)"] >= 90:
        return "Hypertension"
    elif row["Heart Rate (bpm)"] < 60 or row["Heart Rate (bpm)"] > 100:
        return "Cardiovascular Disease (CVD)"
    elif row["HRV SDNN (ms)"] < 50:
        return "Chronic Fatigue Syndrome (CFS)"
    elif row["Hemoglobin A1c (%)"] > 6.4:
        return "Diabetes"
    elif (row["Gender (0-M;1-F)"] == 0 and row["Hemoglobin (g/dl)"] < 13.5) or (
        row["Gender (0-M;1-F)"] == 1 and row["Hemoglobin (g/dl)"] < 12.0
    ):
        return "Anaemia"
    elif row["Hemoglobin A1c (%)"] > 5.7 and row["Hemoglobin A1c (%)"] <= 6.4:
        return "Atherosclerosis"
    elif row["Mean RRi (ms)"] < 600 or row["HRV SDNN (ms)"] > 100:
        return "Arrhythmia"
    elif row["Stress Index"] > 70 or row["SNS Index"] > 1.0:
        return "Stress-related Disorders"
    elif row["Breathing Rate (brpm)"] > 20 or row["Oxygen Saturation (%)"] < 95:
        return "Respiratory Disease (COPD or Asthma)"
    elif row["PNS Index"] < -1.0 or row["SNS Index"] > 1.0:
        return "Autonomic Dysfunction"
    else:
        return "Healthy"


In [6]:
# Apply disease classification
df["Disease Classification"] = df.apply(classify_disease, axis=1)

In [7]:
# Display the class distribution
print("Class Distribution Before Sampling:")
print(df["Disease Classification"].value_counts())

Class Distribution Before Sampling:
Disease Classification
Hypertension                            219610
Cardiovascular Disease (CVD)            101114
Chronic Fatigue Syndrome (CFS)           42896
Stress-related Disorders                 30639
Healthy                                  21191
Diabetes                                 14726
Anaemia                                   8304
Atherosclerosis                           6582
Arrhythmia                                3369
Respiratory Disease (COPD or Asthma)      1525
Autonomic Dysfunction                       44
Name: count, dtype: int64


In [8]:
# Encode labels using LabelEncoder
label_encoder = LabelEncoder()
df["Disease Classification"] = label_encoder.fit_transform(df["Disease Classification"])

In [9]:
# Split features and target
X = df.drop(columns=["Disease Classification"])
y = df["Disease Classification"]

In [10]:
# Save the preprocessed data and scaler for later use
with open("preprocessed_data.pkl", "wb") as f:
    pickle.dump((X, y, label_encoder), f)

print("\nPreprocessing completed and data saved.")


Preprocessing completed and data saved.
