In [1]:
# Import libraries
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
from imblearn.over_sampling import SMOTE

In [2]:
# Step 1: Load the data
data = "../dummy-data.csv"
df = pd.read_csv(data)

print("Dataset Overview:")
df.info()
df.head()

Dataset Overview:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 450000 entries, 0 to 449999
Data columns (total 17 columns):
 #   Column                      Non-Null Count   Dtype  
---  ------                      --------------   -----  
 0   Heart Rate (bpm)            450000 non-null  float64
 1   Breathing Rate (brpm)       450000 non-null  float64
 2   Oxygen Saturation (%)       450000 non-null  float64
 3   Blood Pressure (systolic)   450000 non-null  float64
 4   Blood Pressure (diastolic)  450000 non-null  float64
 5   Stress Index                450000 non-null  float64
 6   Recovery Ability            450000 non-null  int64  
 7   PNS Index                   450000 non-null  float64
 8   SNS Index                   450000 non-null  float64
 9   RMSSD (ms)                  450000 non-null  float64
 10  SD2 (ms)                    450000 non-null  float64
 11  Hemoglobin A1c (%)          450000 non-null  float64
 12  Mean RRi (ms)               450000 non-null  float64
 

Unnamed: 0,Heart Rate (bpm),Breathing Rate (brpm),Oxygen Saturation (%),Blood Pressure (systolic),Blood Pressure (diastolic),Stress Index,Recovery Ability,PNS Index,SNS Index,RMSSD (ms),SD2 (ms),Hemoglobin A1c (%),Mean RRi (ms),SD1 (ms),HRV SDNN (ms),Hemoglobin (g/dl),Gender (0-M;1-F)
0,80.3,12.2,96.4,107.3,74.2,39.6,0,-0.9,0.4,49.7,67.9,4.7,958.8,82.3,87.9,17.1,0
1,73.1,17.7,95.9,92.4,70.8,98.7,0,-0.6,0.6,34.3,67.4,3.7,853.5,50.9,63.2,14.7,1
2,72.2,18.0,96.0,102.4,75.6,45.3,0,-0.9,0.4,40.5,59.0,4.3,873.3,65.4,54.7,13.8,0
3,70.6,14.7,95.1,110.0,62.2,77.8,0,-0.6,0.5,35.3,58.2,4.4,693.9,82.6,75.8,15.8,0
4,99.5,19.5,97.6,110.2,73.0,57.3,0,-0.2,0.5,37.9,62.3,3.0,826.3,63.9,79.0,12.5,1


In [3]:
# Drop the 'Gender' column
df = df.drop(columns=["Gender (0-M;1-F)"])

# Ensure numeric data and handle missing values
df = df.apply(pd.to_numeric, errors="coerce")
df = df.fillna(df.median())

In [4]:
disease_rules = {
    # Hypertension: High blood pressure (systolic or diastolic)
    "Hypertension": lambda row: (
        row["Blood Pressure (systolic)"] >= 140
        or row["Blood Pressure (diastolic)"] >= 90
    )
    and (
        row["Heart Rate (bpm)"] >= 60
        and row["Heart Rate (bpm)"] <= 100
    )
    and row["Hemoglobin A1c (%)"] <= 5.7,
    # Cardiovascular Disease (CVD): High or low heart rate, low HRV, abnormal oxygen saturation
    "Cardiovascular Disease (CVD)": lambda row: (
        row["Heart Rate (bpm)"] < 60
        or row["Heart Rate (bpm)"] > 100
    )
    and (row["HRV SDNN (ms)"] < 50) 
    and row["Oxygen Saturation (%)"] >= 95,
    # Chronic Fatigue Syndrome (CFS): Low HRV and inability to recover
    "Chronic Fatigue Syndrome (CFS)": lambda row: (
        row["HRV SDNN (ms)"] < 50
    )
    and (row["Recovery Ability"] > 1),
    # Diabetes: Elevated HbA1c levels
    "Diabetes": lambda row: (
        row["Hemoglobin A1c (%)"] > 6.4
    ),
    # Anaemia: Low hemoglobin levels
    "Anaemia": lambda row: (
        row["Hemoglobin (g/dl)"]
        < 13.5 
    )
    and row["Oxygen Saturation (%)"] >= 95,
    # Atherosclerosis: Elevated HbA1c levels
    "Atherosclerosis": lambda row: (
        5.7 < row["Hemoglobin A1c (%)"] <= 6.4 
    )
    and row["Blood Pressure (systolic)"] < 140,
    # Arrhythmia: Abnormal RR interval, high HRV or heart rate abnormalities
    "Arrhythmia": lambda row: (
        row["Mean RRi (ms)"] < 600
        or row["HRV SDNN (ms)"] > 100
    )
    and row["Heart Rate (bpm)"] >= 60,
    # Stress-related Disorders: High SNS index or stress index
    "Stress-related Disorders": lambda row: (
        row["Stress Index"] > 70
        or row["SNS Index"] > 1.0
    ),
    # Respiratory Disease (COPD or Asthma): High breathing rate or low oxygen saturation
    "Respiratory Disease (COPD or Asthma)": lambda row: (
        row["Breathing Rate (brpm)"] > 20
    )
    or (row["Oxygen Saturation (%)"] < 95),
    # Autonomic Dysfunction: Low PNS or high SNS activity
    "Autonomic Dysfunction": lambda row: (
        row["PNS Index"] < -1.0
        or row["SNS Index"] > 1.0
    ),
}

In [5]:
# General function to classify disease based on the rules
def classify_disease(row):
    for disease, rule in disease_rules.items():
        if rule(row):
            return disease
    return "Healthy"

In [6]:
# Apply disease classification
df["Disease Classification"] = df.apply(classify_disease, axis=1)

In [7]:
# Encode labels using LabelEncoder
label_encoder = LabelEncoder()

# Encode target variable since other features already in numerical
df["Disease Classification"] = label_encoder.fit_transform(df["Disease Classification"])

# Print label mapping
print("Label Encoder Mapping:")
print(dict(zip(label_encoder.classes_, label_encoder.transform(label_encoder.classes_))))

Label Encoder Mapping:
{'Anaemia': np.int64(0), 'Arrhythmia': np.int64(1), 'Atherosclerosis': np.int64(2), 'Autonomic Dysfunction': np.int64(3), 'Cardiovascular Disease (CVD)': np.int64(4), 'Chronic Fatigue Syndrome (CFS)': np.int64(5), 'Diabetes': np.int64(6), 'Healthy': np.int64(7), 'Hypertension': np.int64(8), 'Respiratory Disease (COPD or Asthma)': np.int64(9), 'Stress-related Disorders': np.int64(10)}


In [8]:
# Split features and target
X = df.drop(columns=["Disease Classification"])
y = df["Disease Classification"]

In [9]:
# Display the class distribution before resampling
print("Class Distribution before SMOTE:")
print(pd.Series(y).value_counts())

Class Distribution before SMOTE:
Disease Classification
6     117506
5     104678
10     66751
8      32889
2      32377
1      31944
7      17789
0      17120
4      16342
9      12419
3        185
Name: count, dtype: int64


In [10]:
# Resample the data using SMOTE
smote = SMOTE(random_state=42)
X_resampled, y_resampled = smote.fit_resample(X, y)

In [11]:
# Display the class distribution after resampling
print("Class Distribution After SMOTE:")
print(pd.Series(y_resampled).value_counts())

Class Distribution After SMOTE:
Disease Classification
7     117506
10    117506
0     117506
6     117506
8     117506
1     117506
4     117506
5     117506
9     117506
2     117506
3     117506
Name: count, dtype: int64


In [12]:
# Save preprocessed data
preprocessed_data = {
    "X_scaled": X_resampled,
    "y_resampled": y_resampled,
    "class_labels": label_encoder.classes_,
}
pd.to_pickle(preprocessed_data, "preprocessed_data.pkl")
X.to_pickle("original_features.pkl")

print("Preprocessed data saved and original X features saved.")

Preprocessed data saved and original X features saved.
