In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from imblearn.over_sampling import SMOTE
import pickle
import os

In [2]:
# 1. Load data with error handling
def load_data(file_path):
    if not os.path.exists(file_path):
        raise FileNotFoundError(f"File not found: {file_path}")
    return pd.read_csv(file_path)


data = load_data("C:/Users/syafi/Desktop/syafiq-project/dummy-data.csv")

In [3]:
# Display basic information about the dataset
print("Data Head:")
print(data.head())
print("\nData Description:")
print(data.describe())

Data Head:
   Heart Rate (bpm)  Breathing Rate (brpm)  Oxygen Saturation (%)  \
0              80.3                   12.2                   96.4   
1              73.1                   17.7                   95.9   
2              72.2                   18.0                   96.0   
3              70.6                   14.7                   95.1   
4              99.5                   19.5                   97.6   

   Blood Pressure (systolic)  Blood Pressure (diastolic)  Stress Index  \
0                      107.3                        74.2          39.6   
1                       92.4                        70.8          98.7   
2                      102.4                        75.6          45.3   
3                      110.0                        62.2          77.8   
4                      110.2                        73.0          57.3   

   Recovery Ability  PNS Index  SNS Index  RMSSD (ms)  SD2 (ms)  \
0                 0       -0.9        0.4        49.7      67.

In [4]:
# Hypertension
data["Hypertension"] = (
    (data["Blood Pressure (systolic)"].between(130, 139))
    | (data["Blood Pressure (diastolic)"].between(80, 89))
    | (data["Blood Pressure (systolic)"] > 140)
    | (data["Blood Pressure (diastolic)"] > 90)
)

# Atherosclerosis
data["Atherosclerosis"] = (data["Blood Pressure (systolic)"] > 140) | (
    data["Hemoglobin A1c (%)"] > 6.5
)

# Cardiovascular Disease (CVD)
data["Cardiovascular Disease (CVD)"] = (
    (data["Heart Rate (bpm)"] > 100)
    | (data["Heart Rate (bpm)"] < 60)
    | (data["Blood Pressure (systolic)"] > 140)
    | (data["Blood Pressure (diastolic)"] > 90)
    | (data["HRV SDNN (ms)"] < 50)
)

# Respiratory Disease
data["Respiratory Disease (COPD or Asthma)"] = (data["Breathing Rate (brpm)"] > 20) | (
    data["Oxygen Saturation (%)"] < 90
)

# Chronic Fatigue Syndrome (CFS)
data["Chronic Fatigue Syndrome (CFS)"] = (
    (data["HRV SDNN (ms)"] < 50)
    | (data["RMSSD (ms)"] < 30)
    | (data["Recovery Ability"] == 0)
)

# Diabetes
data["Diabetes"] = data["Hemoglobin A1c (%)"] > 6.5

# Arrhythmias
data["Arrhythmias"] = (data["HRV SDNN (ms)"] > 100) | (data["Mean RRi (ms)"] < 600)

# Stress-related Disorders
data["Stress-related Disorders"] = (data["Stress Index"] > 70) | (
    data["SNS Index"] > 1.0
)

# Autonomic Dysfunction
data["Autonomic Dysfunction"] = (data["PNS Index"] < -1.0) | (data["SNS Index"] > 1.0)

# Anaemia
data["Anaemia"] = np.where(
    (data["Gender (0-M;1-F)"] == 0) & (data["Hemoglobin (g/dl)"] < 13.5),
    True,
    (data["Gender (0-M;1-F)"] == 1) & (data["Hemoglobin (g/dl)"] < 12.0),
)

In [5]:
# Define priority order of diseases
diseases = [
    "Hypertension",
    "Atherosclerosis",
    "Cardiovascular Disease (CVD)",
    "Respiratory Disease (COPD or Asthma)",
    "Chronic Fatigue Syndrome (CFS)",
    "Diabetes",
    "Arrhythmias",
    "Stress-related Disorders",
    "Autonomic Dysfunction",
    "Anaemia",
]   

In [6]:
# Assign disease classifications based on priority
def classify_disease_vectorized(row):
    for disease in diseases:
        if row[disease]:
            return disease
    return "Healthy"

In [7]:
# Apply the classification
data["Disease Classification"] = data.apply(classify_disease_vectorized, axis=1)

# Display the class distribution
print("Class Distribution Before Sampling:")
data["Disease Classification"].value_counts()

Class Distribution Before Sampling:


Disease Classification
Hypertension                            279469
Cardiovascular Disease (CVD)             53941
Atherosclerosis                          51592
Chronic Fatigue Syndrome (CFS)           51101
Respiratory Disease (COPD or Asthma)     13066
Stress-related Disorders                   374
Arrhythmias                                294
Autonomic Dysfunction                       70
Healthy                                     67
Anaemia                                     26
Name: count, dtype: int64

In [8]:
# Encode labels using LabelEncoder
label_encoder = LabelEncoder()
data["Disease Classification"] = label_encoder.fit_transform(data["Disease Classification"])

# Save label mapping for reuse
label_mapping = dict(zip(label_encoder.classes_, range(len(label_encoder.classes_))))
with open("label_mapping.pkl", "wb") as f:
    pickle.dump(label_mapping, f)

In [9]:
# Split features and target
X = data.drop(columns=["Disease Classification"])
y = data["Disease Classification"]

# Display the first few rows of features and target
print("\nFeature Sample", X.head())
print("\nTarget Sample", y.head())


Feature Sample    Heart Rate (bpm)  Breathing Rate (brpm)  Oxygen Saturation (%)  \
0              80.3                   12.2                   96.4   
1              73.1                   17.7                   95.9   
2              72.2                   18.0                   96.0   
3              70.6                   14.7                   95.1   
4              99.5                   19.5                   97.6   

   Blood Pressure (systolic)  Blood Pressure (diastolic)  Stress Index  \
0                      107.3                        74.2          39.6   
1                       92.4                        70.8          98.7   
2                      102.4                        75.6          45.3   
3                      110.0                        62.2          77.8   
4                      110.2                        73.0          57.3   

   Recovery Ability  PNS Index  SNS Index  RMSSD (ms)  ...  Hypertension  \
0                 0       -0.9        0.4       

In [10]:
# Applying SMOTE for balancing the class distribution
smote = SMOTE(random_state=42)
X_resampled, y_resampled = smote.fit_resample(X, y)

# Display the class distribution after resampling
print("Class Distribution After SMOTE:")
print(pd.Series(y_resampled).value_counts())

Class Distribution After SMOTE:
Disease Classification
5    279469
7    279469
2    279469
4    279469
3    279469
8    279469
1    279469
0    279469
9    279469
6    279469
Name: count, dtype: int64


In [11]:
# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(
    X_resampled, y_resampled, test_size=0.2, stratify=y_resampled, random_state=42
)

# Standardize features using StandardScaler
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# Display scaled features
print("\nFirst 5 rows of Scaled Features:\n", X_train[:5])


First 5 rows of Scaled Features:
 [[-1.10666568e+00 -2.60426946e-01  7.14608658e-01  1.03848479e+00
  -7.67795052e-01  8.86332300e-01 -5.97199658e-02  1.25149501e+00
  -1.78278709e-01  2.08178695e-01  7.32884947e-01 -8.02049655e-01
  -1.92959136e+00  1.40409699e+00  5.48174651e-05 -4.48564880e-01
  -6.06658072e-01 -3.33333168e-01 -4.48624017e-01 -6.26463572e-01
  -7.79306837e-01 -8.96268796e-01 -4.09382339e-01  1.70518819e+00
   9.14710388e-01  9.17700930e-01  1.05093528e+00]
 [-8.27928843e-01  2.13905052e+00 -1.68593009e+00  2.46763451e-01
  -3.74061865e-01  1.24133662e+00 -1.51947084e+00 -8.39467426e-01
   3.93852764e-01  2.15057526e-01  1.80698793e+00 -2.90120366e-01
  -1.72325223e+00 -1.20252086e-01 -4.65413066e-01 -1.84342860e+00
  -6.06658072e-01 -3.33333168e-01 -4.48624017e-01 -6.26463572e-01
   1.28319162e+00  1.11573671e+00 -4.09382339e-01  1.70518819e+00
   9.14710388e-01  9.17700930e-01  1.05093528e+00]
 [-4.84019992e-01 -7.26028107e-01  3.06256401e-01 -5.11105442e-01
  -1.

In [12]:
# Save the preprocessed data and scaler for later use
with open("preprocessed_data.pkl", "wb") as f:
    pickle.dump((X_train, X_test, y_train, y_test), f)

with open("scaler.pkl", "wb") as f:
    pickle.dump(scaler, f)

print("\nPreprocessing completed and data saved.")


Preprocessing completed and data saved.
