In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from imblearn.combine import SMOTEENN
import pickle
import os

In [2]:
# 1. Load data with error handling
def load_data(file_path):
    if not os.path.exists(file_path):
        raise FileNotFoundError(f"File not found: {file_path}")
    return pd.read_csv(file_path)


data = load_data("C:/Users/syafi/Desktop/TM/Heart_Rate_Classification/dummy-data.csv")

data.head()

Unnamed: 0,Heart Rate (bpm),Breathing Rate (brpm),Oxygen Saturation (%),Blood Pressure (systolic),Blood Pressure (diastolic),Stress Index,Recovery Ability,PNS Index,SNS Index,RMSSD (ms),SD2 (ms),Hemoglobin A1c (%),Mean RRi (ms),SD1 (ms),HRV SDNN (ms),Hemoglobin (g/dl),Gender (0-M;1-F)
0,80.3,12.2,96.4,107.3,74.2,39.6,0,-0.9,0.4,49.7,67.9,4.7,958.8,82.3,87.9,17.1,0
1,73.1,17.7,95.9,92.4,70.8,98.7,0,-0.6,0.6,34.3,67.4,3.7,853.5,50.9,63.2,14.7,1
2,72.2,18.0,96.0,102.4,75.6,45.3,0,-0.9,0.4,40.5,59.0,4.3,873.3,65.4,54.7,13.8,0
3,70.6,14.7,95.1,110.0,62.2,77.8,0,-0.6,0.5,35.3,58.2,4.4,693.9,82.6,75.8,15.8,0
4,99.5,19.5,97.6,110.2,73.0,57.3,0,-0.2,0.5,37.9,62.3,3.0,826.3,63.9,79.0,12.5,1


In [3]:
classification_criteria = {
    "Hypertension": lambda row: (
        (
            row["Blood Pressure (systolic)"] > 130
            or row["Blood Pressure (diastolic)"] > 80
        )
        and (60 <= row["Heart Rate (bpm)"] <= 100)
    ),
    "Atherosclerosis": lambda row: (
        row["Blood Pressure (systolic)"] > 140 or row["Hemoglobin A1c (%)"] > 7.0
    ),
    "Cardiovascular Disease (CVD)": lambda row: (
        row["Heart Rate (bpm)"] > 100
        or row["Heart Rate (bpm)"] < 60
        or row["Blood Pressure (systolic)"] > 140
        or row["Blood Pressure (diastolic)"] > 90
    ),
    "Respiratory Disease (COPD or Asthma)": lambda row: (
        row["Breathing Rate (brpm)"] > 20 or row["Oxygen Saturation (%)"] < 90
    ),
    "Chronic Fatigue Syndrome (CFS)": lambda row: (
        row["HRV SDNN (ms)"] < 50
        or row["RMSSD (ms)"] < 30
        or row["Recovery Ability"] == 0
    ),
    "Diabetes": lambda row: (row["Hemoglobin A1c (%)"] > 6.5),
    "Arrhythmias": lambda row: (
        row["HRV SDNN (ms)"] > 100 or row["Mean RRi (ms)"] < 600
    ),
    "Stress-related Disorders": lambda row: (
        row["Stress Index"] > 70 or row["SNS Index"] > 1.0
    ),
    "Autonomic Dysfunction": lambda row: (
        row["PNS Index"] < -1.0 or row["SNS Index"] > 1.0
    ),
    "Anaemia": lambda row: (
        (row["Gender (0-M;1-F)"] == 0 and row["Hemoglobin (g/dl)"] < 13.5)
        or (row["Gender (0-M;1-F)"] == 1 and row["Hemoglobin (g/dl)"] < 12.0)
    ),
    "Healthy": lambda row: True,  # Default category
}

# Function to classify disease based on criteria
def classify_disease(row):
    for disease, condition in classification_criteria.items():
        if condition(row):
            return disease
    return "Healthy"

# Apply the classification function
data["Disease Classification"] = data.apply(classify_disease, axis=1)

# Display the class distribution
print("Class Distribution Before Sampling:")
data["Disease Classification"].value_counts()

Class Distribution Before Sampling:


Disease Classification
Atherosclerosis                         152809
Hypertension                            115644
Cardiovascular Disease (CVD)             99122
Chronic Fatigue Syndrome (CFS)           53545
Respiratory Disease (COPD or Asthma)     28039
Stress-related Disorders                   352
Arrhythmias                                284
Healthy                                     67
Autonomic Dysfunction                       65
Diabetes                                    48
Anaemia                                     25
Name: count, dtype: int64

In [4]:
"""
# Function for stratified sampling
def stratified_sample(df, max_samples):
    return df.groupby("Disease Classification", group_keys=False).apply(
        lambda x: x.sample(
            n=min(len(x), max_samples // len(df["Disease Classification"].unique())),
            random_state=42,
        )
    )


# Limit dataset to 200,000 samples
data = stratified_sample(data, max_samples=200000)

# Display the class distribution after sampling
print("\nClass Distribution After Sampling:")
data["Disease Classification"].value_counts()
"""

'\n# Function for stratified sampling\ndef stratified_sample(df, max_samples):\n    return df.groupby("Disease Classification", group_keys=False).apply(\n        lambda x: x.sample(\n            n=min(len(x), max_samples // len(df["Disease Classification"].unique())),\n            random_state=42,\n        )\n    )\n\n\n# Limit dataset to 200,000 samples\ndata = stratified_sample(data, max_samples=200000)\n\n# Display the class distribution after sampling\nprint("\nClass Distribution After Sampling:")\ndata["Disease Classification"].value_counts()\n'

In [5]:
# Encode labels using LabelEncoder
label_encoder = LabelEncoder()
data["Disease Classification"] = label_encoder.fit_transform(
    data["Disease Classification"]
)

# Save label mapping for reuse
label_mapping = dict(zip(label_encoder.classes_, range(len(label_encoder.classes_))))
with open("label_mapping.pkl", "wb") as f:
    pickle.dump(label_mapping, f)

# Split features and target
X = data.drop(columns=["Disease Classification"])
y = data["Disease Classification"]

# Display the first few rows of features and target
X.head(), y.head()

(   Heart Rate (bpm)  Breathing Rate (brpm)  Oxygen Saturation (%)  \
 0              80.3                   12.2                   96.4   
 1              73.1                   17.7                   95.9   
 2              72.2                   18.0                   96.0   
 3              70.6                   14.7                   95.1   
 4              99.5                   19.5                   97.6   
 
    Blood Pressure (systolic)  Blood Pressure (diastolic)  Stress Index  \
 0                      107.3                        74.2          39.6   
 1                       92.4                        70.8          98.7   
 2                      102.4                        75.6          45.3   
 3                      110.0                        62.2          77.8   
 4                      110.2                        73.0          57.3   
 
    Recovery Ability  PNS Index  SNS Index  RMSSD (ms)  SD2 (ms)  \
 0                 0       -0.9        0.4        49.7    

In [6]:
# Apply SMOTE-ENN for balancing
smote_enn = SMOTEENN(random_state=42)
X_resampled, y_resampled = smote_enn.fit_resample(X, y)

# Display the class distribution after resampling with disease names
resampled_counts_named = (
    pd.Series(y_resampled)
    .value_counts()
    .rename(index=lambda x: [k for k, v in label_mapping.items() if v == x][0])
)
print("\nClass Distribution After SMOTE-ENN with Disease Names:")
print(resampled_counts_named)


Class Distribution After SMOTE-ENN with Disease Names:
Disease Classification
Anaemia                                 152809
Arrhythmias                             152809
Autonomic Dysfunction                   152809
Diabetes                                152809
Healthy                                 152809
Stress-related Disorders                152809
Respiratory Disease (COPD or Asthma)    150751
Chronic Fatigue Syndrome (CFS)          149799
Hypertension                            113126
Cardiovascular Disease (CVD)             86519
Atherosclerosis                          54461
Name: count, dtype: int64


In [7]:
# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(
    X_resampled, y_resampled, test_size=0.2, stratify=y_resampled, random_state=42
)

# Standardize features using StandardScaler
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# Display scaled features
X_train[:5]

array([[-1.10068274, -0.35545791,  0.68544741, -0.31469136, -0.59438685,
         0.53050141, -0.14020379, -0.28459813,  0.22262704, -0.04499016,
         1.31507878,  1.17842863, -0.8857903 , -0.4258558 ,  0.33096843,
        -0.34411947,  1.63634875],
       [ 0.46888114,  0.03097598,  0.86298453,  1.07679402,  0.28024758,
        -0.48196781, -0.14020379,  1.21306318, -0.35806702,  1.04700999,
         1.13333406,  0.37890216, -1.52016072,  1.51887906, -0.04120641,
         1.00008144, -0.61111667],
       [ 0.82579569, -0.03205186,  0.67356624, -0.88795928, -0.11851017,
         0.56951359, -0.14020379, -0.52999798,  1.13854471,  0.99010397,
        -1.61439047, -0.94217897, -1.44545229,  0.69340612,  0.48348366,
        -1.03915704, -0.61111667],
       [-0.58138546,  0.06088181,  0.86705834,  0.50548106, -0.97421626,
        -0.01627507, -0.14020379, -1.93692219,  0.10080705,  0.92936787,
         1.20047295, -0.65458214, -0.89788555,  0.72714943, -0.02107384,
        -0.3826632 

In [8]:
# Save the preprocessed data and scaler for later use
with open("preprocessed_data.pkl", "wb") as f:
    pickle.dump((X_train, X_test, y_train, y_test), f)

with open("scaler.pkl", "wb") as f:
    pickle.dump(scaler, f)

print("\nPreprocessing completed and data saved.")


Preprocessing completed and data saved.
