In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from imblearn.over_sampling import SMOTE
import pickle
import os

In [2]:
# 1. Load data with error handling
def load_data(file_path):
    if not os.path.exists(file_path):
        raise FileNotFoundError(f"File not found: {file_path}")
    return pd.read_csv(file_path)

data = "../dummy-data.csv"
df = load_data(data)

In [3]:
# Display basic information about the dataset
df.head()

Unnamed: 0,Heart Rate (bpm),Breathing Rate (brpm),Oxygen Saturation (%),Blood Pressure (systolic),Blood Pressure (diastolic),Stress Index,Recovery Ability,PNS Index,SNS Index,RMSSD (ms),SD2 (ms),Hemoglobin A1c (%),Mean RRi (ms),SD1 (ms),HRV SDNN (ms),Hemoglobin (g/dl),Gender (0-M;1-F)
0,80.3,12.2,96.4,107.3,74.2,39.6,0,-0.9,0.4,49.7,67.9,4.7,958.8,82.3,87.9,17.1,0
1,73.1,17.7,95.9,92.4,70.8,98.7,0,-0.6,0.6,34.3,67.4,3.7,853.5,50.9,63.2,14.7,1
2,72.2,18.0,96.0,102.4,75.6,45.3,0,-0.9,0.4,40.5,59.0,4.3,873.3,65.4,54.7,13.8,0
3,70.6,14.7,95.1,110.0,62.2,77.8,0,-0.6,0.5,35.3,58.2,4.4,693.9,82.6,75.8,15.8,0
4,99.5,19.5,97.6,110.2,73.0,57.3,0,-0.2,0.5,37.9,62.3,3.0,826.3,63.9,79.0,12.5,1


In [4]:
def classify_disease(row):
    disease_scores = {
        "Hypertension": 0,
        "Atherosclerosis": 0,
        "Cardiovascular Disease (CVD)": 0,
        "Respiratory Disease (COPD or Asthma)": 0,
        "Chronic Fatigue Syndrome (CFS)": 0,
        "Diabetes": 0,
        "Arrhythmias": 0,
        "Stress-related Disorders": 0,
        "Autonomic Dysfunction": 0,
        "Anaemia": 0,
    }

    # Increment scores based on feature thresholds
    if (
        row["Blood Pressure (systolic)"] >= 130
        or row["Blood Pressure (diastolic)"] >= 80
    ):
        disease_scores["Hypertension"] += 1

    if row["Blood Pressure (systolic)"] > 140 or row["Hemoglobin A1c (%)"] > 6.5:
        disease_scores["Atherosclerosis"] += 1

    if (
        row["Heart Rate (bpm)"] > 100
        or row["Heart Rate (bpm)"] < 60
        or row["Blood Pressure (systolic)"] > 140
        or row["Blood Pressure (diastolic)"] > 90
        or row["HRV SDNN (ms)"] < 50
    ):
        disease_scores["Cardiovascular Disease (CVD)"] += 1

    if row["Breathing Rate (brpm)"] > 20 or row["Oxygen Saturation (%)"] < 90:
        disease_scores["Respiratory Disease (COPD or Asthma)"] += 1

    if (
        row["HRV SDNN (ms)"] < 50
        or row["RMSSD (ms)"] < 30
        or row["Recovery Ability"] == 1
    ):
        disease_scores["Chronic Fatigue Syndrome (CFS)"] += 1

    if row["Hemoglobin A1c (%)"] > 6.5:
        disease_scores["Diabetes"] += 1

    if row["HRV SDNN (ms)"] > 100 or row["Mean RRi (ms)"] < 600:
        disease_scores["Arrhythmias"] += 1

    if row["Stress Index"] > 70 or row["SNS Index"] > 1.0:
        disease_scores["Stress-related Disorders"] += 1

    if row["PNS Index"] < -1.0 or row["SNS Index"] > 1.0:
        disease_scores["Autonomic Dysfunction"] += 1

    if (row["Gender (0-M;1-F)"] == 0 and row["Hemoglobin (g/dl)"] < 13.5) or (
        row["Gender (0-M;1-F)"] == 1 and row["Hemoglobin (g/dl)"] < 12.0
    ):
        disease_scores["Anaemia"] += 1

    # Determine the disease with the highest score
    most_likely_disease = max(disease_scores, key=disease_scores.get)

    # If all scores are zero, classify as Healthy
    if all(score == 0 for score in disease_scores.values()):
        return "Healthy"

    return most_likely_disease

In [5]:
# Apply disease classification

df["Disease Classification"] = df.apply(classify_disease, axis=1)

In [6]:
# Display the class distribution
print("Class Distribution Before Sampling:")
print(df["Disease Classification"].value_counts())

Class Distribution Before Sampling:
Disease Classification
Hypertension                            285296
Cardiovascular Disease (CVD)             51327
Atherosclerosis                          49081
Stress-related Disorders                 27967
Healthy                                  21125
Respiratory Disease (COPD or Asthma)     12458
Anaemia                                   1225
Chronic Fatigue Syndrome (CFS)            1215
Arrhythmias                                247
Autonomic Dysfunction                       59
Name: count, dtype: int64


In [7]:
# Encode labels using LabelEncoder
label_encoder = LabelEncoder()
df["Disease Classification"] = label_encoder.fit_transform(df["Disease Classification"])

In [8]:
# Save label mapping for reuse
label_mapping = dict(zip(label_encoder.classes_, range(len(label_encoder.classes_))))
with open("label_mapping.pkl", "wb") as f:
    pickle.dump(label_mapping, f)

In [9]:
# Split features and target
X = df.drop(columns=["Disease Classification"])
y = df["Disease Classification"]

In [10]:
# Applying SMOTE for balancing the class distribution
smote = SMOTE(random_state=42)
X_resampled, y_resampled = smote.fit_resample(X, y)

# Display the class distribution after resampling
print("Class Distribution After SMOTE:")
print(pd.Series(y_resampled).value_counts())

Class Distribution After SMOTE:
Disease Classification
6    285296
9    285296
0    285296
7    285296
2    285296
4    285296
5    285296
8    285296
1    285296
3    285296
Name: count, dtype: int64


In [11]:
# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(
    X_resampled, y_resampled, test_size=0.2, stratify=y_resampled, random_state=42
)

In [12]:
# Standardize features using StandardScaler
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# Display scaled features
print("\nFirst 5 rows of Scaled Features:\n", X_train[:5])


First 5 rows of Scaled Features:
 [[ 0.87934984  1.50293168 -1.90378603  3.34195915  2.50275966  0.83655601
   0.44414969 -1.61783435  1.68693727 -1.86164409 -0.83175893 -0.96324126
   1.5716806  -1.78042224  1.0078355   0.60726494 -0.59109844]
 [ 0.36889468 -0.51226378  0.36646255 -1.65203137 -0.81186811  0.92422899
  -0.82994018  0.39462431  1.30426057 -1.76924388 -0.2169404   0.52690992
   1.4936651  -0.15792898  0.7727667   0.66712125 -0.59109844]
 [-1.32184344 -0.44183425  0.80889488 -0.07715913  0.07428137  0.27885884
  -0.82994018  1.25195351 -1.54816047 -0.28207093  0.67271873 -0.98094871
   0.44613435  0.7725689   0.30176226  1.13706927 -0.59109844]
 [-1.44059782  0.13655697  0.85230118 -0.68365467  0.11515972 -1.1456853
  -0.82994018  1.78866321 -1.08430652  1.05513273  0.93654501 -0.09407329
   0.28891548 -0.25915438  0.88107297 -0.16537115 -0.59109844]
 [ 0.40100091 -0.76406495  0.51758111  1.18839204 -1.45644035 -0.9166827
  -0.82994018 -0.72345681  1.15720651 -0.10519514

In [13]:
# Save the preprocessed data and scaler for later use
with open("preprocessed_data.pkl", "wb") as f:
    pickle.dump((X_train, X_test, y_train, y_test), f)

with open("scaler.pkl", "wb") as f:
    pickle.dump(scaler, f)

print("\nPreprocessing completed and data saved.")


Preprocessing completed and data saved.
