In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from imblearn.over_sampling import SMOTE
import pickle
import os

In [2]:
# 1. Load data with error handling
def load_data(file_path):
    if not os.path.exists(file_path):
        raise FileNotFoundError(f"File not found: {file_path}")
    return pd.read_csv(file_path)

data = "../dummy-data.csv"
df = load_data(data)

In [3]:
# Display basic information about the dataset
df.head()

Unnamed: 0,Heart Rate (bpm),Breathing Rate (brpm),Oxygen Saturation (%),Blood Pressure (systolic),Blood Pressure (diastolic),Stress Index,Recovery Ability,PNS Index,SNS Index,RMSSD (ms),SD2 (ms),Hemoglobin A1c (%),Mean RRi (ms),SD1 (ms),HRV SDNN (ms),Hemoglobin (g/dl),Gender (0-M;1-F)
0,80.3,12.2,96.4,107.3,74.2,39.6,0,-0.9,0.4,49.7,67.9,4.7,958.8,82.3,87.9,17.1,0
1,73.1,17.7,95.9,92.4,70.8,98.7,0,-0.6,0.6,34.3,67.4,3.7,853.5,50.9,63.2,14.7,1
2,72.2,18.0,96.0,102.4,75.6,45.3,0,-0.9,0.4,40.5,59.0,4.3,873.3,65.4,54.7,13.8,0
3,70.6,14.7,95.1,110.0,62.2,77.8,0,-0.6,0.5,35.3,58.2,4.4,693.9,82.6,75.8,15.8,0
4,99.5,19.5,97.6,110.2,73.0,57.3,0,-0.2,0.5,37.9,62.3,3.0,826.3,63.9,79.0,12.5,1


In [4]:
# Disease classification based on thresholds
def classify_disease(row):
    """
    Classifies diseases for an individual based on health thresholds.

    :param row: A pandas Series representing a single individual's health data.
    :return: The disease associated with the individual.
    """
    if (
        row["Blood Pressure (systolic)"] >= 130
        or row["Blood Pressure (diastolic)"] >= 80
    ):
        return "Hypertension"

    if row["Blood Pressure (systolic)"] > 140 or row["Hemoglobin A1c (%)"] > 6.5:
        return "Atherosclerosis"

    if (
        row["Heart Rate (bpm)"] > 100
        or row["Heart Rate (bpm)"] < 60
        or row["Blood Pressure (systolic)"] > 140
        or row["Blood Pressure (diastolic)"] > 90
        or row["HRV SDNN (ms)"] < 50
    ):
        return "Cardiovascular Disease (CVD)"

    if row["Breathing Rate (brpm)"] > 20 or row["Oxygen Saturation (%)"] < 90:
        return "Respiratory Disease (COPD or Asthma)"

    if (
        row["HRV SDNN (ms)"] < 50
        or row["RMSSD (ms)"] < 30
        or row["Recovery Ability"] == 0
    ):
        return "Chronic Fatigue Syndrome (CFS)"

    if row["Hemoglobin A1c (%)"] > 6.5:
        return "Diabetes"

    if row["HRV SDNN (ms)"] > 100 or row["Mean RRi (ms)"] < 600:
        return "Arrhythmias"

    if row["Stress Index"] > 70 or row["SNS Index"] > 1.0:
        return "Stress-related Disorders"

    if row["PNS Index"] < -1.0 or row["SNS Index"] > 1.0:
        return "Autonomic Dysfunction"

    if (row["Gender (0-M;1-F)"] == 0 and row["Hemoglobin (g/dl)"] < 13.5) or (
        row["Gender (0-M;1-F)"] == 1 and row["Hemoglobin (g/dl)"] < 12.0
    ):
        return "Anaemia"

    return "Healthy"

 # Apply disease classification
df["Disease Classification"] = df.apply(classify_disease, axis=1)

In [5]:
# Display the class distribution
print("Class Distribution Before Sampling:")
print(df["Disease Classification"].value_counts())

Class Distribution Before Sampling:
Disease Classification
Hypertension                            285296
Cardiovascular Disease (CVD)             51327
Chronic Fatigue Syndrome (CFS)           51047
Atherosclerosis                          49081
Respiratory Disease (COPD or Asthma)     12458
Stress-related Disorders                   352
Arrhythmias                                282
Healthy                                     67
Autonomic Dysfunction                       65
Anaemia                                     25
Name: count, dtype: int64


In [6]:
# Encode labels using LabelEncoder
label_encoder = LabelEncoder()
df["Disease Classification"] = label_encoder.fit_transform(df["Disease Classification"])

In [7]:
# Save label mapping for reuse
label_mapping = dict(zip(label_encoder.classes_, range(len(label_encoder.classes_))))
with open("label_mapping.pkl", "wb") as f:
    pickle.dump(label_mapping, f)

In [8]:
# Split features and target
X = df.drop(columns=["Disease Classification"])
y = df["Disease Classification"]

In [9]:
# Applying SMOTE for balancing the class distribution
smote = SMOTE(random_state=42)
X_resampled, y_resampled = smote.fit_resample(X, y)

# Display the class distribution after resampling
print("Class Distribution After SMOTE:")
print(pd.Series(y_resampled).value_counts())

Class Distribution After SMOTE:
Disease Classification
5    285296
7    285296
2    285296
4    285296
3    285296
8    285296
1    285296
0    285296
9    285296
6    285296
Name: count, dtype: int64


In [10]:
# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(
    X_resampled, y_resampled, test_size=0.2, stratify=y_resampled, random_state=42
)

In [11]:
# Standardize features using StandardScaler
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# Display scaled features
print("\nFirst 5 rows of Scaled Features:\n", X_train[:5])


First 5 rows of Scaled Features:
 [[ 0.87690292  1.50298394 -1.90574122  3.2050784   2.51798064  0.81129448
  -0.05463831 -1.60461163  1.50125902 -2.07630897 -0.64371516 -1.09981865
   1.27283726 -1.70995782  1.02091325  0.66615944 -0.60321761]
 [-1.32463171 -0.23392174  0.83711995  0.45092345  0.76748948 -0.79668166
  -1.51767363  0.89780895 -0.8995189   0.51741419  0.7353604  -0.51221992
  -0.5251719  -0.47279674 -0.28879667  0.91317463 -0.60321761]
 [-0.85945433 -0.10120892  0.50247536 -0.4068023  -0.59202049  0.95191805
  -0.05463831  0.11221247 -0.11094864 -0.09008602 -1.79301307 -0.13312162
  -0.10285495  0.48344514  0.29558506  0.77896539  1.65777655]
 [-0.84087131 -0.0938254   0.77281333  0.7544841  -1.44474522 -0.66760665
  -0.05463831  1.11885531  1.27264496  1.31334352  0.5972793  -1.32980216
  -0.00880717  1.43360807 -0.94478589  0.66615944 -0.60321761]
 [-0.79013622 -1.01377153  0.38787112  0.29554929 -0.59190026  0.66953197
  -0.05463831 -0.1699626  -0.56330622  0.322409

In [12]:
# Save the preprocessed data and scaler for later use
with open("preprocessed_data.pkl", "wb") as f:
    pickle.dump((X_train, X_test, y_train, y_test), f)

with open("scaler.pkl", "wb") as f:
    pickle.dump(scaler, f)

print("\nPreprocessing completed and data saved.")


Preprocessing completed and data saved.
