In [2]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from imblearn.over_sampling import SMOTE
import pickle
import os

In [9]:
# 1. Load data with error handling
def load_data(file_path):
    if not os.path.exists(file_path):
        raise FileNotFoundError(f"File not found: {file_path}")
    return pd.read_csv(file_path)


data = load_data("C:/Users/syafi/Desktop/py-1/syafiq-project/dummy-data.csv")

# Display basic information about the dataset
data_head = data.head()
data_describe = data.describe()
data_info = data.info()
print("Data Head:")
print(data_head)
print("\nData Description:")
print(data_describe)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 450000 entries, 0 to 449999
Data columns (total 17 columns):
 #   Column                      Non-Null Count   Dtype  
---  ------                      --------------   -----  
 0   Heart Rate (bpm)            450000 non-null  float64
 1   Breathing Rate (brpm)       450000 non-null  float64
 2   Oxygen Saturation (%)       450000 non-null  float64
 3   Blood Pressure (systolic)   450000 non-null  float64
 4   Blood Pressure (diastolic)  450000 non-null  float64
 5   Stress Index                450000 non-null  float64
 6   Recovery Ability            450000 non-null  int64  
 7   PNS Index                   450000 non-null  float64
 8   SNS Index                   450000 non-null  float64
 9   RMSSD (ms)                  450000 non-null  float64
 10  SD2 (ms)                    450000 non-null  float64
 11  Hemoglobin A1c (%)          450000 non-null  float64
 12  Mean RRi (ms)               450000 non-null  float64
 13  SD1 (ms)      

In [10]:
classification_criteria = {
    "Hypertension": lambda row: (
        (
            row["Blood Pressure (systolic)"] > 130
            or row["Blood Pressure (diastolic)"] > 80
        )
        and (60 <= row["Heart Rate (bpm)"] <= 100)
    ),
    "Atherosclerosis": lambda row: (
        row["Blood Pressure (systolic)"] > 140 or row["Hemoglobin A1c (%)"] > 7.0
    ),
    "Cardiovascular Disease (CVD)": lambda row: (
        row["Heart Rate (bpm)"] > 100
        or row["Heart Rate (bpm)"] < 60
        or row["Blood Pressure (systolic)"] > 140
        or row["Blood Pressure (diastolic)"] > 90
    ),
    "Respiratory Disease (COPD or Asthma)": lambda row: (
        row["Breathing Rate (brpm)"] > 20 or row["Oxygen Saturation (%)"] < 90
    ),
    "Chronic Fatigue Syndrome (CFS)": lambda row: (
        row["HRV SDNN (ms)"] < 50
        or row["RMSSD (ms)"] < 30
        or row["Recovery Ability"] == 0
    ),
    "Diabetes": lambda row: (row["Hemoglobin A1c (%)"] > 6.5),
    "Arrhythmias": lambda row: (
        row["HRV SDNN (ms)"] > 100 or row["Mean RRi (ms)"] < 600
    ),
    "Stress-related Disorders": lambda row: (
        row["Stress Index"] > 70 or row["SNS Index"] > 1.0
    ),
    "Autonomic Dysfunction": lambda row: (
        row["PNS Index"] < -1.0 or row["SNS Index"] > 1.0
    ),
    "Anaemia": lambda row: (
        (row["Gender (0-M;1-F)"] == 0 and row["Hemoglobin (g/dl)"] < 13.5)
        or (row["Gender (0-M;1-F)"] == 1 and row["Hemoglobin (g/dl)"] < 12.0)
    ),
    "Healthy": lambda row: True,  # Default category
}

In [11]:
# Function to classify disease based on criteria
def classify_disease(row):
    for disease, condition in classification_criteria.items():
        if condition(row):
            return disease
    return "Healthy"

In [12]:
# Apply the classification function
data["Disease Classification"] = data.apply(classify_disease, axis=1)

# Display the class distribution
print("Class Distribution Before Sampling:")
data["Disease Classification"].value_counts()

Class Distribution Before Sampling:


Disease Classification
Atherosclerosis                         152809
Hypertension                            115644
Cardiovascular Disease (CVD)             99122
Chronic Fatigue Syndrome (CFS)           53545
Respiratory Disease (COPD or Asthma)     28039
Stress-related Disorders                   352
Arrhythmias                                284
Healthy                                     67
Autonomic Dysfunction                       65
Diabetes                                    48
Anaemia                                     25
Name: count, dtype: int64

In [13]:
# Encode labels using LabelEncoder
label_encoder = LabelEncoder()
data["Disease Classification"] = label_encoder.fit_transform(data["Disease Classification"])

# Save label mapping for reuse
label_mapping = dict(zip(label_encoder.classes_, range(len(label_encoder.classes_))))
with open("label_mapping.pkl", "wb") as f:
    pickle.dump(label_mapping, f)

# Split features and target
X = data.drop(columns=["Disease Classification"])
y = data["Disease Classification"]

# Display the first few rows of features and target
print("\nFeature Sample", X.head())
print("\nTarget Sample", y.head())


Feature Sample    Heart Rate (bpm)  Breathing Rate (brpm)  Oxygen Saturation (%)  \
0              80.3                   12.2                   96.4   
1              73.1                   17.7                   95.9   
2              72.2                   18.0                   96.0   
3              70.6                   14.7                   95.1   
4              99.5                   19.5                   97.6   

   Blood Pressure (systolic)  Blood Pressure (diastolic)  Stress Index  \
0                      107.3                        74.2          39.6   
1                       92.4                        70.8          98.7   
2                      102.4                        75.6          45.3   
3                      110.0                        62.2          77.8   
4                      110.2                        73.0          57.3   

   Recovery Ability  PNS Index  SNS Index  RMSSD (ms)  SD2 (ms)  \
0                 0       -0.9        0.4        49.7    

In [14]:
# Applying SMOTE for balancing the class distribution
smote = SMOTE(random_state=42)
X_resampled, y_resampled = smote.fit_resample(X, y)

# Display the class distribution after resampling with disease names
resampled_counts_named = (
    pd.Series(y_resampled).value_counts().rename(
        index=lambda x: [k for k, v in label_mapping.items() if v == x][0]
        )
)

print("\nClass Distribution after SMOTE with Disease Names:")
resampled_counts_named


Class Distribution after SMOTE with Disease Names:


Disease Classification
Chronic Fatigue Syndrome (CFS)          152809
Atherosclerosis                         152809
Hypertension                            152809
Cardiovascular Disease (CVD)            152809
Respiratory Disease (COPD or Asthma)    152809
Autonomic Dysfunction                   152809
Arrhythmias                             152809
Anaemia                                 152809
Stress-related Disorders                152809
Diabetes                                152809
Healthy                                 152809
Name: count, dtype: int64

In [15]:
# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(
    X_resampled, y_resampled, test_size=0.2, stratify=y_resampled, random_state=42
)

# Standardize features using StandardScaler
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# Display scaled features
print("\nFirst 5 rows of Scaled Features:\n")
print(X_train[:5])


First 5 rows of Scaled Features:

[[-0.08934813 -0.60731912  0.27349898 -0.68717907 -0.62221633 -0.74149026
  -0.15897414 -1.23864041  1.36924238  0.73156004  1.09724213 -0.31504622
  -1.56008858 -0.96769316  0.21569355  1.19080098 -0.6537002 ]
 [-0.64559725 -0.27940427  0.88019871 -0.01840411  0.4192912   0.19562219
  -0.15897414 -0.70662485 -0.71829599 -0.53622215  1.5762534  -0.01576187
  -0.8522198   0.91260277 -0.24504177  0.29953984  1.52975325]
 [-0.5271925  -0.25585553  0.60584333 -0.37923869  0.2864573  -0.78010165
   1.34299696 -0.27543659  0.75743978  0.65856242 -0.30429914  0.08058356
  -0.37502173 -1.00834348  1.52028856 -1.19615656 -0.6537002 ]
 [-1.18784866 -0.41993383  0.47813376  0.2765283  -1.31259677 -0.81875542
  -0.15897414  0.76859939  1.21471213  0.38250942  0.42920576  0.13497086
  -0.40078241  0.23363662  1.22842808 -1.50404233 -0.6537002 ]
 [ 2.79015434  2.25752771  0.0759952   1.62036671  2.59555726 -0.36377344
   1.34299696 -0.96325614  0.37211836 -1.018803

In [16]:
# Save the preprocessed data and scaler for later use
with open("preprocessed_data.pkl", "wb") as f:
    pickle.dump((X_train, X_test, y_train, y_test), f)

with open("scaler.pkl", "wb") as f:
    pickle.dump(scaler, f)

print("\nPreprocessing completed and data saved.")


Preprocessing completed and data saved.
