In [1]:
import pandas as pd
import os

# Paths (relative to AI/notebooks/)
PROCESSED_DIR = "../processed/"
MERGED_FILE = os.path.join(PROCESSED_DIR, "merged_dataset.csv")

bp_file = os.path.join(PROCESSED_DIR, "bp_people_features.csv")
spo2_file = os.path.join(PROCESSED_DIR, "spo2_clean_no_datetime.csv")
garmin_file = os.path.join(PROCESSED_DIR, "sourceA_all_clean.csv")

print("Files exist:",
      os.path.exists(bp_file),
      os.path.exists(spo2_file),
      os.path.exists(garmin_file))


Files exist: True True True


In [2]:
bp = pd.read_csv(bp_file)
spo2 = pd.read_csv(spo2_file)
garmin = pd.read_csv(garmin_file)

print("BP:", bp.shape, "columns:", bp.columns[:6].tolist(), "...")
print("SpO2:", spo2.shape, "columns:", spo2.columns[:6].tolist(), "...")
print("Garmin:", garmin.shape, "columns:", garmin.columns[:6].tolist(), "...")


BP: (374, 17) columns: ['Person ID', 'Gender', 'Age', 'Occupation', 'Sleep Duration', 'Quality of Sleep'] ...
SpO2: (970, 5) columns: ['SPO2', 'SPO2_NORM', 'IS_LOW_90', 'IS_LOW_88', 'SPO2_STD_FLAG'] ...
Garmin: (4854, 41) columns: ['DATE', 'DAY_OF_THE_WEEK', 'MONTH', 'SEASON', 'ACTIVITY_STEPS', 'ACTIVITY_DISTANCE'] ...


In [3]:
# Get smallest dataset length for alignment
min_len = min(len(bp), len(spo2), len(garmin))
print("Aligning to minimum length:", min_len)

bp_trimmed = bp.sample(n=min_len, random_state=42).reset_index(drop=True)
spo2_trimmed = spo2.sample(n=min_len, random_state=42).reset_index(drop=True)
garmin_trimmed = garmin.sample(n=min_len, random_state=42).reset_index(drop=True)

# Combine horizontally
merged = pd.concat([bp_trimmed, spo2_trimmed, garmin_trimmed], axis=1)
print("Merged dataset shape:", merged.shape)


Aligning to minimum length: 374
Merged dataset shape: (374, 63)


In [4]:
# Drop unnecessary columns (like names, IDs, gender, etc.)
non_numeric = merged.select_dtypes(exclude=["number"]).columns
merged_clean = merged.drop(columns=non_numeric, errors="ignore")

print("Dropped non-numeric:", list(non_numeric))
print("Final shape:", merged_clean.shape)


Dropped non-numeric: ['Gender', 'Occupation', 'BMI Category', 'Sleep Disorder', 'bp_category', 'DATE', 'PARTICIPANT_ID', 'GENDER', 'OCCUPATION', 'BMI CATEGORY', 'BLOOD PRESSURE', 'SLEEP DISORDER']
Final shape: (374, 51)


In [5]:
from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler()
merged_scaled = pd.DataFrame(scaler.fit_transform(merged_clean),
                             columns=merged_clean.columns)

merged_scaled.to_csv(MERGED_FILE, index=False)
print(f"✅ Final merged dataset saved → {MERGED_FILE}")
merged_scaled.head()


✅ Final merged dataset saved → ../processed/merged_dataset.csv


Unnamed: 0,Person ID,Age,Sleep Duration,Quality of Sleep,Physical Activity Level,Stress Level,Daily Steps,SBP,DBP,HR,...,PHYSICAL ACTIVITY LEVEL,STRESS LEVEL,HEART RATE,DAILY STEPS,HEART_RATE,SPO2,STRESS_LEVEL,SLEEP_HOURS,STEPS,ANOMALY
0,0.882038,0.8125,1.0,1.0,0.0,0.0,0.285714,0.37037,0.25,0.0,...,,,,,0.538462,0.5,1.0,0.491308,0.108274,1.0
1,0.088472,0.125,0.111111,0.4,0.0,1.0,0.285714,0.37037,0.25,0.333333,...,,,,,,,,,,
2,0.040214,0.0625,0.074074,0.4,0.0,1.0,0.714286,0.185185,0.25,0.238095,...,,,,,0.907692,0.8,0.0,0.186097,0.666739,0.0
3,0.871314,0.8125,1.0,1.0,0.0,0.0,0.285714,0.37037,0.25,0.0,...,,,,,,,,,,
4,0.152815,0.15625,0.074074,0.4,0.0,1.0,0.285714,0.37037,0.25,0.333333,...,,,,,,,,,,
