In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import MinMaxScaler
import os
from pathlib import Path


In [2]:
# Correct relative paths from inside AI/notebooks/
RAW_DIR = "../raw/"
PROCESSED_DIR = "../processed/"

# Create processed directory if it doesn't exist
os.makedirs(PROCESSED_DIR, exist_ok=True)

# Test if it works
print("RAW files:", os.listdir(RAW_DIR))
print("PROCESSED folder ready:", os.path.exists(PROCESSED_DIR))

RAW files: ['1487.csv', '2201.csv', '2210.csv', '3379.csv', '4891.csv', '5359.csv', '5544.csv', '6008.csv', '6777.csv', '7359.csv', '8176.csv', '9775.csv', 'BloodPressureData.csv', 'sp02Data.csv']
PROCESSED folder ready: True


In [3]:
VALID_RANGES = {
    "ACTIVITY_HR_AVERAGE": (30, 220),
    "ACTIVITY_HR_MIN": (30, 220),
    "ACTIVITY_HR_MAX": (30, 220),
    "SLEEP_HR_AVERAGE": (30, 220),
    "SLEEP_HR_MIN": (30, 220),
    "SLEEP_HR_MAX": (30, 220),
    "ACTIVITY_STEPS": (0, 50000),
    "SLEEP_TOTALSLEEPTIME": (0, 50000),
    "SLEEP_WAKEUPCOUNT": (0, 50),
    "SLEEP_WAKEUPDURATION": (0, 60000),
}

def clean_participant(file_path, participant_id):
    # Load CSV
    df = pd.read_csv(file_path)

    # Standardize column names
    df.columns = df.columns.str.strip().str.upper()

    # Convert date and sort chronologically
    if "DATE" in df.columns:
        df["DATE"] = pd.to_datetime(df["DATE"], errors="coerce", dayfirst=True)
        df = df.dropna(subset=["DATE"]).sort_values("DATE")

    # Remove duplicates and fill gaps
    df = df.drop_duplicates().fillna(method="ffill").fillna(method="bfill")

    # Clip values within physiological limits
    for col, limits in VALID_RANGES.items():
        if col in df.columns:
            lo, hi = limits
            df[col] = df[col].clip(lo, hi)

    # Derived proxy features
    if {"SLEEP_HR_MAX", "SLEEP_HR_MIN"}.issubset(df.columns):
        df["SLEEP_HRV_PROXY"] = df["SLEEP_HR_MAX"] - df["SLEEP_HR_MIN"]

    if {"SLEEP_WAKEUPCOUNT", "SLEEP_WAKEUPDURATION"}.issubset(df.columns):
        df["STRESS_PROXY"] = df["SLEEP_WAKEUPCOUNT"] + (df["SLEEP_WAKEUPDURATION"] / 60.0)

    # Normalize numeric columns
    num_cols = df.select_dtypes(include=np.number).columns
    if len(num_cols) > 0:
        scaler = MinMaxScaler()
        df[num_cols] = scaler.fit_transform(df[num_cols])

    # Add participant ID
    df["PARTICIPANT_ID"] = participant_id

    return df


In [None]:

all_dfs = []
for file in os.listdir(RAW_DIR):
    if file.endswith(".csv"):
        pid = file.replace(".csv", "")
        fp = os.path.join(RAW_DIR, file)
        print(f" Cleaning {pid}...")
        df_clean = clean_participant(fp, pid)
        out_fp = os.path.join(PROCESSED_DIR, f"{pid}_clean.csv")
        df_clean.to_csv(out_fp, index=False)
        print(f" Saved: {out_fp}")
        all_dfs.append(df_clean)

merged = pd.concat(all_dfs, axis=0).reset_index(drop=True)
merged_out = os.path.join(PROCESSED_DIR, "sourceA_all_clean.csv")
merged.to_csv(merged_out, index=False)
print(f"\nðŸ’¾ Merged dataset saved â†’ {merged_out}  shape={merged.shape}")

ðŸ§¼ Cleaning 1487...
âœ… Saved: ../processed/1487_clean.csv
ðŸ§¼ Cleaning 2201...
âœ… Saved: ../processed/2201_clean.csv
ðŸ§¼ Cleaning 2210...
âœ… Saved: ../processed/2210_clean.csv
ðŸ§¼ Cleaning 3379...
âœ… Saved: ../processed/3379_clean.csv
ðŸ§¼ Cleaning 4891...


  df["DATE"] = pd.to_datetime(df["DATE"], errors="coerce", dayfirst=True)
  df = df.drop_duplicates().fillna(method="ffill").fillna(method="bfill")
  df["DATE"] = pd.to_datetime(df["DATE"], errors="coerce", dayfirst=True)
  df = df.drop_duplicates().fillna(method="ffill").fillna(method="bfill")
  df["DATE"] = pd.to_datetime(df["DATE"], errors="coerce", dayfirst=True)
  df = df.drop_duplicates().fillna(method="ffill").fillna(method="bfill")
  df["DATE"] = pd.to_datetime(df["DATE"], errors="coerce", dayfirst=True)
  df = df.drop_duplicates().fillna(method="ffill").fillna(method="bfill")
  df["DATE"] = pd.to_datetime(df["DATE"], errors="coerce", dayfirst=True)
  df = df.drop_duplicates().fillna(method="ffill").fillna(method="bfill")
  df["DATE"] = pd.to_datetime(df["DATE"], errors="coerce", dayfirst=True)
  df = df.drop_duplicates().fillna(method="ffill").fillna(method="bfill")
  df["DATE"] = pd.to_datetime(df["DATE"], errors="coerce", dayfirst=True)
  df = df.drop_duplicates().fillna(met

âœ… Saved: ../processed/4891_clean.csv
ðŸ§¼ Cleaning 5359...
âœ… Saved: ../processed/5359_clean.csv
ðŸ§¼ Cleaning 5544...
âœ… Saved: ../processed/5544_clean.csv
ðŸ§¼ Cleaning 6008...
âœ… Saved: ../processed/6008_clean.csv
ðŸ§¼ Cleaning 6777...
âœ… Saved: ../processed/6777_clean.csv
ðŸ§¼ Cleaning 7359...


  df = df.drop_duplicates().fillna(method="ffill").fillna(method="bfill")
  df["DATE"] = pd.to_datetime(df["DATE"], errors="coerce", dayfirst=True)
  df = df.drop_duplicates().fillna(method="ffill").fillna(method="bfill")
  df["DATE"] = pd.to_datetime(df["DATE"], errors="coerce", dayfirst=True)
  df = df.drop_duplicates().fillna(method="ffill").fillna(method="bfill")
  df = df.drop_duplicates().fillna(method="ffill").fillna(method="bfill")
  df = df.drop_duplicates().fillna(method="ffill").fillna(method="bfill")


âœ… Saved: ../processed/7359_clean.csv
ðŸ§¼ Cleaning 8176...
âœ… Saved: ../processed/8176_clean.csv
ðŸ§¼ Cleaning 9775...
âœ… Saved: ../processed/9775_clean.csv
ðŸ§¼ Cleaning BloodPressureData...
âœ… Saved: ../processed/BloodPressureData_clean.csv
ðŸ§¼ Cleaning sp02Data...
âœ… Saved: ../processed/sp02Data_clean.csv

ðŸ’¾ Merged dataset saved â†’ ../processed/sourceA_all_clean.csv  shape=(4854, 41)
