In [16]:
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import re

# Paths (relative to AI/notebooks)
RAW_DIR = "../raw/"
PROCESSED_DIR = "../processed/"
os.makedirs(PROCESSED_DIR, exist_ok=True)

RAW_FILE = os.path.join(RAW_DIR, "sp02Data.csv")  # your uploaded file
CLEAN_FILE = os.path.join(PROCESSED_DIR, "spo2_clean_no_datetime.csv")

print("File found:", os.path.exists(RAW_FILE))


File found: True


In [17]:
raw = pd.read_csv(RAW_FILE)
print("Columns:", list(raw.columns))
raw.head()


Columns: ['Heart_Rate', 'SpO2', 'Stress_Level', 'Sleep_Hours', 'Steps', 'Anomaly']


Unnamed: 0,Heart_Rate,SpO2,Stress_Level,Sleep_Hours,Steps,Anomaly
0,82,99,5,7.150235,14479,0
1,72,98,10,5.290899,14058,1
2,84,97,2,5.790723,6622,0
3,97,95,7,4.573354,9052,0
4,71,98,2,6.301869,2378,0


In [18]:
def clean_spo2_data(df):
    # Normalize headers
    df.columns = df.columns.str.strip().str.replace(r"\s+", "_", regex=True).str.upper()

    # Detect possible SpO2 column
    spo2_col = None
    for c in df.columns:
        if re.search(r"SPO2|OXYGEN|SAT", c, re.I):
            spo2_col = c
            break

    if spo2_col is None:
        raise ValueError("❌ No SpO2 column detected.")

    # Convert to numeric
    df["SPO2"] = pd.to_numeric(df[spo2_col], errors="coerce")

    # Remove unrealistic values
    df = df[df["SPO2"].between(70, 100)]  # physiological realistic range

    # Fix scaling (0–1 -> 0–100)
    if df["SPO2"].max() <= 1.5:
        df["SPO2"] *= 100

    # Compute statistical quality metrics
    df["SPO2_STD_FLAG"] = (df["SPO2"].std() < 1).astype(int)  # very stable readings
    df["IS_LOW_90"] = (df["SPO2"] < 90).astype(int)
    df["IS_LOW_88"] = (df["SPO2"] < 88).astype(int)

    # Normalize (0–1 scale) for model input
    df["SPO2_NORM"] = (df["SPO2"] - 70) / (100 - 70)

    # Drop all non-numeric/unnecessary columns
    keep_cols = ["SPO2", "SPO2_NORM", "IS_LOW_90", "IS_LOW_88", "SPO2_STD_FLAG"]
    df = df[keep_cols].reset_index(drop=True)

    return df


In [19]:
clean_df = clean_spo2_data(raw)
clean_df.to_csv(CLEAN_FILE, index=False)

print(f"✅ Cleaned SpO2 file saved → {CLEAN_FILE}")
print(f"Shape: {clean_df.shape}")
clean_df.head()


✅ Cleaned SpO2 file saved → ../processed/spo2_clean_no_datetime.csv
Shape: (970, 5)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df["SPO2_STD_FLAG"] = (df["SPO2"].std() < 1).astype(int)  # very stable readings
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df["IS_LOW_90"] = (df["SPO2"] < 90).astype(int)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df["IS_LOW_88"] = (df["SPO2"] < 88).astype(int)
A value is trying to be set 

Unnamed: 0,SPO2,SPO2_NORM,IS_LOW_90,IS_LOW_88,SPO2_STD_FLAG
0,99,0.966667,0,0,0
1,98,0.933333,0,0,0
2,97,0.9,0,0,0
3,95,0.833333,0,0,0
4,98,0.933333,0,0,0
