# 01_data_cleaning
This notebook:
- Sets up paths and imports
- Loads raw data
- Demonstrates cleaning functions from `src/data_utils.py`
- Saves cleaned / processed files into `data/processed/`


In [None]:
# Set up paths and ensure src is importable
import os, sys
repo_root = os.path.abspath(os.path.join(os.getcwd(), ".."))  # adjust if needed
src_path = os.path.join(repo_root, "src")
if src_path not in sys.path:
    sys.path.insert(0, src_path)

print("repo_root:", repo_root)
print("src_path:", src_path)


In [None]:
# Core imports
import pandas as pd
from data_utils import load_csv, save_csv, standardize_columns, ensure_animal_and_day, merge_on_animal_day, summarize_df

# Paths
RAW_DIR = os.path.join(repo_root, "data", "raw")
PROCESSED_DIR = os.path.join(repo_root, "data", "processed")
os.makedirs(RAW_DIR, exist_ok=True)
os.makedirs(PROCESSED_DIR, exist_ok=True)


## If you already have files:
Place your raw CSV/XLSX files in `data/raw/`. Common filenames:
- pr8_clinical.csv
- pr8_survival.csv
- bleo_histology.csv
- immune_panel.csv


In [None]:
# Optional: create small synthetic example datasets to test the pipeline
df_clinical = pd.DataFrame({
    "AnimalID": [f"A{i}" for i in range(1,9)],
    "Group": ["PR8"]*4 + ["Bleo"]*4,
    "Day": [0,1,2,3,0,1,2,3],
    "Weight": [20.1,19.5,18.9,18.2,20.5,20.0,19.5,19.0]
}).explode(["Day","Weight"])  # quick demo may repeat; not perfect but fine for test

# Simpler synthetic survival dataset
df_surv = pd.DataFrame({
    "AnimalID": [f"A{i}" for i in range(1,9)],
    "Group": ["PR8"]*4 + ["Bleo"]*4,
    "time": [5,6,7,8,3,4,9,10],
    "event": [1,1,0,1,1,0,1,1]
})

# Save synthetic raw
df_clinical.to_csv(os.path.join(RAW_DIR, "demo_clinical.csv"), index=False)
df_surv.to_csv(os.path.join(RAW_DIR, "demo_survival.csv"), index=False)

print("Wrote demo files to:", RAW_DIR)


In [None]:
# Example: load your raw file(s)
clinical_path = os.path.join(RAW_DIR, "demo_clinical.csv")  # replace with your filename
survival_path = os.path.join(RAW_DIR, "demo_survival.csv")

df_clin = load_csv(clinical_path)
df_surv = load_csv(survival_path)

# Standardize column names
df_clin = standardize_columns(df_clin)
df_surv = standardize_columns(df_surv)

# Ensure ID and day are typed appropriately (if present)
df_clin = ensure_animal_and_day(df_clin, id_col="animalid", day_col="day")
df_surv = ensure_animal_and_day(df_surv, id_col="animalid", day_col="time")

# Quick summaries
print("Clinical summary:", summarize_df(df_clin))
print("Survival summary:", summarize_df(df_surv))

# Save processed versions
save_csv(df_clin, os.path.join(PROCESSED_DIR, "clinical_cleaned.csv"))
save_csv(df_surv, os.path.join(PROCESSED_DIR, "survival_cleaned.csv"))

print("Saved cleaned files to:", PROCESSED_DIR)
