# Validation Dataset
This notebook processes raw patient CSV files by extracting inspired oxygen, SpO₂, and hemoglobin values, assigning each patient an anonymous ID, and combining all data into a single dataset. It also creates a mapping between original filenames and anonymous IDs. The result is saved as a clean validation dataset and a mapping file for further analysis.

In [None]:
import pandas as pd
from pathlib import Path

# Input and output paths
input_folder = Path("path_to_folder")  
output_csv = Path("validation_set.csv")
mapping_csv = Path("validation_set_mapping.csv")

# List of CSV files
files = list(input_folder.glob("*.csv"))

# Storage
all_data = []
mappings = []

# Loop through files
for i, file_path in enumerate(files, start=1):
    patient_id = i  # now just 1, 2, 3...
    mappings.append({"Original Filename": file_path.name, "Anon. Patient_ID": patient_id})
    
    df = pd.read_csv(file_path, comment="#").dropna(how="all")
    last_row = df.iloc[-1]
    
    # Find Hb value
    if any(isinstance(x, str) and 'hb' in x.lower() for x in last_row):
        hb_idx = next(j for j, x in enumerate(last_row) if isinstance(x, str) and 'hb' in x.lower())
        hb_val_idx = hb_idx + 1 if hb_idx + 1 < len(last_row) else hb_idx - 1
        hb = float(last_row.iloc[hb_val_idx])
        df = df.iloc[:-1]  # drop Hb row
    else:
        raise ValueError(f"No Hb value found in {file_path.name}")

    # Clean + rename columns
    df.columns = [col.strip().replace(" ", "").replace(".", "") for col in df.columns]
    df.rename(columns={"InspO2(%)": "Insp.O2(%)", "SpO2(%)": "SpO2(%)"}, inplace=True)

    # Add columns
    df["Hb"] = hb
    df["Anon. Patient_ID"] = patient_id
    df = df[["Anon. Patient_ID", "Insp.O2(%)", "SpO2(%)", "Hb"]]  # reorder

    all_data.append(df)

# Save final combined dataset
final_df = pd.concat(all_data, ignore_index=True)
final_df.to_csv(output_csv, index=False)

# Save mapping
pd.DataFrame(mappings).to_csv(mapping_csv, index=False)

print(f"Done! {len(files)} files processed.\nSaved:\n- {output_csv}\n- {mapping_csv}")
