# Creation of Measurements Dataset 
This notebook prepares and anonymizes raw patient data for further analysis. It reads individual patient CSV files from a specified folder, extracts relevant columns (inspired oxygen, measured SpO₂ and hemoglobin), and assigns each entry a patient ID. The notebook then combines all patient data into a single DataFrame, shuffles the patients to anonymize them, and assigns new anonymous patient IDs. Finally, it splits the anonymized data into training and test sets, saving both as CSV and Excel files for downstream machine learning tasks.

In [None]:
folder_path = "path_to_folder"

In [28]:
import pandas as pd
import os
import re  # For formatting patient ID

# Directory containing CSV files
data_dir = folder_path

# Initialize list for collecting all data
data_list = []

# Iterate through each CSV file
for filename in os.listdir(data_dir):
    if filename.endswith(".csv"):
        file_path = os.path.join(data_dir, filename)

        patient_id = re.search(r'(k\d{3})', filename, re.IGNORECASE)


        if patient_id:
            patient_id = f"{patient_id.group(0).upper()}"  # Keep original C### or E### format


        # Read CSV while **ignoring extra columns**, removing empty rows and columns
        temp_df = pd.read_csv(file_path, usecols=[0, 1], names=["Insp. O2 (%)", "SpO2 (%)"], skip_blank_lines=True, skiprows=1)
        temp_df = temp_df.dropna(how='all')  # Remove completely empty rows
        temp_df = temp_df.dropna(axis=1, how='all')  # Remove completely empty columns

        # Extract Hb value (last row, second column)
        hb_value = temp_df.iloc[-1, 1]  
        temp_df = temp_df.iloc[:-1]  # Remove last row

        # Add Patient ID and Hb columns
        temp_df["Patient_ID"] = patient_id
        temp_df["Hb"] = hb_value

        # Append to list
        data_list.append(temp_df)

# Combine all patient data into a single DataFrame
combined_df = pd.concat(data_list, ignore_index=True)


In [29]:
import pandas as pd
import numpy as np

np.random.seed(42)  # Set a fixed seed for reproducibility


# Load your DataFrame (assuming it's already loaded as `df`)
df = combined_df

# Step 1: Group by Patient_ID and collect groups
grouped = list(df.groupby("Patient_ID"))

# Step 2: Shuffle the groups randomly
np.random.shuffle(grouped)

# Step 3: Reassemble the shuffled DataFrame and assign new Anon. Patient_ID
shuffled_df = pd.concat([g[1] for g in grouped], ignore_index=True)
shuffled_df["Anon. Patient_ID"] = np.repeat(range(1, len(grouped) + 1), [len(g[1]) for g in grouped])

# Step 4: Sort by new Anon. Patient_ID and then by Insp. O2 (%)
shuffled_df = shuffled_df.sort_values(by=["Anon. Patient_ID", "Insp. O2 (%)"], ascending=[True, True])

columns = ["Anon. Patient_ID"] + [col for col in shuffled_df.columns if col != "Anon. Patient_ID"]
shuffled_df = shuffled_df[columns].reset_index(drop=True)

# Save the new DataFrame to a CSV file
shuffled_df.to_csv("shuffled_data.csv", index=False) 


In [30]:
shuffled_df.head(30)

Unnamed: 0,Anon. Patient_ID,Insp. O2 (%),SpO2 (%),Patient_ID,Hb
0,1,20.5,86.1,K202,10.4
1,1,22.3,90.8,K202,10.4
2,1,23.0,94.8,K202,10.4
3,1,24.9,95.5,K202,10.4
4,1,25.5,95.4,K202,10.4
5,1,27.1,95.4,K202,10.4
6,1,27.8,95.2,K202,10.4
7,1,29.3,97.8,K202,10.4
8,2,17.0,86.5,K122,8.8
9,2,17.6,87.2,K122,8.8


In [31]:
df = shuffled_df

In [34]:
from sklearn.model_selection import train_test_split

unique_patient_ids = df['Patient_ID'].unique()
train_ids, test_ids = train_test_split(
    unique_patient_ids, test_size=0.3, random_state=42
)

# Split the data into train and test sets
train_df = df[df['Patient_ID'].isin(train_ids)].drop(columns="Patient_ID")
test_df = df[df['Patient_ID'].isin(test_ids)].drop(columns="Patient_ID")

# Save train and test sets to CSV
train_df.to_excel("train_219.xlsx", index=False)
test_df.to_excel("test_219.xlsx", index=False)

train_df.to_csv("train_219.csv", index=False)
test_df.to_csv("test_219.csv", index=False)

In [33]:
# Count unique patients in the train set
num_train_patients = train_df['Patient_ID'].nunique()

# Count unique patients in the test set
num_test_patients = test_df['Patient_ID'].nunique()

# Print the results
print(f"Number of patients in train set: {num_train_patients}")
print(f"Number of patients in test set: {num_test_patients}")


Number of patients in train set: 153
Number of patients in test set: 66
