In [178]:
import pandas as pd
import re

In [179]:
df = pd.read_csv('../data/synthetic_clinical_logs.txt', sep='|', names=['patient name', 'temp (celsius)', 'BP', 'notes'])

In [180]:
df = pd.DataFrame(df)
df.head()

Unnamed: 0,patient name,temp (celsius),BP,notes
0,Patient: Erica Kennedy,Temp: 39.0°C,BP: 127/64,Notes: Mild cough
1,Patient: Kiara Fisher,Temp: --°C,BP: 127/62,Notes: headache and nausea
2,Patient: Douglas Benton,Temp: --°C,BP: 135/85,Notes: complained of fatigue
3,Patient: William Maldonado,Temp: --°C,BP: not taken,Notes: complained of fatigue
4,Patient: Nicole Velazquez,Temp: 36.8°C,BP: 142/71,Notes: chest pain


In [181]:
def clean_clinical_logs(df: pd.DataFrame) -> pd.DataFrame:
    df['patient name'] = df['patient name'].str.replace('Patient: ', '').str.strip()

    df['temp (celsius)'] = (
        df['temp (celsius)']
        .str.replace(r'Temp:\s*', '', regex=True)
        .str.replace(r'°C', '', regex=True)
        .str.replace('--', '', regex=False)
        .str.strip()
    )
    df['temp (celsius)'] = pd.to_numeric(df['temp (celsius)'], errors='coerce')

    # Extract BP values
    bp_split = df['BP'].str.extract(r'(\d+)[^\d]+(\d+)', expand=True)

    # Force-create empty columns if extract fails (handles static typing and runtime)
    df['bp_systolic'] = pd.to_numeric(bp_split[0] if 0 in bp_split else pd.Series([None] * len(df)), errors='coerce')
    df['bp_diastolic'] = pd.to_numeric(bp_split[1] if 1 in bp_split else pd.Series([None] * len(df)), errors='coerce')

    df.drop(columns=['BP'], inplace=True)

    df['notes'] = df['notes'].str.replace(r'Notes:\s*["“]?(.*?)["”]?$', r'\1', regex=True)

    return df

In [182]:
clean_clinical_logs(df)

Unnamed: 0,patient name,temp (celsius),notes,bp_systolic,bp_diastolic
0,Erica Kennedy,39.0,Mild cough,127.0,64.0
1,Kiara Fisher,,headache and nausea,127.0,62.0
2,Douglas Benton,,complained of fatigue,135.0,85.0
3,William Maldonado,,complained of fatigue,,
4,Nicole Velazquez,36.8,chest pain,142.0,71.0
...,...,...,...,...,...
195,Thomas Munoz,,shortness of breath,,
196,Matthew Salazar,37.2,headache and nausea,132.0,69.0
197,Brianna Pearson,36.1,headache and nausea,,
198,Mark Hernandez,37.5,headache and nausea,142.0,86.0


In [183]:
df.to_csv("cleaned_clinical_logs.csv", index=False)