In [1]:
# %%
import os
import re
import pandas as pd

In [2]:
fpath = '/data/nuri/retracing/knn-transformers/preprocess_datasets/mimic_radio/old/radiology.csv'
df = pd.read_csv(fpath)

In [3]:
import re
import pandas as pd

def parse_report_regex(text):
    """
    Clean and split a medical report into:
    - context: everything before 'impression:'
    - impression: text after 'impression:' (keeps the label)
    Removes special characters like underscores, slashes, etc.
    Keeps colons and important punctuation.
    """

    # --- 1️⃣ Normalize and lowercase ---
    text = text.strip().lower()

    # --- 2️⃣ Remove unwanted special chars (but keep punctuation and colons) ---
    text = re.sub(r'[_*#/\\|<>~^`=]+', ' ', text)  # remove underscores, slashes, etc.
    text = re.sub(r'\s+', ' ', text)  # collapse multiple spaces/newlines

    # --- 3️⃣ Normalize section headers (keep colons lowercase) ---
    text = re.sub(
        r'\b(EXAMINATION|INDICATION|TECHNIQUE|COMPARISON|FINDINGS|IMPRESSION|BILE DUCTS|'
        r'GALLBLADDER|LIVER|SPLEEN|KIDNEYS|PANCREAS|RETROPERITONEUM|PROCEDURE|RESULTS)\b:',
        lambda m: m.group(0).lower(),
        text,
        flags=re.IGNORECASE
    )

    # --- 4️⃣ Split into context + impression ---
    parts = re.split(r'\bimpression:\s*', text, flags=re.IGNORECASE)
    if len(parts) == 2:
        context = parts[0].strip()
        impression = "impression: " + parts[1].strip()
    else:
        context = text.strip()
        impression = "impression:"

    return {"context": context, "impression": impression}


In [4]:
# Example test data
test_records = [
    """EXAMINATION:  CHEST (PA AND LAT)
    INDICATION:  ___ with new onset ascites  // eval for infection
    TECHNIQUE:  Chest PA and lateral
    FINDINGS:  No focal consolidation or effusion.
    IMPRESSION:  No acute cardiopulmonary process.""",
    
    """EXAMINATION:  LIVER OR GALLBLADDER US (SINGLE ORGAN)
    INDICATION:  ___ year-old female with cirrhosis, jaundice.
    FINDINGS:  Liver is coarsened and nodular.
    IMPRESSION:  Nodular appearance of liver compatible with cirrhosis.""",
    
    """INDICATION:  ___ hcv cirrhosis c/b ascites, hi risk varices.
    IMPRESSION:  Successful uncomplicated ultrasound-guided paracentesis.""",
    
    """EXAMINATION:  PARACENTESIS
    INDICATION:  ___ ascites
    IMPRESSION:  Uneventful therapeutic paracentesis."""
]

# Process all test records
df_clean = pd.DataFrame([parse_report_regex(r) for r in test_records])
df_clean

Unnamed: 0,context,impression
0,examination: chest (pa and lat) indication: wi...,impression: no acute cardiopulmonary process.
1,examination: liver or gallbladder us (single o...,impression: nodular appearance of liver compat...
2,"indication: hcv cirrhosis c b ascites, hi risk...",impression: successful uncomplicated ultrasoun...
3,examination: paracentesis indication: ascites,impression: uneventful therapeutic paracentesis.


In [5]:
# Apply the parser and expand the dictionary into two columns
df_clean2 = df['text'].apply(parse_report_regex).apply(pd.Series)

# Rename and inspect
df_clean2.columns = ['context', 'impression']
df_clean2.head()


Unnamed: 0,context,impression
0,examination: chest (pa and lat) indication: wi...,impression: no acute cardiopulmonary process.
1,examination: liver or gallbladder us (single o...,impression: 1. nodular appearance of the liver...
2,"indication: hcv cirrhosis c b ascites, hiv on ...",impression: successful uncomplicated ultrasoun...
3,examination: ultrasound-guided paracentesis. i...,impression: uneventful therapeutic paracentesi...
4,examination: paracentesis indication: year old...,"impression: 4.75 l of slightly cloudy, blood t..."


In [7]:
df[:50].to_csv('test_notes.csv')

In [15]:
df_clean2.to_csv('0_radiology_cleaned.csv')

In [8]:
df_clean2[:50].to_csv('test_notes2.csv')

In [19]:
df_clean2.iloc[2].impression

'impression: successful uncomplicated ultrasound guided diagnostic and therapeutic paracentesis yielding 1.5 l of serosanguineous fluid from the right lower quadrant. sample was sent to the lab as requested.'

In [16]:
df_clean2.shape 

(2321355, 2)