### Preprocess Eval Dataset

__05/19/2024__
- Modifies some mislabeling within `eval_dataset_web_05-15`
- Merges `eval_dataset_web_05-15` with `old_annotations` from original labeling analysis with out-of-the-box LLMs

In [14]:
import polars as pl

pl.Config.set_tbl_cols(10)
pl.Config.set_tbl_rows(100)

polars.config.Config

In [23]:
new_dataset = (
    pl.read_csv("../../data/eval_dataset_web_05-15-2024__19-46.tsv", separator="\t")
    .filter(pl.col("labeled") == "true")
)
print(new_dataset.shape)
new_dataset.head()

(1663, 7)


filename,patient_id,finding,anatomic_classification,possible_secondary,autogenerated,labeled
str,str,str,str,str,bool,bool
"""mimic_cxr_reports/p17/p1750463…","""p17504630""","""The patient is status post cor…","""MISCELLANEOUS""",,False,True
"""mimic_cxr_reports/p17/p1750463…","""p17504630""","""The heart is mildly enlarged""","""HEART""","""HEART""",True,True
"""mimic_cxr_reports/p17/p1750463…","""p17504630""","""The cardiac, mediastinal and h…","""MEDIASTINUM AND HILA""","""MEDIASTINUM AND HILA""",True,True
"""mimic_cxr_reports/p17/p1750463…","""p17504630""","""Opacification at the left lung…","""LUNG/PLEURA/LARGE AIRWAYS""","""LUNG/PLEURA/LARGE AIRWAYS""",True,True
"""mimic_cxr_reports/p17/p1750463…","""p17504630""","""A small caliber chest tube at…","""LINES/TUBES/DRAINS""","""LINES/TUBES/DRAINS""",True,True


Some categories might be redundant

In [24]:
new_dataset["anatomic_classification"].unique()

anatomic_classification
str
"""STOMACH"""
"""ABDOMEN"""
"""POSITIONING/LIMITATIONS"""
"""BONE"""
"""MEDIASTINUM AND HILA"""
"""MISCELLANEOUS"""
"""CARDIAC/CARDIOMEDIASTINALSILHO…"
"""BOWEL"""
"""LINES/TUBES/DRAINS"""
"""POSITIONING"""


We will merge `LUNG` with `LUNG/PLEURA/LARGE AIRWAYS` as well as merge various other labels with each other.

In [29]:
new_dataset = new_dataset.with_columns(
    pl.when(pl.col("anatomic_classification") == "LUNG")
        .then(pl.lit("LUNG/PLEURA/LARGE AIRWAYS"))
        .when(pl.col("anatomic_classification") == "HEART")
        .then(pl.lit("CARDIAC/CARDIOMEDIASTINALSILHOUETTE"))
        .when(pl.col("anatomic_classification") == "POSITIONING")
        .then(pl.lit("POSITIONING/LIMITATIONS"))
        .when(pl.col("anatomic_classification") == "BONE")
        .then(pl.lit("BONE AND SOFT TISSUE"))
        .when(pl.col("anatomic_classification") == "MEDIASTINUM AND HILA")  # This switch might need to be manually checked later
        .then(pl.lit("CARDIAC/CARDIOMEDIASTINALSILHOUETTE"))
        .otherwise(pl.col("anatomic_classification"))
        .alias("anatomic_classification")
)
new_dataset["anatomic_classification"].unique()

anatomic_classification
str
"""LUNG/PLEURA/LARGE AIRWAYS"""
"""STOMACH"""
"""CARDIAC/CARDIOMEDIASTINALSILHO…"
"""GALLBLADDER"""
"""ABDOMEN"""
"""NECK"""
"""BONE AND SOFT TISSUE"""
"""LINES/TUBES/DRAINS"""
"""MISCELLANEOUS"""
"""SPLEEN"""


Now we merge with the old annotations.

In [45]:
old_dataset = (
    pl.read_csv("../../data/old_annotations.tsv", separator="\t")
)
old_dataset.head()

filename,patient_id,finding,anatomic_classification,possible_secondary
str,str,str,str,str
"""/home/khans24/charit/anatomy_n…","""p10394761""","""PA and lateral chest views wer…","""MISCELLANEOUS""",
"""/home/khans24/charit/anatomy_n…","""p10394761""","""Analysis is performed in direc…","""MISCELLANEOUS""",
"""/home/khans24/charit/anatomy_n…","""p10394761""","""There is mild cardiac enlargem…","""HEART""",
"""/home/khans24/charit/anatomy_n…","""p10394761""","""There is a relative prominenc…","""HEART""",
"""/home/khans24/charit/anatomy_n…","""p10394761""","""The thoracic aorta is general…","""HEART""",


We will add the `labeled` and `autogenerated` columns to this dataset

In [50]:
old_dataset = old_dataset.with_columns(
    autogenerated = False,
    labeled = True
)
old_dataset.head()

filename,patient_id,finding,anatomic_classification,possible_secondary,autogenerated,labeled
str,str,str,str,str,bool,bool
"""/home/khans24/charit/anatomy_n…","""p10394761""","""PA and lateral chest views wer…","""MISCELLANEOUS""",,False,True
"""/home/khans24/charit/anatomy_n…","""p10394761""","""Analysis is performed in direc…","""MISCELLANEOUS""",,False,True
"""/home/khans24/charit/anatomy_n…","""p10394761""","""There is mild cardiac enlargem…","""HEART""",,False,True
"""/home/khans24/charit/anatomy_n…","""p10394761""","""There is a relative prominenc…","""HEART""",,False,True
"""/home/khans24/charit/anatomy_n…","""p10394761""","""The thoracic aorta is general…","""HEART""",,False,True


Perform the appropriate relabeling...

In [54]:
old_dataset["anatomic_classification"].unique()

anatomic_classification
str
"""HEART"""
"""POSITIONING/LIMITATIONS"""
"""MISCELLANEOUS"""
"""BONE AND SOFT TISSUE"""
"""LUNG/PLEURA/LARGE AIRWAYS"""
"""BOWEL"""
"""LINES/TUBES/DRAINS"""


In [56]:
old_dataset = old_dataset.with_columns(
    pl.when(pl.col("anatomic_classification") == "HEART")
        .then(pl.lit("CARDIAC/CARDIOMEDIASTINALSILHOUETTE"))
        .otherwise(pl.col("anatomic_classification"))
        .alias("anatomic_classification")
)
old_dataset["anatomic_classification"].unique()

anatomic_classification
str
"""BONE AND SOFT TISSUE"""
"""MISCELLANEOUS"""
"""CARDIAC/CARDIOMEDIASTINALSILHO…"
"""BOWEL"""
"""POSITIONING/LIMITATIONS"""
"""LUNG/PLEURA/LARGE AIRWAYS"""
"""LINES/TUBES/DRAINS"""


In [57]:
final_df = pl.concat([old_dataset, new_dataset])
final_df.head()

filename,patient_id,finding,anatomic_classification,possible_secondary,autogenerated,labeled
str,str,str,str,str,bool,bool
"""/home/khans24/charit/anatomy_n…","""p10394761""","""PA and lateral chest views wer…","""MISCELLANEOUS""",,False,True
"""/home/khans24/charit/anatomy_n…","""p10394761""","""Analysis is performed in direc…","""MISCELLANEOUS""",,False,True
"""/home/khans24/charit/anatomy_n…","""p10394761""","""There is mild cardiac enlargem…","""CARDIAC/CARDIOMEDIASTINALSILHO…",,False,True
"""/home/khans24/charit/anatomy_n…","""p10394761""","""There is a relative prominenc…","""CARDIAC/CARDIOMEDIASTINALSILHO…",,False,True
"""/home/khans24/charit/anatomy_n…","""p10394761""","""The thoracic aorta is general…","""CARDIAC/CARDIOMEDIASTINALSILHO…",,False,True


In [58]:
final_df["anatomic_classification"].unique()

anatomic_classification
str
"""LINES/TUBES/DRAINS"""
"""STOMACH"""
"""BONE AND SOFT TISSUE"""
"""MISCELLANEOUS"""
"""BOWEL"""
"""ABDOMEN"""
"""POSITIONING/LIMITATIONS"""
"""NECK"""
"""CARDIAC/CARDIOMEDIASTINALSILHO…"
"""SPLEEN"""


In [61]:
final_df.shape

(2230, 7)

In [60]:
final_df.write_csv("../../data/eval_dataset_2000.tsv", separator="\t")