In [4]:
from data import *
from utils import *
import os
import torch
from torch.utils.data import Dataset, DataLoader
from torchvision import transforms
from PIL import Image

ModuleNotFoundError: No module named 'tqdm'

# Preprocessing notebook for the MIMIC dataset

It is divided in the investigation of how to prerocess:
1. the tabular data
2. the images
3. the labels

## 1. **Tabular data**

**MIMIC-III Data Description**<br>
MIMIC-III is a relational database consisting of 26 tables. Tables are linked by identifiers which usually have the suffix ‘ID’. For example, SUBJECT_ID refers to a unique patient, HADM_ID refers to a unique admission to the hospital, and ICUSTAY_ID refers to a unique admission to an intensive care unit.

Charted events such as notes, laboratory tests, and fluid balance are stored in a series of ‘events’ tables. For example the OUTPUTEVENTS table contains all measurements related to output for a given patient, while the LABEVENTS table contains laboratory test results for a patient.

Tables prefixed with ‘D_’ are dictionary tables and provide definitions for identifiers. For example, every row of CHARTEVENTS is associated with a single ITEMID which represents the concept measured, but it does not contain the actual name of the measurement. By joining CHARTEVENTS and D_ITEMS on ITEMID, it is possible to identify the concept represented by a given ITEMID.

Developing the MIMIC data model involved balancing simplicity of interpretation against closeness to ground truth. As such, the model is a reflection of underlying data sources, modified over iterations of the MIMIC database in response to user feedback. Care has been taken to avoid making assumptions about the underlying data when carrying out transformations, so MIMIC-III closely represents the raw hospital data.

Broadly speaking, five tables are used to define and track patient stays: ADMISSIONS; PATIENTS; ICUSTAYS; SERVICES; and TRANSFERS. Another five tables are dictionaries for cross-referencing codes against their respective definitions: D_CPT; D_ICD_DIAGNOSES; D_ICD_PROCEDURES; D_ITEMS; and D_LABITEMS. The remaining tables contain data associated with patient care, such as physiological measurements, caregiver observations, and billing information.

In some cases it would be possible to merge tables—for example, the D_ICD_PROCEDURES and CPTEVENTS tables both contain detail relating to procedures and could be combined—but our approach is to keep the tables independent for clarity, since the data sources are significantly different. Rather than combining the tables within MIMIC data model, we suggest researchers develop database views and transforms as appropriate.

In [None]:
pd.read_csv('../dataset/CPTEVENTS.csv').head()

  pd.read_csv('../dataset/CPTEVENTS.csv').head()


Unnamed: 0,ROW_ID,SUBJECT_ID,HADM_ID,COSTCENTER,CHARTDATE,CPT_CD,CPT_NUMBER,CPT_SUFFIX,TICKET_ID_SEQ,SECTIONHEADER,SUBSECTIONHEADER,DESCRIPTION
0,317,11743,129545,ICU,,99232,99232.0,,6.0,Evaluation and management,Hospital inpatient services,
1,318,11743,129545,ICU,,99232,99232.0,,7.0,Evaluation and management,Hospital inpatient services,
2,319,11743,129545,ICU,,99232,99232.0,,8.0,Evaluation and management,Hospital inpatient services,
3,320,11743,129545,ICU,,99232,99232.0,,9.0,Evaluation and management,Hospital inpatient services,
4,321,6185,183725,ICU,,99223,99223.0,,1.0,Evaluation and management,Hospital inpatient services,


## 2. **Images data**


The mimic-cxr-2.0.0-metadata.csv.gz file contains useful meta-data derived from the original DICOM files in MIMIC-CXR. The columns are:

* **dicom_id** - An identifier for the DICOM file. The stem of each JPG image filename is equal to the dicom_id.
* **PerformedProcedureStepDescription** - The type of study performed ("CHEST (PA AND LAT)", "CHEST (PORTABLE AP)", etc).
* **ViewPosition** - The orientation in which the chest radiograph was taken ("AP", "PA", "LATERAL", etc).
* **Rows** - The height of the image in pixels.
* **Columns** - The width of the image in pixels.
* **StudyDate** - An anonymized date for the radiographic study. All images from the same study will have the same date and time. Dates are anonymized, but chronologically consistent for each patient. Intervals between two scans have not been modified during de-identification.
* **StudyTime** - The time of the study in hours, minutes, seconds, and fractional seconds. The time of the study was not modified during de-identification.
* **ProcedureCodeSequence_CodeMeaning** - The human readable description of the coded procedure (e.g. "CHEST (PA AND LAT)". Descriptions follow Simon-Leeming codes [11].
* **ViewCodeSequence_CodeMeaning** - The human readable description of the coded view orientation for the image (e.g. "postero-anterior", "antero-posterior", "lateral").
* **PatientOrientationCodeSequence_CodeMeaning** - The human readable description of the patient orientation during the image acquisition. Three values are possible: "Erect", "Recumbent", or a null value (missing).

In [None]:
# Info from the images
info_jpg = pd.read_csv('../fake_data/mimic-cxr-2.0.0-metadata.csv')
info_jpg.head()

Unnamed: 0,dicom_id,subject_id,study_id,PerformedProcedureStepDescription,ViewPosition,Rows,Columns,StudyDate,StudyTime,ProcedureCodeSequence_CodeMeaning,ViewCodeSequence_CodeMeaning,PatientOrientationCodeSequence_CodeMeaning
0,02aa804e-bde0afdd-112c0b34-7bc16630-4e384014,10000032,50414267,CHEST (PA AND LAT),PA,3056,2544,21800506,213014.531,CHEST (PA AND LAT),postero-anterior,Erect
1,174413ec-4ec4c1f7-34ea26b7-c5f994f8-79ef1962,10000032,50414267,CHEST (PA AND LAT),LATERAL,3056,2544,21800506,213014.531,CHEST (PA AND LAT),lateral,Erect
2,2a2277a9-b0ded155-c0de8eb9-c124d10e-82c5caab,10000032,53189527,CHEST (PA AND LAT),PA,3056,2544,21800626,165500.312,CHEST (PA AND LAT),postero-anterior,Erect
3,e084de3b-be89b11e-20fe3f9f-9c8d8dfe-4cfd202c,10000032,53189527,CHEST (PA AND LAT),LATERAL,3056,2544,21800626,165500.312,CHEST (PA AND LAT),lateral,Erect
4,68b5c4b1-227d0485-9cc38c3f-7b84ab51-4b472714,10000032,53911762,CHEST (PORTABLE AP),AP,2705,2539,21800723,80556.875,CHEST (PORTABLE AP),antero-posterior,


Note that "No Finding" is the absence of any of the 13 descriptive labels and a check that the text does not mention a specified set of other common findings beyond those covered by the descriptive labels. Each label column contains one of four values: 1.0, -1.0, 0.0, or missing. These labels have the following interpretation:

* **1.0** - The label was positively mentioned in the associated study, and is present in one or more of the corresponding images e.g. "A large pleural effusion"
* **0.0** - The label was negatively mentioned in the associated study, and therefore should not be present in any of the corresponding images e.g. "No pneumothorax."
* **-1.0** - The label was either: (1) Explicit uncertainty or (2) Ambiguous language
* **Missing** (empty element) - No mention of the label was made in the report

In [None]:
labels_data = pd.read_csv('../fake_data/mimic-cxr-2.0.0-chexpert.csv')
labels_data.head()

Unnamed: 0,subject_id,study_id,Atelectasis,Cardiomegaly,Consolidation,Edema,Enlarged Cardiomediastinum,Fracture,Lung Lesion,Lung Opacity,No Finding,Pleural Effusion,Pleural Other,Pneumonia,Pneumothorax,Support Devices
0,10000032,50414267,,,,,,,,,1.0,,,,,
1,10000032,53189527,,,,,,,,,1.0,,,,,
2,10000032,53911762,,,,,,,,,1.0,,,,,
3,10000032,56699142,,,,,,,,,1.0,,,,,
4,10000764,57375967,,,1.0,,,,,,,,,-1.0,,


In [None]:
def list_images(base_path):
    """
    Recursively lists all image files starting from the base path.
    Assumes that images have extensions typical for image files (e.g., .jpg, .jpeg, .png).
    """
    image_files = []
    for subdir, dirs, files in os.walk(base_path):
        for file in files:
            if file.lower().endswith(('.png', '.jpg', '.jpeg')):
                image_files.append(os.path.join(subdir, file))
    return image_files

image_files = list_images('../fake_data/files')
image_files

['../fake_data/files/p10/p10000764/s57375967/dcfeeac4-1597e318-d0e6736a-8b2c2238-47ac3f1b.jpg',
 '../fake_data/files/p10/p10000764/s57375967/b79e55c3-735ce5ac-64412506-cdc9ea79-f1af521f.jpg',
 '../fake_data/files/p10/p10000764/s57375967/096052b7-d256dc40-453a102b-fa7d01c6-1b22c6b4.jpg',
 '../fake_data/files/p10/p10000032/s56699142/ea030e7a-2e3b1346-bc518786-7a8fd698-f673b44c.jpg',
 '../fake_data/files/p10/p10000032/s53189527/e084de3b-be89b11e-20fe3f9f-9c8d8dfe-4cfd202c.jpg',
 '../fake_data/files/p10/p10000032/s53189527/2a2277a9-b0ded155-c0de8eb9-c124d10e-82c5caab.jpg',
 '../fake_data/files/p10/p10000032/s53911762/68b5c4b1-227d0485-9cc38c3f-7b84ab51-4b472714.jpg',
 '../fake_data/files/p10/p10000032/s53911762/fffabebf-74fd3a1f-673b6b41-96ec0ac9-2ab69818.jpg',
 '../fake_data/files/p10/p10000032/s50414267/174413ec-4ec4c1f7-34ea26b7-c5f994f8-79ef1962.jpg',
 '../fake_data/files/p10/p10000032/s50414267/02aa804e-bde0afdd-112c0b34-7bc16630-4e384014.jpg',
 '../fake_data/files/p11/p11000011/s5102

In [None]:
# Create a dictionary to hold the mapping
image_labels_mapping = {}

for image_path in image_files:
    # Extract subject_id and study_id from the file path
    parts = image_path.split(os.sep)
    subject_id = parts[-3][1:] 
    study_id = parts[-2][1:]

    # Find the corresponding row in the labels CSV
    labels_row = labels_data[
        (labels_data['subject_id'] == int(subject_id)) &
        (labels_data['study_id'] == int(study_id))
    ]

    # Assuming there is only one match, get the labels
    if not labels_row.empty:
        labels = labels_row.iloc[0].to_dict()
        # Add the image path and its labels to the mapping dictionary
        image_labels_mapping[image_path] = labels

image_labels_mapping[image_files[0]]    

{'subject_id': 10000764.0,
 'study_id': 57375967.0,
 'Atelectasis': nan,
 'Cardiomegaly': nan,
 'Consolidation': 1.0,
 'Edema': nan,
 'Enlarged Cardiomediastinum': nan,
 'Fracture': nan,
 'Lung Lesion': nan,
 'Lung Opacity': nan,
 'No Finding': nan,
 'Pleural Effusion': nan,
 'Pleural Other': nan,
 'Pneumonia': -1.0,
 'Pneumothorax': nan,
 'Support Devices': nan}

In [None]:
# Define your custom dataset class
class MedicalImagesDataset(Dataset):
    def __init__(self, data_dict, transform=None):
        self.data_dict = data_dict
        self.transform = transform
        self.paths = list(data_dict.keys())

    def __len__(self):
        return len(self.paths)

    def __getitem__(self, idx):
        image_path = self.paths[idx]
        image = Image.open(image_path).convert('RGB')  # Open the image file
        
        # Get the labels for the image
        labels = self.data_dict[image_path]
        
        # Apply the transformations to the image
        if self.transform:
            image = self.transform(image)

        # You might want to convert labels to a tensor or perform some other kind of preprocessing on them
        # For simplicity, we're just returning the 'Consolidation' label as an example
        label = labels['Consolidation']
        label_tensor = torch.tensor(label if not pd.isna(label) else 0, dtype=torch.float32)

        return image, label_tensor

# Instantiate the dataset
# dataset = MedicalImagesDataset(image_labels_mapping, transform=None)

# Create the DataLoader
# dataloader = DataLoader(dataset, batch_size=32, shuffle=True)  # Adjust batch_size as needed

: 