In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import cv2
import os
from tqdm import tqdm

import pydicom
import numpy as np
import glob

# 1. Metadata object (highest level) (run once)

In [2]:
df_train_series_descriptions = pd.read_csv("/kaggle/input/rsna-2024-lumbar-spine-degenerative-classification/train_series_descriptions.csv")
folder_train_images = os.listdir("/kaggle/input/rsna-2024-lumbar-spine-degenerative-classification/train_images")
# exclude .DS_store files
folder_train_images = list(filter(lambda x: x.find('.DS') == -1, folder_train_images))

def get_metadata_object(folder_images, df_series_descriptions):
    '''
    for intially the train_images folder and train_series_descriptions.csv,
    later for the test_images folder and test_series_description.csv
    '''
    
    # a list of tuples like (study_id, study_id's path location)
    images_study_id_dirs = [(int(study_id),    # integer the study_id
                                   f"/kaggle/input/rsna-2024-lumbar-spine-degenerative-classification/train_images/{study_id}") 
                                  for study_id in folder_images]

    # convert the list of tuples into dictionary/metadata
    metadata_object = {study_id: {'study_id_folder_path': path, 
                                    'SeriesInstanceUIDs': [],
                                    'SeriesDescriptions': []
                                 }
                       for study_id, path in images_study_id_dirs
                      }

    # remove all the .DS files/folders (MacOS) from SeriesInstanceUIDs or series_ids folders
    # then put the names for series_ids as well as the corresponding description to metadata
    for study_id in tqdm(metadata_object):

        # SERIES_ID
        # get all series directories/folders inside each study_id directory
        series_ids_dirs = os.listdir(metadata_object[study_id]['study_id_folder_path'])
        filtered_series_ids_dirs = [int(x) for x in series_ids_dirs if x.find('.DS') == -1]   # integer(series_id)
        # put to metadata_object
        metadata_object[study_id]['SeriesInstanceUIDs'] = filtered_series_ids_dirs

        # SERIES_DESCRIPTIONS
        series_desc_df = df_series_descriptions[df_series_descriptions.study_id==study_id][["series_id", "series_description"]]
        for series_id in metadata_object[study_id]['SeriesInstanceUIDs']:

            series_desc_list = series_desc_df[series_desc_df.series_id==series_id].series_description.values

            if len(series_desc_list) == 0:
                metadata_object[study_id]['SeriesDescriptions'].append("")
            else:
                metadata_object[study_id]['SeriesDescriptions'].append(series_desc_list[0])
                
    return metadata_object

metadata_object_train = get_metadata_object(folder_train_images, df_train_series_descriptions)


100%|██████████| 1975/1975 [00:08<00:00, 227.60it/s]


# 2. Metadata object for series for each study_id

### remember: each image instance has its own metadata for id and dicom

<br>

In [3]:
def get_series_metadata_object_given_study_id(metadata_object, study_id):
    """
    one study_id can have many series => this func gets all series metadata for this 
    particular study_id
    """
    
    metadata_for_study_id = metadata_object[study_id]
    
    series_metadata_object_given_study_id = {}

    for idx, series_id in enumerate(metadata_for_study_id["SeriesInstanceUIDs"]):

        # create bases for the series_images_metadata: each series contains desc and images files
        series_metadata_object_given_study_id[series_id] = {'image_series_description': metadata_for_study_id["SeriesDescriptions"][idx], 
                                                            'image_files': []
                                                            }

        # glob for patten matching as we want to get image files ending with .dcm
        folder_path_study_id = metadata_for_study_id["study_id_folder_path"]
        # rmb: "SeriesInstancesUIDs" is just a list of images and series_id is the actual series-id
        image_files = glob.glob(f"{folder_path_study_id}/{series_id}/*.dcm")


        # inside image_files, create a metadata for id_image and corresponding dicom readable image file
        sorted_image_files = sorted(image_files, key = lambda x: int(x.split('/')[-1].replace('.dcm', '')))

        
        # iterate through all image files (sorting to make sense, not very necessarily)
        for image_file in sorted_image_files:
            
            dicom_image_id = image_file.split('/')[-1].replace(".dcm", '')
            dicom_image_read = pydicom.dcmread(image_file)
            
            # metadata for one image instance
            one_image_metadata = {"SOPInstanceUID": dicom_image_id,      # id of the dicom image instance file
                                  "dicom_image_file": dicom_image_read}  # actual read of the dcm instance file

            # append this image_metadata to list of image_files
            series_metadata_object_given_study_id[series_id]["image_files"].append(one_image_metadata)

    return series_metadata_object_given_study_id
    

# 3. Display normal images for one study_id

In [4]:
 def display_images_given_study_id(metadata_object, study_id): 
    '''
    inside there is another function specifically for this function
    '''
    
    # view images for this study_id = 4003253 for a particular series_description
    def display_images(image_files, series_description, max_images_per_row=5):

        # grid for display
        num_images = len(image_files)
        num_rows = (num_images + max_images_per_row - 1) // max_images_per_row  # ceiling division (ignore the remainder, extra)

        # subplot grid
        fig, axes = plt.subplots(nrows=num_rows, ncols=max_images_per_row)

        # flatten axes for easy looping if there are multiple rows
        if num_rows > 1:
            axes = axes.flatten()
        else:
            axes = [axes] # iterable for consistency

        # plot each image
        for idx, image_file in enumerate(image_files):
            ax = axes[idx]
            ax.imshow(image_file, cmap='gray') # Assuming grayscale for simplicity, change cmap as needed
            ax.axis('off')

        # turn off unused subplots
        for idx in range(num_images, len(axes)):
            axes[idx].axis("off")

        fig.suptitle(series_description, fontsize=12)
        plt.show()
        
        
        
    series_metadata_object = get_series_metadata_object_given_study_id(metadata_object, study_id)
    
    for series_id in series_metadata_object:

        series_description = series_metadata_object[series_id]["image_series_description"]

        # get the image_files for this particular series_id
        image_files = series_metadata_object[series_id]["image_files"]

        # get the dicom files to a list
        dicom_images = []
        for image_metadata in image_files:
            dicom_image = image_metadata['dicom_image_file'].pixel_array
            dicom_images.append(dicom_image)

        # display images for each series
        display_images(dicom_images, series_description)

# Display images with coordinated pathologies for one study_id

In [5]:
coord_df = pd.read_csv('/kaggle/input/rsna-2024-lumbar-spine-degenerative-classification/train_label_coordinates.csv')
# add the columns in coordinates regarding condition+level to match with train_df

cols = ['condition', 'level']
coord_df['m_condition'] = coord_df[cols].apply(lambda row: ' '.join(row.values.astype(str)).lower(), axis=1)
coord_df['m_condition'] = coord_df['m_condition'].str.replace(r'[ /]', '_', regex=True)

train_df = pd.read_csv('/kaggle/input/rsna-2024-lumbar-spine-degenerative-classification/train.csv')



def display_images_with_coord(metadata_object, study_id):
    '''
    again, there is an inner function to display imges
    '''
    
    # this function is for one coord, one image, and one title/description/condition/level)
    def display_image_with_coord(center_coord, image_instance_dicom, title):
        '''
        coord_entry is a particular coord for a particular image 
        image_meta is the image that specifically in the series, containing SOPInstanceUID and dicom_image_file, i.e. 
        {2448190387: {'image_series_description': 'Axial T2',
                      'image_files': [{'SOPInstanceUID': '1',
                                        'dicom_image_file': Dataset.file_meta -------------------------------
                                        (0002, 0001) File Meta Informa
        '''
        radius = 10
        color = (255, 0, 0)
        thickness = 2

        # for what?
        image_normalized = cv2.normalize(image_instance_dicom, None, alpha=0, beta=255, norm_type=cv2.NORM_MINMAX, dtype=cv2.CV_8U)
        # circling
        image_circle = cv2.circle(image_normalized.copy(), center_coord, radius, color, thickness)

        # convert image from BGR to RGB for correct color display in matplotlib
        image_circle = cv2.cvtColor(image_circle, cv2.COLOR_BAYER_BG2BGR)

        # display
        plt.imshow(image_circle)
        plt.axis('off')
        plt.title(title)
        plt.show()
    
    
    
    # example_study_id = 4003253
    series_meta =  get_series_metadata_object_given_study_id(metadata_object, study_id) # series_meta all based on given study_id
    example_train_df = train_df[train_df.study_id==study_id]
    example_coord_entries_df = coord_df[coord_df.study_id==study_id]

    for d, coord_entry in example_coord_entries_df.iterrows():

        center_coord = (int(coord_entry['x']), int(coord_entry['y']))

        # search for the image available for coord in all imgs in series
        image_meta_instances_list = series_meta[coord_entry.series_id]["image_files"]
        # check matching
        image_instance_id = coord_entry.instance_number

        for image_instance in image_meta_instances_list:

            if int(image_instance["SOPInstanceUID"]) == int(image_instance_id):

                image_instance_dicom = image_instance["dicom_image_file"].pixel_array

                severity = train_df.loc[example_train_df.index[0], coord_entry.m_condition]
                title = f'image_instance_id: {image_instance_id} \n {coord_entry.m_condition} - severity: {severity}'

                display_image_with_coord(center_coord, image_instance_dicom, title)

# ALL IN ONE BLOCK CODE

In [6]:
import pandas as pd
import matplotlib.pyplot as plt
import cv2
import os
from tqdm import tqdm

import pydicom
import numpy as np
import glob

df_train_series_descriptions = pd.read_csv("/kaggle/input/rsna-2024-lumbar-spine-degenerative-classification/train_series_descriptions.csv")
folder_train_images = os.listdir("/kaggle/input/rsna-2024-lumbar-spine-degenerative-classification/train_images")
# exclude .DS_store files
folder_train_images = list(filter(lambda x: x.find('.DS') == -1, folder_train_images))

def get_metadata_object(folder_images, df_series_descriptions):
    '''
    for intially the train_images folder and train_series_descriptions.csv,
    later for the test_images folder and test_series_description.csv
    '''
    
    # a list of tuples like (study_id, study_id's path location)
    images_study_id_dirs = [(int(study_id),    # integer the study_id
                                   f"/kaggle/input/rsna-2024-lumbar-spine-degenerative-classification/train_images/{study_id}") 
                                  for study_id in folder_images]

    # convert the list of tuples into dictionary/metadata
    metadata_object = {study_id: {'study_id_folder_path': path, 
                                    'SeriesInstanceUIDs': [],
                                    'SeriesDescriptions': []
                                 }
                       for study_id, path in images_study_id_dirs
                      }

    # remove all the .DS files/folders (MacOS) from SeriesInstanceUIDs or series_ids folders
    # then put the names for series_ids as well as the corresponding description to metadata
    for study_id in tqdm(metadata_object):

        # SERIES_ID
        # get all series directories/folders inside each study_id directory
        series_ids_dirs = os.listdir(metadata_object[study_id]['study_id_folder_path'])
        filtered_series_ids_dirs = [int(x) for x in series_ids_dirs if x.find('.DS') == -1]   # integer(series_id)
        # put to metadata_object
        metadata_object[study_id]['SeriesInstanceUIDs'] = filtered_series_ids_dirs

        # SERIES_DESCRIPTIONS
        series_desc_df = df_series_descriptions[df_series_descriptions.study_id==study_id][["series_id", "series_description"]]
        for series_id in metadata_object[study_id]['SeriesInstanceUIDs']:

            series_desc_list = series_desc_df[series_desc_df.series_id==series_id].series_description.values

            if len(series_desc_list) == 0:
                metadata_object[study_id]['SeriesDescriptions'].append("")
            else:
                metadata_object[study_id]['SeriesDescriptions'].append(series_desc_list[0])
                
    return metadata_object

metadata_object_train = get_metadata_object(folder_train_images, df_train_series_descriptions)





def get_series_metadata_object_given_study_id(metadata_object, study_id):
    """
    one study_id can have many series => this func gets all series metadata for this 
    particular study_id
    """
    
    metadata_for_study_id = metadata_object[study_id]
    
    series_metadata_object_given_study_id = {}

    for idx, series_id in enumerate(metadata_for_study_id["SeriesInstanceUIDs"]):

        # create bases for the series_images_metadata: each series contains desc and images files
        series_metadata_object_given_study_id[series_id] = {'image_series_description': metadata_for_study_id["SeriesDescriptions"][idx], 
                                                            'image_files': []
                                                            }

        # glob for patten matching as we want to get image files ending with .dcm
        folder_path_study_id = metadata_for_study_id["study_id_folder_path"]
        # rmb: "SeriesInstancesUIDs" is just a list of images and series_id is the actual series-id
        image_files = glob.glob(f"{folder_path_study_id}/{series_id}/*.dcm")


        # inside image_files, create a metadata for id_image and corresponding dicom readable image file
        sorted_image_files = sorted(image_files, key = lambda x: int(x.split('/')[-1].replace('.dcm', '')))

        
        # iterate through all image files (sorting to make sense, not very necessarily)
        for image_file in sorted_image_files:
            
            dicom_image_id = image_file.split('/')[-1].replace(".dcm", '')
            dicom_image_read = pydicom.dcmread(image_file)
            
            # metadata for one image instance
            one_image_metadata = {"SOPInstanceUID": dicom_image_id,      # id of the dicom image instance file
                                  "dicom_image_file": dicom_image_read}  # actual read of the dcm instance file

            # append this image_metadata to list of image_files
            series_metadata_object_given_study_id[series_id]["image_files"].append(one_image_metadata)

    return series_metadata_object_given_study_id
    
    
    
    
    
    
 def display_images_given_study_id(metadata_object, study_id): 
    '''
    inside there is another function specifically for this function
    '''
    
    # view images for this study_id = 4003253 for a particular series_description
    def display_images(image_files, series_description, max_images_per_row=5):

        # grid for display
        num_images = len(image_files)
        num_rows = (num_images + max_images_per_row - 1) // max_images_per_row  # ceiling division (ignore the remainder, extra)

        # subplot grid
        fig, axes = plt.subplots(nrows=num_rows, ncols=max_images_per_row)

        # flatten axes for easy looping if there are multiple rows
        if num_rows > 1:
            axes = axes.flatten()
        else:
            axes = [axes] # iterable for consistency

        # plot each image
        for idx, image_file in enumerate(image_files):
            ax = axes[idx]
            ax.imshow(image_file, cmap='gray') # Assuming grayscale for simplicity, change cmap as needed
            ax.axis('off')

        # turn off unused subplots
        for idx in range(num_images, len(axes)):
            axes[idx].axis("off")

        fig.suptitle(series_description, fontsize=12)
        plt.show()
        
        
        
    series_metadata_object = get_series_metadata_object_given_study_id(metadata_object, study_id)
    
    for series_id in series_metadata_object:

        series_description = series_metadata_object[series_id]["image_series_description"]

        # get the image_files for this particular series_id
        image_files = series_metadata_object[series_id]["image_files"]

        # get the dicom files to a list
        dicom_images = []
        for image_metadata in image_files:
            dicom_image = image_metadata['dicom_image_file'].pixel_array
            dicom_images.append(dicom_image)

        # display images for each series
        display_images(dicom_images, series_description)
        
        

        
        
        
coord_df = pd.read_csv('/kaggle/input/rsna-2024-lumbar-spine-degenerative-classification/train_label_coordinates.csv')
# add the columns in coordinates regarding condition+level to match with train_df

cols = ['condition', 'level']
coord_df['m_condition'] = coord_df[cols].apply(lambda row: ' '.join(row.values.astype(str)).lower(), axis=1)
coord_df['m_condition'] = coord_df['m_condition'].str.replace(r'[ /]', '_', regex=True)

train_df = pd.read_csv('/kaggle/input/rsna-2024-lumbar-spine-degenerative-classification/train.csv')



def display_images_with_coord(metadata_object, study_id):
    '''
    again, there is an inner function to display imges
    '''
    
    # this function is for one coord, one image, and one title/description/condition/level)
    def display_image_with_coord(center_coord, image_instance_dicom, title):
        '''
        coord_entry is a particular coord for a particular image 
        image_meta is the image that specifically in the series, containing SOPInstanceUID and dicom_image_file, i.e. 
        {2448190387: {'image_series_description': 'Axial T2',
                      'image_files': [{'SOPInstanceUID': '1',
                                        'dicom_image_file': Dataset.file_meta -------------------------------
                                        (0002, 0001) File Meta Informa
        '''
        radius = 10
        color = (255, 0, 0)
        thickness = 2

        # for what?
        image_normalized = cv2.normalize(image_instance_dicom, None, alpha=0, beta=255, norm_type=cv2.NORM_MINMAX, dtype=cv2.CV_8U)
        # circling
        image_circle = cv2.circle(image_normalized.copy(), center_coord, radius, color, thickness)

        # convert image from BGR to RGB for correct color display in matplotlib
        image_circle = cv2.cvtColor(image_circle, cv2.COLOR_BAYER_BG2BGR)

        # display
        plt.imshow(image_circle)
        plt.axis('off')
        plt.title(title)
        plt.show()
    
    
    
    # example_study_id = 4003253
    series_meta =  get_series_metadata_object_given_study_id(metadata_object, study_id) # series_meta all based on given study_id
    example_train_df = train_df[train_df.study_id==study_id]
    example_coord_entries_df = coord_df[coord_df.study_id==study_id]

    for d, coord_entry in example_coord_entries_df.iterrows():

        center_coord = (int(coord_entry['x']), int(coord_entry['y']))

        # search for the image available for coord in all imgs in series
        image_meta_instances_list = series_meta[coord_entry.series_id]["image_files"]
        # check matching
        image_instance_id = coord_entry.instance_number

        for image_instance in image_meta_instances_list:

            if int(image_instance["SOPInstanceUID"]) == int(image_instance_id):

                image_instance_dicom = image_instance["dicom_image_file"].pixel_array

                severity = train_df.loc[example_train_df.index[0], coord_entry.m_condition]
                title = f'image_instance_id: {image_instance_id} \n {coord_entry.m_condition} - severity: {severity}'

                display_image_with_coord(center_coord, image_instance_dicom, title)

IndentationError: unindent does not match any outer indentation level (<tokenize>, line 112)