# <p style='text-align: center'>Convert Dicom datasets (ricord and RSNA) into .png images and create corresponding .csv spreadsheets with required information for inclusion</p>

In [1]:
import pydicom as pdcm
from pydicom.pixel_data_handlers import apply_modality_lut, apply_voi_lut
import pandas as pd
import cv2
import numpy as np
import glob
import os
import pyprog

In [2]:
ricord_dicom_path = './dicom images/ricord/MIDRC-RICORD-1C/'
ricord_unusable_path = './ricord_exclude'
ricord_output_path = './dataset/ricord/'

rsna_dicom_path = './dicom images/rsna/stage_2_train_images'
rsna_info_csv = './dicom images/rsna/stage_2_detailed_class_info.csv'
rsna_output_path = './dataset/rsna/'

In [3]:
os.makedirs(os.path.join(ricord_output_path,'images'), exist_ok=True)

# This list will prune unusable images from the set but not delete them... Some may find use of them
unusable_imgs_file = open(ricord_unusable_path)
unusable_imgs = [line[:-1] for line in unusable_imgs_file.readlines()]

ricord_df = pd.DataFrame(columns=['patientid','finding','imagename'])

# Calculate Target file count and create progress bar for convenience
file_count_target = sum(len(files) for _, _, files in os.walk(ricord_dicom_path))

prog = pyprog.ProgressBar('','')
prog = pyprog.ProgressBar(prefix='',suffix=' Processing ' + str(prog.total - prog.current_stat) + ' more images...',total=file_count_target,progress_loc=3, complete_symbol="=", not_complete_symbol=" ", wrap_bar_prefix='|', wrap_bar_suffix='|')
prog.update()

for patient in sorted(glob.iglob(os.path.join(ricord_dicom_path, '**/'), recursive=False)):
    for index, filename in enumerate(sorted(glob.iglob(os.path.join(patient, '**/*.dcm'), recursive=True))):
        
        # Splitting the first subfolder name (format: MIDRC-RICORD-1C-XXX-XXX) to get the patient ID at the end
        mrn = (filename.split('/')[4]).split('-', 3)[-1]
        
        # Read Dataset & Image
        dataset = pdcm.dcmread(filename)
        image = dataset.pixel_array
        
        # Check for AP or PA orientation as they are the only two the model will be trained on. Field is 'ViewPosition'
        if dataset.ViewPosition != 'AP' and dataset.ViewPosition != 'PA':
            prog.set_total(prog.total - 1)
            continue
    
        # Dicom images may not be uint8 dtype we need, so convert and map values if necessary
        if image.dtype != np.dtype(np.uint8):
            # Apply LUT transformations to fix contrast on some images
            lut_arr = apply_modality_lut(image, dataset)
            # Cannot run voi_lut if array is float, so check for any float dtype and correct it
            if np.issubdtype(lut_arr.dtype, np.floating) or isinstance(lut_arr.dtype, float):
                lut_arr = lut_arr.astype(np.uint16)
            lut_arr = apply_voi_lut(lut_arr, dataset)
            
            # Convert to float64 [0,1] map and convert to uint8 [0,255]
            lut_arr = lut_arr.astype(float)
            
            # If 'PhotometricInterpretation' field is MONOCHROME1 it needs to be inverted. (black on white to white on black)
            if dataset.PhotometricInterpretation == 'MONOCHROME1':
                # invert float value
                lut_arr = 1.0 - lut_arr
            # Normalize to [0,255]
            lut_norm = cv2.normalize(lut_arr, None, alpha=0, beta=255, norm_type=cv2.NORM_MINMAX)
            lut_norm = lut_norm.astype(np.uint8)
            
            # Reassign the modified image if necessary
            image = lut_norm
        else:
            # If 'PhotometricInterpretation' field is MONOCHROME1 it needs to be inverted. (black on white to white on black)
            if dataset.PhotometricInterpretation == 'MONOCHROME1':
                # invert uint8 value
                image = 255 - image
        
        denoise_image = cv2.medianBlur(image, 25)
        _,binary_image = cv2.threshold(denoise_image,0,255,cv2.THRESH_BINARY)
        x,y,w,h = cv2.boundingRect(binary_image)
        crop = image[y:y+h,x:x+w]

        # Create the file in the output directory or in pruned if not good quality
        try:
            if '{}-{}.png'.format(mrn, index) in unusable_imgs:
                cv2.imwrite(os.path.join(ricord_output_path,'pruned', '{}-{}.png'.format(mrn, index)), crop)
            else:
                cv2.imwrite(os.path.join(ricord_output_path,'images', '{}-{}.png'.format(mrn, index)), crop)
                                
                #populate entries with each successful iteration
                ricord_df = ricord_df.append({'patientid': mrn, 'finding': 'COVID-19', 'imagename': '{}-{}.png'.format(mrn, index)}, ignore_index=True)
        except:
            print('\nError: {}-{}.png could not be created!'.format(mrn, index))
            continue 

        # Update progress bar
        prog.set_stat(prog.current_stat + 1)
        prog.set_suffix(' Processing ' + str(prog.total - prog.current_stat) + ' more images...   ')
        prog.update()
        
ricord_df.to_csv(os.path.join(ricord_output_path,'ricord_meta.csv'), index=False)

# Finish Progress bar and clean up stray characters
prog.set_suffix(' Finished!                         ')
prog.update()
prog.end()



In [6]:
# For RSNA we will obtain only 'Lung Opacity' or 'Normal'

os.makedirs(os.path.join(rsna_output_path,'images'), exist_ok=True)

# Calculate Target file count and create progress bar for convenience
file_count_target = sum(len(files) for _, _, files in os.walk(rsna_dicom_path))
prog = pyprog.ProgressBar('','')
prog = pyprog.ProgressBar(prefix='',suffix=' Processing ' + str(prog.total - prog.current_stat) + ' more images...',total=file_count_target,progress_loc=3, complete_symbol="=", not_complete_symbol=" ", wrap_bar_prefix='|', wrap_bar_suffix='|')
prog.update()

rsna_valid_targets = []

rsna_df = pd.DataFrame(columns=['patientid','finding','imagename'])

rsna_info = pd.read_csv(rsna_info_csv)

for index, series in rsna_info.iterrows():
    # Only grab pneumonia and normal cases and no repeats
    if (series['class'] == 'Lung Opacity' or series['class'] == 'Normal') and series['patientId'] not in rsna_valid_targets:
        finding = 'Normal' if series['class'] == 'Normal' else 'Pneumonia'
        rsna_valid_targets.append(series['patientId'])
        dicom_image = pdcm.dcmread(os.path.join(rsna_dicom_path, '{}.dcm'.format(series['patientId'])))
        cv2.imwrite(os.path.join(rsna_output_path, 'images', '{}.png'.format(series['patientId'])), dicom_image.pixel_array)
        rsna_df = rsna_df.append({'patientid': series['patientId'], 'finding': finding, 'imagename': '{}.png'.format(series['patientId'])}, ignore_index=True)
        
        # Update progress bar
        prog.set_stat(prog.current_stat + 1)
        prog.set_suffix(' Processing ' + str(prog.total - prog.current_stat) + ' more images...   ')
        prog.update()
        
    elif series['patientId'] not in rsna_valid_targets:
        prog.set_stat(prog.current_stat + 1)

# Finish Progress bar and clean up stray characters
prog.set_suffix(' Finished!                         ')
prog.update()
prog.end()
        
rsna_df.to_csv(os.path.join(rsna_output_path,'rsna_meta.csv'), index=False)

