In [None]:
import os
import numpy as np
import pandas as pd

import pydicom
from pydicom.pixel_data_handlers.util import apply_voi_lut

from tqdm.notebook import tqdm

from PIL import Image

# 1. Preprocessing

Converting DICOM files to png images.
From https://www.kaggle.com/xhlulu/vinbigdata-process-and-resize-to-image

In [None]:
#Util Methods
def read_xray(path):
    dicom_file = pydicom.read_file(path)
    # VOI LUT (if available by DICOM device) is used to transform raw DICOM data to 
    # "human-friendly" view
    data = apply_voi_lut(dicom_file.pixel_array, dicom_file)
    #MONOCHROME1 indicates that the greyscale ranges from bright to dark with ascending pixel values, 
    #whereas MONOCHROME2 ranges from dark to bright with ascending pixel values.
    if dicom_file.PhotometricInterpretation == 'MONOCHROME1':
        data = np.amax(data) - data
    data = data - np.min(data)
    data = data/np.max(data)
    data = (data * 255).astype(np.uint8)
    return data

def resize(array, size):
    im = Image.fromarray(array)
    #LANCZOS (a high-quality downsampling filter)
    im = im.resize((size,size),  resample = Image.LANCZOS)
    return im

In [None]:
training_image_ids = []
dim_0 = []
dim_1 = []

for split in ['train','test']:
    load_dir = f'../input/vinbigdata-chest-xray-abnormalities-detection/{split}/'
    save_dir = f'/kaggle/tmp/{split}'
    #Creating save_dirs
    os.makedirs(save_dir, exist_ok = True)
    #iterating over each file
    for file in tqdm(os.listdir(load_dir)):
        xray = read_xray(load_dir+file)
        im = resize(xray, size = 512)
        im.save(save_dir+file.replace('.dicom','.png'))
        
        if split == 'train':
            training_image_ids.append(file.replace('.dicom',''))
            dim_0.append(xray.shape[0])
            dim_1.append(xray.shape[1])

In [None]:
! tar -zcf train.tar.gz -C "/kaggle/tmp/train"
! tar -zcf test.tar.gz -C "/kaggle/tmp/test"

In [None]:
df = pd.DataFrame({"image_id":training_image_ids,"dim_0":dim_0,"dim1":dim_1})
df.to_csv("train_metadata.csv", index = False)