In [None]:
!conda install gdcm -c conda-forge -y

## References
- https://www.kaggle.com/ammarnassanalhajali/siim-covid-19-convert-dcm-to-jpg-384-512-and-640px/execution 
- https://www.kaggle.com/xhlulu/siim-covid-19-convert-to-jpg-256px


In [None]:
import os

from PIL import Image
import pandas as pd
from tqdm.auto import tqdm
from tqdm.contrib.concurrent import process_map
import glob
from collections import namedtuple

In [None]:
import numpy as np
import pydicom
from pydicom.pixel_data_handlers.util import apply_voi_lut

def read_xray(path, voi_lut = True, fix_monochrome = True):
    # Original from: https://www.kaggle.com/raddar/convert-dicom-to-np-array-the-correct-way
    dicom = pydicom.read_file(path)
    
    # VOI LUT (if available by DICOM device) is used to transform raw DICOM data to 
    # "human-friendly" view
    if voi_lut:
        data = apply_voi_lut(dicom.pixel_array, dicom)
    else:
        data = dicom.pixel_array
               
    # depending on this value, X-ray may look inverted - fix that:
    if fix_monochrome and dicom.PhotometricInterpretation == "MONOCHROME1":
        data = np.amax(data) - data
        
    data = data - np.min(data)
    data = data / np.max(data)
    data = (data * 255).astype(np.uint8)
        
    return data

In [None]:
def resize(array, shape, keep_ratio=False, resample=Image.LANCZOS):
    # Original from: https://www.kaggle.com/xhlulu/vinbigdata-process-and-resize-to-image
    im = Image.fromarray(array)
    
    if keep_ratio:
        im.thumbnail(shape, resample)
    else:
        im = im.resize(shape, resample)
    
    return im

In [None]:
ImageMeta = namedtuple("ImageMeta", ("height", "width", "fname"))

In [None]:
%%time
SHAPE = (512, 512)
for split in ["train", "test"]:
    filenames = glob.glob("/kaggle/input/siim-covid19-detection/{}/*/*/*.dcm".format(split))
    SAVE_DIR = f"/kaggle/tmp/{split}"
    os.makedirs(SAVE_DIR, exist_ok=True)
    def persist_image(path):
        xray = read_xray(path)
        height = xray.shape[0]
        width = xray.shape[1]
        im = resize(xray, shape=SHAPE)
        fname = os.path.basename(os.path.splitext(path)[-2])
        jpg_fname = os.path.join(SAVE_DIR, "{}.jpg".format(fname))
        im.save(jpg_fname)
        return ImageMeta(height, width, fname)
    split_imgs = process_map(persist_image, filenames, max_workers=8, chunksize=1)
    pd.DataFrame.from_records(split_imgs, columns=ImageMeta._fields).to_csv("/kaggle/working/{}_meta.csv".format(split), index=False)
    print("No. of Images in split {}: {}".format(split, len(split_imgs)))

In [None]:
!tar -zcf train_{SHAPE[0]}x{SHAPE[1]}.tar.gz -C "/kaggle/tmp/train" .
!tar -zcf test_{SHAPE[0]}x{SHAPE[1]}.tar.gz -C "/kaggle/tmp/test" .