In [None]:
import os
import pandas as pd
import numpy as np

import pydicom as dicom
from pydicom.pixel_data_handlers.util import apply_voi_lut

import cv2
from tqdm.auto import tqdm

from os import listdir, makedirs
from os.path import isfile, join

## Define path

In [None]:
path = '/kaggle/input/vinbigdata-chest-xray-abnormalities-detection'
test_path = join(path,'test')
sample_sub_file = join(path, 'sample_submission.csv')
output_path = '/kaggle/working'
output_images_path = join(output_path,'test_images')
output_images_size_path = join(output_path,'test_images_size')


In [None]:
makedirs(output_images_path, exist_ok = True)

## Read Dicom files

In [None]:
def read_xray(path, voi_lut = True, fix_monochrome = True, downscale_factor = 1, max_dim = None):
    # Read image data from the dicom file
    # (reference: https://www.kaggle.com/raddar/vinbigdata-competition-jpg-data-2x-downsampled)
    data_file = dicom.dcmread(path)

    # VOI LUT (if available by DICOM device) is used to transform raw DICOM data to "human-friendly" view
    if voi_lut:
        data = apply_voi_lut(data_file.pixel_array, data_file)
    else:
        data = data_file.pixel_array

    # depending on this value, X-ray may look inverted - fix that:
    if fix_monochrome and data_file.PhotometricInterpretation == "MONOCHROME1":
        data = np.amax(data) - data

    data = data - np.min(data)
    data = data / np.max(data)
    data = (data * 255.0).astype(np.uint8)
    orig_size = data.shape # Original size before resizing - (h,w)
    if downscale_factor > 1:
        new_shape = tuple([int(x / downscale_factor) for x in data.shape])
        data = cv2.resize(data, (new_shape[1], new_shape[0]))
    if max_dim != None:
        downscale_factor =  max(data.shape) / max_dim
        new_shape = tuple([round(x / downscale_factor) for x in data.shape])
        data = cv2.resize(data, (new_shape[1], new_shape[0]))
    
    # Use CLAHE (Contrast Limited Adaptive Histogram Equalization) to improve contrast
        clahe = cv2.createCLAHE(clipLimit=2.0, tileGridSize=(8,8))
        data = clahe.apply(data)
        data = cv2.cvtColor(data,cv2.COLOR_GRAY2RGB) # convert to 3 channel grayscale
        
    return data, orig_size

In [None]:
sample_sub_df = pd.read_csv(sample_sub_file)
dicom_files = sample_sub_df['image_id'].tolist()

## Read Dicom and write jpg files

In [None]:
# Need to save CSV later with original images sizes
image_hight_list = [] 
image_width_list = []

# Convert all dicoms to jpg (reduced size) and create DF with original image sizes
for dicom_file in tqdm(dicom_files):
    dicom_path = join(test_path,dicom_file + '.dicom')
    img, (img_h, img_w) = read_xray(dicom_path, max_dim = 512) # Read image, resize to 512pt and get original image size h,w for norm
    image_path = join(output_images_path, dicom_file + '.jpg')
                                    
    result=cv2.imwrite(image_path, img)  # Write jpg file
    
    image_hight_list.append(img_h) 
    image_width_list.append(img_w)

## Update sample_submission.csv with original image sizes

In [None]:
#update test dataframe with image sizes (for labling calculations)
sample_sub_df['image_h'] = image_hight_list
sample_sub_df['image_w'] = image_width_list

sample_sub_df.to_csv("sample_submission.csv", index=False)