I've noticed that some of the min-max normalized images are too dark or too bright with low-contrast.  
In this notebook, I tried to find better method to normalize pixel_arrays.

In [None]:
!conda install gdcm -c conda-forge -y

In [None]:
from pathlib import Path
from tqdm import tqdm
import pydicom
from pydicom.pixel_data_handlers.util import apply_voi_lut
import cv2
import numpy as np
import matplotlib.pyplot as plt

train_dir = Path('/kaggle/input/siim-covid19-detection/train')
filenames = sorted(train_dir.glob('**/*.dcm'))

## Is lookup table or windowing available at all?

In [None]:
def is_lut_available(ds):
    '''
    Original from pydicom/util.py (apply_voi_lut)
    '''
    valid_voi = False
    if 'VOILUTSequence' in ds:
        ds.VOILUTSequence = cast(List["Dataset"], ds.VOILUTSequence)
        valid_voi = None not in [
            ds.VOILUTSequence[0].get('LUTDescriptor', None),
            ds.VOILUTSequence[0].get('LUTData', None)
        ]
    valid_windowing = None not in [
        ds.get('WindowCenter', None),
        ds.get('WindowWidth', None)
    ]
    return valid_voi or valid_windowing

any_lut_available = False
for filename in tqdm(filenames):
    dcm = pydicom.dcmread(filename, stop_before_pixels=True)
    any_lut_available |= is_lut_available(dcm)

print('At least one lut is available.' if any_lut_available else 'There is no lut available.')

So I can't rely on lookup tables or pre-defined windows.

## Min-max normalization

In [None]:
N = 10
IMAGE_SIZE = 128

def resize(img):
    return cv2.resize(img, (IMAGE_SIZE, IMAGE_SIZE))

def read_xray(path, voi_lut = True, fix_monochrome = True):
    # Original from: https://www.kaggle.com/raddar/convert-dicom-to-np-array-the-correct-way
    dicom = pydicom.read_file(path)
    # VOI LUT (if available by DICOM device) is used to transform raw DICOM data to "human-friendly" view
    if voi_lut:
        data = apply_voi_lut(dicom.pixel_array, dicom)
    else:
        data = dicom.pixel_array

    # depending on this value, X-ray may look inverted - fix that:
    if fix_monochrome and dicom.PhotometricInterpretation == "MONOCHROME1":
        data = np.amax(data) - data
        
    data = data - np.min(data)
    data = data / np.max(data)
    data = (data * 255).astype(np.uint8)
        
    return data

In [None]:
imgs = np.stack([resize(read_xray(filename)) for filename in tqdm(filenames[:N**2])])
tile = np.concatenate(np.concatenate(imgs.reshape((N, N, IMAGE_SIZE, IMAGE_SIZE)), axis=1), axis=1)
plt.figure(figsize=(20, 20))
plt.imshow(tile, cmap='gray')

There are quite a few too dark/bright images.

## Too dark/bright images
I picked up some samples of too dark/bright images for the inspection

In [None]:
import seaborn as sns
dark_bright_filenames = ['1e96d5eb4c91/703a9f4c4ffb/7f1924880cf8.dcm', '1e96d5eb4c91/3928035a1e6d/3d12cb6aad8b.dcm', '1a53a4506f10/db8d01dc5e4f/95bf5f4f6153.dcm', '047b450939fd/76eb6433f31d/cc54237ef4db.dcm']

for filename in dark_bright_filenames:
    dcm = pydicom.dcmread(train_dir / filename)
    pixel_array = dcm.pixel_array
    print(filename)
    print('min, max =',pixel_array.min(), pixel_array.max())
    plt.figure(figsize=(10, 5))
    plt.subplot(1,2,1)
    plt.imshow(read_xray(train_dir / filename), cmap='gray')
    plt.subplot(1,2,2)
    sns.histplot(pixel_array.ravel())
    plt.show()


Apparently, these images contain some outliers in pixel values.

## Robust pixel_array scaling
The basic idea here is to exclude outlier pixels in the scaling process.

In [None]:
def robust_read_xray(path, voi_lut = True, fix_monochrome = True):
    # Original from: https://www.kaggle.com/raddar/convert-dicom-to-np-array-the-correct-way
    dicom = pydicom.read_file(path)

    # VOI LUT (if available by DICOM device) is used to transform raw DICOM data to "human-friendly" view
    if voi_lut:
        data = apply_voi_lut(dicom.pixel_array, dicom)
    else:
        data = dicom.pixel_array
    
    # depending on this value, X-ray may look inverted - fix that:
    if fix_monochrome and dicom.PhotometricInterpretation == "MONOCHROME1":
        data = np.amax(data) - data
        
    q1,q2,q3 = np.quantile(data, [.25,.5,.75])
    iqr = q3 - q1
    multiplier = 2
    mask = ((q2 - multiplier * iqr) < data) & (data < (q2 + multiplier * iqr))
    p = .001
    data = data.astype(np.float32) - np.quantile(data[mask], p)
    data = data / np.quantile(data[mask], 1-p)
    data = np.clip(data, 0, 1)
    return data

In [None]:
imgs = np.stack([resize(robust_read_xray(filename)) for filename in tqdm(filenames[:N**2])])
tile = np.concatenate(np.concatenate(imgs.reshape((N,N,IMAGE_SIZE, IMAGE_SIZE)), axis=1), axis=1)
plt.figure(figsize=(20,20))
plt.imshow(tile, cmap='gray')

## Side by side comparison

In [None]:
for filename in dark_bright_filenames:
    print(filename)
    plt.subplot(1,2,1)
    plt.imshow(read_xray(train_dir / filename), cmap='gray')
    plt.title('min-max')
    plt.subplot(1,2,2)
    plt.imshow(robust_read_xray(train_dir / filename), cmap='gray')
    plt.title('robust')
    plt.show()

There maybe better method or parameters (multiplier and p in read_xray_robust) for the pixel_array normalization.