# Data Preprocessing

In [None]:
# ~400 .dcm files have Transfer Syntax UID : JPEG Lossless, Nonhierarchical, First- Order Prediction 
# therefore, GDCM must be installed beforehand, in order to decode it.
# The other files have 'Explicit VR Little Endian', which is supported by pydicom alone.
# GDCM package available: +Add data -> gdcm-conda-forge -> Add
!tar -xvf ../input/gdcm-conda-install/gdcm.tar
!conda install ../working/gdcm/conda-4.8.4-py37hc8dfbb8_2.tar.bz2
!conda install ../working/gdcm/gdcm-2.8.9-py37h71b2a6d_0.tar.bz2
!conda install ../working/gdcm/libjpeg-turbo-2.0.3-h516909a_1.tar.bz2

In [None]:
import os
import glob
from tqdm import tqdm
from pathlib import Path

import pandas as pd
import numpy as np

# from pydicom.pixel_data_handlers.util import apply_voi_lut
import seaborn as sns
import matplotlib.pyplot as plt
from collections import Counter

from PIL import Image
import gdcm
import pydicom

import cv2
import random

In [None]:
HOME = Path('/kaggle/input/siim-covid19-detection/')
STUDY_ANN = Path('train_study_level.csv')
IMG_ANN = Path('train_image_level.csv')

TRAIN = '/kaggle/input/siim-covid19-detection/train/'

In [None]:
img_ann = pd.read_csv(HOME/IMG_ANN)
img_ann.head(3)

In [None]:
# The equivalent of utils.py
def rescale_dcm(dcm_px):
    zero_one = (dcm_px - dcm_px.min())/((dcm_px.max() - dcm_px.min()))
    rescaled = (zero_one * 255).astype(np.uint8)
    return rescaled

def apply_hist_equalization(array):
    clahe = cv2.createCLAHE(clipLimit = 2, tileGridSize = (8,8))
    cl_array = clahe.apply(array)
    return cl_array

In [None]:
# The overall mean and standard deviation were calculated:
SAMPLE_MEAN = 134.0
SAMPLE_STD = 56.0

# Visual inspection of the dcm

The chosen transformations of the raw pixel arrays are:
- harmonize the meaning of each pixel value by chosing the baseline Photometric Interpretation;
- scale the pixel values in the 0-255 range;
- CLAHE for histogram equalization;
    The purpose is trying to remove some of the noise in the images, and obtaining a better view of the X-rays. Applying histogram equalization generates an improvement in how the images look like. 
- Standardize the values to have mean 0 and unit standard deviation.

In [None]:
# Sample 6 images from the annotation file
sample_df = img_ann.sample(6)

# Plot
fig, ax = plt.subplots(nrows = 6, ncols = 4, figsize = (25,30))
r = 0
c = 0

for idx, image in tqdm(sample_df.iterrows(), total = len(sample_df)):
    dcm_path = glob.glob(os.path.join(TRAIN, 
                                      image.StudyInstanceUID, 
                                      "*",
                                      image.id.split("_")[0]+".dcm"))[0]
    
    # Read the .dcm metadata
    dcm = pydicom.dcmread(dcm_path)
      
    # Get the pixel array
    dcm_px = dcm.pixel_array

    if dcm.PhotometricInterpretation == "MONOCHROME1":
        dcm_px = np.amax(dcm_px) - dcm_px
    
    # Rescale the values in the 0-255 range
    dcm_rescaled = rescale_dcm(dcm_px)
    
    #Histogram equalization
    cl_array = apply_hist_equalization(dcm_rescaled)
    
#     std_array = (cl_array - SAMPLE_MEAN)/ SAMPLE_STD
#     std_array = cv2.resize(std_array, (512,512))
    

    ax[r, c].imshow(dcm_rescaled)
    ax[r, 0].set_ylabel("Rescaled pixel array")
    ax[r, c+1].hist(dcm_rescaled.flatten(), bins = 100)
    
    ax[r+1, c].imshow(cl_array)
    ax[r+1, 0].set_ylabel("CLAHE")
    ax[r+1, c+1].hist(cl_array.flatten(), bins = 100)
    
    
    c = c + 2
    if c%4 == 0:
        c = 0
        r = r+2

# Transform data

In [None]:
DESTINATION_RESIZED = 'resized_512_train'
if os.path.exists(DESTINATION_RESIZED):
    print("{} folder exists.".format(DESTINATION_RESIZED))
else:
    os.makedirs(DESTINATION_RESIZED)

In [None]:
# os.rmdir(DESTINATION_RESIZED)
# import shutil
# shutil.rmtree(DESTINATION_RESIZED)

In [None]:
for idx, image in tqdm(img_ann[0:5].iterrows(), total = len(img_ann[0:5])):
    dcm_path = glob.glob(os.path.join(TRAIN, 
                                      image.StudyInstanceUID, 
                                      "*",
                                      image.id.split("_")[0]+".dcm"))[0]
    
    # Read the .dcm metadata
    dcm = pydicom.dcmread(dcm_path)
      
    # Get the pixel array
    dcm_px = dcm.pixel_array

    # Harmonize the images to match MONOCHROME2
    if dcm.PhotometricInterpretation == "MONOCHROME1":
        dcm_px = np.amax(dcm_px) - dcm_px
    
    # Rescale the values in the 0-255 range
    dcm_rescaled = rescale_dcm(dcm_px)
    
    # Apply histogram equalization as a transformation step
    cl_array = apply_hist_equalization(dcm_rescaled)
    
    # Standardize to 0 mean and 1 standard deviation?
    # For this purpose, first determine the mean and standard deviation of the overall train data
#     std_array = (cl_array - SAMPLE_MEAN)/ SAMPLE_STD
    
#     # Maybe resize the images
    resized_array = cv2.resize(cl_array, (512,512))

    Image.fromarray(resized_array).save(os.path.join(DESTINATION_RESIZED, str(image.id) + ".jpg"))

In [None]:
os.listdir(DESTINATION_RESIZED)

In [None]:
!tar -czf train_images_512.tar.gz resized_512_train
!du -h train_images_512.tar.gz

In [None]:
from IPython.display import FileLink
FileLink(r'train_images_512.tar.gz')

## Sources:

1. CLAHE: https://towardsdatascience.com/clahe-and-thresholding-in-python-3bf690303e40
2. @avinashrai for saving the transformed images