### Pretrained UNET Lung Segmentation Dataset

The objective of this notebook is quite self explanatory :   
<center><b><h2>Create a Lung Masks Segmented Dataset</h2></b></center>

### Why Lung Segmentation?

Most of the COVID-19 images are hastily obtained in this situation.  
Hence, more often then not, they contain **Medical Noise**. Obviously, we can't blame doctors for this situation as they are working hard day and night to treat patients.

Well then, **What we as Machine Learning Engineers can do about this??**

We can apply segmentation models for cleaning the dataset first! In this notebook, I apply my favourite model, the UNET for Segmenting and separating out the Lung masks from the image. You can read more about UNets here : [Official Paper](https://arxiv.org/abs/1505.04597) or [Blog](https://towardsdatascience.com/understanding-semantic-segmentation-with-unet-6be4f42d4b47).

Also I have a pretrained model ready for use here, borrowed from [Nikhil Pandey](https://www.kaggle.com/nikhilpandey360/lung-segmentation-from-chest-x-ray-dataset/output).

### Import Helper Functions

In [None]:
! conda install -c conda-forge gdcm -y

In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import os
from glob import glob
from tqdm.notebook import tqdm
import matplotlib.pyplot as plt
from skimage import exposure
import cv2
import warnings
warnings.filterwarnings('ignore')
import shutil 
import tensorflow as tf
%matplotlib inline


import matplotlib.pylab as pylab
import seaborn as sns
import pprint
import pydicom as dicom
from pydicom.pixel_data_handlers.util import apply_voi_lut
import wandb

import PIL
from PIL import Image
from colorama import Fore, Back, Style
viz_counter=0

def create_dir(dir, v=1):
    """
    Creates a directory without throwing an error if directory already exists.
    dir : The directory to be created.
    v : Verbosity
    """
    if not os.path.exists(dir):
        os.makedirs(dir)
        if v:
            print("Created Directory : ", dir)
        return 1
    else:
        if v:
            print("Directory already existed : ", dir)
        return 0

voi_lut=True
fix_monochrome=True

def dicom_dataset_to_dict(filename):
    """Credit: https://github.com/pydicom/pydicom/issues/319
               https://www.kaggle.com/raddar/convert-dicom-to-np-array-the-correct-way
    """
    
    dicom_header = dicom.dcmread(filename) 
    
    #====== DICOM FILE DATA ======
    dicom_dict = {}
    repr(dicom_header)
    for dicom_value in dicom_header.values():
        if dicom_value.tag == (0x7fe0, 0x0010):
            #discard pixel data
            continue
        if type(dicom_value.value) == dicom.dataset.Dataset:
            dicom_dict[dicom_value.name] = dicom_dataset_to_dict(dicom_value.value)
        else:
            v = _convert_value(dicom_value.value)
            dicom_dict[dicom_value.name] = v
      
    del dicom_dict['Pixel Representation']
    
    #====== DICOM IMAGE DATA ======
    # VOI LUT (if available by DICOM device) is used to transform raw DICOM data to "human-friendly" view
    if voi_lut:
        data = apply_voi_lut(dicom_header.pixel_array, dicom_header)
    else:
        data = dicom_header.pixel_array
    # depending on this value, X-ray may look inverted - fix that:
    if fix_monochrome and dicom_header.PhotometricInterpretation == "MONOCHROME1":
        data = np.amax(data) - data
    data = data - np.min(data)
    data = data / np.max(data)
    modified_image_data = (data * 255).astype(np.uint8)
    
    return dicom_dict, modified_image_data

def _sanitise_unicode(s):
    return s.replace(u"\u0000", "").strip()

def _convert_value(v):
    t = type(v)
    if t in (list, int, float):
        cv = v
    elif t == str:
        cv = _sanitise_unicode(v)
    elif t == bytes:
        s = v.decode('ascii', 'replace')
        cv = _sanitise_unicode(s)
    elif t == dicom.valuerep.DSfloat:
        cv = float(v)
    elif t == dicom.valuerep.IS:
        cv = int(v)
    else:
        cv = repr(v)
    return cv


import os, fnmatch
def find(pattern, path):
    """Utility to find files wrt a regex search"""
    result = []
    for root, dirs, files in os.walk(path):
        for name in files:
            if fnmatch.fnmatch(name, pattern):
                result.append(os.path.join(root, name))
    return result


Lets get all the .dcm files

In [None]:
FIND_FOLDER="/kaggle/input/siim-covid19-detection"
dcm_files=find('*.dcm', FIND_FOLDER)
print(len(dcm_files),"Files Found.")

In [None]:
import random
random.seed(52) # 42 # 2021
subset_dcm_files =  random.choices(dcm_files, k=3) # dcm_files[:3]
subset_dcm_files

In [None]:
viz_counter=0

In [None]:
def props(arr):
    print("Shape :",arr.shape,"Maximum :",arr.max(),"Minimum :",arr.min(),"Data Type :",arr.dtype)
for path in subset_dcm_files:
    dicom_dict, modified_image_data = dicom_dataset_to_dict(path)
    props(modified_image_data)
    # print(dicom_dict)
    fig, ax = plt.subplots(1, 2, figsize=(20, 12))
    ax[0].imshow(modified_image_data, cmap="gray")
    ax[0].axis('off')
    ax[1].imshow(modified_image_data, cmap="viridis")    
    ax[1].axis('off')
    plt.savefig(str(viz_counter)+".png",dpi=300)
    viz_counter+=1
    plt.show()

### Define UNET Model

In [None]:
from keras.models import *
from keras.layers import *
from keras.optimizers import *
from keras import backend as keras
from keras.preprocessing.image import ImageDataGenerator
from keras.callbacks import ModelCheckpoint, LearningRateScheduler


def dice_coef(y_true, y_pred):
    y_true_f = keras.flatten(y_true)
    y_pred_f = keras.flatten(y_pred)
    intersection = keras.sum(y_true_f * y_pred_f)
    return (2. * intersection + 1) / (keras.sum(y_true_f) + keras.sum(y_pred_f) + 1)

def dice_coef_loss(y_true, y_pred):
    return -dice_coef(y_true, y_pred)

def unet(input_size=(256,256,1)):
    inputs = Input(input_size)
    
    conv1 = Conv2D(32, (3, 3), activation='relu', padding='same')(inputs)
    conv1 = Conv2D(32, (3, 3), activation='relu', padding='same')(conv1)
    pool1 = MaxPooling2D(pool_size=(2, 2))(conv1)

    conv2 = Conv2D(64, (3, 3), activation='relu', padding='same')(pool1)
    conv2 = Conv2D(64, (3, 3), activation='relu', padding='same')(conv2)
    pool2 = MaxPooling2D(pool_size=(2, 2))(conv2)

    conv3 = Conv2D(128, (3, 3), activation='relu', padding='same')(pool2)
    conv3 = Conv2D(128, (3, 3), activation='relu', padding='same')(conv3)
    pool3 = MaxPooling2D(pool_size=(2, 2))(conv3)

    conv4 = Conv2D(256, (3, 3), activation='relu', padding='same')(pool3)
    conv4 = Conv2D(256, (3, 3), activation='relu', padding='same')(conv4)
    pool4 = MaxPooling2D(pool_size=(2, 2))(conv4)

    conv5 = Conv2D(512, (3, 3), activation='relu', padding='same')(pool4)
    conv5 = Conv2D(512, (3, 3), activation='relu', padding='same')(conv5)

    up6 = concatenate([Conv2DTranspose(256, (2, 2), strides=(2, 2), padding='same')(conv5), conv4], axis=3)
    conv6 = Conv2D(256, (3, 3), activation='relu', padding='same')(up6)
    conv6 = Conv2D(256, (3, 3), activation='relu', padding='same')(conv6)

    up7 = concatenate([Conv2DTranspose(128, (2, 2), strides=(2, 2), padding='same')(conv6), conv3], axis=3)
    conv7 = Conv2D(128, (3, 3), activation='relu', padding='same')(up7)
    conv7 = Conv2D(128, (3, 3), activation='relu', padding='same')(conv7)

    up8 = concatenate([Conv2DTranspose(64, (2, 2), strides=(2, 2), padding='same')(conv7), conv2], axis=3)
    conv8 = Conv2D(64, (3, 3), activation='relu', padding='same')(up8)
    conv8 = Conv2D(64, (3, 3), activation='relu', padding='same')(conv8)

    up9 = concatenate([Conv2DTranspose(32, (2, 2), strides=(2, 2), padding='same')(conv8), conv1], axis=3)
    conv9 = Conv2D(32, (3, 3), activation='relu', padding='same')(up9)
    conv9 = Conv2D(32, (3, 3), activation='relu', padding='same')(conv9)

    conv10 = Conv2D(1, (1, 1), activation='sigmoid')(conv9)

    return Model(inputs=[inputs], outputs=[conv10])

In [None]:
model = unet(input_size=(512,512,1))
model.compile(optimizer=Adam(lr=1e-5), loss=dice_coef_loss,
                  metrics=[dice_coef, 'binary_accuracy'])
model.summary()

# Load the Pretrained UNet Model

In [None]:
model_weights_path = "/kaggle/input/unet-lung-segmentation-weights-for-chest-x-rays/cxr_reg_weights.best.hdf5"

model.load_weights(model_weights_path)

In [None]:
"""
Shapes that you wish to resize to
"""

Shape_X,Shape_Y=512,512

for path in subset_dcm_files:
    dicom_dict, modified_image_data = dicom_dataset_to_dict(path)
    resized_image_data = cv2.resize(modified_image_data,(Shape_Y,Shape_X)) # cv2 has this opposite
    # props(resized_image_data)
    prep_unet_input_img_1 = resized_image_data.reshape(1,Shape_X,Shape_Y,1)
    prep_unet_input_img = (prep_unet_input_img_1-127.0)/127.0
    pred_img = model.predict(prep_unet_input_img)
    pred_img_preprocessed_1 = np.squeeze(pred_img)
    pred_img_preprocessed = (pred_img_preprocessed_1*255>127).astype(np.int8)
    # props(pred_img_preprocessed)
    # print("Unique Values :",np.unique(pred_img_preprocessed))
    res = cv2.bitwise_and(resized_image_data,resized_image_data,mask = pred_img_preprocessed)
    fig, ax = plt.subplots(1, 3, figsize=(20, 12))
    ax[0].imshow(resized_image_data, cmap="viridis")
    ax[0].axis('off')
    ax[1].imshow(pred_img_preprocessed, cmap="viridis")    
    ax[1].axis('off')
    ax[2].imshow(res, cmap="viridis")    
    ax[2].axis('off')
    plt.savefig(str(viz_counter)+".png",dpi=300)
    viz_counter+=1
    cv2.imwrite(str(viz_counter)+".png",res)
    viz_counter+=1
    plt.show()

In [None]:
image_id = []
dim0 = []
dim1 = []
splits = []

for split in ['test', 'train']:
    # save_dir = f'/kaggle/tmp/{split}/'
    save_dir = f'/kaggle/working/segmented_data/{split}/'
    print(split)
    os.makedirs(save_dir, exist_ok=True)
    
    for dirname, _, filenames in tqdm(os.walk(f'../input/siim-covid19-detection/{split}')):
        for file in filenames:
            # set keep_ratio=True to have original aspect ratio
            fpath = os.path.join(dirname, file)
            dicom_dict, modified_image_data = dicom_dataset_to_dict(fpath)
            resized_image_data = cv2.resize(modified_image_data,(Shape_Y,Shape_X)) # cv2 has this opposite
            # props(resized_image_data)
            prep_unet_input_img_1 = resized_image_data.reshape(1,Shape_X,Shape_Y,1)
            prep_unet_input_img = (prep_unet_input_img_1-127.0)/127.0
            pred_img = model.predict(prep_unet_input_img)
            pred_img_preprocessed_1 = np.squeeze(pred_img)
            pred_img_preprocessed = (pred_img_preprocessed_1*255>127).astype(np.int8)
            # props(pred_img_preprocessed)
            # print("Unique Values :",np.unique(pred_img_preprocessed))
            res = cv2.bitwise_and(resized_image_data,resized_image_data,mask = pred_img_preprocessed)
            save_path = os.path.join(save_dir, file.replace('dcm', 'png'))
            cv2.imwrite(save_path,res)

            image_id.append(file.replace('.dcm', ''))
            dim0.append(res.shape[0])
            dim1.append(res.shape[1])
            splits.append(split)
"""
2475/?
12386/?
07:34 | 5.38it/s
36:51 | 8.13it/s
"""
print("Generation Complete!")

In [None]:
df = pd.DataFrame.from_dict({'image_id': image_id, 'dim0': dim0, 'dim1': dim1, 'split': splits})
df.to_csv('meta.csv', index=False)

In [None]:
import os
import zipfile
import shutil

#taken from : https://www.kaggle.com/xhlulu/recursion-2019-load-resize-and-save-images

def zip_and_remove(path):
    ziph = zipfile.ZipFile(f'{path}.zip', 'w', zipfile.ZIP_DEFLATED)
    
    for root, dirs, files in os.walk(path):
        for file in files:
            file_path = os.path.join(root, file)
            ziph.write(file_path)
            os.remove(file_path)
    
    ziph.close()
    shutil.rmtree(path)
save_dir = 'segmented_data'
zip_and_remove(save_dir)