# AI_For_Healthcare_Ex2_Image_Segmentation

In this excersice, we tried to build a U-net learning model for Image segmentation of X-Ray images.

In [None]:
import os
import sys
from glob import glob

import numpy as np 
import pandas as pd

import matplotlib.pyplot as plt
import matplotlib.image as image
import seaborn as sns
from tqdm.notebook import tqdm

import pydicom
from pydicom.data import get_testdata_files

# libraries For building the model
import torch
from sklearn.model_selection import train_test_split
import tensorflow as tf
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Conv2D, MaxPool2D, UpSampling2D, Concatenate
import cv2

print(os.listdir("../input/siim-acr-pneumothorax-segmentation"))
print()
sys.path.insert(0, '../input/siim-acr-pneumothorax-segmentation')

from mask_functions import mask2rle, rle2mask
%matplotlib inline

In [None]:
from tensorflow.python.client import device_lib

print(device_lib.list_local_devices())



## Load information of the dataset

We actually used another data set that was stored in this notebook: https://www.kaggle.com/datasets/seesee/siim-train-test (Doesn't exists anymore). It has the same files but with minor changes in the meatadata and mybe also in the number of files.

In [None]:
# Create a list of image files
train_imgs_paths = sorted(glob('../input/siim-acr-pneumothorax-segmentation-data/dicom-images-train/*/*/*.dcm'))
print("Train images -", len(train_imgs_paths))

test_imgs_paths = sorted(glob('../input/siim-acr-pneumothorax-segmentation-data/dicom-images-test/*/*/*.dcm'))
print("Test images -", len(test_imgs_paths))
file_paths = train_imgs_paths + test_imgs_paths

In [None]:
# Load information for dataset
data_df = pd.read_csv('../input/siim-acr-pneumothorax-segmentation-data/train-rle.csv')
data_df.rename(columns={" EncodedPixels" : "EncodedPixels"}, inplace=True) # a typo in the csv
data_df.head()

### The Images & metadata

In [None]:
fig = plt.figure(figsize=(15, 15))
# Showing 10 sample images
for q, file_path in enumerate(file_paths):
    if q == 8:
        # See the metadata included in the image file
        print(ds)
        print()
        break
    ds = pydicom.dcmread(file_path)
    plt.subplot(1,8,q+1)
    plt.title('Input Images')
    plt.imshow(ds.pixel_array, cmap='gray') 
    plt.axis('off')

### Get information from images

In [None]:
def get_metadata(dicom, df):
    """
    Maps the metadata from the dcm file to an image Id that we actually have in the labeled data (df)
    """
    metadata = {}

    matching_image = df['ImageId'] == dicom.SOPInstanceUID
    # meaning we didn't find any matching image to df image id 
    if matching_image.eq(False).all():
        return {}

    encoded_pixels = df[matching_image]['EncodedPixels'].values

    metadata['patient sex'] = dicom.PatientSex
    metadata['patient age'] = dicom.PatientAge
    metadata['view position'] = dicom.ViewPosition
    metadata['has pneumothorax'] = encoded_pixels[0] != ' -1'
    metadata['encoded pixels'] = encoded_pixels

    return metadata

In [None]:
all_metadata = []
    
for file_path in file_paths:
    ds = pydicom.dcmread(file_path)
    metadata = get_metadata(ds, data_df)
    if metadata != {}:
        all_metadata.append(metadata)

metadata_df = pd.DataFrame(all_metadata)
metadata_df.head()
print(metadata_df.shape)

# Distribution of the data

We wanted to see the distribution of the data from the dcm files - # of men/women, Age distribution, How much people are with pneumothorax. We used the metadata data frame that we created earlier.

In [None]:
precent_pneumothorax = ((metadata_df['has pneumothorax'] == True).sum())/len(metadata_df)
precent_no_pneumothorax = 1-precent_pneumothorax
labels = ['Pneumothorax', 'No Pneumothorax']
colors = sns.color_palette('pastel')
plt.pie([precent_pneumothorax, precent_no_pneumothorax], labels = labels, colors = colors, autopct='%.2f%%')
plt.title("Precentage of Pneumothorax vs. No Pneumothorax")
plt.show();

In [None]:
sns.displot(metadata_df, x='patient sex', hue='has pneumothorax', palette='flare').set(title='Distribution of Female and Male with and without Pneumothorax');

In [None]:
count_genders = metadata_df['patient sex'].value_counts()
men_with_pneumothorax = metadata_df.loc[(metadata_df['patient sex'] == 'M') & (metadata_df['has pneumothorax'] == True)].count()[0]
women_with_pneumothorax = metadata_df.loc[(metadata_df['patient sex'] == 'F') & (metadata_df['has pneumothorax'] == True)].count()[0]

print("% of men with pneumothorax", round(men_with_pneumothorax/count_genders['M']*100, 2), '%')
print("% of women with pneumothorax", round(women_with_pneumothorax/count_genders['F']*100, 2), '%')

We can see that 
1. More men patients than women
2. More men have been diagnosed with pneumothorax. 

The precentage of men which have pneumothorax from the tested group is: ~ 22.5% and of the women is: ~ 21.7%

Preparing the data for feading the model:

In [None]:
# Load rles
rles_df = pd.read_csv('../input/siim-acr-pneumothorax-segmentation-data/train-rle.csv')
rles_df = rles_df.rename(columns={' EncodedPixels':'EncodedPixels'})
rles_df['EncodedPixels'] = rles_df['EncodedPixels'].apply(lambda x: x.strip())

# Create a dictionary for images with masks
rles_df = rles_df[rles_df['EncodedPixels'] !='-1'].groupby('ImageId')['EncodedPixels'].apply(list).reset_index()
print(len(rles_df))

masks = {}
for index, row in rles_df.iterrows():
    masks[row['ImageId']] = row['EncodedPixels']
print(len(masks))

### Building the Learning Model
We played with the different parameters and changed a little the architecture

In [None]:
# Parameters
img_size = 128
batch_size = 8
k_size = 3
train_size = 0.7
test_size = 0.05
shuffle = True
channels = 1
epoch = 10
smooth = 1 
verbose = 2

In [None]:
# Define a data generator
# The data generator will help us hold the images and the masks
class DataGenerator(tf.keras.utils.Sequence):
    def __init__(self, file_path_list, labels, batch_size=32, img_size=256, channels=1, shuffle=True):
        self.file_path_list = file_path_list
        self.labels = labels
        self.batch_size = batch_size
        self.img_size = img_size
        self.channels = channels
        self.shuffle = shuffle
        self.on_epoch_end()

    def __len__(self):
        'denotes the number of batches per epoch'
        return int(np.floor(len(self.file_path_list)) / self.batch_size)

    def __getitem__(self, index):
        'generate one batch of data'
        indexes = self.indexes[index*self.batch_size:(index+1)*self.batch_size]
        # get list of IDs
        file_path_list_temp = [self.file_path_list[k] for k in indexes]
        # generate data
        X, y = self.__data_generation(file_path_list_temp)
        # return data 
        return X, y

    def on_epoch_end(self):
        'update ended after each epoch'
        self.indexes = np.arange(len(self.file_path_list))
        if self.shuffle:
            np.random.shuffle(self.indexes)

    def __data_generation(self, file_path_list_temp):
        'generate data containing batch_size samples'
        X = np.empty((self.batch_size, self.img_size, self.img_size, self.channels))
        y = np.empty((self.batch_size, self.img_size, self.img_size, self.channels))

        for idx, file_path in enumerate(file_path_list_temp):

            id = file_path.split('/')[-1][:-4]
            rle = self.labels.get(id)
            image = pydicom.read_file(file_path).pixel_array
            image_resized = cv2.resize(image, (self.img_size, self.img_size))
            image_resized = np.array(image_resized, dtype=np.float64)

            X[idx,] = np.expand_dims(image_resized, axis=2)

            # if there is no mask create empty mask
            # notice we are starting of with 1024 because we need to use the rle2mask function
            
            mask = np.zeros((1024, 1024))
            if rle is not None:
                for r in rle:
                    mask =  mask + rle2mask(r, 1024, 1024).T

            mask_resized = cv2.resize(mask, (self.img_size, self.img_size))
            y[idx,] = np.expand_dims(mask_resized, axis=2)

        # normalize 
        X = X / 255
        y = y / 255

        return X, y

In [None]:
# Create generators for training and validating
params = {'img_size': img_size,
          'batch_size': batch_size,
          'channels': channels,
          'shuffle': shuffle}

X_train, X_val = train_test_split(train_imgs_paths, test_size=test_size, train_size=train_size)
print(len(X_train))
print(len(X_val))

train_gen = DataGenerator(X_train, masks, **params)
val_gen = DataGenerator(X_val, masks, **params)

In [None]:
# Sample batch
for i in range(8):    
    x, y = train_gen.__getitem__(i)
    print(x.shape, y.shape)
    n=1
    fig, ax = plt.subplots(nrows=1, ncols=2, sharey=True, figsize=(10,7))
    ax[0].imshow(x[n,:,:,0],cmap='bone')
    ax[1].imshow(y[n,:,:,0],cmap='Blues')

In [None]:
# Helper functions
def down_block(x, filters, kernel_size=3, padding='same', strides=1, activation='relu'):
    'down sampling block of our UNet'
    conv = Conv2D(filters, kernel_size, padding=padding, strides=strides, activation=activation)(x)
    conv = Conv2D(filters, kernel_size, padding=padding, strides=strides, activation=activation)(conv)
    pool = MaxPool2D((2,2), (2,2))(conv)
    return conv, pool

def up_block(x, skip, filters, kernel_size=3, padding='same', strides=1, activation='relu'):
    'up sampling block of our UNet'
    up_sample = UpSampling2D((2,2))(x)
    concat = Concatenate()([up_sample, skip])
    conv = Conv2D(filters, kernel_size, padding=padding, strides=strides, activation=activation)(concat)
    conv = Conv2D(filters, kernel_size, padding=padding, strides=strides, activation=activation)(conv)
    return conv

def bottleneck(x, filters, kernel_size=3, padding='same', strides=1, activation='relu'):
    'bottle neck that sits inbetween the down sampling side and the up sampling side'
    conv = Conv2D(filters, kernel_size, padding=padding, strides=strides, activation=activation)(x)
    conv = Conv2D(filters, kernel_size, padding=padding, strides=strides, activation=activation)(conv)
    return conv

def UNet(img_size):
    'constructing UNet using the blocks defined above'
    
    # number of filters per block
    f = [16, 32, 64, 128, 256]
    inputs = Input((img_size, img_size, 1))
    p0 = inputs
    c1, p1 = down_block(p0, f[0])
    c2, p2 = down_block(p1, f[1])
    c3, p3 = down_block(p2, f[2])
    c4, p4 = down_block(p3, f[3])
    
    bn = bottleneck(p4, f[4])
    
    u1 = up_block(bn, c4, f[3])
    u2 = up_block(u1, c3, f[2])
    u3 = up_block(u2, c2, f[1])
    u4 = up_block(u3, c1, f[0])
    
    outputs = Conv2D(1, (1,1), padding='same', activation='sigmoid')(u4)
    model = Model(inputs, outputs)
    return model

def dice_coef(y_true, y_pred):
    y_true_f = tf.keras.layers.Flatten()(y_true)
    y_pred_f = tf.keras.layers.Flatten()(y_pred)
    intersection = tf.reduce_sum(y_true_f * y_pred_f)
    return (2. * intersection + smooth) / (tf.reduce_sum(y_true_f) + tf.reduce_sum(y_pred_f) + smooth)

def dice_coef_loss(y_true, y_pred):
    return 1.0 - dice_coef(y_true, y_pred)

In [None]:
model = UNet(img_size)

adam = tf.keras.optimizers.Adam(lr = 0.05, epsilon = 0.1)
model.compile(optimizer=adam, loss=dice_coef_loss, metrics=[dice_coef])
model.summary()

In [None]:
history = model.fit_generator(generator=train_gen, validation_data=val_gen, epochs=epoch, verbose=verbose)

The Model did not really learn or improve after running more than 10 epochs.

In [None]:
test_gen = DataGenerator(test_imgs_paths, masks, **params)


results = model.evaluate(test_gen, batch_size=batch_size)


In [None]:
print("test loss, test acc:", results)


In [None]:
y = model.predict(test_gen)
y.shape

In [None]:
# Visualize the 20 first predictions (3rd column is the actual masks)
for i in range(20):    
    x, y2 = test_gen.__getitem__(i)
    n=1
    fig, ax = plt.subplots(nrows=1, ncols=3, sharey=True, figsize=(10,7))
    ax[0].imshow(x[n,:,:,0],cmap='bone')
    ax[1].imshow(y[i][:,:,0],cmap='Blues')
    ax[1].imshow(y2[n,:,:,0],cmap='Reds')