In [None]:
import numpy as np
import pandas as pd
import os
import sys
import math
import cv2
import seaborn as sns
import tensorflow as tf

# To calculate accuracy measures and confusion matrix
from sklearn import metrics
# To get Recall and precision values
from sklearn.metrics import classification_report

# !pip install -q pydicom
# After installing pydicom. This is needed to load .dcm files
import pydicom
import pydicom as dcm
from pydicom import dcmread

from matplotlib import pyplot
import matplotlib.patches as patches

from skimage import measure
from skimage.transform import resize

import tensorflow.keras.utils as pltUtil
from tensorflow.keras.utils import Sequence

from tensorflow.keras.layers import Layer
from tensorflow.keras.layers import Concatenate, UpSampling2D, Conv2D, Reshape
from tensorflow.keras.models import Model

from tensorflow.keras.applications.mobilenet import MobileNet
from tensorflow.keras.applications.mobilenet import preprocess_input 

from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import ModelCheckpoint, EarlyStopping, ReduceLROnPlateau
#from tensorflow.keras.optimizers import SGD

from tensorflow.keras.applications.resnet import ResNet50
from tensorflow.keras.applications.resnet import preprocess_input as resnetProcess_input

#from keras.preprocessing.image import ImageDataGenerator



# **Setting up project path**

In [None]:
# Load train set image metadata
dataDirPath = '../input/rsna-pneumonia-detection-challenge/'

Train_Image_path = dataDirPath + 'stage_2_train_images'

SAVED_FILES_ROOT = '../input/reportfiles/'

****Creating a function to load the metadata from images****

In [None]:
class ImageMetadata():
    def __init__(self, setName, file):
        # print(name, file)
        # dataset name(train or test)
        self.setName = setName
        # image file name
        self.file = file

    def __repr__(self):
        return self.imagePath()

    def imagePath(self):
        return os.path.join(self.setName, self.file) 
    

# function to load image metadada   
def loadImageMetadata(dataSetName):
    imageMetadata = []
    for f in os.listdir(dataSetName):
        # Check file extension. Allow only .dcm files.
        ext = os.path.splitext(f)[1]
        if ext == '.dcm' :
            imageMetadata.append(ImageMetadata(dataSetName, f))
    return np.array(imageMetadata)

**Getting Metadata Information**

In [None]:
trainSetImageMetadata = loadImageMetadata(Train_Image_path)

print("trainSetImageMetadata.shape : ", trainSetImageMetadata.shape)

print("Sample image path : ", trainSetImageMetadata[0])

******Insights from Metadata******

There are 26684 images in the Training data
These are DICOM Images which has pixel information as well as several tags added to it like patientid, age,gender etc.

**Create a function to load image and loaging a sample Image**

In [None]:
def loadImage(path):
    img = pydicom.dcmread(path)
    return img

imgIndex = 4
imgPath = trainSetImageMetadata[imgIndex]
imgPath = imgPath.imagePath()
imgData = loadImage(imgPath)

pyplot.imshow(imgData.pixel_array, cmap=pyplot.cm.bone)

**Preparing Dataset with patient id and respective image paths**

In [None]:
trainSetImageMetadata_df = pd.DataFrame(trainSetImageMetadata, columns=["Path"])
trainSetImageMetadata_df.head(2)

def getImgId(_imgData) :
    return str(_imgData).split(".dcm")[0].split("/")[4]

imageIdPaths = pd.DataFrame(columns=["patientId", "imgPath"])
imageIdPaths["patientId"] = trainSetImageMetadata_df["Path"].apply(getImgId)
imageIdPaths["imgPath"] = trainSetImageMetadata_df["Path"]

print("imageIdPaths", imageIdPaths.shape)
imageIdPaths.head(2)

# Exloratory Data Analysis on Train Labels and Detail Info CSV data sets

**Analyzing Detailed Classes CSV file**

Step 1 -> Define and read the Detail_Info CSV File 

In [None]:
classesPath =  dataDirPath + 'stage_2_detailed_class_info.csv'

detailedClasses = pd.read_csv(classesPath)

detailedClasses.head(2)

Step 2 -> Check for Missing Values


In [None]:
detailedClasses.isna().apply(pd.value_counts)

No Missing Values found

Step 3 -> Checking the shape of data frame

In [None]:
print("detailedClasses.shape : ", detailedClasses.shape, )

# File has 30227 rows and 2 columns - PatientID & Class

Step 3 -> Checking Unique Patients

In [None]:
print("Unique patientIds : ", detailedClasses['patientId'].nunique(), )

Total number of unique patients in data - 26684

**Observation - As we have total 30227 records and out of that 26684 are unique records, this shows presence of multiple records for some patients**

Step 4 -> Checking unique Classes

In [None]:
print("Unique patientIds : ", detailedClasses['class'].nunique(), )

print(detailedClasses['class'].unique)

3 Unique classes observed 
1 - No Lung Opacity/Not Normal,
2 - Normal,
3 - Lung Opacity

In [None]:
sns.countplot(x="class",hue="class",data=detailedClasses)

**Analyzing Train Lables Dataset**

Step 1 -> Reading the data 

In [None]:
labelsPath = dataDirPath + 'stage_2_train_labels.csv'

trainLabels = pd.read_csv(labelsPath)

trainLabels.head(2)

Step 2 -> Checking the missing values if any

In [None]:
trainLabels.isna().apply(pd.value_counts)

**Observation -For around 20672 patients Bounding box cordinates not available where as for 9555 patients its avaialable**

In [None]:
trainLabels[trainLabels['Target']==0].head(2)

Few records have observed with missing values in x,y,width and height coulmn, but no missing values observed in patientid and Target.

Also this is observed such missing columns are present for those records with Target as '0'.

x,y,width and height columns have the information for bounding boxes in Images where Penumonia is detected.

**Explaination on missing values - These are not the missing values instead it is expected not to have Bounding Box co-ordinates for those images where Pneumonia is not detected (Target - '0')**

Hence concluding there are no missing values in this dataset as well

Step 3 -> Checking unique Patients

In [None]:
print("Unique patientIds : ", trainLabels['patientId'].nunique(), )

Total Unique patients found - 26684
This is same as the number of patients in Detailed CSV sheet hence both sheets share the information for same patients

Step 4 -> Checkin unqiue Targets

In [None]:
print("Unique patientIds : ", trainLabels['Target'].nunique(), )

print("Unique patientIds : ", trainLabels['Target'].unique(), )

Trail label has only 2 target variables [0 & 1] 

**Conclusion - In Train labels only two target variables are present 0 & 1, where as in Detailed_Info sheet we have 3 classes.**

**As we have 3 Classes in Detailed_Info dataset and 2 Target Variables in Train_Labels, concatenating to get better insight into the data**

Step 1 -> Sorting both the datasets based on patientId

In [None]:
trainLabels.sort_values("patientId", inplace=True)
detailedClasses.sort_values("patientId", inplace=True)

Step 2 -> Concatenating the data

In [None]:
Combined_Data = pd.concat([trainLabels, detailedClasses["class"]], axis=1, sort=False)
Combined_Data.head(3)

**Validating the concatenation results**

In [None]:
Combined_Data.shape

In [None]:
Combined_Data.isna().apply(pd.value_counts)

In [None]:
Combined_Data[Combined_Data["Target"] == 1].isna().apply(pd.value_counts)

In [None]:
Combined_Data[Combined_Data["Target"] == 0].isna().apply(pd.value_counts)

In [None]:
Combined_Data[Combined_Data["class"] == "Lung Opacity"].isna().apply(pd.value_counts)

From Above Analysis our concatenated data is correct

## Prepare data for training


Step 1 -> 
* Conver data to only two classes, 'Normal' and 'Lung Opacity'
* Splitting the data in three parts, train, validation and test sets.


In [None]:
# Conver data to only two classes, 'Normal' and 'Lung Opacity'
Combined_Data["class"].replace("No Lung Opacity / Not Normal", "Normal", inplace=True)
Combined_Data.head(3)

In [None]:

train_CombinedData = Combined_Data[0:15000]
validate_CombinedData = Combined_Data[15000:25000]
test_CombinedData = Combined_Data[25000:30227]

print("train_CombinedData.shape : ", train_CombinedData.shape)
print("validate_CombinedData.shape : ", validate_CombinedData.shape)
print("test_CombinedData.shape : ", test_CombinedData.shape)

print("\nunique train patients : ", train_CombinedData["patientId"].nunique())
print("unique validate patients : ", validate_CombinedData["patientId"].nunique())
print("unique test patients : ", test_CombinedData["patientId"].nunique())

print("\nTotal unique patients : ", imageIdPaths["patientId"].nunique())
print("Total of unique train and test : ", train_CombinedData["patientId"].nunique() + validate_CombinedData["patientId"].nunique() + test_CombinedData["patientId"].nunique())

print("\nLast from train set : ", train_CombinedData.iloc[14999]["patientId"])
print("First from validate set : ", validate_CombinedData.iloc[0]["patientId"])
print("\nLast from validate set : ", validate_CombinedData.iloc[9999]["patientId"])
print("First from test set : ", test_CombinedData.iloc[0]["patientId"])

# Set all NaN values to 0 in train and test data sets. While training NaN will not have any meaning.
#    * x, y, width and hight values as zero(0) means no bounding box.
train_CombinedData.fillna(0, inplace=True)
validate_CombinedData.fillna(0, inplace=True)
test_CombinedData.fillna(0, inplace=True)

In [None]:
imageIdPaths.sort_values("patientId", inplace=True)

train_imageIdPaths = imageIdPaths[0:13163]
validate_imageIdPaths = imageIdPaths[13163:21764]
test_imageIdPaths = imageIdPaths[21764:26684]

print("train_imageIdPaths.shape : ", train_imageIdPaths.shape)
print("validate_imageIdPaths.shape : ", validate_imageIdPaths.shape)
print("test_imageIdPaths.shape : ", test_imageIdPaths.shape)

print("\nunique train patients : ", train_imageIdPaths["patientId"].nunique())
print("unique validate patients : ", validate_imageIdPaths["patientId"].nunique())
print("unique test patients : ", test_imageIdPaths["patientId"].nunique())

print("\nTotal unique patients : ", imageIdPaths["patientId"].nunique())
print("Total of unique train and test : ", train_imageIdPaths["patientId"].nunique() + validate_imageIdPaths["patientId"].nunique() + test_imageIdPaths["patientId"].nunique())

print("\nLast from train set : ", train_imageIdPaths.iloc[13162]["patientId"])
print("First from validate set : ", validate_imageIdPaths.iloc[0]["patientId"])
print("Last from validate set : ", validate_imageIdPaths.iloc[8600]["patientId"])
print("First from test set : ", test_imageIdPaths.iloc[0]["patientId"])

In [None]:
sns.countplot(x="Target",hue="class",data=train_CombinedData)

In [None]:
sns.countplot(x="Target",hue="class",data=validate_CombinedData)

In [None]:
sns.countplot(x="Target",hue="class",data=test_CombinedData)

# Build UNet

In [None]:
IMAGE_SIZE = 224

IMG_WIDTH = 1024
IMG_HEIGHT = 1024

In [None]:
BATCH_SIZE = 10

class UNetTrainGenerator(Sequence):

    def __init__(self, _imageIdPaths, _CombinedData):       
        self.pids = _CombinedData["patientId"].to_numpy()
        self.imgIdPaths = _imageIdPaths
        self.coords = _CombinedData[["x", "y", "width", "height"]].to_numpy()
        # Resize Bounding box
        self.coords = self.coords * IMAGE_SIZE / IMG_WIDTH
        

    def __len__(self):
        return math.ceil(len(self.coords) / BATCH_SIZE)
    

    def __getitem__(self, idx): # Get a batch
        batch_coords = self.coords[idx * BATCH_SIZE:(idx + 1) * BATCH_SIZE] # Image coords
        batch_pids = self.pids[idx * BATCH_SIZE:(idx + 1) * BATCH_SIZE] # Image pids    
        
        batch_images = np.zeros((len(batch_pids), IMAGE_SIZE, IMAGE_SIZE, 3), dtype=np.float32)
        batch_masks = np.zeros((len(batch_pids), IMAGE_SIZE, IMAGE_SIZE))
        for _indx, _pid in enumerate(batch_pids):
            _path = self.imgIdPaths[self.imgIdPaths["patientId"] == _pid]["imgPath"].array[0]
            _imgData = loadImage(str(_path)) # Read image
            img = _imgData.pixel_array 
            
            # Resize image
            resized_img = cv2.resize(img, (IMAGE_SIZE, IMAGE_SIZE), interpolation=cv2.INTER_AREA)
            #resized_img = cv2.resize(img[200:824, 200:824], dsize=(IMAGE_SIZE, IMAGE_SIZE), interpolation=cv2.INTER_AREA)
    
            #print("batch_images[_indx] shape :", batch_images[_indx][:,:,0].shape)
            # preprocess image for the batch
            batch_images[_indx][:,:,0] = preprocess_input(np.array(resized_img[:,:], dtype=np.float32)) # Convert to float32 array
            batch_images[_indx][:,:,1] = preprocess_input(np.array(resized_img[:,:], dtype=np.float32)) # Convert to float32 array
            batch_images[_indx][:,:,2] = preprocess_input(np.array(resized_img[:,:], dtype=np.float32)) # Convert to float32 array  
            
            x = int(batch_coords[_indx, 0])
            y = int(batch_coords[_indx, 1])
            width = int(batch_coords[_indx, 2])
            height = int(batch_coords[_indx, 3])
            
            batch_masks[_indx][y:y+height, x:x+width] = 1

        return batch_images, batch_masks

In [None]:
trainUNetDataGen = UNetTrainGenerator(train_imageIdPaths, train_CombinedData)
validateUNetDataGen = UNetTrainGenerator(validate_imageIdPaths, validate_CombinedData)

print(len(trainUNetDataGen), "# of iterations in one train epoch")
print(len(validateUNetDataGen), "# of iterations in one validate epoch")

In [None]:
# To show image with mask
def showMaskedImage(_imageSet, _maskSet, _index) :
    maskImage = _imageSet[_index]

    #pyplot.imshow(maskImage[:,:,0], cmap=pyplot.cm.bone)
    maskImage[:,:,0] = _maskSet[_index] * _imageSet[_index][:,:,0]
    maskImage[:,:,1] = _maskSet[_index] * _imageSet[_index][:,:,1]
    maskImage[:,:,2] = _maskSet[_index] * _imageSet[_index][:,:,2]

    pyplot.imshow(maskImage[:,:,0], cmap=pyplot.cm.bone)


In [None]:
imageSet0 = trainUNetDataGen[0][0]
maskSet0 = trainUNetDataGen[0][1]    
showMaskedImage(imageSet0, maskSet0, 5)

In [None]:

ALPHA = 1.0

def create_UNetModel(trainable=True):
    model = MobileNet(input_shape=(IMAGE_SIZE, IMAGE_SIZE, 3), include_top=False, alpha=ALPHA, weights="imagenet") 
    # Top layer is last layer of the model

    for layer in model.layers:
        layer.trainable = trainable

    # Add all the UNET layers here
    #### Add your code here ####
    convLayer_112by112 = model.get_layer("conv_pw_1_relu").output
    convLayer_56by56 = model.get_layer("conv_pw_3_relu").output
    convLayer_28by28 = model.get_layer("conv_pw_5_relu").output
    convLayer_14by14 = model.get_layer("conv_pw_11_relu").output
    convLayer_7by7 = model.get_layer("conv_pw_13_relu").output
    # The last layer of mobilenet model is of dimensions (7x7x1024)

    # Start upsampling from 7x7 to 14x14 ...up to 224x224 to form UNET
    # concatinate with the original image layer of the same size from MobileNet
    x = Concatenate()([UpSampling2D()(convLayer_7by7), convLayer_14by14])
    x = Concatenate()([UpSampling2D()(x), convLayer_28by28])
    x = Concatenate()([UpSampling2D()(x), convLayer_56by56])
    x = Concatenate()([UpSampling2D()(x), convLayer_112by112])
    x = UpSampling2D(name="unet_last")(x) # upsample to 224x224

    # Add classification layer
    x = Conv2D(1, kernel_size=1, activation="sigmoid", name="masks")(x)
    x = Reshape((IMAGE_SIZE, IMAGE_SIZE))(x) 

    return Model(inputs=model.input, outputs=x)

In [None]:
# define iou or jaccard loss function
def iou_loss(y_true, y_pred):
    y_true = tf.reshape(y_true, [-1])
    y_pred = tf.reshape(y_pred, [-1])
    intersection = tf.reduce_sum(y_true * y_pred)
    score = (intersection + 1.) / (tf.reduce_sum(y_true) + tf.reduce_sum(y_pred) - intersection + 1.)
    return 1 - score

# mean iou as a metric
def mean_iou(y_true, y_pred):
    y_pred = tf.round(y_pred)    
    #intersect = tf.reduce_sum(y_true * y_pred, axis=[1, 2, 3])
    intersect = tf.reduce_sum(y_true * y_pred, axis=[1])
    #union = tf.reduce_sum(y_true, axis=[1, 2, 3]) + tf.reduce_sum(y_pred, axis=[1, 2, 3])
    union = tf.reduce_sum(y_true, axis=[1]) + tf.reduce_sum(y_pred, axis=[1])
    smooth = tf.ones(tf.shape(intersect))
    return tf.reduce_mean((intersect + smooth) / (union - intersect + smooth))

In [None]:

adamOptimizer = Adam(lr=1e-4, beta_1=0.9, beta_2=0.999, epsilon=None, decay=0.0, amsgrad=False)

checkpoint = ModelCheckpoint("unetModel-{loss:.2f}.h5", monitor="loss", verbose=1, save_best_only=True,
                             save_weights_only=True, mode="min", period=1)
stop = EarlyStopping( monitor="loss", patience=5, mode="min")
reduce_lr = ReduceLROnPlateau(monitor="loss", factor=0.2, patience=5, min_lr=1e-6, verbose=1, mode="min")

In [None]:
trainUnetModel = False

EPOCHS = 3

WEIGHTS_FILE = SAVED_FILES_ROOT + "unetModel-0.73.h5"
UNetModel = create_UNetModel()
UNetModel.compile(loss=iou_loss, optimizer=adamOptimizer, metrics=[mean_iou, 'accuracy']) 

UNetModel.load_weights(WEIGHTS_FILE)

if trainUnetModel==True :
    # Make layers trainable
    for layer in UNetModel.layers:
        layer.trainable = True

    hist = UNetModel.fit_generator(generator=trainUNetDataGen,
                        epochs=EPOCHS,
                        validation_data=validateUNetDataGen,
                        callbacks=[checkpoint, reduce_lr, stop],
                        shuffle=True,
                        verbose=1)
    
    unet_history = np.array(hist.history)
    np.save("unetTrainHist-3", unet_history, allow_pickle=True)

In [None]:
## Read saved training history and plot curves

HISTORY_FILE = SAVED_FILES_ROOT + "unetTrainHist.npy"
unetSavedHistory = np.load(HISTORY_FILE, allow_pickle=True).item()

unetSavedHistoryDF = pd.DataFrame(unetSavedHistory)

# list data in history
# summarize history for loss
pyplot.plot(unetSavedHistoryDF['loss'])
pyplot.plot(unetSavedHistoryDF['val_loss'])
pyplot.title('model loss')
pyplot.ylabel('loss')
pyplot.xlabel('epoch')
pyplot.legend(['train', 'test'], loc='best')
pyplot.show()
# summarize history for mean IOU
pyplot.plot(unetSavedHistoryDF['mean_iou'])
pyplot.plot(unetSavedHistoryDF['val_mean_iou'])
pyplot.plot(unetSavedHistoryDF['lr'])
pyplot.title('model IOU and Leraning rate')
pyplot.ylabel('IOU and LR')
pyplot.xlabel('epoch')
pyplot.legend(['train', 'test', 'Lerning Rate'], loc='best')
pyplot.show()

In [None]:
pltUtil.plot_model(UNetModel,
                    to_file="model.png",
                    show_shapes=True,
                    show_layer_names=True,
                    expand_nested=False,
                    dpi=70)

In [None]:
# Check sample ground truth masked image and predicted masked image 
imageSet0 = trainUNetDataGen[0][0]
maskSet0 = trainUNetDataGen[0][1]
print("Ground Truth Box/Mask")
showMaskedImage(imageSet0, maskSet0, 5)


In [None]:
predMasks = UNetModel.predict(imageSet0)
print("Predicted Box/Mask")
showMaskedImage(imageSet0, predMasks, 5)

In [None]:
BATCH_SIZE = 10

class UNetTestGenerator(Sequence):

    def __init__(self, _imageIdPaths, _CombinedData):       
        self.pids = _CombinedData["patientId"].to_numpy()
        self.imgIdPaths = _imageIdPaths
        self.coords = _CombinedData[["x", "y", "width", "height", "Target"]].to_numpy() #for (1024, 1024)
        self.classes = _CombinedData["class"]
        # Resize Bounding box
        self.coordsOrig = self.coords #for (1024, 1024)
        self.coords = self.coords * IMAGE_SIZE / IMG_WIDTH   #for (224, 224)
        

    def __len__(self):
        return math.ceil(len(self.coords) / BATCH_SIZE)
    

    def __getitem__(self, idx): # Get a batch
        batch_coords = self.coords[idx * BATCH_SIZE:(idx + 1) * BATCH_SIZE] # Image coords for (224, 224)
        batch_coordsOrig = self.coordsOrig[idx * BATCH_SIZE:(idx + 1) * BATCH_SIZE] # Image coords for (1024, 1024)
        batch_pids = self.pids[idx * BATCH_SIZE:(idx + 1) * BATCH_SIZE] # Image pids    
        batch_classes = self.classes[idx * BATCH_SIZE:(idx + 1) * BATCH_SIZE] # Image classes           
        
        batch_images = np.zeros((len(batch_pids), IMAGE_SIZE, IMAGE_SIZE, 3), dtype=np.float32)
        batch_masks = np.zeros((len(batch_pids), IMAGE_SIZE, IMAGE_SIZE))
        for _indx, _pid in enumerate(batch_pids):
            _path = self.imgIdPaths[self.imgIdPaths["patientId"] == _pid]["imgPath"].array[0]
            _imgData = loadImage(str(_path)) # Read image
            img = _imgData.pixel_array 
            
            # Resize image
            resized_img = cv2.resize(img, (IMAGE_SIZE, IMAGE_SIZE), interpolation=cv2.INTER_AREA) #(224, 224)
            #resized_img = cv2.resize(img[200:824, 200:824], dsize=(IMAGE_SIZE, IMAGE_SIZE), interpolation=cv2.INTER_AREA)
    
            #print("batch_images[_indx] shape :", batch_images[_indx][:,:,0].shape)
            # preprocess image for the batch
            batch_images[_indx][:,:,0] = preprocess_input(np.array(resized_img[:,:], dtype=np.float32)) # Convert to float32 array
            batch_images[_indx][:,:,1] = preprocess_input(np.array(resized_img[:,:], dtype=np.float32)) # Convert to float32 array
            batch_images[_indx][:,:,2] = preprocess_input(np.array(resized_img[:,:], dtype=np.float32)) # Convert to float32 array  
            
            x = int(batch_coords[_indx, 0])
            y = int(batch_coords[_indx, 1])
            width = int(batch_coords[_indx, 2])
            height = int(batch_coords[_indx, 3])
            target = int(batch_coords[_indx, 4])
            
            batch_coords[_indx, 0] = x
            batch_coords[_indx, 1] = y 
            batch_coords[_indx, 2] = width 
            batch_coords[_indx, 3] = height    
            batch_coords[_indx, 4] = target 
            
            batch_masks[_indx][y:y+height, x:x+width] = 1

        return batch_images, batch_masks, batch_pids, batch_coords, batch_classes, batch_coordsOrig  #for (224, 224) and (1024, 1024)

In [None]:
def iouFromCoords(boxA, boxB) :
    # determine the (x, y)-coordinates of the intersection rectangle
    xA = max(boxA[0], boxB[0])
    yA = max(boxA[1], boxB[1])
    xB = min(boxA[2], boxB[2])
    yB = min(boxA[3], boxB[3])

    # compute the area of intersection rectangle
    interArea = abs(max((xB - xA, 0)) * max((yB - yA), 0))
    if interArea == 0:
        return 0
    # compute the area of both the prediction and ground-truth
    # rectangles
    boxAArea = abs((boxA[2] - boxA[0]) * (boxA[3] - boxA[1]))
    boxBArea = abs((boxB[2] - boxB[0]) * (boxB[3] - boxB[1]))

    # compute the intersection over union by taking the intersection
    # area and dividing it by the sum of prediction + ground-truth
    # areas - the interesection area
    iou = interArea / float(boxAArea + boxBArea - interArea)

    # return the intersection over union value
    return iou

In [None]:
predictUnetModel = False

if predictUnetModel == True :

    print('Number of Test Samples :', len(test_CombinedData)) # about 20% of the dataset
    # create test generator 
    testUNetDataGen = UNetTestGenerator(test_imageIdPaths, test_CombinedData) #for (224, 224)
    # create submission dafa frame with column names
    submissionDF = pd.DataFrame(columns=['patientId', 'x', 'y', 'width', 'height', 'Target', 'class', 'x_pred', 'y_pred', 'width_pred', 
                                         'height_pred', 'predTarget', 'iou', 'class_pred'])
    dfIndex = 0
    iouThreshold = 0.3

    # loop through testset
    #for imgs, filenames in test_gen:
    print("Predicting Batches ", end='')
    for batchImages, gtBatchMasks, batchPids, batchCoords, batchClasses, batchCoordsOrig in testUNetDataGen:    #for (224, 224)
        print(".", end = '')    
        # predict batch of images
        batchPreds = UNetModel.predict(batchImages)    #for (224, 224)

        prevPid = ""
        # loop through batch
        for pred, gtMask, pid, coords, gtClass, coordsOrig in zip(batchPreds, gtBatchMasks, batchPids, batchCoords, batchClasses, batchCoordsOrig):   #for (224, 224)

            if prevPid != pid :
                prevPid = pid

                # resize predicted mask
                pred = resize(pred, (1024, 1024), mode='reflect')   #for (1024, 1024)
                # recompute coords for resized pred
                coords = coordsOrig   #for (1024, 1024)

                # threshold predicted mask
                strongPred = pred[:, :] > 0.5   

                # apply connected components
                strongPred = measure.label(strongPred)

                loopIndx = 0
                # collect all reagions for the prediction
                iouCoordsDF = pd.DataFrame(columns=['iou', 'x', 'y', 'width', 'height'])
                for region in measure.regionprops(strongPred) :
                    # retrieve x, y, height and width
                    y, x, y2, x2 = region.bbox
                    height = y2 - y
                    width = x2 - x
                    # Get IOUs
                    coordsXYs = np.array([coords[0], coords[1], coords[2]+coords[0], coords[3]+coords[1]])
                    regionXYs = np.array([x, y, x2, y2])
                    IOU = iouFromCoords(coordsXYs, regionXYs)
                    #print("IOU ", IOU)
                    iouCoordsRow = [IOU, x, y, width, height]
                    iouCoordsDF.loc[loopIndx] = iouCoordsRow
                    loopIndx = loopIndx + 1

                GTDFRow = [pid, coords[0], coords[1], coords[2], coords[3], coords[4], gtClass] # ground truth data 
                prevGTDFRow = []
                # Get top 2 predictions based on IOU 
                iouCoordsDF.sort_values("iou", ascending=False, inplace=True)
                predIOUCoordCount = 0
                # If predictions exist
                if len(iouCoordsDF) > 0 :
                    for predIOUCoordIdx in (0, len(iouCoordsDF)-1) :
                        if iouCoordsDF.loc[predIOUCoordIdx]["iou"] > iouThreshold :
                            # add row with ground truth and prediction values to data frame    
                            submissionDFRow = [pid, coords[0], coords[1], coords[2], coords[3], coords[4],
                                               gtClass, int(iouCoordsDF.loc[predIOUCoordIdx]["x"]), int(iouCoordsDF.loc[predIOUCoordIdx]["y"]), 
                                               int(iouCoordsDF.loc[predIOUCoordIdx]["width"]), int(iouCoordsDF.loc[predIOUCoordIdx]["height"]), 
                                               1, iouCoordsDF.loc[predIOUCoordIdx]["iou"], "Lung Opacity"]
                            if predIOUCoordCount < 2 :
                                if GTDFRow != prevGTDFRow : 
                                    submissionDF.loc[dfIndex] = submissionDFRow
                                    dfIndex = dfIndex + 1 
                                    predIOUCoordCount = predIOUCoordCount + 1
                                    prevGTDFRow = GTDFRow
                            else :
                                break;
                        else : # Normal if IOU below threshold
                            # add row with ground truth and prediction values to data frame
                            if GTDFRow != prevGTDFRow :  
                                submissionDFRow = [pid, coords[0], coords[1], coords[2], coords[3], coords[4], 
                                                   gtClass, 0, 0, 0, 0, 0, iouCoordsDF.loc[predIOUCoordIdx]["iou"], "Normal"]
                                submissionDF.loc[dfIndex] = submissionDFRow
                                dfIndex = dfIndex + 1  
                                prevGTDFRow = GTDFRow
                                break;
                            # end of if
                        # end of if
                    # end of for

                else : # else of If predictions exist. Normal if no predictions
                    # add row with ground truth and prediction values to data frame
                    submissionDFRow = [pid, coords[0], coords[1], coords[2], coords[3], coords[4], 
                                       gtClass, 0, 0, 0, 0, 0, 'NA', "Normal"]
                    submissionDF.loc[dfIndex] = submissionDFRow
                    dfIndex = dfIndex + 1      

        # to stop at certain count for debug
        #     if len(submissionDF) >= 15 :
        #         break

    # save dictionary as csv file
    submissionDF.to_csv('submission.csv', index=False)
    print("Prediction Complete!")

### **Confusion Matrix and Classification Report**

In [None]:
REPORT_30_FILE = SAVED_FILES_ROOT + "submission_30.csv"
report30IOU = pd.read_csv(REPORT_30_FILE)
report30IOU.fillna(0, inplace=True) # set NA IOU values to zero

In [None]:
y_30_test = report30IOU["Target"]
y_30_predicted = report30IOU["predTarget"]

print("Predictions above 30% IOU :\n")
print("Confusion Matrix:- \n", metrics.confusion_matrix(y_30_test, y_30_predicted), "\n")
print("Classification Report:- \n", metrics.classification_report(y_30_test, y_30_predicted))


**Show top predicted masks with bounding boxes.**

In [None]:
topNum = 6

# Sort on IOU to get higher IOUs on top
report30IOU.sort_values("iou", ascending=False, inplace=True)
# Get patientIds
topPids = report30IOU["patientId"].head(topNum)
topPidsAry = np.array(topPids)
# Get IOUs
topIOUs = report30IOU["iou"].head(topNum)
topIOUsAry = np.array(topIOUs)

# To get ground truth images for top IOU scored pids
imageCollc = np.zeros((topNum, IMG_WIDTH, IMG_HEIGHT), np.float32) # (1024, 1024)

# Get ground truth coordinates for top IOU scored rows and prepare masks
gtCoordCollc = report30IOU[["x", "y", "width", "height"]].to_numpy()  # (1024, 1024)
# To get ground truth masks
gtMaskCollc  = np.zeros((topNum, IMG_WIDTH, IMG_HEIGHT), np.int) # (1024, 1024)

# Get ground truth coordinates for top IOU scored rows and prepare masks
predCoordCollc = report30IOU[["x_pred", "y_pred", "width_pred", "height_pred"]].to_numpy()  # (1024, 1024)
# To get ground truth masks
predMaskCollc  = np.zeros((topNum, IMG_WIDTH, IMG_HEIGHT), np.int)

# Get ground truth and prediction masks
for indx in range(0, topNum) :
    # Get images
    path = test_imageIdPaths[test_imageIdPaths["patientId"] == topPidsAry[indx]]["imgPath"].array[0]
    imgData = loadImage(str(path)) # Read image
    img = imgData.pixel_array
    imageCollc[indx][:,:] = preprocess_input(np.array(img[:,:], dtype=np.float32)) # Convert to float32 array
    
    # prepare ground truth masks
    x = int(gtCoordCollc[indx, 0])
    y = int(gtCoordCollc[indx, 1])
    width = int(gtCoordCollc[indx, 2])
    height = int(gtCoordCollc[indx, 3])
    gtMaskCollc[indx][y:y+height, x:x+width] = 1   # (1024, 1024)

    # prepare predicted masks
    x_pred = int(predCoordCollc[indx, 0])
    y_pred = int(predCoordCollc[indx, 1])
    width_pred = int(predCoordCollc[indx, 2])
    height_pred = int(predCoordCollc[indx, 3])
    predMaskCollc[indx][y_pred:y_pred+height_pred, x_pred:x_pred+width_pred] = 1   # (1024, 1024)
    

# Show images and bounding boxes
imageArea, axesArry = pyplot.subplots(3, 2, figsize=(18,18))
axesArry = axesArry.ravel()
for axidx in range(0, topNum) :
    axesArry[axidx].imshow(imageCollc[axidx][:, :], cmap=pyplot.cm.bone)
    
    gtComp = gtMaskCollc[axidx][:, :] > 0.5
    # apply connected components
    gtComp = measure.label(gtComp)
    # apply ground truth bounding boxes
    for region in measure.regionprops(gtComp):
        # retrieve x, y, height and width
        y1, x1, y2, x2 = region.bbox
        heightReg = y2 - y1
        widthReg = x2 - x1
        axesArry[axidx].add_patch(patches.Rectangle((x1, y1), widthReg, heightReg, linewidth=1, edgecolor='r', 
                                                    facecolor='none'))
        
    predComp = predMaskCollc[axidx][:, :] > 0.5
    # apply connected components
    predComp = measure.label(predComp)
    # apply predicted bounding boxes
    for region_pred in measure.regionprops(predComp):
        # retrieve x, y, height and width
        y1_pred, x1_pred, y2_pred, x2_pred = region_pred.bbox
        heightReg_pred = y2_pred - y1_pred
        widthReg_pred = x2_pred - x1_pred
        axesArry[axidx].add_patch(patches.Rectangle((x1_pred, y1_pred), widthReg_pred, heightReg_pred, linewidth=1, edgecolor='b', 
                                                    facecolor='none'))
        axesArry[axidx].set_title('IOU : '+str(topIOUsAry[axidx]))
        
        
pyplot.show()