<img src="https://storage.googleapis.com/kaggle-competitions/kaggle/24800/logos/header.png?t=2020-12-17-19-26-15">
<center>
    <h1 style="color:red;font-weight:900;font-size:2.5em">VinBigData Chest X-ray Abnormalities Detection</h1>
    <h3>Automatically localize and classify thoracic abnormalities from chest radiographs</h3>
</center>
<br>
<br>
<hr>
<h2 style="color:blue;font-weight:600"> About Competition </h2>
<p>
    Radiologists diagnose and treat medical conditions using imaging techniques like CT and PET scans, MRIs, and, of course, X-rays. Yet, as it happens when working with such a wide variety of medical tools, radiologists face many daily challenges, perhaps the most difficult being the chest radiograph. The interpretation of chest X-rays can lead to medical misdiagnosis, even for the best practicing doctor. Computer-aided detection and diagnosis systems (CADe/CADx) would help reduce the pressure on doctors at metropolitan hospitals and improve diagnostic quality in rural areas.
</p>
<p>
    In this competition we are to predict the thoracic abnormalities in given X-Ray images and also locate those abnormalities. The data provided include:
    <ul>
    <li>Train and Test X-Ray images in folders <b style="font-weight:700">Train</b> and <b style="font-weight:700">Test</b>
    <li> sample submission file in sample_submission.csv
    <li> train dataframe in train.csv
    </ul>
</p>
<hr>
<br>
<a id="home"></a>
<div class="list-group" id="list-tab" role="tablist" style="background: rgb(49,114,163);
background: radial-gradient(circle, rgba(49,114,163,1) 0%, rgba(26,136,181,1) 15%, rgba(1,159,200,1) 52%, rgba(0,212,255,1) 60%, rgba(0,182,224,1) 64%, rgba(0,145,186,1) 69%, rgba(1,66,104,1) 82%, rgba(2,33,70,1) 95%, rgba(24,23,50,1) 100%);">
  <h3 class="list-group-item list-group-item-action active" data-toggle="list"  role="tab" aria-controls="home"style="background: rgb(49,114,163);
background: radial-gradient(circle, rgba(49,114,163,1) 0%, rgba(26,136,181,1) 15%, rgba(1,159,200,1) 52%, rgba(0,212,255,1) 60%, rgba(0,182,224,1) 64%, rgba(0,145,186,1) 69%, rgba(1,66,104,1) 82%, rgba(2,33,70,1) 95%, rgba(24,23,50,1) 100%);">Table of Contents</h3>
    <center>
  <a class="list-group-item list-group-item-action" data-toggle="list" href="#first" role="tab" aria-controls="profile">First Look at the Data<span class="badge badge-primary badge-pill">1</span></a>
    <a class="list-group-item list-group-item-action" data-toggle="list" href="#second" role="tab" aria-controls="profile">EDA<span class="badge badge-primary badge-pill">2</span></a>
    <a class="list-group-item list-group-item-action" data-toggle="list" href="#third" role="tab" aria-controls="profile">An insight of the Data<span class="badge badge-primary badge-pill">2</span></a>
  <a class="list-group-item list-group-item-action" data-toggle="list" href="#fourth" role="tab" aria-controls="messages">Data Preparation<span class="badge badge-primary badge-pill">3</span></a>
    <a class="list-group-item list-group-item-action" data-toggle="list" href="#fifth" role="tab" aria-controls="messages">Model Building and training<span class="badge badge-primary badge-pill">4</span></a>
    </center>
</div>
<hr>
<h1 style="color:red">Note:</h1>
<h5 style="color:red">The utilities_x_ray module used here is a script that I have written(can be found <a href="https://www.kaggle.com/bibhash123/utilities-x-ray">here</a>). It contains some functions for visualization of the X-Ray images. The dicom image reading pipeline is taken from <a href="https://www.kaggle.com/raddar/popular-x-ray-image-normalization-techniques"> this Notebook</a> by <a href="https://www.kaggle.com/raddar">@raddar</a></h5>

In [None]:
import numpy as np
import random
import tensorflow as tf
import tensorflow.keras.layers as L
import tensorflow.keras.backend as K
import seaborn as sns
import pandas as pd
import cv2
import os
import matplotlib.pyplot as plt
from utilities_x_ray import read_xray,showXray
from tqdm import tqdm
import pydicom

import warnings
warnings.filterwarnings("ignore")

from keras import preprocessing, layers
from keras.models import Sequential

from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.model_selection import train_test_split
import os
import cv2
from PIL import Image
import pickle
from pathlib import Path

import imageio
import pydicom
from pydicom.pixel_data_handlers.util import apply_voi_lut

import os

In [None]:
# Import Libraries
import matplotlib.pyplot as plt
import torch
from torch import nn
from torch import optim
import torch.nn.functional as F
from torchvision import datasets, transforms, models
from torch.autograd import Variable

In [None]:
### To test whether GPU instance is present in the system of not.
use_cuda = torch.cuda.is_available()
print('Using PyTorch version:', torch.__version__, 'CUDA:', use_cuda)

<h1 style="display:inline"> <a id="first"> First Look at the data</a></h1>&emsp;&emsp;&emsp;&emsp;&emsp;<a href="#home" style="color:blue"><img src="https://toppng.com/uploads/preview/light-blue-up-arrow-11550117759k4je61afsa.png" style="display:inline;width:2em;height:2em"></a>

## 1. DataFrames

In [None]:
df_train = pd.read_csv('../input/vinbigdata-chest-xray-abnormalities-detection/train.csv')
ss = pd.read_csv('../input/vinbigdata-chest-xray-abnormalities-detection/sample_submission.csv')

In [None]:
df_train.head()

In [None]:
ids = df_train.image_id[:10]
imgs = []
labels = []
nps = []
im_label_set = []
n_images = 100

train = os.listdir('../input/vinbigdata-chest-xray-abnormalities-detection/train')
test = os.listdir('../input/vinbigdata-chest-xray-abnormalities-detection/test')

test_img = ('../input/vinbigdata-chest-xray-abnormalities-detection/test/')
train_img = ('../input/vinbigdata-chest-xray-abnormalities-detection/train/')

<ul>
<li><code>image_id</code> - unique image identifier</li>
<li><code>class_name</code>&nbsp;- the name of the class of detected object (or "No finding")</li>
<li><code>class_id</code>&nbsp;- the ID of the class of detected object</li>
<li><code>rad_id</code>&nbsp;- the ID of the radiologist that made the observation</li>
<li><code>x_min</code>&nbsp;- minimum X coordinate of the object's bounding box</li>
<li><code>y_min</code>&nbsp;- minimum Y coordinate of the object's bounding box</li>
<li><code>x_max</code>&nbsp;- maximum X coordinate of the object's bounding box</li>
<li><code>y_max</code>&nbsp;- maximum Y coordinate of the object's bounding box</li>
</ul>

In [None]:
ss.head()

The submission file must contain the image id and the prediction string in the format "a b (c,d,e,f)"<br>where
<ul>
    <li>a = predicted class ; 14 for no abnormality</li>
    <li>b= confidence</li>
    <li>(c,d,e,f) = (x_min,y_min,x_max,y_max)</li>
</ul>

## 2. Images

In [None]:
plt.figure(figsize=(8,10))
plt.imshow(read_xray('../input/vinbigdata-chest-xray-abnormalities-detection/train/0108949daa13dc94634a7d650a05c0bb.dicom'),cmap=plt.cm.bone)

<h1 style="display:inline"><a id="second">EDA</a></h1>&emsp;&emsp;&emsp;&emsp;&emsp;<a href="#home" style="color:blue"><img src="https://toppng.com/uploads/preview/light-blue-up-arrow-11550117759k4je61afsa.png" style="display:inline;width:2em;height:2em"></a>

In [None]:
print("Number of rows in train dataframe: {}".format(df_train.shape[0]))
print("Number of Unique images in train set: {}".format(df_train.image_id.nunique()))
print("Number of Classes: {}\n".format(df_train.class_name.nunique()))
print("Class Names: {}".format(list(df_train.class_name.unique())))

In [None]:
print("Null Values:")
df_train.isna().sum().to_frame().rename(columns={0:'Null Value count'}).style.background_gradient('viridis')

The number of null values are same as the number of samples that do not have any abnormality

### The Distribution of Classes
We can see there is a huge class imbalance. The number of negative examples are very high and a few abnormalities have very few examples 

In [None]:
plt.figure(figsize=(9,6))
sns.countplot(df_train["class_id"]);
plt.title("Class Distributions");

### Distribution of Radiologists

In [None]:
plt.figure(figsize=(9,6))
sns.countplot(df_train["rad_id"]);
plt.title("rad_id Distributions");

<h1 style="display:inline"><a id="third"> An Intuition of the Data</a></h1>&emsp;&emsp;&emsp;&emsp;&emsp;<a href="#home" style="color:blue"><img src="https://toppng.com/uploads/preview/light-blue-up-arrow-11550117759k4je61afsa.png" style="display:inline;width:2em;height:2em"></a><br><br>
<h5>Before proceeding further let us try and get an intuition of the data and what exactly we need to do.</h5>
<h5> In this competition we have been given 15000 images for training. Parallelly we have a dataframe containing the ground truths for various abnormalities. Every sample in the datframe contains:</h5>
  <ul>
      <li>the image id</li><li>the id of the radiologist who annoted it</li><li>the name of the corresponding class</li><li>the class id</li><li>the bounding box coordinates</li>
  </ul>
<b style="font-weight:700">Important points to be noted here are:</b>
<ul>
    <li>Each image may have multiple corresponding abnormalities. Therefore this is a multilabel prediction</li>
    <li>Bounding boxes for each image have been annoted by multiple radiologists. Therefore for every sample we have multiple ground truths. A naive way to deal with this is to take mean of bounding box coordinates by every radiologists for a particular abnormality</li>
    <li>There is a significant class imbalance which is likely to affect the performance of models a lot.</li>
</ul>
<h4 style="font-weight:700">Information about dicom can be found: <a href="https://en.wikipedia.org/wiki/DICOM" style="font-size:1em">Here</a></h4>
<h4 style="font-weight:700">Procedure to extract DICOM metadata can be found in: <a href="https://www.kaggle.com/mrutyunjaybiswal/vbd-chest-x-ray-abnormalities-detection-eda" style="font-size:1em">this notebook</a></h4>

<h1 style="display:inline"><a id="fourth">Data Preparation</a></h1>&emsp;&emsp;&emsp;&emsp;&emsp;<a href="#home" style="color:blue"><img src="https://toppng.com/uploads/preview/light-blue-up-arrow-11550117759k4je61afsa.png" style="display:inline;width:2em;height:2em"></a>

In [None]:
class_names = sorted(df_train.class_name.unique())
del class_names[class_names.index('No finding')]
class_names = class_names+['No finding']
classes = dict(zip(list(range(15)),class_names))

In [None]:
def prepareDataFrame(train_df= df_train):
    train_df = train_df.fillna(0)
    cols = ['image_id','label']+list(range(4*len(class_names[:-1])))
    return_df = pd.DataFrame(columns=cols)
    
    for image in tqdm(train_df.image_id.unique()):
        df = train_df.query("image_id==@image")
        label = np.zeros(15)
        for cls in df.class_id.unique():
            label[int(cls)]=1
        bboxes_df = df.groupby('class_id')[['x_min','y_min','x_max','y_max']].mean().round()
        
        bboxes_list = [0 for i in range(60)]
        for ind in list(bboxes_df.index):
            bboxes_list[4*ind:4*ind+4] = list(bboxes_df.loc[ind,:].values)
        return_df.loc[len(return_df),:] = [image]+[label]+bboxes_list[:-4]
    return return_df
train_df = prepareDataFrame()

In [None]:
train_df.head(2)

In [None]:
temp=train_df[train_df.image_id=='50a418190bc3fb1ef1633bf9678929b3']
np.array([temp.iloc[0,1],temp.iloc[0,2:].values])

In [None]:
### To test whether GPU instance is present in the system of not.
use_cuda = torch.cuda.is_available()
print('Using PyTorch version:', torch.__version__, 'CUDA:', use_cuda)                

In [None]:
device = torch.device("cuda" if use_cuda else "cpu")
device

In [None]:
class DataLoader:
    def __init__(self,path = None,train_df=train_df):
        self.path = path
        self.files = os.listdir(self.path)
        np.random.shuffle(self.files)
        self.df = train_df
    
    def read_image(self):
        for img in self.files:
            im_name = img.split('.dicom')[0]
            image = read_xray(self.path+img)
            image = cv2.resize(image,(256,256),cv2.INTER_AREA)
            image = np.expand_dims(image,axis=2)
            temp = self.df[self.df.image_id==im_name]
            c_label,bb = temp.iloc[0,1],temp.iloc[0,2:].values.astype('float')
            yield image,c_label,bb
    
    
    def batch_generator(self,items,batch_size):
        a=[]
        i=0
        for item in items:
            a.append(item)
            i+=1

            if i%batch_size==0:
                yield a
                a=[]
        if len(a) is not 0:
            yield a
            
    def flow(self,batch_size):
        """
        flow from given directory in batches
        ==========================================
        batch_size: size of the batch
        """
        while True:
            for bat in self.batch_generator(self.read_image(),batch_size):
                batch_images = []
                batch_c_labels = []
                batch_bb = []
                for im,im_c_label,im_bb in bat:
                    batch_images.append(im)
                    batch_c_labels.append(im_c_label)
                    batch_bb.append(im_bb)
                batch_images = np.stack(batch_images,axis=0)
                batch_labels =  (np.stack(batch_c_labels,axis=0),np.stack(batch_bb,axis=0))
                yield batch_images,batch_labels

In [None]:
dl = DataLoader('../input/vinbigdata-chest-xray-abnormalities-detection/train/')
train_set = dl.flow(batch_size=32)

<h1 style="display:inline"><a id="fifth">Model Building and Training</a></h1>&emsp;&emsp;&emsp;&emsp;&emsp;<a href="#home" style="color:blue"><img src="https://toppng.com/uploads/preview/light-blue-up-arrow-11550117759k4je61afsa.png" style="display:inline;width:2em;height:2em"></a>

In [None]:
for n, id_ in enumerate(ids):
    dicom_path = train_img + id_ + '.dicom'
    dicom = pydicom.read_file(dicom_path)
    data = apply_voi_lut(dicom.pixel_array, dicom)
    
    data = data - np.min(data)
    
    data = data / np.max(data)
    
    data = (data * 255).astype(np.uint8)
    new_shape = tuple([int(x / 3) for x in data.shape])
    data = cv2.resize(data, (new_shape[1], new_shape[0]))
    im = Image.fromarray(data)
    
    #imgs.append(im)
    
    new_im = im.resize((256,256))
    npdata = np.asarray(new_im)
    new = preprocessing.image.img_to_array(npdata)
    nps.append(new)
    
    #im.save('test'+str(n)+'.jpg')
    
    df = df_train[df_train.image_id == id_]
    label = df.iloc[0,2]
    #label = tf.io.decode_raw(label, tf.uint8)
    #label = tf.reshape(label, label.shape)
    #label = tf.one_hot(label, 10)
    labels.append(label) 

In [None]:
# compile data into a keras readable format

labels = np.stack(labels)
data = np.stack(nps)

In [None]:
dl = DataLoader('../input/vinbigdata-chest-xray-abnormalities-detection/train/')
train_set = dl.flow(batch_size=32)

In [None]:
def build_cnn():
    model = tf.keras.models.Sequential()
    model.add(tf.keras.layers.Conv2D(32, (3, 3), activation='relu',strides=(2,2), input_shape=(256, 256, 3)))
    model.add(tf.keras.layers.MaxPooling2D((2, 2)))
    model.add(tf.keras.layers.Conv2D(64, (3, 3), strides=(2,2),activation='relu'))
    model.add(tf.keras.layers.MaxPooling2D((2, 2)))
    model.add(tf.keras.layers.Conv2D(128, (3, 3), strides=(2,2),activation='relu'))
    model.add(tf.keras.layers.Flatten())
    model.add(tf.keras.layers.Dense(128, activation='relu'))
    model.add(tf.keras.layers.Dense(15, activation='softmax'))
    return model

In [None]:
# Build CNN model
model = build_cnn()
#Compile the model with optimizer and loss function
model.compile(optimizer='adam',loss='categorical_crossentropy',
metrics=['accuracy'])

In [None]:
# split data into train & test

X_train, X_test, y_train, y_test = train_test_split(data, labels, test_size=0.2, random_state=34)

In [None]:
batch_size = 256
epochs = 50
history = model.fit(X_train, y_train,
          batch_size=batch_size,
          epochs=epochs,
          verbose=0)
score = model.evaluate(X_test, y_test, verbose=0)


In [None]:
# create ohe of labels

comp_labels = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14]

mlb = MultiLabelBinarizer()

mlb.fit([labels])

In [None]:
# verify all classes are present

mlb.classes_

In [None]:
# examine X_train shape before feeding model 

X_train.shape

In [None]:
# view model summary to get an idea of its parameters and steps

model.summary()

In [None]:
# Save the CNN model to disk for later use.
model_path = "models/pneumiacnn"
model.save(filepath=model_path)

In [None]:
# evaluate model on the given input data from only 3000 of the training images

model.evaluate(X_test, y_test)

In [None]:
# visualize model steps 

tf.keras.utils.plot_model(model)

In [None]:
test_df = pd.DataFrame(results, columns=['image_id', 'PredictionString'])
test_df.head()


In [None]:
test_df.to_csv('submission.csv', index=False)