# Importing the required libraries

In [None]:
import numpy as np 
import pandas as pd
import tensorflow as tf
import os
import matplotlib.pyplot as plt
import seaborn as sns
import pydicom as pyd
from tqdm import tqdm
from sklearn.model_selection import train_test_split
import albumentations as A
import cv2
from tensorflow.keras.applications import EfficientNetB0
from tensorflow.keras.models import Sequential
from tensorflow.keras.utils import Sequence
from tensorflow.keras import layers

# Reading the input files

In [None]:
images_path = '../input/rsna-pneumonia-detection-challenge/stage_2_train_images'
train_labels_df = pd.read_csv('../input/rsna-pneumonia-detection-challenge/stage_2_train_labels.csv')
label_meta_data = pd.read_csv('../input/rsna-pneumonia-detection-challenge/stage_2_detailed_class_info.csv')

> The dataset is available as part of the RSNA Pneumonia Detection Competition in Kaggle itself. The data has been added to this workspace and we are storing the path of the images as well as converting the CSV's into a Pandas Dataframe

In [None]:
train_labels_df.head(10)

> The stage_2_train_labels CSV contains the following information 
* patientId : For uniquely identifying the patient whose X-ray scan is in the image dataset
* Coordinates of the bounding boxes : The x_min, y_min, width and height of the rectangular boudning boxes that detect inflammation leading to the diagnosis of pneumonia
* Taeget Class : the categorical output which tells if the patient has pneumonia or not.

Note that if the target is 0, then then the coordinate columns carry NAN values

In [None]:
label_meta_data.head(10)

> The stage_2_detailed_class_info CSV contains the following information
* patientId : For uniquely identifying the patient whose X-ray scan is in the image dataset
* class : detailed class info wherein there are three categories in picture

1.     Normal : No pneumonia (Target = 0)
2.     Lung Opacity : Pneumonia (Target = 1)
3.     No Lung Opacity/ Not Normal : Inflammation not leading to Pneumonia ( Target = 0)


In [None]:
print('Size of Dataset 1: ',train_labels_df.shape)
print('Size of Dataset 2: ',label_meta_data.shape)
print('Number of Unique X-Rays in Dataset 1 : ',train_labels_df['patientId'].nunique())
print('Number of Unique X-Rays in Dataset 2 : ',label_meta_data['patientId'].nunique())


> Both datasets have same number of rows - 30,227 but out of which there are only 26,684 unique patient IDs -> which means some patient scans may have more than one bounding box.

In [None]:
train_labels_df.drop_duplicates(inplace=True)
label_meta_data.drop_duplicates(inplace=True)
print('Size of Dataset 1: ',train_labels_df.shape)
print('Size of Dataset 2: ',label_meta_data.shape)
print('Number of Unique X-Rays in Dataset 1 : ',train_labels_df['patientId'].nunique())
print('Number of Unique X-Rays in Dataset 2 : ',label_meta_data['patientId'].nunique())

> This shows that there are no duplicates in the bounding box dataset

In [None]:
patient_info_df = pd.DataFrame(columns=['age','sex','patientId'])
for ix,id_ in tqdm(enumerate(label_meta_data['patientId'])):
    age=pyd.read_file(os.path.join(images_path,
                                   id_+'.dcm')).PatientAge
    sex=pyd.read_file(os.path.join(images_path,
                 base_feat              id_+'.dcm')).PatientSex

    patient_info_df.loc[ix,'age']=age
    patient_info_df.loc[ix,'sex']=sex
    patient_info_df.loc[ix,'patientId'] = id_

> Extracting the age and gender of the patient for all the IDs specified in the detailed info CSV into a separate data frame

In [None]:
# Merging this patient information with the exiting meta data
patient_info_df = pd.merge(label_meta_data,patient_info_df,on="patientId")
patient_info_df

In [None]:
merged_data_info = pd.merge(train_labels_df,patient_info_df,on="patientId")
merged_data_info.tail(10)

> Merging both the files into a single dataset to make sure that there are no inconsistent patient IDs across both the datasets

In [None]:
print(merged_data_info.shape)
print(merged_data_info.isna().sum())

> The shape of the merged one is the same as the original bounding box dataset and there are no other null values in the other output columns

In [None]:
print('Minimum Age in the dataset:', merged_data_info['age'].min())
print('Maximum Age in the dataset', merged_data_info['age'].max())

merged_data_info.columns = merged_data_info.columns.str.strip()
merged_data_info['sex']=merged_data_info['sex'].replace({ 'M' : 0, 'F' : 1  })
merged_data_info['label']=merged_data_info['class']
merged_data_info['label']=merged_data_info['label'].replace({ 'Normal' : 0, 'Lung Opacity' : 1, 'No Lung Opacity / Not Normal' : 2 })


> The Age can be considered as an continuous input. After removing the white spaces at the beginning and end of string columns, we encode the categorical columns into integers with the following code. The label will be the new classification output column
* Male : 0
* Female : 1

* Normal : 0
* Lung Opacity : 1
* No Lung Opacity / Not Normal : 2

In [None]:
merged_data_info['age'] = merged_data_info['age'].astype('int64')

In [None]:
merged_data_info.info()

> Other than the class description column and the patient ID colummn all the others are of numerical datatype

> 

# Target Distribution

In [None]:
label_count=label_meta_data['class'].value_counts()
explode = (0.01,0.01,0.01)  

fig1, ax1 = plt.subplots(figsize=(5,5))
ax1.pie(label_count.values, explode=explode, labels=label_count.index, autopct='%1.1f%%',
        shadow=True, startangle=90)
ax1.axis('equal') 
plt.title('Class Distribution')
plt.show()

> Only 22% of the patients have Lung opacity, which shows that the dataset is very much imbalanced. This is based on the unique patientIDs. Out of these patient X-ray scans each image may have one or more bounding boxes.

In [None]:
# lets take a look at our Target Distribution
label_count=merged_data_info['Target'].value_counts()
explode = (0.1,0.0)  

fig1, ax1 = plt.subplots(figsize=(5,5))
ax1.pie(label_count.values, explode=explode, labels=['Normal','Pneumonia'], autopct='%1.1f%%',
        shadow=True, startangle=90)
ax1.axis('equal') 
plt.title('Target Distribution')
plt.show()

> We need to atleast double the positive datasets artificially to make it a balanced data set

# Visualization of the Images

In [None]:
r=c=3
fig= plt.figure(figsize=(15,14))
for i in range(1,r*c+1):
    id_= np.random.choice(merged_data_info['patientId'].values)
    label_0= np.unique(merged_data_info['Target'][merged_data_info['patientId']==id_])
    label_1= np.unique(merged_data_info['class'][merged_data_info['patientId']==id_])
    
    #read xray
    img=pyd.read_file(os.path.join(images_path,id_+'.dcm')).pixel_array
    fig.add_subplot(r,c,i)
    plt.imshow(img,cmap='gray')
    if label_0==1:
        plt.title('Pneumonia Infected'+' | '+label_1)
    else:
        plt.title('Normal Xray'+' | '+label_1)
    plt.xticks([])
    plt.yticks([])

> These random set of 9 images show various combinations of Target class and the detailed category

# Visualization of the areas of inflammation

In [None]:
id_= np.random.choice(merged_data_info[merged_data_info['Target'] == 1]['patientId'].values)
class_=merged_data_info['class'][merged_data_info['patientId']==id_]

plt.figure(figsize=(15,10))
current_axis = plt.gca()
img=pyd.read_file(os.path.join(images_path,id_+'.dcm')).pixel_array
plt.imshow(img,cmap='bone')


current_axis = plt.gca()
boxes=train_labels_df[['x','y','width','height']][merged_data_info['patientId']==id_].values

for box in boxes:
    x=box[0]
    y=box[1]
    w=box[2]
    h=box[3]
    current_axis.add_patch(plt.Rectangle((x, y), w, h, 
                                         color='red', fill=False, linewidth=3))  
    

In [None]:
sns.countplot(x=merged_data_info['Target'], hue=merged_data_info['sex'])

> Its seems the representation of males is slightly greater than females by almost ~2000 datapoints for both positive as well as negative cases

In [None]:
X = merged_data_info['patientId'].values
y = tf.keras.utils.to_categorical(merged_data_info['label'].values,num_classes=3)
X_train, X_val, y_train, y_val = train_test_split(X,y,test_size=0.2)
print('Samples in Training Data: ', len(X_train))
print('Samples in Validation Data: ', len(X_val))

In [None]:
BATCH_SIZE = 32
HEIGHT = 224
WIDTH = 224
dims=(HEIGHT,WIDTH,1)

In [None]:
class KustomGenerator(Sequence):
    def __init__(self,input_data,batch_size=BATCH_SIZE,dims=(HEIGHT,WIDTH,1),is_train=True):
        self.input_ids=input_data[0]
        self.input_targets=input_data[1]
        self.batch_size=batch_size
        self.dims=dims
        self.is_train=is_train
        self.on_epoch_end()
    
    def on_epoch_end(self):
        self.indexes=np.arange(len(self.input_ids))
        if self.is_train:
            np.random.shuffle(self.indexes)
    
    
    def __len__(self):
        return int(len(self.input_ids)/self.batch_size)
    
    def __getitem__(self,index):
        
        indexes=self.indexes[index*self.batch_size:(index+1)*self.batch_size]
        X_ids = [self.input_ids[i] for i in indexes]
        Y_=[self.input_targets[i] for i in indexes]
        
        X=self.__data_generation(X_ids)
        return X,np.array(Y_)

    def __data_generation(self,input_x):
        tmp_imgs=np.zeros((self.batch_size,*self.dims))
        
        for ix,rows in enumerate(input_x):
            #Read Image
            img=pyd.read_file(os.path.join(images_path,input_x[ix]+'.dcm')).pixel_array
            img_shape=img.shape
            img=cv2.resize(img,(self.dims[0],self.dims[1]))
            img=np.expand_dims(img,2)
            
            
            #augmentation
            #if self.is_train:
            #    img=self.__augmentation(img)
                       
            tmp_imgs[ix]=img.astype('float')/255.
            
        return tmp_imgs
    
    def __augmentation(self,image):
        transform = A.Compose([
            A.HorizontalFlip(p=0.5),
            A.VerticalFlip(p=0.5),
            A.Transpose(p=0.5),])
        t=transform(image=image)
        return t['image']

In [None]:
#Get Generator Object
train_gen=KustomGenerator([X_train,y_train])
val_gen=KustomGenerator([X_val,y_val],is_train=False)

In [None]:
'''
inputs = layers.Input(shape=(HEIGHT, WIDTH, 1))
inp=Input(2)
x_=Dense(512,activation='relu')(inp)


x=layers.Conv2D(3,(5,5),1,padding='same')(inputs)
x=layers.LayerNormalization()(x)
x=layers.Activation('relu')(x)

base_feat = EfficientNetB0(include_top=False, weights='imagenet')
for layer in base_feat.layers:
    layer.trainable=True

base_feat=base_feat(x)    
base_feat=layers.GlobalAveragePooling2D()(base_feat)
outputs=layers.Dense(3,activation='sigmoid')(x)

model = tf.keras.Model(inputs, outputs)

model.compile(optimizer="adam", loss="binary_crossentropy", metrics=["acc"])
model.summary()
'''

In [None]:
inputs = layers.Input(shape=(HEIGHT, WIDTH, 1))
x=layers.Conv2D(3,(5,5),1,padding='same')(inputs)
x=layers.LayerNormalization()(x)
x=layers.Activation('relu')(x)

base_feat = EfficientNetB0(include_top=False, weights='imagenet')(x)
base_feat=layers.GlobalAveragePooling2D()(base_feat)
x=layers.Dense(1024,activation='relu')(base_feat)
outputs=layers.Dense(3,activation='sigmoid')(x)

model = tf.keras.Model(inputs, outputs)

model.compile(optimizer="adam", loss="binary_crossentropy", metrics=["acc"])
model.summary()

In [None]:
#Callbacks


In [None]:
EPOCH=10
model.fit(train_gen,steps_per_epoch=train_gen.__len__(),
          epochs=EPOCH,validation_data=val_gen,
          validation_steps=val_gen.__len__())