### Importing the necessary libraries
We will be importing libraries at different stages of the program as well, but these are required to start the project

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
import matplotlib.image as mpimg
import cv2
import pydicom as dicom
from keras.callbacks import ReduceLROnPlateau

### Data Preprocessing
In this section we will be pre-processing the data we have and make it suitable for analysis and model creation

In [2]:
# Address where data is stored and this will be usefull while declaring other addresses
dataset_address = "../input/rsna-pneumonia-detection-challenge/"

In [3]:
# Importing training csv file as dataframes

# train_label is a dataframe containing original patientId, coordinate for bounding box and target values defining if the patient have pnemonia or not
train_label = pd.read_csv(dataset_address + "stage_2_train_labels.csv")
train_label.sample(10)

In [4]:
# This function with_box_img_to_numpy reads the image file, prepares a box around the infected area (using x,y,width,height from table)
# and saves the numpy array of the image file into patientId. This column will help us train the model for
# object detection.

def img_to_numpy(patientId):
    address = dataset_address + "stage_2_train_images/" + patientId + ".dcm" #Address for the file
    result = train_label.loc[train_label['patientId'] == patientId] #Record for the corresponding patientId
    
    ds = dicom.dcmread(address)
    cv2.imwrite('tempfile.png',ds.pixel_array)
    
    img = cv2.imread("tempfile.png")
    #resizing the image for easier computation while running the model
    resized_image = cv2.resize(src = img, dsize = (102,102))
    
    return np.array(resized_image)

In [5]:
# Applying the function to all patientId in train_label dataframe and saving it as df. We'll use this df for our model
# Training and testing
df = train_label.sample(1000)
df.patientId = df.patientId.apply(img_to_numpy)

In [6]:
# Since we already have a box around the images, droping x,y,width, and height from dataframe
df.drop('x',axis = 1,inplace=True)
df.drop('y',axis = 1,inplace=True)
df.drop('width',axis = 1,inplace=True)
df.drop('height',axis = 1,inplace=True)

In [7]:
# Showing a sample from df for validation
# df.sample(10)

##### Data seperation into features and lables
We will create 2 lists, one with features (i.e the numpy array of images) and other with Target values

In [8]:
x_train_temp = df.drop('Target',axis=1)
y_train_temp = df['Target']

##### Data Visualization for y_train to understand the distribution of data

In [9]:
plt.bar(['Normal','Pnemonic'],[y_train_temp[y_train_temp == 0].count(),y_train_temp[y_train_temp == 1].count()],color=['orange','green'])
plt.show()

In [11]:
# Converting the dataframe into numpy array for easier access using index
x_train_temp = np.array(x_train_temp)

# Since our dataset only contains 1 column, we are reshaping it in 1D
x_train_temp = x_train_temp.reshape(1000)

y_train = np.array(y_train_temp)

In [12]:
x_train = []
for x in x_train_temp:
    x_train.append(x)
x_train = np.array(x_train)

In [13]:
# Creating a validation set

x_train_model, x_val, y_train_model, y_val = train_test_split(x_train,y_train,test_size = 0.3,random_state=1)

### Data Generation
To avoid the problem of bais or overfitting, we are generating artificial images from our existing data.

In [14]:
# This library will allow us to generate a lot of images from our existing dataset
from keras.preprocessing.image import ImageDataGenerator

In [15]:
# data_geneator is a instance of ImageDataGenerator which later used on x_train to modify the images
data_generator = ImageDataGenerator(
    rotation_range=90,
    width_shift_range=5.0,
    height_shift_range=5.0,
    zoom_range=1.5,
    fill_mode='nearest',
    horizontal_flip=True,
    vertical_flip=True,
    rescale=1.2
)

# Generating the data from x_train
data_generator.fit(x_train)

In [16]:
#Importing the required libraries to prepare the model
from keras.models import Sequential
from keras.layers import Dense, Conv2D , MaxPool2D , Flatten , Dropout , BatchNormalization


# Creating a model taking input as 1024*1024 image
model = Sequential()

# model.add(Flatten(input_shape=[102,102,3]))
# model.add(Dense(200,activation='relu'))
# model.add(Dense(100,activation='relu'))
# model.add(Dense(10,activation='softmax'))

model.add(Conv2D(32 , (3,3) , strides = 1 , padding = 'same' , activation = 'relu' , input_shape = (102,102,3)))
model.add(BatchNormalization())
model.add(MaxPool2D((2,2) , strides = 2 , padding = 'same'))

model.add(Conv2D(64 , (3,3) , strides = 1 , padding = 'same' , activation = 'relu'))
model.add(Dropout(0.1))
model.add(BatchNormalization())
model.add(MaxPool2D((2,2) , strides = 2 , padding = 'same'))

model.add(Conv2D(64 , (3,3) , strides = 1 , padding = 'same' , activation = 'relu'))
model.add(Dropout(0.1))
model.add(BatchNormalization())
model.add(MaxPool2D((2,2) , strides = 2 , padding = 'same'))

model.add(Conv2D(64 , (3,3) , strides = 1 , padding = 'same' , activation = 'relu'))
model.add(Dropout(0.1))
model.add(BatchNormalization())
model.add(MaxPool2D((2,2) , strides = 2 , padding = 'same'))

model.add(Flatten())
model.add(Dense(units = 128 , activation = 'relu'))
model.add(Dropout(0.2))

model.add(Dense(units = 1 , activation = 'sigmoid'))
model.compile(optimizer = "rmsprop" , loss = 'binary_crossentropy' , metrics = ['accuracy'])
model.summary()

In [17]:
model.compile(loss='sparse_categorical_crossentropy',optimizer='adam',metrics='accuracy')

In [18]:
model.compile(optimizer = "rmsprop" , 
            loss = 'binary_crossentropy' , 
            metrics = ['accuracy'])

In [20]:
learning_rate_reduction = ReduceLROnPlateau(monitor='val_accuracy', 
                                            patience = 2, 
                                            verbose=1,
                                            factor=0.3, 
                                            min_lr=0.00001)

In [21]:
model.fit(data_generator.flow(x_train_model,y_train_model), batch_size = 32,epochs = 100,validation_data = data_generator.flow(x_val, y_val),callbacks = learning_rate_reduction)