## MileStone 1 of Capstone project (RSNA-pneumonia-detection-challenge)

In [None]:
import numpy as np 
import pandas as pd
import matplotlib.pyplot as plt
from matplotlib.patches import Rectangle
import seaborn as sns
import gc
import glob
import os
import cv2
import pydicom

import warnings
warnings.simplefilter(action = 'ignore')

## Loading CSV files

In [None]:
detailed_df = pd.read_csv('/kaggle/input/rsna-pneumonia-detection-challenge/stage_2_detailed_class_info.csv')
train_df = pd.read_csv('/kaggle/input/rsna-pneumonia-detection-challenge/stage_2_train_labels.csv')

In [None]:
## shape of detailed_df
detailed_df.shape

In [None]:
## shape of train_df
train_df.shape

In [None]:
detailed_df.head()

In [None]:
train_df.head()

### Merging the data tables detailed_df and train_df

In [None]:
df = pd.merge(left = detailed_df, right = train_df, how = 'left', on = 'patientId')
df = df.drop_duplicates()
df.info()

### It is clealy evident that the above data table contains lots of null values.

## Summary on the values, types and null values:

In [None]:
df.info()

In [None]:
df.isnull().sum()

### Distribution of classes

The following output shows that the nearly 2/3 of the patients do not have pneumonia (with target value = 0) and 1/3 of the patients have pneumonia (with target value =1)

In [None]:
pd.pivot_table(df,index=["Target"], values=['patientId'], aggfunc='count')

# alternative approach
# train_df['Target'].value_counts()

### Distribution of patients in each class
There are 9555 patients in the category '**Lung Opacity**' and 11821 in '**No Lung Opacity / Not Normal**' category and 8851 are in **Normal** category

In [None]:
pd.pivot_table(df,index=["class"], values=['patientId'], aggfunc='count')

#### The classes "No Lung Opacity / Not Normal", "Normal", and "Lung Opacity" are in the proportion of 39%, 29% and 32% respectively.

In [None]:
df["class"].value_counts().plot(kind='pie',autopct='%1.0f%%', shadow=True, subplots=False)

#### It is also clear from the below output that the patients who do not have pnuemonia do not have the bounding box coordinates

In [None]:
pd.pivot_table(df,index=["Target"], aggfunc='count')

### Count of patients having single row and more than single rows

In [None]:
df['patientId'].value_counts().value_counts()

### Patients who do not have pneumonia has only one record in the table

In [None]:
df[df['Target'] == 0]['patientId'].value_counts().value_counts()

In [None]:
sns.countplot(x = 'class', hue = 'Target', data = df)

### Preprocessing - Filling the null values

In [None]:
df.fillna(0.0)

### Correlation between the variables
There is a strong colleation between height and width variables

In [None]:
df.corr()

In [None]:
sns.jointplot(x = 'width', y = 'height', data = df, kind="reg")

### EDA with the header values from the dataframe

Creating a data frame with all of their appropriate header values from the dicom file takes long time as there are 30277 records. Hence, the EDA analysis is done on a subset of randomly chosen 1000 records by keeping the same proportion of the classes.
(i.e) The classes Not Normal, Normal, Lunge Opacity are in a proportion 39%, 29%, and 32% respectively.

* Number of rows of Not Normal class = 39% of 1000 = 390 rows
* Number of rows of Normal class = 29% of 1000 = 290 rows
* Number of rows of Lunge Opacity class = 32% of 1000 = 320 rows


In [None]:
df_Not_Normal = df[df['class']=='No Lung Opacity / Not Normal'].sample(n=390)
df_Normal = df[df['class']=='Normal'].sample(n=290)
df_Lunge_Opacity = df[df['class']=='Lung Opacity'].sample(n=320)
frames = [df_Not_Normal, df_Normal, df_Lunge_Opacity]

dicom_df = pd.concat(frames)

dicom_df.shape

In [None]:
def process_dicom_data(data_df):
    for n, pid in enumerate(data_df['patientId'].unique()):        
        dcm_file = '/kaggle/input/rsna-pneumonia-detection-challenge/stage_2_train_images/%s.dcm' % pid
        dcm_data = pydicom.read_file(dcm_file)        
        idx = (data_df['patientId']==dcm_data.PatientID)
        data_df.loc[idx,'Modality'] = dcm_data.Modality
        data_df.loc[idx,'PatientAge'] = pd.to_numeric(dcm_data.PatientAge)
        data_df.loc[idx,'PatientSex'] = dcm_data.PatientSex
        data_df.loc[idx,'BodyPartExamined'] = dcm_data.BodyPartExamined
        data_df.loc[idx,'ViewPosition'] = dcm_data.ViewPosition
        
    return data_df

In [None]:
dicom_df = process_dicom_data(dicom_df)

In [None]:
# converting PatientAge to int as it is in float
dicom_df = dicom_df.astype({"PatientAge": int})
dicom_df.fillna(0.0, inplace=True)
dicom_df.head()

#### There are 995 unique patient rows exist

In [None]:
dicom_df.nunique()

### Now Visualizing the data along with their dicom header values

### Patient's age proportion in the detection

In [None]:
plt.figure(figsize = (30, 10))
sns.countplot(x = 'PatientAge', hue = 'Target', data = dicom_df)

## Patient's gender proportion in the detection

In [None]:
sns.countplot(x = 'PatientSex', hue = 'Target', data = dicom_df)

### With respect to view proportion

In [None]:
sns.countplot(x = 'ViewPosition', hue = 'Target', data = dicom_df);

In [None]:
dicom_df = dicom_df.drop('Target', axis=1)


In [None]:
dicom_df['PatientSex'].astype('category')
dicom_df['ViewPosition'].astype('category')
dicom_df['PatientSex'] = np.where(dicom_df["PatientSex"].str.contains("M"), 1, 0)
dicom_df['ViewPosition'] = np.where(dicom_df["ViewPosition"].str.contains("AP"), 1, 0)

In [None]:
dicom_df.head()

## Apart from the correlation between the width and height,there is no strong correlation between the other variables in the dataframe

In [None]:
dicom_df.corr()

## Visualizing the dicom images

In [None]:
def show_dicom_image(data_df):
        img_data = list(data_df.T.to_dict().values())
        f, ax = plt.subplots(2,2, figsize=(16,18))
        for i,data_row in enumerate(img_data):
            pid = data_row['patientId']
            dcm_file = '/kaggle/input/rsna-pneumonia-detection-challenge/stage_2_train_images/%s.dcm' % pid
            dcm_data = pydicom.read_file(dcm_file)                    
            ax[i//2, i%2].imshow(dcm_data.pixel_array, cmap=plt.cm.bone)
            ax[i//2, i%2].set_title('ID: {}\n Age: {} Sex: {}'.format(
                data_row['patientId'],dcm_data.PatientAge, dcm_data.PatientSex))

## Showing some random dicom images of a patients who have Pnuemonia

In [None]:
show_dicom_image(df[df['Target']==1].sample(n=4))

## Showing some random dicom images of a patient who do not have Pnuemonia, however with class ***No Lung Opacity / Not Normal***

In [None]:
show_dicom_image(df[ (df['Target']==0) & (df['class']=='No Lung Opacity / Not Normal')].sample(n=4))

## Showing some random dicom images of a patients who do not have Pnuemonia, however with class ***Normal***

In [None]:
show_dicom_image(df[ (df['Target']==0) & (df['class']=='Normal')].sample(n=4))

In [None]:
def show_dicome_with_boundingbox(data_df):
    img_data = list(data_df.T.to_dict().values())
    f, ax = plt.subplots(2,2, figsize=(16,18))
    for i,data_row in enumerate(img_data):
        pid = data_row['patientId']
        dcm_file = '/kaggle/input/rsna-pneumonia-detection-challenge/stage_2_train_images/%s.dcm' % pid
        dcm_data = pydicom.read_file(dcm_file)                    
        ax[i//2, i%2].imshow(dcm_data.pixel_array, cmap=plt.cm.bone)
        ax[i//2, i%2].set_title('ID: {}\n Age: {} Sex: {}'.format(
                data_row['patientId'],dcm_data.PatientAge, dcm_data.PatientSex))
        rows = data_df[data_df['patientId']==data_row['patientId']]
        box_data = list(rows.T.to_dict().values())        
        for j, row in enumerate(box_data):            
            x,y,width,height = row['x'], row['y'],row['width'],row['height']
            rectangle = Rectangle(xy=(x,y),width=width, height=height, color="red",alpha = 0.1)
            ax[i//2, i%2].add_patch(rectangle)            

In [None]:
show_dicome_with_boundingbox(df[df['Target']==1].sample(n=4))

## Building the pneumonia detection model using CNN

In [None]:
from keras.models import Sequential
from keras.layers import Conv2D
from keras.layers import MaxPooling2D
from keras.layers import Flatten
from keras.layers import Dense
from keras.layers.core import Flatten, Dense, Dropout
from keras.layers.convolutional import Convolution2D, MaxPooling2D, ZeroPadding2D
from keras.optimizers import SGD
from keras.preprocessing.image import ImageDataGenerator

In [None]:
IMAGE_SIZE = [224, 224]

train_path = '/kaggle/input/rsna-pneumonia-detection-challenge/stage_2_train_images/'
test_path = '/kaggle/input/rsna-pneumonia-detection-challenge/stage_2_test_images/'