## Part 0: Pneumonia Exploratory Data Analysis

### Format of Data

- <b><i>patientId_</i></b>- A patientId. Each patientId corresponds to a unique image.
- <b><i>x_</i></b> - the upper-left x coordinate of the bounding box.
- <b><i>y_</i></b> - the upper-left y coordinate of the bounding box.
- <b><i>width_</i></b> - the width of the bounding box.
- <b><i>height_</i></b> - the height of the bounding box.
- <b><i>Target_</i></b> - the binary Target, indicating whether this sample has evidence of pneumonia. (1 = true, 0 = false)

### Notes

- A pneumonia opacity is a part of the lungs that looks darker on a radiograph and has a shape that indicates that pneumonia is (or may be) present.
- Since the goal is to detect & then draw a bounding box on each pneumonia opacities (where each image can have 0 or many), and the training set is already classified:
    - It can be analysed as a supervised learning statistical multilabel classification


In [None]:
# Imports
import cv2
import tqdm
import pydicom
import pylab as pl
import numpy as np 
import pandas as pd 
import seaborn as sns
import matplotlib.pylab as plt
%matplotlib inline

import skimage
from skimage import feature, filters

import os
from os import listdir
from os.path import isfile, join

In [None]:
pathLabels = "../input/rsna-pneumonia-detection-challenge/stage_2_train_labels.csv"
classInfoPath = "../input/rsna-pneumonia-detection-challenge/stage_2_detailed_class_info.csv"

labels = pd.read_csv(pathLabels)
classInfo = pd.read_csv(classInfoPath)

## Part 0.1: Merge ClassInfo & Labels

In [None]:
merged = pd.merge(left = classInfo, right = labels, how = 'left', on = 'patientId') # Merge
merged = merged.drop_duplicates() # Remove duplicates

merged.head()

In [None]:
a = merged.loc[merged['class'] == "No Lung Opacity / Not Normal"]

a['Target'].value_counts()

In [None]:
# How many unique features?
print(f"Unique features: \n{merged.nunique()}")

In [None]:
neg, pos = merged["Target"].value_counts()

print(f'How many people have pneumonia vs do not: \n{merged["Target"].value_counts()}\n')

print(f'This is roughly {round(pos/(pos+neg)*100, 2)}% having pneumonia within this training data')

##### We can see that over 20,000 people are classified as not having pneumonia, whereas around 10,000 people do

##### This tells us that we have an imbalanced dataset

##### To deal with this, we need to assign the class with less appeareances (i.e: having pneumonia) with a higher weight to even this imbalance out. We will do this by using a compute weight function, that we will use in our Modelling file

In [None]:
# See how many people with pneumonia vs non-pneumonia
merged['Target'].hist()

In [None]:
# Look at the amounts in each of the different classes
merged['class'].hist();

##### It seems that the amount is spread evenly between the 3 classes with roughly 2/3s being either classified as Normal or no lung opacity/not normal

## Part 0.2: Some Numerical Analysis

### Check out pneumonia classifying boxes

In [None]:
boxNums = merged.dropna()[['x', 'y', 'width', 'height']].copy() # Make a copy of just the boxes

# Calculate x2 & y2 coordinates
boxNums['x2'] = boxNums['x'] + boxNums['width']
boxNums['y2'] = boxNums['y'] + boxNums['height']

# Calculate x2 & y2 centres
boxNums['xCentre'] = boxNums['x'] + boxNums['width']/2
boxNums['yCentre'] = boxNums['y'] + boxNums['height']/2

# Calculate area of the box
boxNums['boxArea'] = boxNums['width'] * boxNums['height']

In [None]:
boxNums.head(3)

In [None]:
# Look at the correlations between x, y, x2, y2, width, height and the centres

pairs = [(boxNums['x'], boxNums['y']), (boxNums['x2'], boxNums['y2']), (boxNums['width'], boxNums['height']), 
         (boxNums['xCentre'], boxNums['yCentre'])]

axis = [(0, 0), (0, 1), (1, 0), (1,1)]

fig, axs = plt.subplots(2, 2, figsize=(10, 10))

for i in range(4):
    axs[axis[i][0], axis[i][1]].hist2d(pairs[i][0], pairs[i][1], bins=30)

# Set titles
axs[0,0].set_title('X vs Y')
axs[0,1].set_title('X2 vs Y2')
axs[1,0].set_title('Width vs Height')
axs[1,1].set_title('X Centre vs Y Centre')

plt.show()   

##### Taking a look at the heatmap, we can see that the centres seems to have more density for x, x2 and y, y2

In [None]:
# Take a look at the distribution of box area
boxNums['boxArea'].plot(kind='hist',
                        bins=25,  
                        figsize=(14,4), 
                        title='Area Distribution of boxes for a Positive target')

## Part 0.3: Let's take a look at the Dicom Images 

In [None]:
# Get two patients (one who has pneumonia & one who doesnt)
patient0 = merged['patientId'][0] # Doesn't have pneumonia
patient1 = merged['patientId'][4] # Has pneumonia

patients = [(patient0, "Doesn't Have Pneumonia"), (patient1, "Has Pneumonia")]
patients

In [None]:
# Plot the images side by side for visual comparison
imgsPath = "../input/rsna-pneumonia-detection-challenge/stage_2_train_images/"
fig, ax = plt.subplots(1, 2, figsize=(7,7))

for i in range(2):
    
    patientID, title = patients[i][0], patients[i][1] # Extract patient data
    
    dcmFile = f"{imgsPath}{patientID}.dcm" # Get path
    dcmData = pydicom.read_file(dcmFile) # Read file
    
    img = dcmData.pixel_array # Get the pixel array
    
    ax[i].imshow(img, cmap=pl.cm.gist_gray) # Plot
    ax[i].set_title(title) # Set title
    ax[i].axis('off') # Remove axis

##### We can see that the patient with pneumonia appears to have a much less opaque scan

In [None]:
p = merged.loc[merged['patientId'] == patient1]
p

##### Important Note: A given patientId may have multiple boxes if more than one area of pneumonia is detected (see above).
##### We can see that this particular patient (patient #5) has pneumonia, and has 2 entries, with 2 different boxes

In [None]:
# We want to create a Data parser to group a patients boxes with its image
def parseData(df):

    newData = {}
    
    for n, row in df.iterrows():
        
        patientID = row['patientId'] # Initialise patient
        
        # If patient is not in the dict, add them
        if patientID not in newData:
            newData[patientID] = {
                'dicom': f"{imgsPath}{patientID}.dcm",
                'classifier': row['Target'],
                'boxes': []}

        # Add box if the patient has pneumonia
        if newData[patientID]['classifier'] == 1:
            newData[patientID]['boxes'].append([row['x'], row['y'], row['width'], row['height']])

    return newData

In [None]:
parsedData = parseData(merged)

# Check patient 1 which we know has pneumonia
parsedData[patient1]

##### We can see that we have saved the file path, the classifier and the array of boxes for that person

##### Now check someone we know that doesnt have pneumonia

In [None]:
parsedData[merged['patientId'][0]]

### Create a function that allows us to draw the boxes over the images

In [None]:
"""
Credit for @peterchang77 for these 2 functions
"""

# This function will allow us to overlay a box
def overlayBox(im, box, rgb, stroke=1):

    # --- Convert coordinates to integers
    box = [int(b) for b in box]
    
    # --- Extract coordinates
    x, y, width, height = box
    y2 = y + height
    x2 = x + width

    im[y:y + stroke, x:x2] = rgb
    im[y2:y2 + stroke, x:x2] = rgb
    im[y:y2, x:x + stroke] = rgb
    im[y:y2, x2:x2 + stroke] = rgb

    return im

def drawBox(data):
    
    d = pydicom.read_file(data['dicom']) # Open and read the file
    im = d.pixel_array

    # Convert to 3 RGB
    im = np.stack([im] * 3, axis=2)

    # Add the boxes with random colours
    for box in data['boxes']:
        
        rgb = np.floor(np.random.rand(3) * 256).astype('int') # Get rgb
        
        im = overlayBox(im=im, box=box, rgb=rgb, stroke=6) # Overlay the box

    pl.imshow(im, cmap=pl.cm.gist_gray) # Show the image
    pl.axis('off') # Remove axis

In [None]:
drawBox(parsedData[patient1])

##### We can see the dark opacities in the image were boxed, indicating the pnemonia

## Part 0.4: Viewing side-by-side of people having and not having Pneumonia

### Pneumonia = 0

In [None]:
# Get all patients with no pneumonia
patients0 = [(row['patientId']) for n, row in merged.iterrows() if row['Target'] == 0]

In [None]:
fig = plt.figure(figsize=(20, 10))
 
columns = 6
rows = 4

for i in range(1, columns*rows +1):
    
    fig.add_subplot(rows, columns, i) # Add the subplot
    drawBox(parsedData[patients0[i]]) # Draw the box

### Pneumonia = 1

In [None]:
# Get all patients with pneumonia
patients1 = [(row['patientId']) for n, row in merged.iterrows() if row['Target'] == 1]

In [None]:
fig = plt.figure(figsize=(20, 10))
 
columns = 6
rows = 4

for i in range(1, columns*rows +1):
    
    fig.add_subplot(rows, columns, i) # Add the subplot
    drawBox(parsedData[patients1[i]]) # Draw the box

## Part 0.5: Some Feature Analysis for Standard Models

In [None]:
def readImage(pId):
        
    patient = parsedData[pId]
    path = patient['dicom']

    img = pydicom.read_file(path).pixel_array # Read & Convert to pixels
    
    return img

In [None]:
noPne = readImage(patients0[1])
pne = readImage(patients1[1])

In [None]:
def printTwoImgs(img1, img2, title1, title2):
    
    fig, ax = plt.subplots(1, 2, figsize=(12,12))

    ax[0].imshow(img1, cmap="gray")
    ax[1].imshow(img2, cmap="gray")
    
    ax[0].set_title(title1)
    ax[1].set_title(title2)
    
    ax[0].axis('off')
    ax[1].axis('off')

In [None]:
printTwoImgs(noPne, pne, "No Pneumonia - Normal Image", "Pneumonia - Normal Image")

### Hist Equalisation

In [None]:
equ1 = cv2.equalizeHist(noPne)
equ2 = cv2.equalizeHist(pne)

printTwoImgs(equ1, equ2, "No Pneumonia", "Pneumonia") # Print images

### Image Sharpening

In [None]:
hpf_kernel = np.full((3, 3), -1)
hpf_kernel[1,1] = 9

imHP = cv2.filter2D(equ1, -1, hpf_kernel)
imUS = skimage.filters.unsharp_mask(equ1) # use unsharpen mask filter

imHP2 = cv2.filter2D(equ2, -1, hpf_kernel)
imUS2 = skimage.filters.unsharp_mask(equ2) # use unsharpen mask filter

printTwoImgs(imHP, imHP2, "No Pneu - High Pass", "Pneu - High Pass") # Print images

printTwoImgs(imUS, imUS2, "No Pneu - Unsharpen Mask", "Pneu - Unsharpen Mask") # Print images