## Part 1.0: Feature Extraction for Image Processing

In [None]:
# Imports
import os
import glob
import cv2
import pydicom
import skimage
import numpy as np
import pandas as pd 
from tqdm import tqdm
from os import listdir
from tqdm import tqdm_notebook
from os.path import isfile, join
from skimage import feature, filters

%matplotlib inline 
import matplotlib
import seaborn as sns
import matplotlib.pyplot as plt
from matplotlib.patches import Rectangle

In [None]:
# Load in filepaths
trainImagesPath = "../input/rsna-pneumonia-detection-challenge/stage_2_train_images"
testImagesPath = "../input/rsna-pneumonia-detection-challenge/stage_2_test_images"
labelsPath = "../input/rsna-pneumonia-detection-challenge/stage_2_train_labels.csv"
classInfoPath = "../input/rsna-pneumonia-detection-challenge/stage_2_detailed_class_info.csv"

labels = pd.read_csv(labelsPath) # Read Labels
details = pd.read_csv(classInfoPath) # Read classInfo

## Part 1.1: Define Functions for Extracing Image Features

In [None]:
"""
@Description: This function will take an image and extract all of the above features
@Input: Dicom image pixel array
@Output: Returns the number of non-zero elements in the pixel array
"""
def getImageArea(image):
    return np.count_nonzero(image)

"""
@Description: This function gives us the equivalenet diameter from the area
@Input: Takes the area of the given image
@Output: Returns the equivalent image diameter
"""
def getImageEquivalentDiameter(area):
    return (np.sqrt((area*4) / np.pi))


"""
@Description: This function gets us the perimeter of an image
@Input: Dicom image pixel array of edges of image
@Output: Returns the number of non-zero elements in the perimeter pixel array
"""
def getImagePerimeter(image):
    return np.count_nonzero(image)

"""
@Description: This function gives us the irregularity in a given image
@Input: Perimeter and Area of the image
@Output: Returns the irregularity index
"""
def getImageIrregularity(perimeter, area):
    return ((area*4*np.pi) / (perimeter**2))

"""
@Description: This function gives us an images hu moments
@Input: Takes in the contours of the given image
@Output: Returns the various hu moments besides the 3rd and 7th
"""
def getImageHuMoments(contour):
    hu = cv2.HuMoments(cv2.moments(contour)).ravel().tolist() # Get the hu's
    hu.pop(-1) # Remove last hu
    hu.pop(2) # Remove third hu
    return ([-np.sign(h)*np.log10(np.abs(h)) for h in hu]) # Return the log of the hu's

## Part 1.2: Define Extract Features Function

In [None]:
"""
@Description: This function will take an image and extract all of the above features

@Input: An image that has been read with pydicom

@Output: Returns the extract features

@Credit: This extraction function was borrowed from @suryathiru (https://www.kaggle.com/suryathiru/1-tradition-image-processing-feature-extraction/)
"""
def extractFeatures(image):
    
    mean = image.mean() # Mean
    stdDev = image.std() # Standard deviation
    equalized = cv2.equalizeHist(image) # Hist Equalisation
    
    # Sharpening
    hpf_kernel = np.full((3, 3), -1)
    hpf_kernel[1,1] = 9
    
    sharpened = cv2.filter2D(equalized, -1, hpf_kernel)
    
    ret, binarized = cv2.threshold(cv2.GaussianBlur(sharpened,(7,7),0), 0, 255, cv2.THRESH_BINARY + cv2.THRESH_OTSU) # thresholding
    edges = skimage.filters.sobel(binarized) # Edge detection for binarized image
    
    # Moments from contours
    contours, hier = cv2.findContours((edges * 255).astype('uint8'),cv2.RETR_EXTERNAL,cv2.CHAIN_APPROX_SIMPLE)
    select_contour = sorted(contours, key=lambda x: x.shape[0], reverse=True)[0]
    
    # Return extracted features
    return (mean, stdDev, getImageArea(binarized), getImagePerimeter(edges), 
            getImageIrregularity(getImageArea(binarized), getImagePerimeter(edges)),
            getImageEquivalentDiameter(getImageArea(binarized)), getImageHuMoments(select_contour))

In [None]:
fileNames = [f for f in listdir(testImagesPath) if isfile(join(testImagesPath, f))] # Get test image filenames

## Part 1.3: Get Testing Data

In [None]:
"""
@Description: This function goes through the dicom image information and returns 1 or 0
              depending on whether the image contains Pneumonia or not

@Inputs: A dataframe containing the metadata

@Output: Returns our test y
"""
def createY(df):
    
    y = (df['SeriesDescription'] == 'view: PA')
    Y = np.zeros(len(y)) # Initialise Y
    
    for i in range(len(y)):
        if(y[i] == True):
            Y[i] = 1
    
    return Y

In [None]:
# Reads each image path and puts it into a list
def readDicomData(data):
    
    res = []
    
    for filePath in tqdm(data): # Loop over data
        f = pydicom.read_file(filePath, stop_before_pixels=True) # Read image and stop before pixels to save memory
        res.append(f)
    
    return res

In [None]:
"""
@Description: This function parses the medical images meta-data contained

@Inputs: Takes in the dicom image after it has been read

@Output: Returns the unpacked data and the group elements keywords
"""
def parseMetadata(dcm):
    
    unpackedData = {}
    groupElemToKeywords = {}
    
    for d in dcm: # Iterate here to force conversion from lazy RawDataElement to DataElement
        pass
    
    # Un-pack Data
    for tag, elem in dcm.items():
        tagGroup = tag.group
        tagElem = tag.elem
        keyword = elem.keyword
        groupElemToKeywords[(tagGroup, tagElem)] = keyword
        value = elem.value
        unpackedData[keyword] = value
        
    return unpackedData, groupElemToKeywords


In [None]:
testFilepaths = glob.glob(f"{testImagesPath}/*.dcm") # Get test data file paths
testImages = readDicomData(testFilepaths) # Read test file paths

testMetaDicts, testKeyword = zip(*[parseMetadata(x) for x in tqdm(testImages)])
test_df = pd.DataFrame.from_dict(data = testMetaDicts) # Convert to dataframe

test_df['dataset'] = 'test' # Call it test
test_Y = createY(test_df) # Call the create Y function to get our test Y

In [None]:
# Get testing images
featuresTest = []

for fN in tqdm(fileNames): # Loop over file names

    path = f"{testImagesPath}/{fN}" # Get Path
    
    image = pydicom.read_file(path).pixel_array # Read file and get pixel array
    
    featuresTest.append(extractFeatures(image)) # Extract features & append to array

In [None]:
# Get training images
featuresTrain = []

# Loop over patient IDs
for patientId in tqdm(labels['patientId']):

    path = f"{trainImagesPath}/{patientId}.dcm" # Get path
    
    image = pydicom.read_file(path).pixel_array # Read file and get pixel array
    
    featuresTrain.append(extractFeatures(image)) # Extract features & append to array

In [None]:
testDf = pd.DataFrame(test_Y, columns = ['Target'])
testDf['features'] = featuresTest

labels['features'] = featuresTrain # Set features

In [None]:
testDf.head(3)

In [None]:
labels.head(3)

## Part 1.4: Download Data to CSV

In [None]:
# Download test data as csv
testDf.to_csv('testImageFeatures.csv')

In [None]:
# Download train data as csv
df = pd.DataFrame(labels)
df.to_csv('dicomImageFeatures.csv')