In [None]:
#Importing packages
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
from glob import glob
import os

In [None]:
detail_class_info=pd.read_csv('../input/rsna-pneumonia-detection-challenge/stage_2_detailed_class_info.csv')
train_label=pd.read_csv('../input/rsna-pneumonia-detection-challenge/stage_2_train_labels.csv')
samp_submission=pd.read_csv('../input/rsna-pneumonia-detection-challenge/stage_2_sample_submission.csv')

In [None]:
#Viewing the dataset on a high level-detailed class info file
detail_class_info.head()

In [None]:
#Viewing the dataset on a high level-train_labels info file
train_label.head()

In [None]:
#Viewing the dataset on a high level-sample submission file
samp_submission.head()

*Understanding the dataset *

In reading the dataset detail_class_info file, it shows the list of patient ID's and under which class they are classified, i.e., if they are "No Lung Opacity / Not Normal", "Normal" or "Lung Opacity"

The dataset train_labels, it shows the patient id loaded and contains image details for the cases confirmed with 1

In [None]:
#Check the number of records loaded and the number of unique patients in the Class upload file and Train label file
print("Details under Class Upload file")
print(detail_class_info.shape[0],'Number of Patient ID Loaded')
print(detail_class_info['patientId'].value_counts().shape[0],'Number of Patient cases available')

In [None]:
#Check the number of records loaded and the number of unique patients in the Train label file
print("Details under Train Label file")
print(train_label.shape[0],'Number of Patient ID Loaded')
print(train_label['patientId'].value_counts().shape[0],'Number of Patient cases available')

In [None]:
#Find the list of patient ID loaded under various class
detail_class_info.groupby('class').count()

In [None]:
#Find the total list of patient ID who are identified with Cases confirmed or not
train_label.groupby('Target').patientId.count()

In [None]:
#Check for missing values, if any
print("Number of missing values in Details class file: ",pd.isnull(detail_class_info).sum().sum())

In [None]:
#Visualize the split in class based on Detail Class Information
detail_class_info.groupby('class').size().plot.bar(rot=0,color=['red','green','blue'])

In [None]:
#Visualize the number of cases confirmed or not from the training table
train_label.groupby('Target').size().plot.bar(rot=0,color=['green','red'])

Insight on DCM files provided in Train and Test Folders


Medical images are stored in a special format known as DICOM files (*.dcm). They contain a combination of header metadata as well as a raw image arrays for pixel data. We will use a library called pydicom to read the files. This file can be read against a particular given patientId and matching for the respective file in the stage_2_train_images and stage_2_test_images folders, and then use the pydicom.read_file() method to load the data:

In [None]:
import pydicom

In [None]:
#We will read a patient detail from the Test Image Folder
patID=train_label['patientId'][4]
img_train_folder='../input/rsna-pneumonia-detection-challenge/stage_2_train_images/%s.dcm' % patID
dcm_data=pydicom.read_file(img_train_folder)

print(dcm_data)

In [None]:
#We will access the above primary patient details with pixel data in array form
im=dcm_data.pixel_array
print(type(im))
print(im.dtype)
print(im.shape)

In [None]:
import pylab
pylab.imshow(im,cmap=pylab.cm.gist_gray)
pylab.axis('off')

Exploring the Data and Labels

Any given patient may potentially have many boxes if there are several different suspicious areas of pneumonia. To collapse the current CSV file dataframe into a dictionary with unique entries, we will consider the following method:

In [None]:
def parse_data(df):
    """
    Method to read a CSV file (Pandas dataframe) and parse the 
    data into the following nested dictionary:

      parsed = {
        
        'patientId-00': {
            'dicom': path/to/dicom/file,
            'label': either 0 or 1 for normal or pnuemonia, 
            'boxes': list of box(es)
        },
        'patientId-01': {
            'dicom': path/to/dicom/file,
            'label': either 0 or 1 for normal or pnuemonia, 
            'boxes': list of box(es)
        }, ...

      }

    """
    # --- Define lambda to extract coords in list [y, x, height, width]
    extract_box = lambda row: [row['y'], row['x'], row['height'], row['width']]

    parsed = {}
    for n, row in df.iterrows():
        # --- Initialize patient entry into parsed 
        pid = row['patientId']
        if pid not in parsed:
            parsed[pid] = {
                'dicom': '../input/rsna-pneumonia-detection-challenge/stage_2_train_images/%s.dcm' % pid,
                'label': row['Target'],
                'boxes': []}

        # --- Add box if opacity is present
        if parsed[pid]['label'] == 1:
            parsed[pid]['boxes'].append(extract_box(row))

    return parsed

In [None]:
parsed = parse_data(train_label)

In [None]:
print(parsed['00436515-870c-4b36-a041-de91049b9ab4'])

In [None]:
def draw(data):
    """
    Method to draw single patient with bounding box(es) if present 

    """
    # --- Open DICOM file
    d = pydicom.read_file(data['dicom'])
    im = d.pixel_array

    # --- Convert from single-channel grayscale to 3-channel RGB
    im = np.stack([im] * 3, axis=2)

    # --- Add boxes with random color if present
    for box in data['boxes']:
        rgb = np.floor(np.random.rand(3) * 256).astype('int')
        im = overlay_box(im=im, box=box, rgb=rgb, stroke=6)

    pylab.imshow(im, cmap=pylab.cm.gist_gray)
    pylab.axis('off')

def overlay_box(im, box, rgb, stroke=1):
    """
    Method to overlay single box on image

    """
    # --- Convert coordinates to integers
    box = [int(b) for b in box]
    
    # --- Extract coordinates
    y1, x1, height, width = box
    y2 = y1 + height
    x2 = x1 + width

    im[y1:y1 + stroke, x1:x2] = rgb
    im[y2:y2 + stroke, x1:x2] = rgb
    im[y1:y2, x1:x1 + stroke] = rgb
    im[y1:y2, x2:x2 + stroke] = rgb

    return im

In [None]:
draw(parsed['00436515-870c-4b36-a041-de91049b9ab4'])

Data Setup

In [None]:
def formatMetadataString(val):
    return str(val).split(':')[1].replace('\'', '')


In [None]:
import glob

image_data = []

dataDir='../input/rsna-pneumonia-detection-challenge'
trainimg_folder='../input/rsna-pneumonia-detection-challenge/stage_2_train_images'

trainfiles = glob.glob(os.path.join(dataDir,trainimg_folder, "*.dcm"))

for f in trainfiles:    
    
    patientId = formatMetadataString(ds['PatientID'])
    age = formatMetadataString(ds['PatientAge'])
    gender = formatMetadataString(ds['PatientSex'])
    viewPos = formatMetadataString(ds['ViewPosition'])    
    image_data.append([patientId, int(age), gender, viewPos])
    
