# I. EXPLORATORY DATA ANALYSIS
- load and study from train sets
- view distributions of classes
- view DICOM information (and eventually to be dropped before experiments)

In [None]:
#### uncomment below lines if running for first time
#### install gdcm and related libs which read medical dicom files
#!conda install -c conda-forge pillow -y
#!conda install -c conda-forge pydicom -y
#!conda install -c conda-forge gdcm -y
#!pip install pylibjpeg pylibjpeg-libjpeg

In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

import numpy as np 
import pandas as pd 
from pandas import DataFrame

from matplotlib.lines import Line2D
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px

import os
import pydicom
import glob
import cv2

from fastai.vision.all import *
from fastai.medical.imaging import *

from tqdm.notebook import tqdm
from pydicom.pixel_data_handlers.util import apply_voi_lut
from skimage import exposure

import warnings
warnings.filterwarnings('ignore')

In [None]:
# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))


## There are two datasets - study level and image level
## Study level is classification problem, an Xray may have one of the 4 given classes
- Negative: No findings, clean lungs
- Typical: findings common in COVID-19
- ATypical: findings uncommon in COVID-19
- Indeterminate: findings may occur in COVID-19 patients but usually seen for other infections

Loading study level dataset ...

In [None]:
#read study level data
train_study_df = pd.read_csv('../input/siim-covid19-detection/train_study_level.csv')
train_study_df = train_study_df.rename(columns = {'Negative for Pneumonia': 'Negative', 'Typical Appearance': 'Typical', 'Indeterminate Appearance': 'Indeterminate', 'Atypical Appearance': 'Atypical'}, inplace = False)

#determine "y"
train_study_df['y_study'] = 'Typical'
train_study_df.loc[train_study_df['Negative']==1, 'y_study'] = 'Negative'
train_study_df.loc[train_study_df['Indeterminate']==1, 'y_study'] = 'Indeterminate'
train_study_df.loc[train_study_df['Atypical']==1, 'y_study'] = 'Atypical'

train_study_df.head(3)

In [None]:
# draws barchart for the given df
def plot_barchart(df,x,y,x_title,title,colors=None,text=None):
    fig = px.bar(x=x,
                 y=y,
                 text=text,
                 labels={x: x_title.title()},   
                 data_frame=df,
                 color=colors,
                 barmode='group',
                 template="simple_white")
    
    texts = [df[col].values for col in y]
    for i, t in enumerate(texts):
        fig.data[i].text = t
        fig.data[i].textposition = 'inside'
        
    fig['layout'].title=title

    fig.update_layout(title_font_size=19)
    fig.update_layout(title_font_family='Droid Serif')
    fig.update_layout(width=400,height=400)
        


    for trace in fig.data:
        trace.name = trace.name.replace('_',' ').title()

    fig.update_yaxes(tickprefix="", showgrid=True)

    fig.show()

# draws piechart for the given df
def plot_piechart(df, y, c, title):
    
    feature = df[y]
    counts = df[c]

    plt.figure(figsize = (10,5))
    plt.pie(counts, labels=feature, autopct="%1.1f%%")
    plt.title(title)
    plt.show()

In [None]:
# get count of each class

train_study_df1=train_study_df.groupby(['y_study']).size().reset_index(name='counts')
train_study_df1

In [None]:
# y_study vs count barchart
plot_barchart(train_study_df1, 'y_study', ['counts'], 'Classes', title='Count of Classes')

In [None]:
# y_study share piechart
plot_piechart(train_study_df1, 'y_study', 'counts', 'Class Frequency %')

## Image level dataset is object detection problem. An Xray may have any of below two classes based on whether it has bounding boxes or not. There maybe 0 or more bounding box information for each image, given in x1,y1,W,H format for each box.
- none: For images having no bounding box info (usually which are also Negative at study level), they have "none" at image level. No bounding box is drawn so a dummy one-pixel format is used as "none 1 0 0 1 1"
- opacity: For images having bounding boxes, they have "opacity" at image level. Each bounding box info is given in a dict.

label is given as class, confidence, bbinfo format for example when an image has 2 bounding boxes (one at each lungs):
opacity 1 789.28836 582.43035 1815.94498 2499.73327 opacity 1 2245.91208 591.20528 3340.5737 2352.75472

Loading image level dataset ...

In [None]:
# reading from image level dataset
train_image_df = pd.read_csv('../input/siim-covid19-detection/train_image_level.csv')
train_image_df.head(3)

In [None]:
# get count of each class
train_image_df['y_image'] = train_image_df.label.apply(lambda x: x.split()[0])
train_image_df1=train_image_df.groupby(['y_image']).size().reset_index(name='counts')
train_image_df1

In [None]:
# y_image vs count barchart
plot_barchart(train_image_df1, 'y_image', ['counts'], 'Classes', title='Count of Classes')

In [None]:
# y_image share piechart
plot_piechart(train_image_df1, 'y_image', 'counts', 'Class Frequency %')

In [None]:
# Merge both datasets

train_study_df['StudyInstanceUID'] = train_study_df['id'].apply(lambda x: x.replace('_study', ''))
#del train_study_df['id']
train_image_df = train_image_df.merge(train_study_df, on='StudyInstanceUID')
train_image_df.sample(3)

In [None]:
def plot_distribution(df, clss, col, title, a):
    sns.kdeplot(df[clss], shade=True,ax=a,color=col)
    a.set_title(title,font="Serif", fontsize=12)
    a.set(xlabel=None)

In [None]:
fig, ax = plt.subplots(2,2,figsize=(12,8))
plot_distribution(train_image_df, "Negative", "#00ff00", "Negative Distribution", ax[0,0])
plot_distribution(train_image_df, "Typical", "#4209ff", "Typical Distribution", ax[0,1])
plot_distribution(train_image_df, "Indeterminate", "#f72545", "Indeterminate Distribution", ax[1,0])
plot_distribution(train_image_df, "Atypical", "#ffba08", "Atypical Distribution", ax[1,1])

In [None]:
# view Xrays
def dicom2array(path, voi_lut=True, fix_monochrome=True):
    dicom = pydicom.read_file(path)
    # VOI LUT (if available by DICOM device) is used to
    # transform raw DICOM data to "human-friendly" view
    if voi_lut:
        data = apply_voi_lut(dicom.pixel_array, dicom)
    else:
        data = dicom.pixel_array
    # depending on this value, X-ray may look inverted - fix that:
    if fix_monochrome and dicom.PhotometricInterpretation == "MONOCHROME1":
        data = np.amax(data) - data
    data = data - np.min(data)
    data = data / np.max(data)
    data = (data * 255).astype(np.uint8)
    return data
        
    
def plot_img(img, size=(5, 5), is_rgb=True, title="", cmap='gray'):
    plt.figure(figsize=size)
    plt.imshow(img, cmap=cmap)
    plt.suptitle(title)
    plt.show()


def plot_imgs(imgs, cols=4, size=5, is_rgb=True, title="", cmap='gray', img_size=(300,300)):
    rows = len(imgs)//cols + 1
    fig = plt.figure(figsize=(cols*size, rows*size))
    for i, img in enumerate(imgs):
        if img_size is not None:
            img = cv2.resize(img, img_size)
        fig.add_subplot(rows, cols, i+1)
        plt.imshow(img, cmap=cmap)
    plt.suptitle(title)
    plt.show()

In [None]:
dicom_paths = get_dicom_files('../input/siim-covid19-detection/train')
imgs = [dicom2array(path) for path in dicom_paths[-4:]]
plot_imgs(imgs)

In [None]:
imgs = [dicom2array(path) for path in dicom_paths[0:4]]
plot_imgs(imgs)

In [None]:
#train_image_df = train_image_df[~train_image_df.boxes.isnull()] 
class_names = ['Negative', 'Typical', 'Indeterminate', 'Atypical'] # we have 1 negative & 3 positive classes
unique_classes = np.unique(train_image_df[class_names].values, axis=0)

In [None]:
unique_classes

[[0, 0, 0, 1],[0, 0, 1, 0],[0, 1, 0, 0]])
       
[[0, 0, 1], [0, 1, 0],[1, 0, 0]]      

In [None]:
from glob import glob
imgs = []
label2color = {
    '[1, 0, 0, 0]': [255,255,255], # Negative Appearance - white
    '[0, 1, 0, 0]': [66,9,255], # Typical Appearance - blue
    '[0, 0, 1, 0]': [247,37,69], # Indeterminate Appearance - red
    '[0, 0, 0, 1]': [255,186,8], # Atypical Appearance - yellow
}
thickness = 3
scale = 6

#for _, row in train_image_df[train_image_df['Negative']==0].iloc[12:20].iterrows():
for _, row in train_image_df.iloc[10:26].iterrows():
    study_id = row['StudyInstanceUID']
    img_path = glob(f'../input/siim-covid19-detection/train/{study_id}/*/*')[0]
    img = dicom2array(path=img_path)
    img = cv2.resize(img, None, fx=1/scale, fy=1/scale)
    img = np.stack([img, img, img], axis=-1)
    
    claz = row[class_names].values
    
    #print(claz)
    #continue
    
    color = label2color[str(claz.tolist())]

    bboxes = []
    bbox = []
    for i, l in enumerate(row['label'].split(' ')):
        if (i % 6 == 0) | (i % 6 == 1):
            continue
        bbox.append(float(l)/scale)
        if i % 6 == 5:
            bboxes.append(bbox)
            bbox = []    
    
    for box in bboxes:
        img = cv2.rectangle(
            img,
            (int(box[0]), int(box[1])),
            (int(box[2]), int(box[3])),
            color, thickness
    )
    img = cv2.resize(img, (600,600))
    imgs.append(img)
    
plot_imgs(imgs, cmap=None)

## Reading DICOM files

In [None]:
from tqdm import tqdm
import glob
import pydicom as dicom
from pydicom.pixel_data_handlers.util import apply_voi_lut
import pprint

voi_lut=True
fix_monochrome=True

def dicom_dataset_to_dict(filename,func):
    """Credit: https://github.com/pydicom/pydicom/issues/319
               https://www.kaggle.com/raddar/convert-dicom-to-np-array-the-correct-way
    """
    
    dicom_header = dicom.dcmread(filename) 
    
    #====== DICOM FILE DATA ======
    dicom_dict = {}
    repr(dicom_header)
    for dicom_value in dicom_header.values():
        if dicom_value.tag == (0x7fe0, 0x0010):
            #discard pixel data
            continue
        if type(dicom_value.value) == dicom.dataset.Dataset:
            dicom_dict[dicom_value.name] = dicom_dataset_to_dict(dicom_value.value)
        else:
            v = _convert_value(dicom_value.value)
            dicom_dict[dicom_value.name] = v
      
    del dicom_dict['Pixel Representation']
    
    if func!='metadata_df':
        #====== DICOM IMAGE DATA ======
        # VOI LUT (if available by DICOM device) is used to transform raw DICOM data to "human-friendly" view
        if voi_lut:
            data = apply_voi_lut(dicom_header.pixel_array, dicom_header)
        else:
            data = dicom_header.pixel_array
        # depending on this value, X-ray may look inverted - fix that:
        if fix_monochrome and dicom_header.PhotometricInterpretation == "MONOCHROME1":
            data = np.amax(data) - data
        data = data - np.min(data)
        data = data / np.max(data)
        modified_image_data = (data * 255).astype(np.uint8)
    
        return dicom_dict, modified_image_data
    
    else:
        return dicom_dict

def _sanitise_unicode(s):
    return s.replace(u"\u0000", "").strip()

def _convert_value(v):
    t = type(v)
    if t in (list, int, float):
        cv = v
    elif t == str:
        cv = _sanitise_unicode(v)
    elif t == bytes:
        s = v.decode('ascii', 'replace')
        cv = _sanitise_unicode(s)
    elif t == dicom.valuerep.DSfloat:
        cv = float(v)
    elif t == dicom.valuerep.IS:
        cv = int(v)
    else:
        cv = repr(v)
    return cv



In [None]:
train_directory = "../input/siim-covid19-detection/train/"
training_paths = []

for sid in tqdm(train_image_df['StudyInstanceUID']):
    training_paths.append(glob.glob(os.path.join(train_directory, sid +"/*/*"))[0])

train_image_df['path'] = training_paths

for filename in train_image_df.path[0:1]:
    dic, img_array = dicom_dataset_to_dict(filename, 'fetch_both_values')
    fig, ax = plt.subplots(1, 2, figsize=[15, 8])
    ax[0].imshow(img_array, cmap=plt.cm.gray)
    ax[1].imshow(img_array, cmap=plt.cm.plasma)    
    plt.show()
    pprint.pprint(dic)