![COVID-19](https://images.pexels.com/photos/3992933/pexels-photo-3992933.jpeg?auto=compress&cs=tinysrgb&dpr=1&w=500)

<div style="color:white; display:fill; border-radius:5px; background-color:#5642C5;font-size:200%;font-family:Verdana; letter-spacing:0.5px"><p style="padding: 10px;color:white;">Introduction</p></div>

## This is a competition where we need to identify & localize COVID-19 abnormalities from the chest radiographs.
## This is an object detection and classification problem.

## Items to be classified (Performed on study level on training dataset, need to find the same for test dataset)
1. Negative for Pneumonia
2. Typical Appearance
3. Indeterminate Appearance
4. Atypical Appearance

Let's get started !!!

<div style="font-size:30px" class="alert alert-block alert-info"> Cool Imports</div>

In [None]:
pip install pylibjpeg pylibjpeg-libjpeg pydicom


In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import os, glob, ast, cv2, sys
from pathlib import Path
import seaborn as sns
import pylibjpeg
import matplotlib.pyplot as plt
import plotly.graph_objects as go
import plotly.express as px
from plotly.subplots import make_subplots
from collections import Counter
import pydicom
from pydicom.pixel_data_handlers.util import apply_voi_lut
from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot
import plotly as py
init_notebook_mode(connected=True)

from colorama import Fore, Back, Style
y_ = Fore.YELLOW
r_ = Fore.RED
g_ = Fore.GREEN
b_ = Fore.BLUE
m_ = Fore.MAGENTA
sr_ = Style.RESET_ALL
print(f"{y_}Folder Contents:{b_}\n")

count = 1
for dirname, _, filenames in os.walk('/kaggle/input'):
    if count < 10:
        for filename in filenames:
            print(f"{os.path.join(dirname, filename)}");
            count += 1
print(f"{sr_}")

<div style="font-size:30px" class="alert alert-block alert-info"> Dataset Analytics</div>

In [None]:
files = glob.glob('/kaggle/input/siim-covid19-detection/*')

print(f" {r_} Number of CSV Files : {b_}{len([file for file in files if file.endswith('.csv')])}{sr_}")

training_study_files = glob.glob('/kaggle/input/siim-covid19-detection/train/*')
print(f" {r_} Number of Training Study Files : {b_}{len(training_study_files)}{sr_}")
training_series_files = glob.glob('/kaggle/input/siim-covid19-detection/train/*/*')
print(f" {r_} Number of Training Series Files : {b_}{len(training_series_files)}{sr_}")
training_image_files = glob.glob('/kaggle/input/siim-covid19-detection/train/*/*/*.dcm')
print(f" {r_} Number of Training Image (.dcm) Files : {b_}{len(training_image_files)} \n")

testing_study_files = glob.glob('/kaggle/input/siim-covid19-detection/test/*')
print(f" {r_} Number of Testing Study Files : {b_}{len(testing_study_files)}{sr_}")
testing_series_files = glob.glob('/kaggle/input/siim-covid19-detection/test/*/*')
print(f" {r_} Number of Testing Series Files : {b_}{len(testing_series_files)}{sr_}")
testing_image_files = glob.glob('/kaggle/input/siim-covid19-detection/test/*/*/*.dcm')
print(f" {r_} Number of Testing Image (.dcm) Files : {b_}{len(testing_image_files)}{sr_}")

## Intuitively we can think study as number of patients (6064) and there can be multiple radiographs taken by any single patient. 
## Thus in total there is 6334 image(.dcm) files.

<div style="font-size:20px" class="alert alert-block alert-info"> Sneek peek at train_study_level.csv </div>

## train_study_level.csv - The train study-level metadata, with one row for each study, including correct labels.

In [None]:
tsl_df = pd.read_csv('/kaggle/input/siim-covid19-detection/train_study_level.csv')
print("\n", tsl_df.info())
tsl_df.head()

In [None]:
print(f"{r_}Unique Elements : \n {b_}{np.unique(tsl_df[list(tsl_df.columns[1:])].values, axis=0)}{sr_}")

<div style="font-size:30px" class="alert alert-block alert-info"> Class Distribution</div>

In [None]:
columns = tsl_df.columns[1:]
x0 = [columns[0],columns[1],columns[2],columns[3]]
y0 = [str(len(tsl_df[tsl_df[columns[0]] == 0])), str(len(tsl_df[tsl_df[columns[1]] == 0])), str(len(tsl_df[tsl_df[columns[2]] == 0])), str(len(tsl_df[tsl_df[columns[3]] == 0]))]

x1 = [columns[0],columns[1],columns[2],columns[3]]
y1 = [str(len(tsl_df[tsl_df[columns[0]] == 1])), str(len(tsl_df[tsl_df[columns[1]] == 1])), str(len(tsl_df[tsl_df[columns[2]] == 1])), str(len(tsl_df[tsl_df[columns[3]] == 1]))]

fig = go.Figure()
fig.add_trace(go.Histogram(histfunc="sum", y=y0, x=x0, name="0"))
fig.add_trace(go.Histogram(histfunc="sum", y=y1, x=x1, name="1"))
# fig.update_layout(barmode="overlay",bargap=0.1)
py.offline.iplot(fig)

# Just as expected, there is slight skewness towards the samples of 0, than the samples of 1

<div style="font-size:20px" class="alert alert-block alert-info"> Sneek peek at train_image_level.csv </div>

## train_image_level.csv - the train image-level metadata, with one row for each image, including both correct labels and any bounding boxes in a dictionary format. Some images in both test and train have multiple bounding boxes.

In [None]:
til_df = pd.read_csv('/kaggle/input/siim-covid19-detection/train_image_level.csv')
print("\n", til_df.info())
til_df.head()

<div style="font-size:30px" class="alert alert-block alert-info"> Bounding Box Distribution</div>

## Object Detection format (x,y,w,h)
![coords](https://i.imgur.com/Ow9oPGx.png)

In [None]:
box_frequency = []
for index, row in til_df.iterrows():
    if (isinstance(row.boxes, str)): ## To filter NaN
        box_frequency.append(len(ast.literal_eval(row.boxes)))
    else:
        box_frequency.append(0)
        
fig = go.Figure(data=[go.Pie(labels=list(Counter(box_frequency).keys()), 
                             values=list(Counter(box_frequency).values()), 
                             textinfo='label+percent',
                             insidetextorientation='radial',hole=.3)])
fig.update_layout(title_text='BBOX Distribution - Pie')
py.offline.iplot(fig)


In [None]:
fig = go.Figure(go.Histogram(x=box_frequency, bingroup=1))
fig.update_layout(barmode="overlay",bargap=0.1,title_text='BBOX Distribution - Bar')
py.offline.iplot(fig)

In [None]:
count_of_none = np.count_nonzero(np.array(box_frequency) == 0 )
count_of_opacity = np.count_nonzero(np.array(box_frequency) != 0 )

fig = go.Figure()
fig.add_trace(go.Histogram(histfunc="sum", x=['none','opacity'], y=[count_of_none,count_of_opacity]))
fig.update_layout(title_text='Distribution of None Vs Opacity')
py.offline.iplot(fig)

<div style="font-size:30px" class="alert alert-block alert-info"> Custom Functions</div>

In [None]:
## Functions

def get_image_id(path):
    image_name = path.split('/')[-1].replace('.dcm', '_image')
    return image_name


def create_study_id(path):
    image_name = path.split('/')[-1].replace('.dcm', '_image')
    return image_name

def dicom2array(path, voi_lut=True, fix_monochrome=True):
    dicom = pydicom.read_file(path)
    # VOI LUT (if available by DICOM device) is used to
    # transform raw DICOM data to "human-friendly" view
    if voi_lut:
        data = apply_voi_lut(dicom.pixel_array, dicom)
    else:
        data = dicom.pixel_array
    # depending on this value, X-ray may look inverted - fix that:
    if fix_monochrome and dicom.PhotometricInterpretation == "MONOCHROME1":
        data = np.amax(data) - data
    data = data - np.min(data)
    data = data / np.max(data)
    data = (data * 255).astype(np.uint8)
    data = cv2.cvtColor(data, cv2.COLOR_GRAY2RGB)

    return data

def do_annotations(path, coords, color_codes):
    temp_image = dicom2array(path)
    for coord in coords:
        cv2.rectangle(temp_image, (int(coord['x']), int(coord['y'])), (int(coord['x']+coord['width']), int(coord['y']+coord['height'])), color_codes, 15)
    return temp_image

def make_subplots(display_image, display_image_w_annotation, title):
    fig, ax = plt.subplots(3,2, sharex='col', figsize=(20,20), gridspec_kw={'hspace': 0, 'wspace': 0.1}) 

    ax[0, 0].imshow(cv2.resize(display_image[0], (1024, 1024)))
    ax[0, 1].imshow(cv2.resize(display_image_w_annotation[0], (1024, 1024)))

    ax[1, 0].imshow(cv2.resize(display_image[1], (1024, 1024)))
    ax[1, 1].imshow(cv2.resize(display_image_w_annotation[1], (1024, 1024)))

    ax[2, 0].imshow(cv2.resize(display_image[2], (1024, 1024)))
    ax[2, 1].imshow(cv2.resize(display_image_w_annotation[2], (1024, 1024)))

    plt.subplots_adjust(left=0.5)
    plt.show()
    
    

def prepare_sample(training_images_df, col_number, samples, nan_flag = False):
    df_subset = training_images_df[training_images_df.iloc[:, col_number] == 1].sample(frac=1)
    
    display_image, display_image_w_annotation  = [], []
    title = str(training_images_df.columns[col_number])
    color_codes = {}
    color_codes[5] = (255, 0, 0)
    color_codes[6] = (0, 0, 255)
    color_codes[7] = (0, 255, 0)
    
    if nan_flag:
        for idx, rows in df_subset.iterrows():
            if (len(display_image) ==  samples) and (len(display_image_w_annotation) == samples):
                return display_image, display_image_w_annotation, title, df_subset
            else:
                display_image.append(dicom2array(rows['training_images_path']))
                display_image_w_annotation.append(dicom2array(rows['training_images_path']))
    else:
        for idx, rows in df_subset.iterrows():
            if (len(display_image) ==  samples) and (len(display_image_w_annotation) == samples):
                return display_image, display_image_w_annotation, title
            else:
                if isinstance(rows['boxes'], str):
#                     print(rows['training_images_path'], rows['boxes'])
                    display_image.append(dicom2array(rows['training_images_path']))
                    display_image_w_annotation.append(do_annotations(rows['training_images_path'], ast.literal_eval(rows.boxes), color_codes[col_number]))

<div style="font-size:20px" class="alert alert-block alert-info"> Merging two dataframes into a single one based on the foreign key</div>

In [None]:
training_images_path = glob.glob('/kaggle/input/siim-covid19-detection/train/*/*/*.dcm')
training_images_df = pd.DataFrame(training_images_path, columns =['training_images_path'])

training_images_df['image_id'] = training_images_df.apply(lambda rows: get_image_id(rows['training_images_path']), axis=1)
training_images_df = pd.merge(training_images_df, til_df, left_on='image_id', right_on='id', how='left').drop(['image_id','id'], axis=1)
training_images_df.head()

In [None]:
training_images_df['StudyInstance'] = training_images_df.apply(lambda rows: get_image_id(rows['StudyInstanceUID']) + "_study", axis=1)
training_images_df = pd.merge(training_images_df, tsl_df, left_on='StudyInstance', right_on='id', how='left').drop(['StudyInstance','id'], axis=1)
training_images_df.info()
training_images_df.to_csv('/kaggle/working/training_images_df.csv', index=False)
training_images_df.head()

<div style="font-size:30px" class="alert alert-block alert-info"> Image Analytics</div>

<div style="font-size:20px" class="alert alert-block alert-warning"> Samples based on the class --> Negative for Pneumonia</div>

In [None]:
display_image, display_image_w_annotation, title, df_subset = prepare_sample(training_images_df, col_number=4, samples = 3, nan_flag = True)
make_subplots(display_image, display_image_w_annotation, title)

<div style="font-size:20px" class="alert alert-block alert-warning"> Samples based on the class --> Typical Appearance</div>

In [None]:
display_image, display_image_w_annotation, title = prepare_sample(training_images_df, col_number=5, samples = 3)
make_subplots(display_image, display_image_w_annotation, title)

<div style="font-size:20px" class="alert alert-block alert-warning"> Samples based on the class --> Indeterminate Appearance</div>

In [None]:
display_image, display_image_w_annotation, title = prepare_sample(training_images_df, col_number=6, samples = 3)
make_subplots(display_image, display_image_w_annotation, title)

<div style="font-size:20px" class="alert alert-block alert-warning"> Samples based on the class --> Atypical Appearance</div>

In [None]:
display_image, display_image_w_annotation, title = prepare_sample(training_images_df, col_number=7, samples = 3)
make_subplots(display_image, display_image_w_annotation, title)

<div style="font-size:20px" class="alert alert-block alert-danger"> Work in Progress </div>