# Competition
# SIIM-FISABIO-RSNA COVID-19 Detection
<font color = 'blue'>Identify and localize COVID-19 abnormalities on chest radiographs</font>

- Currently, COVID-19 can be diagnosed via polymerase chain reaction to detect genetic material from the virus or chest radiograph. However, it can take a few hours and sometimes days before the molecular test results are back.
- By contrast, chest radiographs can be obtained in minutes

If successful, you'll help radiologists diagnose the millions of COVID-19 patients more confidently and quickly. This will also enable doctors to see the extent of the disease and help them make decisions regarding treatment. Depending upon severity, affected patients may need hospitalization, admission into an intensive care unit, or supportive therapies like mechanical ventilation. As a result of better diagnosis, more patients will quickly receive the best care for their condition, which could mitigate the most severe effects of the virus.

# Competition Rules

- CPU Notebook <= 9 hours run-time
- GPU Notebook <= 9 hours run-time
- Internet access disabled
- Freely & publicly available external data is allowed, including pre-trained models
- Submission file must be named submission.csv

# Evaluation Metric
- <font color = 'blue'>Standard PASCAL VOC 2010 **mean Average Precision (mAP)** at IoU > 0.5</font>

In [None]:
!conda install gdcm -c conda-forge -y

In [None]:
%matplotlib inline

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import os

from tqdm import tqdm
from glob import glob
import gc

import cv2
import matplotlib.pyplot as plt
import matplotlib
import seaborn as sns
from IPython.display import display

plt.rcParams["figure.figsize"] = (12, 8)
plt.rcParams['axes.titlesize'] = 16
sns.set_palette('Set3_r')

import pydicom as dicom
import ast

import warnings
warnings.filterwarnings('ignore')

print(os.listdir('/kaggle/input/siim-covid19-detection/'))

from time import time, strftime, gmtime
start = time()
import datetime
print(str(datetime.datetime.now()))

In [None]:
base_dir = '/kaggle/input/siim-covid19-detection/'

In [None]:
train_label = pd.read_csv(base_dir + 'train_study_level.csv')
print(train_label.shape)
train_label.head()

In [None]:
train_img = pd.read_csv(base_dir + 'train_image_level.csv')
print(train_img.shape)
train_img.head()

In [None]:
sub = pd.read_csv(base_dir + 'sample_submission.csv')
print(sub.shape)
sub.head()

In [None]:
train_dicom = glob(base_dir + 'train/*/*/*.dcm')
test_dicom = glob(base_dir + 'test/*/*/*.dcm')
print(f"Number of train dicom files: {len(train_dicom)}")
print(f"Number of test dicom files: {len(test_dicom)}")

In [None]:
train_img['dcm_path'] = train_img['StudyInstanceUID'].apply(lambda x: glob(base_dir + 'train/' + x + '/*/*.dcm')[0])

In [None]:
train_label['StudyInstanceUID'] = train_label['id'].apply(lambda x: x.split('_')[0])
train_img = train_img.merge(train_label.iloc[:, 1:], on = 'StudyInstanceUID')
print(train_img.shape)
train_img.head()

In [None]:
#Check any multi-label
train_img.iloc[:, 5:].sum(axis = 1).value_counts()

# EDA

In [None]:
plt.figure(figsize = (16, 12))
plt.suptitle('Countplot of Classes')
for i, c in enumerate(train_label.columns[1:5]):
    plt.subplot(2, 2, i + 1)
    ax = sns.countplot(train_label[c])
    for p in ax.patches:
        ax.annotate(str(p.get_height()), (p.get_x() * 1.005, p.get_height() * 1.005))

- There can be multiple bounding boxes for a given case image
- We create a feature with number of BB for a given image

In [None]:
train_img['bb_num'] = train_img['label'].apply(lambda x: len(x.split()) / 6)
plt.title('Number of Bounding Boxes per Image')
ax = sns.countplot(data = train_img, x = 'bb_num')
for p in ax.patches:
        ax.annotate(str(p.get_height()), (p.get_x() * 1.005, p.get_height() * 1.005))

In [None]:
def get_label(x):
    if len(x) == 1 and x[0] == 'none':
        return 'None'
    elif len(x) == 1 and x[0] == 'opacity':
        return 'Opacity'
    elif len(x) == 2:
        return 'Double Opacity'
    elif len(x) > 2:
        return 'More than 2 Opacity'

In [None]:
plt.title('Countplot of Label Cateogries')
train_img['label_category'] = train_img['label'].apply(lambda x: x.split()[::6]).apply(get_label)
ax = sns.countplot(data = train_img, x = 'label_category')
for p in ax.patches:
        ax.annotate(str(p.get_height()), (p.get_x() * 1.005, p.get_height() * 1.005))

__Display Sample Dicom__

In [None]:
sample = dicom.dcmread(train_img['dcm_path'][0])
sample

In [None]:
#train_img['gender'] = train_img['dcm_path'].apply(lambda x: dicom.dcmread(x).PatientSex)
#ax = sns.countplot(data = train_img, x = 'gender')
#for p in ax.patches:
#       ax.annotate(str(p.get_height()), (p.get_x() * 1.005, p.get_height() * 1.005))

# Display Sample X-Ray Images

In [None]:
def display_dicom_images(idx):
    f, ax = plt.subplots(3, 3, figsize = (15, 10))
    f.subplots_adjust(hspace = .1, wspace = .1)
    ax = ax.flatten()

    for i, ind in enumerate(idx):
        dicom_path = train_img['dcm_path'].loc[ind]
        label = train_img['label_category'].loc[ind]
        #classes = train_img.iloc[:, 5:8].columns[train_img.iloc[ind, 5:8].values.argmax()].split()[0]
        dicom_file = dicom.dcmread(dicom_path)
        img = dicom_file.pixel_array
        #print(img.shape)
        ax[i].imshow(img, cmap = 'gray')
        ax[i].set_xticklabels([])
        ax[i].set_yticklabels([])
        ax[i].set_title(f"{label}", fontsize = 10)
        if label != 'None':
            boxes = ast.literal_eval(train_img['boxes'].loc[ind])
            #print(label)
            for bb in boxes:
                bbox = [bb['x'], bb['y'], bb['width'], bb['height']]
                p = matplotlib.patches.Rectangle((bbox[0], bbox[1]),
                                                 bbox[2],
                                                 bbox[3],
                                                 color = 'red', fc = 'none')
                ax[i].add_patch(p)
    plt.show()

In [None]:
idx = np.random.choice(train_img.index, 9)
display_dicom_images(idx)

In [None]:
idx = np.random.choice(train_img.index, 9)
display_dicom_images(idx)

# WIP

In [None]:
finish = time()
print(strftime("%H:%M:%S", gmtime(finish - start)))