In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import seaborn as sns
import glob
import plotly as py
import plotly.graph_objects as go
import plotly.express as px
from plotly.subplots import make_subplots
import pydicom as dicom
import matplotlib.pylab as plt
import matplotlib
import cv2
import ast
import json
import os

In this competition, we are identifying and localizing COVID-19 abnormalities on chest radiographs. This is an object detection and classification problem.

For each test image, you will be predicting a bounding box and class for all findings. If you predict that there are no findings, you should create a prediction of "none 1 0 0 1 1" ("none" is the class ID for no finding, and this provides a one-pixel bounding box with a confidence of 1.0).

Further, for each test study, you should make a determination within the following labels:

'Negative for Pneumonia' 'Typical Appearance' 'Indeterminate Appearance' 'Atypical Appearance'

**train_study_level.csv**

id - unique study identifier
Negative for Pneumonia - 1 if the study is negative for pneumonia, 0 otherwise
Typical Appearance - 1 if the study has this appearance, 0 otherwise
Indeterminate Appearance  - 1 if the study has this appearance, 0 otherwise
Atypical Appearance  - 1 if the study has this appearance, 0 otherwise

**train_image_level.csv**

id - unique image identifier
boxes - bounding boxes in easily-readable dictionary format
label - the correct prediction label for the provided bounding boxes

In [None]:
train_study = pd.read_csv("../input/siim-covid19-detection/train_study_level.csv")
train_study.head(2)

In [None]:
train_study.info()

In [None]:
columns = train_study.columns[1:] # ignore id
x0 = [columns[0],columns[1],columns[2],columns[3]]
# fetch all '0's
y0 = [str(len(train_study[train_study[columns[0]] == 0])), str(len(train_study[train_study[columns[1]] == 0])), str(len(train_study[train_study[columns[2]] == 0])), str(len(train_study[train_study[columns[3]] == 0]))]

x1 = [columns[0],columns[1],columns[2],columns[3]]
# fetch all '1's
y1 = [str(len(train_study[train_study[columns[0]] == 1])), str(len(train_study[train_study[columns[1]] == 1])), str(len(train_study[train_study[columns[2]] == 1])), str(len(train_study[train_study[columns[3]] == 1]))]

fig = go.Figure()
fig.add_trace(go.Histogram(histfunc="sum", y=y0, x=x0, name="0"))
fig.add_trace(go.Histogram(histfunc="sum", y=y1, x=x1, name="1"))
py.offline.iplot(fig)

In [None]:
train_img = pd.read_csv("../input/siim-covid19-detection/train_image_level.csv")
train_img.head(2)

In [None]:
train_img['label'][0]

In [None]:
train_img['boxes'][0]

In [None]:
train_img.info()

In [None]:
# count of missing values in each column
train_img.isnull().sum()

In [None]:
sample_sub= pd.read_csv('../input/siim-covid19-detection/sample_submission.csv')
sample_sub.head(2)

## Merge train image and train study data based on UID , and append train image path with the merged dataframe

In [None]:
train_path = '../input/siim-covid19-detection/train'
test_path = '../input/siim-covid19-detection/test'

In [None]:
train_imgs = glob.glob('/kaggle/input/siim-covid19-detection/train/*/*/*.dcm')
test_imgs = glob.glob('/kaggle/input/siim-covid19-detection/test/*/*/*.dcm')

print('Total train images',len(train_imgs) ,'\nSample train image',train_imgs[0])
print('Total test images',len(test_imgs) ,'\nSample test image',test_imgs [0])

In [None]:
def get_image_id(path):
    image_name = path.split('/')[-1].replace('.dcm', '_image')
    return image_name

In [None]:
training_images_df = pd.DataFrame(train_imgs, columns =['training_images_path'])

In [None]:
training_images_df['image_id'] = training_images_df.apply(lambda rows: get_image_id(
                                rows['training_images_path']), axis=1)

training_images_df = pd.merge(training_images_df, train_img, left_on='image_id', right_on='id', 
                              how='left')

training_images_df.head(3)

In [None]:
training_images_df = training_images_df.drop(['id'],axis=1)
training_images_df.head(2)

#### Merge train img df with train study df

In [None]:
training_images = pd.DataFrame(train_imgs, columns =['training_images_path'])

In [None]:
training_images_df['StudyInstance'] = training_images_df.apply(lambda rows: get_image_id(rows['StudyInstanceUID']) + "_study", axis=1)
training_images_df = pd.merge(training_images_df, train_study, left_on='StudyInstance', right_on='id', how='left').drop(['StudyInstance','id'], axis=1)
training_images_df.info()

In [None]:
training_images_df.head(3)

#### Save Merged training data df in - output dir

In [None]:
training_images_df.to_csv('./merged_train_df.csv', index=False)

In [None]:
training_images_df['training_images_path'][0]

In [None]:
#### Checking for duplicate ID

In [None]:
# dup_ids= training_images_df.groupby("StudyInstanceUID").count().reset_index()
# dup_ids

In [None]:
# no_dup = dup_ids[dup_ids["image_id"]==1] 
# no_dup

In [None]:
#training_images_df[training_images_df.StudyInstanceUID == 'fa9ea207e240']

In [None]:
training_images_df.head()

In [None]:
neg_pnemonia_no = training_images_df[training_images_df['Negative for Pneumonia']==0]
neg_pnemonia_yes = training_images_df[training_images_df['Negative for Pneumonia']==1]

In [None]:
neg_pnemonia_no.info()

In [None]:
neg_pnemonia_yes.info()

In [None]:
neg_pnemonia_no[neg_pnemonia_no['boxes'].isna()]

Total 304 images which are NOT negative for pnemonia (means having pnemonia) has 'none'lables

In [None]:
neg_pnemonia_yes[neg_pnemonia_yes['boxes'].isna()]

All images which are negative for pnemonia (means who do not have pnemonia is equiv to not having covid) has 'none' lables

#### Image Analysis

In [None]:
training_images_df.loc[1, 'boxes']

In [None]:
boxes = ast.literal_eval(training_images_df.loc[1, 'boxes'])
print(boxes)

In [None]:
# Visualize single img with boxes

image_path = train_imgs[1]
ds = dicom.dcmread(image_path).pixel_array
fig, ax = plt.subplots(1,1, figsize=(8,4))
for box in boxes:
    print('box',box)
    p = matplotlib.patches.Rectangle((box['x'], box['y']),
                                      box['width'], box['height'],
                                      ec='r', fc='none', lw=1.5)
    ax.add_patch(p)
ax.imshow(ds, cmap='gray')
plt.show()

In [None]:
training_images_df.head(2)

In [None]:
training_images_df.columns

In [None]:
def visualization(class_name,df,color):
    df.dropna(inplace=True)
    fig,axes = plt.subplots(3,3,figsize=(20,16))
    fig.subplots_adjust(hspace=.1, wspace=.1)
    axes = axes.ravel()
    records = df[
        df[class_name]==0].iloc[:9].reset_index(drop=True)
    
    for _, row in records.iterrows():
        img = row['training_images_path']
        img = dicom.dcmread(image_path).pixel_array

        if (row['boxes'] == row['boxes']):
            boxes = ast.literal_eval(row['boxes'])
            for box in boxes:
                p = matplotlib.patches.Rectangle((box['x'], box['y']),
                                                  box['width'], box['height'],
                                                  ec=color, fc='none', lw=2.
                                                )
                axes[_].add_patch(p)


            axes[_].imshow(img, cmap='gray')
            axes[_].set_title(row['StudyInstanceUID'].split(' ')[0])
            axes[_].set_xticklabels([])
            axes[_].set_yticklabels([])

### Indeterminate Appearance 

In [None]:
visualization('Indeterminate Appearance',training_images_df,'g')

### Typical Appearance

In [None]:
visualization('Typical Appearance',training_images_df,'r')

### Atypical Appearance

In [None]:
visualization('Atypical Appearance',training_images_df,'b')

#### ************ *** EDA DONE ***************** ** ####

### Data Preperation

#### Replacing boxes with nan value to [{'x': 0, 'y': 0, 'width': 1, 'height': 1}]

In [None]:
df= pd.read_csv('./training_images_df.csv')

In [None]:
df.head()

In [None]:
none_ = [{'x': 0, 'y': 0, 'width': 1, 'height': 1}]

In [None]:
df.boxes[0] = none_

In [None]:
boxes = ast.literal_eval(json.dumps(df.loc[0, 'boxes']))
print(boxes)

In [None]:
# Visualize single img with box coordinate - 0,0,1,1

image_path = train_imgs[0]
ds = dicom.dcmread(image_path).pixel_array
fig, ax = plt.subplots(1,1, figsize=(8,4))
for box in boxes:
    print('box',box)
    p = matplotlib.patches.Rectangle((box['x'], box['y']),
                                      box['width'], box['height'],
                                      ec='r', fc='none')#, lw=1.5)
    ax.add_patch(p)
ax.imshow(ds, cmap='gray')
plt.show()

### Creating train_df with class lables and boxes

In [None]:
df["one_hot"] = df.apply(lambda x : np.array([x["Negative for Pneumonia"],
                                                        x["Typical Appearance"],
                                                        x["Indeterminate Appearance"],
                                                        x["Atypical Appearance"]]),axis=1)

In [None]:
df.head(2)

In [None]:
classes_dict = {
    0 : "Negative for Pneumonia",
    1  : "Typical Appearance",
    2  : "Indeterminate Appearance",
    3  : "Atypical Appearance"
}

In [None]:
df["Class"] = df["one_hot"].map(lambda x : classes_dict[np.argmax(x)]) # argmax returns the index of max value
df["Class"].value_counts()

In [None]:
df = df.drop(["Negative for Pneumonia","Typical Appearance","Indeterminate Appearance","Atypical Appearance","one_hot"],axis=1)
df.head(1)

In [None]:
# Replace nan boxes with one pixel coordinates
df["boxes"].fillna("[{'x':0,'y':0,'width':1,'height':1}]",inplace=True)

In [None]:
df["boxes"] = df["boxes"].map(lambda x : (ast.literal_eval(json.dumps(x))))

In [None]:
df.info()

In [None]:
df.to_csv('train_df_after_eda.csv')

In [None]:
df['training_images_path'][0]

In [None]:
df['training_images_path'][1]

In [None]:
pd.read_csv('./train_df_after_eda.csv')