In [None]:
import pandas as pd
import numpy as np
import ast

import re
import os

In [None]:
ls ../input/siim-covid19-resized-to-512px-jpg/meta.csv

In [None]:
FOLDER = '../input/siim-covid19-detection'

In [None]:
# train set
train_study_df = pd.read_csv('../input/siim-covid19-detection/train_study_level.csv')
train_image_df = pd.read_csv('../input/siim-covid19-detection/train_image_level.csv')

In [None]:
train_study_df.head()

In [None]:
train_image_df.head()

# 1. Basic info

In [None]:
print('# studies in train set:', train_image_df.StudyInstanceUID.nunique())
print('# images in train set:', train_image_df.id.nunique())

In [None]:
# test set
def get_df_from_folder(phase='test'):
    list_test_studies = os.listdir(os.path.join(FOLDER, phase))

    ret_studies = []
    ret_images = []

    for study in list_test_studies:
        series = os.path.join(FOLDER, phase, study)
        for sr in os.listdir(series):
            images = os.path.join(FOLDER, phase, study, sr)
            for image in os.listdir(images):
                ret_studies.append(study+'_study')
                ret_images.append(image.replace('.dcm','')+'_image')
    return pd.DataFrame({'id':ret_images, 'StudyInstanceUID':ret_studies})

In [None]:
test_image_df = get_df_from_folder('test')

In [None]:
sam_sub = pd.read_csv('../input/siim-covid19-detection/sample_submission.csv')
sam_sub['id'].map(lambda x: x.endswith('image')).sum()

In [None]:
test_image_df.id.isin(sam_sub[sam_sub['id'].map(lambda x: x.endswith('image'))].id).mean()

In [None]:
print('# studies in test set:', test_image_df.StudyInstanceUID.nunique())
print('# images in test set:', test_image_df.id.nunique())

In [None]:
wh_df = pd.read_csv('../input/siim-covid19-resized-to-512px-jpg/meta.csv')
wh_df.columns = ['image_id', 'height', 'width', 'split']
wh_train = wh_df[wh_df.split=='train']
wh_test = wh_df[wh_df.split=='test']

In [None]:
print('Number of unique study:', train_image_df.StudyInstanceUID.nunique())
assert train_study_df.id.nunique() == train_image_df.StudyInstanceUID.nunique() # cf the consistent number

In [None]:
# a study may contain multiple images
train_img_cnt_study = train_image_df.StudyInstanceUID.value_counts()
print('#study havin more than 1 images in train set', (train_img_cnt_study > 1).sum())
print('Max number of images a study can have in train set', train_img_cnt_study.max())

In [None]:
test_img_cnt_study = test_image_df.StudyInstanceUID.value_counts()
print('#study havin more than 1 images in test set', (test_img_cnt_study > 1).sum())
print('Max number of images a study can have in test set', test_img_cnt_study.max())

In [None]:
# confirm that bounding boxes only takes 2 categories: opacity/none
train_image_df.label.map(lambda x: set(re.findall('[a-z]+', x))).astype(str).value_counts()

In [None]:
4294+2040

In [None]:
train_image_df['StudyInstanceUID'] = train_image_df.StudyInstanceUID + '_study'

In [None]:
print('Counting #study for each classification label')
print(train_study_df.drop('id', axis=1).sum(axis=0))

# 2. Generate OD annotations

In [None]:
anns = []
for i, row in train_image_df.iterrows():
    image_id = row['id']
    if(pd.notna(row['boxes'])):
        labels = ast.literal_eval(row['boxes'])
        df = pd.DataFrame(labels)
        df['id'] = image_id
        anns.append(df)
    else:
        anns.append(pd.DataFrame())
    
anns = pd.concat(anns)

In [None]:
anns['x_min'] = anns['x']
anns['y_min'] = anns['y']
anns['x_max'] = anns['x_min'] + anns['width']
anns['y_max'] = anns['y_min'] + anns['height']

anns['class_name'] = 'opacity'
anns['class_id'] = 1

In [None]:
anns = anns.merge(train_image_df[['id']], on='id', how='right')

In [None]:
anns['class_id'] = anns['class_id'].fillna(0)
anns['class_name'] = anns['class_name'].fillna('none')

anns['class_id'] = anns['class_id'].astype(int)

In [None]:
anns = anns.drop(['x','y','width','height'],axis=1)

In [None]:
anns['image_id'] = anns['id'].map(lambda x: x.replace('_image',''))

In [None]:
anns_wh = anns.merge(wh_train, on='image_id')

In [None]:
anns_wh.head()

In [None]:
anns_wh = anns_wh[['image_id', 'width', 'height', 'class_id', 'class_name', 'x_min', 'y_min', 'x_max', 'y_max']]

In [None]:
SIZE = 512

for col in [ 'x_min', 'x_max']:
    anns_wh[col] = anns_wh[col] / anns_wh['width'] * 512
    
for col in ['y_min', 'y_max']:
    anns_wh[col] = anns_wh[col] / anns_wh['height'] * 512
    
    
anns_wh['width'] = 512
anns_wh['height'] = 512

In [None]:
print('#images having at least 1 bbox not none:', (anns_wh.groupby('image_id').class_id.sum() > 0 ).sum())

In [None]:
anns_wh.to_csv('bbox_train_meta_512.csv', index=False)

In [None]:
wh_test = wh_test[['image_id']]
wh_test['width'] = 512
wh_test['height'] = 512

In [None]:
wh_test.to_csv('bbox_test_meta_512.csv', index=False)

In [None]:
import matplotlib.pyplot as plt
import cv2

color_palletes = [
    (123, 141, 245),
    (123, 193, 239),
    (230, 193, 239),
    (230, 79, 239),
    (71, 79, 239),
    (71, 170, 90),
    (201, 170, 38),
    (201, 47, 46),
    (201, 240, 219),
    (0, 43, 255),
    (0, 255, 255),
    (102, 114, 41),
    (255, 242, 235),
    (54, 0, 0)
]


def draw_boxes(img_id, meta_df, img_folder, plot_rad=False, figsize=(10,15), return_img=False):
    df = meta_df[meta_df.image_id == img_id]
    img = cv2.imread(img_folder + '/' + img_id + '.jpg')
    if(not return_img):
        print(meta_df[meta_df.image_id==img_id][['class_name', 'x_min', 'y_min', 'x_max', 'y_max', 'width', 'height']])
    for i, row in meta_df[meta_df.image_id == img_id].iterrows():
        x1, y1, x2, y2, cls_id, cls_name = int(row['x_min']), int(row['y_min']), \
                                            int(row['x_max']), int(row['y_max']), \
                                            int(row['class_id']), row['class_name']
        
        c = color_palletes[cls_id]
        cv2.rectangle(img, (x1, y1), (x2, y2), c, 2)
        if(plot_rad):
            rad_id = row['rad_id']
            cv2.putText(img, cls_name + '-' + rad_id, (x1, y1), cv2.FONT_HERSHEY_SIMPLEX, 0.7, c, 2)
        else:
            cv2.putText(img, cls_name, (x1, y1), cv2.FONT_HERSHEY_SIMPLEX, 0.7, c, 2)
    
    if(not return_img):
        plt.figure(figsize=figsize)
        plt.imshow(img)
    else:
        return img
    
def show_values_on_bars(axs, h_v="v", space=0.4):
    def _show_on_single_plot(ax):
        if h_v == "v":
            for p in ax.patches:
                _x = p.get_x() + p.get_width() / 2
                _y = p.get_y() + p.get_height()
                value = int(p.get_height())
                ax.text(_x, _y, value, ha="center") 
        elif h_v == "h":
            for p in ax.patches:
                _x = p.get_x() + p.get_width() + float(space)
                _y = p.get_y() + p.get_height()
                value = int(p.get_width())
                ax.text(_x, _y, value, ha="left")

    if isinstance(axs, np.ndarray):
        for idx, ax in np.ndenumerate(axs):
            _show_on_single_plot(ax)
    else:
        _show_on_single_plot(axs)
        

In [None]:
im_folder = '../input/siim-covid19-resized-to-512px-jpg/train'
draw_boxes('ffd9b6cf2961', anns_wh, im_folder)

In [None]:
import seaborn as sns
ax = sns.countplot(anns_wh[anns_wh.class_id==1].image_id.value_counts())
plt.xlabel('k')
plt.ylabel('Number of images having k bboxes')
plt.title('How many bboxes in 1 images (only consider postive image)')
show_values_on_bars(ax)
plt.show()

# 3. The relation ship between classification label and object detection label

In [None]:
study_lookup = train_image_df.set_index('id')['StudyInstanceUID'].to_dict()

In [None]:
anns_wh_study = anns_wh.copy()
anns_wh_study['StudyInstanceUID'] = anns_wh_study['image_id'].map(lambda x: study_lookup[x+'_image'])
anns_wh_study = anns_wh_study.merge(train_study_df.rename({'id':'StudyInstanceUID'}, axis=1), on='StudyInstanceUID')

In [None]:
anns_wh_study[anns_wh_study['Negative for Pneumonia']==1]

In [None]:
anns_wh_study[anns_wh_study.class_id==1]['Negative for Pneumonia'].max()

In [None]:
CLF_COLS = ['Negative for Pneumonia', 'Typical Appearance', 'Indeterminate Appearance', 'Atypical Appearance']

In [None]:
anns_wh_study.head()

In [None]:
print('Number of images (clf label = x | the image does not have opacity bbox(es)):')
print(anns_wh_study[anns_wh_study.class_id==0].drop_duplicates('image_id')[CLF_COLS].sum().rename('x'))

In [None]:
print('Number of images (clf label = x | the image has opacity bbox(es)):')
print(anns_wh_study[anns_wh_study.class_id==1].drop_duplicates('image_id')[CLF_COLS].sum().rename('x'))

In [None]:
print('Number of images (the image has/does not have opacity bbox(es)) | clf label = Negative for Pneumonia:')
print(anns_wh_study[anns_wh_study['Negative for Pneumonia']==1].drop_duplicates('image_id').class_id.value_counts().sort_index())

In [None]:
print('Number of images (the image has/does not have opacity bbox(es)) | clf label = Typical Appearance:')
print(anns_wh_study[anns_wh_study['Typical Appearance']==1].drop_duplicates('image_id').class_id.value_counts().sort_index())

In [None]:
print('Number of images (the image has/does not have opacity bbox(es)) | clf label = Indeterminate Appearance:')
print(anns_wh_study[anns_wh_study['Indeterminate Appearance']==1].drop_duplicates('image_id').class_id.value_counts().sort_index())

In [None]:
print('Number of images (the image has/does not have opacity bbox(es)) | clf label = Atypical Appearance:')
print(anns_wh_study[anns_wh_study['Atypical Appearance']==1].drop_duplicates('image_id').class_id.value_counts().sort_index())

In [None]:
train_image_df

In [None]:
# Counting number of images having bbox opacity in each study
print('How many images with opacity bboxes on train set for each study? Maximum number is:')
train_image_df.groupby('StudyInstanceUID').apply(lambda df: df.boxes.notnull().sum()).max()