### **download external packages**

In [None]:
HELPER_DIR = '/kaggle/input/pydicom-conda-helper/'

!conda install {HELPER_DIR+'libjpeg-turbo-2.1.0-h7f98852_0.tar.bz2'} -c conda-forge -y -q
!conda install {HELPER_DIR+'libgcc-ng-9.3.0-h2828fa1_19.tar.bz2'} -c conda-forge -y -q
!conda install {HELPER_DIR+'gdcm-2.8.9-py37h500ead1_1.tar.bz2'} -c conda-forge -y -q
!conda install {HELPER_DIR+'conda-4.10.1-py37h89c1867_0.tar.bz2'} -c conda-forge -y -q
!conda install {HELPER_DIR+'certifi-2020.12.5-py37h89c1867_1.tar.bz2'} -c conda-forge -y -q
!conda install {HELPER_DIR+'openssl-1.1.1k-h7f98852_0.tar.bz2'} -c conda-forge -y -q

### **import dependencies**

In [None]:
import os, zipfile
import cv2
import plotly.express as px
import numpy as np
import pandas as pd
from tqdm.auto import tqdm
import matplotlib.pyplot as plt
from kaggle_secrets import UserSecretsClient
import pydicom
import wandb

from pathlib import Path

### **configuration and initialization**

In [None]:
SIIM_COVID19_DETECTION_DIR = '/kaggle/input/siim-covid19-detection/'

WORKING_DIR = '/kaggle/working/'
TEMP_DIR = '/kaggle/temp/'

INPUT_DIR = SIIM_COVID19_DETECTION_DIR+'train/'
OUTPUT_DIR = WORKING_DIR+'data/'

TRAIN_IMAGE_LEVEL_PATH = SIIM_COVID19_DETECTION_DIR+'train_image_level.csv'
TRAIN_STUDY_LEVEL_PATH = SIIM_COVID19_DETECTION_DIR+'train_study_level.csv'

IMG_SIZE = WIDTH = HEIGHT = 512
N_IMAGES_WANDB = 42


INTERPOLATION = cv2.INTER_LANCZOS4

In [None]:
os.makedirs(OUTPUT_DIR, exist_ok=True)

In [None]:
user_secrets = UserSecretsClient()
secret_value_0 = user_secrets.get_secret("WANDB_API_KEY")
os.environ['WANDB_API_KEY'] = secret_value_0

wandb.login()

### **load csv file**

In [None]:
df_train_image_level = pd.read_csv(TRAIN_IMAGE_LEVEL_PATH)
df_train_study_level = pd.read_csv(TRAIN_STUDY_LEVEL_PATH)

### **first look**

In [None]:
df_train_image_level.sample(5)

In [None]:
df_train_image_level.describe()

In [None]:
df_train_study_level.sample(5)

In [None]:
df_train_study_level.describe()

### **merge df study/image, add path image**

In [None]:
df_train_image_level['id'] = df_train_image_level.apply(lambda row: row.id.split('_')[0], axis=1)
df_train_image_level['path'] = df_train_image_level.apply(lambda row: OUTPUT_DIR+row.id+'.jpg', axis=1)
df_train_image_level['image_level'] = df_train_image_level.apply(lambda row: row.label.split(' ')[0], axis=1)

df_train_study_level['id'] = df_train_study_level.apply(lambda row: row.id.split('_')[0], axis=1)
df_train_study_level.columns = ['StudyInstanceUID', 'Negative for Pneumonia', 'Typical Appearance', 'Indeterminate Appearance', 'Atypical Appearance']

In [None]:
df = df_train_image_level.merge(df_train_study_level, on='StudyInstanceUID',how="left")
df.sample(3)

In [None]:
print(f"Number of images in trainset: {len(df)}")
print(f"Number of images in trainset ( without boxes): {df['boxes'].isna().sum()}")
print(f"Number of images in trainset ( with boxes): {len(df) - df['boxes'].isna().sum()}")

In [None]:
labels = df[['Negative for Pneumonia','Typical Appearance','Indeterminate Appearance','Atypical Appearance']]

In [None]:
fig = px.bar(labels.sum(),
             title="<b>Distribution images by classes</b>",)
fig.update_layout(showlegend=False,
                  xaxis_title="",
                  yaxis_title="")


fig.show()

In [None]:
df['study_level'] = np.argmax(labels.values, axis=1)
df.sample(3)

In [None]:
no_bb = df[df['boxes'].isna()].shape[0]
has_bb = df[df['boxes'].notna()].shape[0]

px.pie(names=["with boxes", "without boxes"],
       values=[has_bb, no_bb], 
       title="<b>Distribution images by boxes</b>")

In [None]:
no_bb = df[(df['boxes'].isna() & df['Negative for Pneumonia'] ==1)].shape[0]
has_bb = df[(df['boxes'].notna() & df['Negative for Pneumonia'] ==1)].shape[0]

px.pie(names=["with boxes", "without boxes"],
       values=[has_bb, no_bb], 
       title="<b>Distribution images by boxes for negative study</b>")

In [None]:
no_bb = df[(df['boxes'].isna() & df['Negative for Pneumonia'] ==0)].shape[0]
has_bb = df[(df['boxes'].notna() & df['Negative for Pneumonia'] ==0)].shape[0]

px.pie(names=["with boxes", "without boxes"],
       values=[has_bb, no_bb], 
       title="<b>Distribution images by boxes for positive study</b>")

In [None]:
label_to_class_id = {
    'Negative for Pneumonia': 0,
    'Typical Appearance': 1,
    'Indeterminate Appearance': 2,
    'Atypical Appearance': 3
}

class_id_to_label = {v: k for k, v in label_to_class_id.items()}

### **get path dicom files**

In [None]:
path_dicom_files = []

total = sum([len(f) for r, d, f in os.walk(INPUT_DIR)])

with tqdm(total=total) as pbar:
    for dirname, _, filenames in os.walk(INPUT_DIR):
        for file in filenames:
            path_dicom_files.append(Path(os.path.join(dirname, file)))
            pbar.update(1)

### **rescale all train images and save to IMG_SIZE=512x512px jpg / save original width and height then export df**

In [None]:
img=None
for p in tqdm(path_dicom_files):
    img_name = p.parts[-1][0:-4]
    if img_name =='039159f7b61b':
        print(True)
        dcm = pydicom.dcmread(p)
        img = dcm.pixel_array
        if dcm.PhotometricInterpretation == "MONOCHROME1":
            img = cv2.bitwise_not(img)
        img = cv2.normalize(img, None, 0, 255, cv2.NORM_MINMAX, dtype=cv2.CV_8U)
        img = cv2.resize(img, (WIDTH, HEIGHT), interpolation = INTERPOLATION)

In [None]:
df.loc[:,"width"] = np.nan
df.loc[:,"height"] = np.nan


for p in tqdm(path_dicom_files):
    dcm = pydicom.dcmread(p)
    img = dcm.pixel_array
    img_name = p.parts[-1][0:-4]
    
    index = df[df['id'].str.contains(img_name)].index
    df.loc[index, ['width']] = img.shape[0]
    df.loc[index, ['height']] = img.shape[1]

    if dcm.PhotometricInterpretation == "MONOCHROME1":
        img = cv2.bitwise_not(img)
    img = cv2.normalize(img, None, 0, 255, cv2.NORM_MINMAX, dtype=cv2.CV_8U)
    img = cv2.resize(img, (WIDTH, HEIGHT), interpolation = INTERPOLATION)
    
    cv2.imwrite(OUTPUT_DIR+img_name+'.jpg', img)
    
#039159f7b61b image return error (or 920d7ef35702 )
    

In [None]:
df.to_csv(WORKING_DIR+'meta.csv', index = False)

### **df images with boxes**

In [None]:
opacity_df = df.dropna(subset = ["boxes"], inplace=False)
opacity_df = opacity_df.reset_index(drop=True)

In [None]:
opacity_df.sample(5)

In [None]:
opacity_df.describe()

### **convert train image boxes to wandb image for visualization**

In [None]:
def get_bbox(row):
    bboxes = []
    bbox = []
    for i, l in enumerate(row.label.split(' ')):
        if (i % 6 == 0) | (i % 6 == 1):
            continue
        bbox.append(float(l))
        if i % 6 == 5:
            bboxes.append(bbox)
            bbox = []  
            
    return bboxes

In [None]:
def scale_bbox(row, bboxes):
    scale_x = IMG_SIZE/row.width
    scale_y = IMG_SIZE/row.height
    
    scaled_bboxes = []
    for bbox in bboxes:
        x = int(np.round(bbox[0]*scale_x, 4))
        y = int(np.round(bbox[1]*scale_y, 4))
        x1 = int(np.round(bbox[2]*(scale_x), 4))
        y1= int(np.round(bbox[3]*scale_y, 4))

        scaled_bboxes.append([x, y, x1, y1]) # xmin, ymin, xmax, ymax
    
    return scaled_bboxes

In [None]:
def wandb_bbox(image, bboxes, true_label, class_id_to_label):
    all_boxes = []
    for bbox in bboxes:
        box_data = {"position": {
                        "minX": bbox[0],
                        "minY": bbox[1],
                        "maxX": bbox[2],
                        "maxY": bbox[3]
                    },
                     "class_id" : int(true_label),
                     "box_caption": class_id_to_label[true_label],
                     "domain" : "pixel"}
        all_boxes.append(box_data)
    

    return wandb.Image(image, boxes={
        "ground_truth": {
            "box_data": all_boxes,
          "class_labels": class_id_to_label
        }
    })

In [None]:
sampled_opacity_df = opacity_df.sample(N_IMAGES_WANDB).reset_index(drop=True)

run = wandb.init(project='project8-kaggle-covid19')

wandb_bbox_list = []
for i in tqdm(range(sampled_opacity_df.shape[0])):
    row = sampled_opacity_df.loc[i]
    image = cv2.imread(row.path)
    bboxes = get_bbox(row)
    scale_bboxes = scale_bbox(row, bboxes)
    true_label = row.study_level
    wandb_bbox_list.append(wandb_bbox(image, 
                                      scale_bboxes, 
                                      true_label, 
                                      class_id_to_label))
    
wandb.log({"radiograph": wandb_bbox_list})

run.finish()

run

### **ref** 

* https://www.kaggle.com/xhlulu
* https://www.kaggle.com/yujiariyasu
* https://www.kaggle.com/ayuraj
* https://www.kaggle.com/dschettler8845   
....