In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import tensorflow as tf
import os
import matplotlib.pyplot as plt
import cv2

In [None]:
!cp /kaggle/input/gdcm-conda-install/gdcm.tar .
!tar -xvzf gdcm.tar
!conda install --offline ./gdcm/gdcm-2.8.9-py37h71b2a6d_0.tar.bz2

In [None]:
root_path = "/kaggle/input/siim-covid19-detection/"

In [None]:
df_image = pd.read_csv(os.path.join(root_path, "train_image_level.csv"))
df_study = pd.read_csv(os.path.join(root_path, "train_study_level.csv"))

# Load Data

In [None]:
train_path = os.path.join(root_path, "train")
test_path = os.path.join(root_path, "test")

In [None]:
train_filenames = [os.path.join(dirname,filename) for dirname,_,filenames in os.walk(train_path) for filename in filenames]
test_filenames = [os.path.join(dirname,filename) for dirname,_,filenames in os.walk(test_path) for filename in filenames]

In [None]:
train_dict = {x.split('/')[-1].replace('.dcm','_image'): x for x in train_filenames}
test_dict = {x.split('/')[-1].replace('.dcm','_image'): x for x in test_filenames}

In [None]:
df_image['path'] = df_image["id"].map(train_dict)
df_image['id']=df_image['id'].apply(lambda x: x.replace('_image',''))
df_image['simplified_path']=df_image['path'].apply(lambda x: '/'.join(x.split('/')[5:]))

df_study = df_study.rename(columns={'id':'StudyInstanceUID'}, inplace=False)
df_study['StudyInstanceUID'] = df_study['StudyInstanceUID'].apply(lambda x: x.replace('_study',''))

In [None]:
df_train = df_image.merge(df_study, how='inner', on='StudyInstanceUID')

In [None]:
columns_reordered=['id',
 'StudyInstanceUID',
 'boxes',
 'label',
 'Negative for Pneumonia',
 'Typical Appearance',
 'Indeterminate Appearance',
 'Atypical Appearance',
 'path',
 'simplified_path']
df_train = df_train[columns_reordered]

## Create data

### Split and store images

In [None]:
from sklearn.model_selection import train_test_split
x_train, x_valid = train_test_split(list(df_train["path"]), test_size=0.2, random_state=42, shuffle=True)

In [None]:
os.makedirs("dataset_det/train", exist_ok=True)
os.makedirs("dataset_det/valid", exist_ok=True)
os.makedirs("dataset_det/test", exist_ok=True)

### Convert to png

In [None]:
import os

from PIL import Image
import pandas as pd
from tqdm.auto import tqdm

In [None]:
orig_shapes = {"train" : list(), "valid" : list(), "test" : list()}

In [None]:
import numpy as np
import pydicom
from pydicom.pixel_data_handlers.util import apply_voi_lut

def read_xray(path, voi_lut = True, fix_monochrome = True):
    # Original from: https://www.kaggle.com/raddar/convert-dicom-to-np-array-the-correct-way
    dicom = pydicom.read_file(path)
    
    # VOI LUT (if available by DICOM device) is used to transform raw DICOM data to 
    # "human-friendly" view
    if voi_lut:
        data = apply_voi_lut(dicom.pixel_array, dicom)
    else:
        data = dicom.pixel_array
               
    # depending on this value, X-ray may look inverted - fix that:
    if fix_monochrome and dicom.PhotometricInterpretation == "MONOCHROME1":
        data = np.amax(data) - data
        
    data = data - np.min(data)
    data = data / np.max(data)
    data = (data * 255).astype(np.uint8)
        
    return data

In [None]:
def resize(array, size, keep_ratio=False, resample=Image.LANCZOS):
    # Original from: https://www.kaggle.com/xhlulu/vinbigdata-process-and-resize-to-image
    im = Image.fromarray(array)
    
    if keep_ratio:
        im.thumbnail((size, size), resample)
    else:
        im = im.resize((size, size), resample)
    
    return im

In [None]:
def save_dcm_as_png(source, dest, mode = "train", size = 832):
    image = read_xray(source)

    orig_shapes[mode].append((image.shape[1], image.shape[0]))
    
    image = resize(image, size)
    image.save(dest)

In [None]:
for x in x_train:
    save_dcm_as_png(x, 
                    os.path.join("dataset_det/train",
                                 x.split("/")[-1][:-3] + "png"),
                   mode = "train")

In [None]:
for x in x_valid:
    save_dcm_as_png(x, 
                    os.path.join("dataset_det/valid", 
                                    x.split("/")[-1][:-3] + "png"),
                   mode = "valid")

In [None]:
"""for x in test_filenames:
    save_dcm_as_png(x, os.path.join("dataset_det/test", 
                                    x.split("/")[-1][:-3] + "png"),
                   mode = "test")"""

In [None]:
!cp -R dataset_det dataset_class

### Create Labels

In [None]:
import csv
import math

#### Image level - Detection

In [None]:
def isNaN(string):
    return string != string

#csv format
def create_labels_for_det(input_size = 832, mode = "train"):
    
    if mode == "train":
        filenames = x_train
    elif mode == "valid":
        filenames = x_valid
    
    labels = []
    csv_columns = ["path", "xmin", "ymin", "xmax", "ymax", "class"]
    csv_file = "./dataset_det/{}.csv".format(mode)
    
    for i, x in enumerate(filenames):
        _id = x.split("/")[-1][:-4]

        raw_boxes = list(df_train.loc[df_train["id"] == _id]["boxes"])[0]
        
        if not isNaN (raw_boxes):
            boxes = eval(raw_boxes)

            orig_w, orig_h = orig_shapes[mode][i]

            for box in boxes:
                d = {}
                w = (box["width"] / orig_w) * input_size 
                h = (box["height"] / orig_h) * input_size

                xmin = (box["x"] / orig_w) * input_size
                ymin = (box["y"] / orig_h) * input_size

                xmax = xmin + w
                ymax = ymin + h

                d["path"] = "./{}/{}.png".format(mode, _id) 
                d["xmin"] = xmin
                d["xmax"] = xmax
                d["ymin"] = ymin
                d["ymax"] = ymax
                d["class"] = "opacity"
                labels.append(d)
        else:
            os.remove("dataset_det/{}/{}.png".format(mode, _id))
    try:
        with open(csv_file, 'w') as csvfile:
            writer = csv.DictWriter(csvfile, fieldnames=csv_columns)
            #writer.writeheader()
            for entry in labels:
                writer.writerow(entry)
    except IOError:
        print("I/O error")

In [None]:
create_labels_for_det(mode = "valid")
create_labels_for_det(mode = "train")

In [None]:
d = {"class" : "opacity", "id" : 0}
csv_columns = list(d.keys())
d = [d]
d

In [None]:
try:
    with open("./dataset_det/classes.csv", 'w') as csvfile:
        writer = csv.DictWriter(csvfile, fieldnames=csv_columns)
        #writer.writeheader()
        for entry in d:
            writer.writerow(entry)
except IOError:
    print("I/O error")

#### Study level - Classification

In [None]:
classes = list(df_train.columns)[4:8]
classes

In [None]:
def create_labels_for_class(mode = "train"):
    
    if mode == "train":
        filenames = x_train
    elif mode == "valid":
        filenames = x_valid
    
    labels = []
    csv_columns = ["path", "class"]
    csv_file = "./dataset_class/{}.csv".format(mode)
    
    for i, x in enumerate(filenames):
        _id = x.split("/")[-1][:-4]

        gt = [list(df_train.loc[df_train["id"] == _id][c])[0] for c in classes]
        c = classes[np.argmax(gt)]
        
        d = {}
        d["path"] = "./{}/{}.png".format(mode, _id) 
        d["class"] = c

        labels.append(d)
    try:
        with open(csv_file, 'w') as csvfile:
            writer = csv.DictWriter(csvfile, fieldnames=csv_columns)
            #writer.writeheader()
            for entry in labels:
                writer.writerow(entry)
    except IOError:
        print("I/O error")

In [None]:
create_labels_for_class(mode = "train")
create_labels_for_class(mode = "valid")

### Prepare zip file for dowload

In [None]:
!mkdir dataset
!mv ./dataset_det ./dataset
!mv ./dataset_class ./dataset

In [None]:
!cp ../input/siim-covid19-detection/train_image_level.csv ./dataset
!cp ../input/siim-covid19-detection/train_study_level.csv ./dataset

In [None]:
!cp ../input/siim-covid19-detection/train_image_level.csv ./dataset

In [None]:
!zip -r dataset_siim_covid.zip ./dataset

<a href="./dataset_siim_covid.zip"> Download File </a>