In [None]:
! conda install -c conda-forge gdcm -y;

In [None]:
import sys
sys.path.append("../input/timmeffnetv2")

import platform
import numpy as np
import pandas as pd
import os
from tqdm.notebook import tqdm
import cv2
import pydicom
import gdcm
import glob
import gc
from math import ceil
import matplotlib.pyplot as plt
from pydicom.pixel_data_handlers.util import apply_voi_lut
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import StratifiedKFold

import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.cuda.amp import GradScaler, autocast
from torch.utils.data import Dataset, DataLoader

import warnings
warnings.simplefilter('ignore')

In [None]:
train_image = pd.read_csv("../input/siim-covid19-detection/train_image_level.csv")
train_study = pd.read_csv("../input/siim-covid19-detection/train_study_level.csv")

In [None]:
TRAIN_DIR = "../input/siim-covid19-detection/train/"
train_study['StudyInstanceUID'] = train_study['id'].apply(lambda x: x.replace('_study', ''))
train = train_image.merge(train_study, on='StudyInstanceUID')

# Make a path folder
paths = []
for instance_id in tqdm(train['StudyInstanceUID']):
    paths.append(glob.glob(os.path.join(TRAIN_DIR, instance_id +"/*/*"))[0])

train['path'] = paths

train = train.drop(['id_x', 'id_y'], axis=1)

train = train.sample(frac=1).reset_index(drop=True)
train.head()

In [None]:
train.shape

In [None]:
def dicom2array(path, voi_lut=True, fix_monochrome=True):
    dicom = pydicom.read_file(path)
    # VOI LUT (if available by DICOM device) is used to
    # transform raw DICOM data to "human-friendly" view
    if voi_lut:
        data = apply_voi_lut(dicom.pixel_array, dicom)
    else:
        data = dicom.pixel_array
    # depending on this value, X-ray may look inverted - fix that:
    if fix_monochrome and dicom.PhotometricInterpretation == "MONOCHROME1":
        data = np.amax(data) - data
    data = data - np.min(data)
    data = data / np.max(data)
    data = (data * 255).astype(np.uint8)
    return data

In [None]:
os.makedirs('/kaggle/working/output/', exist_ok=True)

In [None]:
import h5py

In [None]:
'''
idx = 0
image_id = train['StudyInstanceUID'].values[idx]
image_path = train['path'].values[idx]
image = dicom2array(image_path)
image = cv2.cvtColor(image, cv2.COLOR_GRAY2RGB)
image = cv2.resize(image, (512, 512))
label = train[train['StudyInstanceUID'] == image_id].values.tolist()[0][3:7]
example = h5py.File('/kaggle/working/output/'+'test.hdf5', 'w')
example.create_dataset("img", data=image)
example.create_dataset("label", data=label)
example.close()
example = h5py.File('/kaggle/working/output/'+'test.hdf5', 'r')
display(np.mean(image == example['img'][:]))
display(label == example['label'][:])
'''


In [None]:
for idx in range(6334):
    image_id = train['StudyInstanceUID'].values[idx]
    image_path = train['path'].values[idx]
    image = dicom2array(image_path)
    image = cv2.cvtColor(image, cv2.COLOR_GRAY2RGB)
    image = cv2.resize(image, (512, 512))
    label = train[train['StudyInstanceUID'] == image_id].values.tolist()[0][3:7]
    example = h5py.File('/kaggle/working/output/'+'{x}.hdf5'.format(x=image_id), 'w')
    example.create_dataset("img", data=image)
    example.create_dataset("label", data=label)
    example.close()

In [None]:
from zipfile import ZipFile
import os
from os.path import basename
# create a ZipFile object
with ZipFile('data.zip', 'w') as zipObj:
    for folderName, subfolders, filenames in os.walk('/kaggle/working/output/'):
        for filename in filenames:
           #create complete filepath of file in directory
           filePath = os.path.join(folderName, filename)
           # Add file to zip
           zipObj.write(filePath, basename(filePath))