In [None]:
import numpy as np 
import pandas as pd 
import pydicom
from pydicom.pixel_data_handlers.util import apply_voi_lut
import cv2
import matplotlib.pyplot as plt
from pathlib import Path
import os
import albumentations as A
import mlcrate as mlc
import shutil
import json

## How to make a Dataset with over 20GB
- https://www.kaggle.com/ksmcg90/miccai-brain-256

## Add Kaggle Secrets to Notebook

In [None]:
from kaggle_secrets import UserSecretsClient
user_secrets = UserSecretsClient()
username = user_secrets.get_secret("KAGGLE_USERNAME")
os.environ['KAGGLE_USERNAME'] = username
os.environ['KAGGLE_KEY'] = user_secrets.get_secret("KAGGLE_KEY")

## Read Xray and Resize Functions

In [None]:
## Function modified from https://www.kaggle.com/lucamtb/brain-tumor-very-basice-inference
def read_xray(path, voi_lut = True, fix_monochrome = True, normalize=False):
    dicom = pydicom.read_file(path)
    
    # VOI LUT (if available by DICOM device) is used to transform raw DICOM data to "human-friendly" view
    if voi_lut:
        data = apply_voi_lut(dicom.pixel_array, dicom)
    else:
        data = dicom.pixel_array
               
    # depending on this value, X-ray may look inverted - fix that:
    if fix_monochrome and dicom.PhotometricInterpretation == "MONOCHROME1":
        data = np.amax(data) - data
    
    if normalize:
        data = data - np.min(data)
        data = data / np.max(data)
        #data = (data * 255).astype(np.uint8)
    else:
        data = (data / 256).astype(np.uint8)
        
    return data

In [None]:
def resize_function(image_size, original_dir=None, new_dir=None):
    shape = (image_size, image_size)
    transform = A.Compose([A.LongestMaxSize(image_size) ,A.PadIfNeeded(*shape, border_mode=0)])
    def resized(path):
        image = read_xray(path)
        image = transform(image=image)['image']
        new_path = str(path).replace('.dcm', '.npy')
        if original_dir is not None and new_dir is not None:
            new_path = new_path.replace(original_dir, new_dir)
        new_path = Path(new_path)
        new_path.parent.mkdir(exist_ok=True, parents=True)
        np.save(new_path, image)
    return resized

In [None]:
IM_SIZE = 256
DATA_DIR = Path('../input/rsna-miccai-brain-tumor-radiogenomic-classification')

## Initialize Dataset and add Metadata

In [None]:
SAVE_DIR = f'/kaggle/tmp/resized-{IM_SIZE}'
os.environ['SAVE_DIR'] = SAVE_DIR
os.makedirs(SAVE_DIR, exist_ok=True)

In [None]:
!kaggle datasets init -p $SAVE_DIR

In [None]:
with open(f'{SAVE_DIR}/dataset-metadata.json') as f:
    data = json.load(f)
    
dataset_title = f"miccai-brain-{IM_SIZE}"
data['title'] = dataset_title
data['id'] = f"{username}/{dataset_title}"

with open(f'{SAVE_DIR}/dataset-metadata.json', 'w') as json_file:
    json.dump(data, json_file)

## Copy the csv files in case you want to use the dataset in Google Colab
- kaggle datasets download -d ksmcg90/miccai-brain-256

In [None]:
for path in DATA_DIR.glob('*.csv'):
    new_path = str(path).replace(str(DATA_DIR), str(SAVE_DIR))
    shutil.copy(path, new_path)

In [None]:
resizing_func = resize_function(IM_SIZE, str(DATA_DIR), str(SAVE_DIR))

In [None]:
paths = list((DATA_DIR).rglob('*.dcm'))

In [None]:
pool = mlc.SuperPool()
results = pool.map(resizing_func, paths)

## Create or Update Dataset

In [None]:
! kaggle datasets create -p $SAVE_DIR -u --dir-mode tar
#! kaggle datasets version -p $SAVE_DIR -m "Full Sample" --dir-mode tar