In [None]:
!pip install -qU python-gdcm pydicom pylibjpeg

In [None]:
import os
import cv2
import glob
import gdcm
import pydicom
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

from tqdm.notebook import tqdm
from joblib import Parallel, delayed

In [None]:
train_images = glob.glob("/kaggle/input/rsna-breast-cancer-detection/train_images/*/*.dcm")

len(train_images)  # 54706

In [None]:
def crop_image(img, show=True):
    # Binarize the image
    bin_pixels = cv2.threshold(img, 20, 255, cv2.THRESH_BINARY)[1]
   
    # Make contours around the binarized image, keep only the largest contour
    contours, _ = cv2.findContours(bin_pixels, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_NONE)
    contour = max(contours, key=cv2.contourArea)

    # Create a mask from the largest contour
    mask = np.zeros(img.shape, np.uint8)
    cv2.drawContours(mask, [contour], -1, 255, cv2.FILLED)
   
    # Use bitwise_and to get masked part of the original image
    out = cv2.bitwise_and(img, mask)
    
    # get bounding box of contour
    y1, y2 = np.min(contour[:, :, 1]), np.max(contour[:, :, 1])
    x1, x2 = np.min(contour[:, :, 0]), np.max(contour[:, :, 0])
    
    x1 = int(0.99 * x1)
    x2 = int(1.01 * x2)
    y1 = int(0.99 * y1)
    y2 = int(1.01 * y2)
    
    if show:
        plt.imshow(out[y1:y2, x1:x2], cmap="gray") ; 

    return out[y1:y2, x1:x2]

In [None]:
DATASET_NAME = f'RSNA-cropped-png-1344x960-test'
SAVE_FOLDER = f"/kaggle/working/{DATASET_NAME}"

In [None]:
os.makedirs(SAVE_FOLDER, exist_ok=True)
os.makedirs(SAVE_FOLDER + '/images/', exist_ok=True)

with open('/kaggle/input/myjson/kaggle.json') as f:
    kaggle_creds = json.load(f)
    
os.environ['KAGGLE_USERNAME'] = kaggle_creds['username']
os.environ['KAGGLE_KEY'] = kaggle_creds['key']

!kaggle datasets init -p '{SAVE_FOLDER}'

with open(f'{SAVE_FOLDER}/dataset-metadata.json') as f:
    dataset_meta = json.load(f)
    
dataset_meta['id'] = f'fabiendaniel/{DATASET_NAME}'
dataset_meta['title'] = DATASET_NAME

with open(f'{SAVE_FOLDER}/dataset-metadata.json', "w") as outfile:
    json.dump(dataset_meta, outfile)
print(dataset_meta)

!cp '{SAVE_FOLDER}'/dataset-metadata.json '{SAVE_FOLDER}'/meta.json
!ls '{SAVE_FOLDER}'

!kaggle datasets create -u -p '{SAVE_FOLDER}'

In [None]:
TARGET_HEIGHT = 1344
TARGET_WIDTH = 960
N_CHANNELS = 1
INPUT_SHAPE = (TARGET_HEIGHT, TARGET_WIDTH, N_CHANNELS)
TARGET_HEIGHT_WIDTH_RATIO = TARGET_HEIGHT / TARGET_WIDTH
def process(f, save_folder="", extension="png"):
    a = 0
    patient = f.split('/')[-2]
    image = f.split('/')[-1][:-4]
#     img_dicom_sdl = dicoml.open(f)
#     img = img_dicom_sdl.pixelData()
    dicom = pydicom.dcmread(f)
    img = dicom.pixel_array
    
    img = (img - img.min()) / (img.max() - img.min())
    img = img*255
    img = np.uint8(img) 
    if dicom.PhotometricInterpretation == "MONOCHROME1":
        img = 1 - img
    h0, w0 = img.shape
    if img[:,int(-w0 * 0.10):].sum() > img[:,:int(w0 * 0.10)].sum():
        img = np.flip(img, axis=1)
    h0, w0 = img.shape
    img = img[int(h0 * 2e-2):-int(h0 * 2e-2),int(w0 * 2e-2):-int(w0 * 2e-2)]
    img = crop_image(img)
    h, w = img.shape
    if (h / w) > TARGET_HEIGHT_WIDTH_RATIO:
        print("Larger than 1.4")
        pad = int(h / TARGET_HEIGHT_WIDTH_RATIO - w)
        img = np.pad(img, [[0,0], [0, pad]])
        h, w = img.shape
    else:
        print("not larger than 1.4")
        pad = int(0.50 * (w * TARGET_HEIGHT_WIDTH_RATIO - h))
        img = np.pad(img, [[pad, pad], [0,0]])
        h, w = img.shape
    # Resize
    img = cv2.resize(img, (TARGET_WIDTH, TARGET_HEIGHT), interpolation=cv2.INTER_AREA)
    print(img.shape)
    
    img = (img - img.min()) / (img.max() - img.min())
    cv2.imwrite(save_folder + f"/images/{patient}_{image}.{extension}", (img * 255).astype(np.uint8))
    print(len(os.listdir("/kaggle/working/output")))

In [None]:
_ = Parallel(n_jobs=4)(
    delayed(process)(uid, save_folder=SAVE_FOLDER, extension=EXTENSION)
    for uid in tqdm(train_images[:10])
)

In [None]:
from datetime import datetime
version_name = datetime.now().strftime("%Y%m%d-%H%M%S")
print(version_name)

In [None]:
output_images = glob.glob(f"{SAVE_FOLDER}/images/*.png")

len(output_images)

In [None]:
!kaggle datasets version -m {version_name} -p "{SAVE_FOLDER}"  -r tar -r zip