<a href="https://colab.research.google.com/github/samilarinc/engineerdocs/blob/main/Preprocess.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Download Data

In [None]:
!gcloud auth login
!gcloud auth application-default login

In [None]:
!gsutil -m -q cp -r gs://engineerdocs/sakso .

# Import Headers

In [372]:
import cv2
import matplotlib.pyplot as plt
import numpy as np
import os
import pydicom as dicom
from tqdm import tqdm

from skimage import filters
from time import perf_counter

# Preprocess

In [378]:
def img_crop(gray_img, kernel_size):
    img = 4095 - gray_img
    thresh = filters.threshold_otsu(img)
    binary = img > thresh

    kernel = cv2.getStructuringElement(cv2.MORPH_RECT, kernel_size)
    binary = cv2.morphologyEx(binary.astype(np.uint8), cv2.MORPH_GRADIENT, kernel)
    contours, _ = cv2.findContours(binary, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)

    breast_contour = max(contours, key=cv2.contourArea)
    x, y, w, h = cv2.boundingRect(breast_contour)
    cropped_img = img[y:y+h, x:x+w]

    cropped_img = cv2.bitwise_not(cropped_img)
    return cropped_img

def img_normalize(img):
    img = (img - img.min()) / (img.max() - img.min())
    return img

def img_clahe(img, clip, tile):
    clahe_create = cv2.createCLAHE(clipLimit=clip, tileGridSize=tile)
    clahe_img = clahe_create.apply(img)

    return clahe_img

def img_downsample(img, method, kernel_size):
    if method == 'lanczos':
        return cv2.resize(img, kernel_size, interpolation = cv2.INTER_LANCZOS4)
    return 

def preprocess(img, do_downsample = False, do_normalize = False, down_method = 'lanczos', down_size = (1024, 1024), clip = 2.0, tile = (8, 8), crop_kernel = (100, 100)):
    img = img_clahe(img_crop(img, crop_kernel), clip, tile)
    if do_downsample:
        img = img_downsample(img, down_method, down_size)
    if do_normalize:
        img = img_normalize(img)
    return img

In [386]:
folder_names = os.listdir('sakso')
files = ('LCC', 'LMLO', 'RCC', 'RMLO')

os.mkdir('processed')

for i in tqdm(folder_names):
    path = 'sakso/' + i
    for j in files:
        img_path = path + '/' + j + '.dcm'
        ds = dicom.dcmread(img_path)
        img = ds.pixel_array
        img = preprocess(img, True)
        np.save(f'processed/{i}_{j}.npy', img)

  6%|▌         | 87/1445 [01:19<20:43,  1.09it/s]


KeyboardInterrupt: ignored

In [385]:
!rm -rf processed