In [None]:
import os
import glob
import json
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from tqdm.notebook import tqdm
from PIL import Image
from pathlib import Path

#print(os.getcwd())

"""   Replace folder path for your computer  """

root_dir = '/home/SDMSegmentation/PathologyDataset_original'
save_root_dir = '/home/SDMSegmentation/PathologyDataset'

# Make Image Path List

In [None]:
def image_list(path: None):
    
    if 'JPEGImages' in path:
        jpegimage_dir = os.path.join(root_dir, path)
        jpegimage_list = os.listdir(jpegimage_dir)
        
        jpegimage_path = []
        for name in jpegimage_list:
            _jpegimage_path = os.path.join(jpegimage_dir, name)
            jpegimage_path.append(_jpegimage_path)
        
        return jpegimage_path
    
    elif 'SegmentationClassPNG' in path:
        classpng_dir = os.path.join(root_dir, path)
        classpng_list = os.listdir(classpng_dir)
        
        classpng_path = []
        for name in classpng_list:
            _classpng_path = os.path.join(classpng_dir, name)
            classpng_path.append(_classpng_path)
        
        return classpng_path

In [None]:
jpegimage_path = image_list(path='JPEGImages')
#print(jpegimage_path)
classpng_path = image_list(path='SegmentationClassPNG')

# Train & Valid & Test Split

In [None]:
train_list = ['S21-3099_x100_1.', 'S21-3099_x100_3.', 'S21-3099_x100_6.', 'S21-3099_x100_7.',
              'S21-3099_x100_8.', 'S21-3099_x100_9.', 'S21-3099_x100_10.', 'S21-3099_x100_11.',
              'S21-3099_x100_12.', 'S21-3099_x100_13.', 'S21-3191_x100_1.', 'S21-3191_x100_2.',
              'S21-3191_x100_4.', 'S21-3191_x100_5.', 'S21-3191_x100_6.', 'S21-3191_x100_7.',
              'S21-3193_x100_2.', 'S21-3193_x100_3.', 'S21-3194_x100_3.', 'S21-3195_x100_3.']

valid_list = ['S21-3191_x100_3.', 'S21-3191_x100_8.', 'S21-3193_x100_1.', 'S21-3194_x100_2.',
              'S21-3195_x100_1.', 'S21-3195_x100_4.', 'S21-3195_x100_6.']

test_list = ['S21-3099_x100_4.', 'S21-3099_x100_5.', 'S21-3193_x100_4.',
             'S21-3194_x100_1.', 'S21-3195_x100_2.', 'S21-3195_x100_5.', 'S21-3191_x100_4.']

def split_data(data_list: list(), 
               data_type: str = None, 
               path: str = None):
    
    if data_type == 'input':
        splited_data_list = [jpegimage for data in data_list for jpegimage in path if data in jpegimage]
    
    elif data_type == 'label':
        splited_data_list = [classpng for data in data_list for classpng in path if data in classpng]
       
    return splited_data_list

In [None]:
# Input Data List
train_data_list = split_data(train_list, 'input', jpegimage_path)
valid_data_list = split_data(valid_list, 'input', jpegimage_path)
test_data_list = split_data(test_list, 'input', jpegimage_path)

# Class Label List
train_label_list = split_data(train_list, 'label', classpng_path)
valid_label_list = split_data(valid_list, 'label', classpng_path)
test_label_list = split_data(test_list, 'label', classpng_path)

# Make All DataSet

### patch generation-train

In [None]:
def start_points(image_size: int = None, 
                 split_size: int = 256, 
                 overlap: float = 0.):
    points = [0]
    stride = int(split_size * (1 - overlap))
    print(stride)
    counter = 1

    while True:
        pt = stride * counter
        
        if pt + split_size >= image_size:
            points.append(image_size - split_size)    
            break
        
        else:
            points.append(pt)
        
        counter += 1
    
    return points


def decode_segmap(label_mask: np.array):
    pathology_classes = ['background', 'benign', 'malignant']
    label_colors = get_pathology_labels()
    
    r = label_mask.copy()
    g = label_mask.copy()
    b = label_mask.copy()
    
    for ll in range(0, len(pathology_classes)):
        r[label_mask == ll] = label_colors[ll, 0]
        g[label_mask == ll] = label_colors[ll, 1]
        b[label_mask == ll] = label_colors[ll, 2]
    
    rgb = np.dstack([r, g, b])
    rgb = np.uint8(rgb)
    
    return rgb

    
def get_pathology_labels():
    
    return np.asarray(
        [
            [0, 0, 0],#배경
            [255, 0, 0], #benign
            [0, 255, 0] #malignant
        ])

      
def make_labelled_dataset(overlap_ratio: float = 0., 
                          split_size: int = 256, 
                          data_path: str = None, 
                          class_path: str = None, 
                          mode: str = 'train'):
    
    if not os.path.exists(os.path.join(save_root_dir, f'labelled\\{mode}\\npy')):
        os.makedirs(os.path.join(save_root_dir, f'labelled\\{mode}\\npy'))
        
    if not os.path.exists(os.path.join(save_root_dir, f'labelled\\{mode}\\png')):
        os.makedirs(os.path.join(save_root_dir, f'labelled\\{mode}\\png'))
        
    if not os.path.exists(os.path.join(save_root_dir, f'labelled\\{mode}\\png_label')):
        os.makedirs(os.path.join(save_root_dir, f'labelled\\{mode}\\png_label'))
    
#    root_save_name = data_path.split('/')[-1].split('.')[0]
    root_save_name = Path(data_path).stem


    # input data
    input_image = Image.open(data_path).convert('RGB')

    # target data
    output_label = Image.open(class_path)

    # Convert PIL Image to Numpy Array 
    # input data
    image_array = np.array(input_image)
    img_h, img_w, _ = image_array.shape

    # output data
    label_array = np.array(output_label)

    # Revise Class Label
    label_array[label_array > 2] = 0
    unique, total = np.unique(label_array, return_counts=True)
    print(f"Unique Label: {unique} | Total Count: {total}")
    print("=="*30)

    x_points = start_points(img_w, split_size=split_size, overlap=overlap_ratio)
    y_points = start_points(img_h, split_size=split_size, overlap=overlap_ratio)

    for y in y_points:

        for x in x_points:
            splited_input_array = image_array[y: y + split_size, x: x + split_size]
            splited_label_array = label_array[y: y + split_size, x: x + split_size]
#             _, sub_total = np.unique(splited_label_array, return_counts=True)
#             print(f"Subset Count: {sub_total}")

            npy_dict = {'input': splited_input_array,
                        'label': splited_label_array}

#            save_name = f'{root_save_name}_[{x}_{y}].npy' 
            save_name = root_save_name+'_'+'['+f'{x}'+'_'+f'{y}'+']'+'.npy'
            save_path = '/home/SDMSegmentation/PathologyDataset/labelled/train/npy'
            # Save Input & Label Npy
#            save_path = os.path.join(save_root_dir, 'labelled', f'{mode}', 'npy', f'{root_save_name}_[{x}_{y}].npy')
            real_path = save_path+'\\'+save_name
            np.save(real_path, npy_dict)
            print(real_path) 
            
            save_name = root_save_name+'_'+'['+f'{x}'+'_'+f'{y}'+']'+'.png'
            save_path = '/home/SDMSegmentation/PathologyDataset/labelled/train/png'
            # Save Input Png
            splited_img = Image.fromarray(splited_input_array)
#            save_path = os.path.join(save_root_dir, f'labelled\\{mode}\\png', f'{root_save_name}_[{x}_{y}].png')
            real_path = save_path+'\\'+save_name
            splited_img.save(real_path, format='png')
            
            save_name = root_save_name+'_'+'['+f'{x}'+'_'+f'{y}'+']'+'.png'
            save_path = '/home/SDMSegmentation/PathologyDataset/labelled/train/png_label'
            # Save Label Png
            splited_label_img = Image.fromarray(decode_segmap(splited_label_array))
#            save_path = os.path.join(save_root_dir, f'labelled\\{mode}\\png_label', f'{root_save_name}_[{x}_{y}].png')
            real_path = save_path+'\\'+save_name
            splited_label_img.save(real_path, format='png')

In [None]:
# Make Labelled DataSet with Train
for input_path, label_path in tqdm(zip(train_data_list, train_label_list)):
    make_labelled_dataset(overlap_ratio=0.75, data_path=input_path, class_path=label_path, mode='train')

### patch generation-valid

In [None]:
def start_points(image_size: int = None, 
                 split_size: int = 256, 
                 overlap: float = 0.):
    points = [0]
    stride = int(split_size * (1 - overlap))
    counter = 1
    
    while True:
        pt = stride * counter
        
        if pt + split_size >= image_size:
            points.append(image_size - split_size)    
            break
        
        else:
            points.append(pt)
        
        counter += 1
    
    return points


def decode_segmap(label_mask: np.array):
    pathology_classes = ['background', 'benign', 'malignant']
    label_colors = get_pathology_labels()
    
    r = label_mask.copy()
    g = label_mask.copy()
    b = label_mask.copy()
    
    for ll in range(0, len(pathology_classes)):
        r[label_mask == ll] = label_colors[ll, 0]
        g[label_mask == ll] = label_colors[ll, 1]
        b[label_mask == ll] = label_colors[ll, 2]
    
    rgb = np.dstack([r, g, b])
    rgb = np.uint8(rgb)
    
    return rgb

    
def get_pathology_labels():
    
    return np.asarray(
        [
            [0, 0, 0],
            [255, 0, 0],
            [0, 255, 0]
        ])

      
def make_labelled_dataset(overlap_ratio: float = 0., 
                          split_size: int = 256, 
                          data_path: str = None, 
                          class_path: str = None, 
                          mode: str = 'train'):
    
    if not os.path.exists(os.path.join(save_root_dir, f'labelled\\{mode}\\npy')):
        os.makedirs(os.path.join(save_root_dir, f'labelled\\{mode}\\npy'))
        
    if not os.path.exists(os.path.join(save_root_dir, f'labelled\\{mode}\\png')):
        os.makedirs(os.path.join(save_root_dir, f'labelled\\{mode}\\png'))
        
    if not os.path.exists(os.path.join(save_root_dir, f'labelled\\{mode}\\png_label')):
        os.makedirs(os.path.join(save_root_dir, f'labelled\\{mode}\\png_label'))
    
#    root_save_name = data_path.split('/')[-1].split('.')[0]
    root_save_name = Path(data_path).stem


    # input data
    input_image = Image.open(data_path).convert('RGB')

    # target data
    output_label = Image.open(class_path)

    # Convert PIL Image to Numpy Array 
    # input data
    image_array = np.array(input_image)
    img_h, img_w, _ = image_array.shape

    # output data
    label_array = np.array(output_label)

    # Revise Class Label
    label_array[label_array > 2] = 0
    unique, total = np.unique(label_array, return_counts=True)
    print(f"Unique Label: {unique} | Total Count: {total}")
    print("=="*30)

    x_points = start_points(img_w, split_size=split_size, overlap=overlap_ratio)
    y_points = start_points(img_h, split_size=split_size, overlap=overlap_ratio)

    for y in y_points:

        for x in x_points:
            splited_input_array = image_array[y: y + split_size, x: x + split_size]
            splited_label_array = label_array[y: y + split_size, x: x + split_size]
#             _, sub_total = np.unique(splited_label_array, return_counts=True)
#             print(f"Subset Count: {sub_total}")

            npy_dict = {'input': splited_input_array,
                        'label': splited_label_array}

#            save_name = f'{root_save_name}_[{x}_{y}].npy' 
            save_name = root_save_name+'_'+'['+f'{x}'+'_'+f'{y}'+']'+'.npy'
            save_path = '/home/SDMSegmentation/PathologyDataset/labelled/valid/npy'
            # Save Input & Label Npy
#            save_path = os.path.join(save_root_dir, 'labelled', f'{mode}', 'npy', f'{root_save_name}_[{x}_{y}].npy')
            real_path = save_path+'\\'+save_name
            np.save(real_path, npy_dict)
            print(real_path) 
            
            save_name = root_save_name+'_'+'['+f'{x}'+'_'+f'{y}'+']'+'.png'
            save_path = '/home/SDMSegmentation/PathologyDataset/labelled/valid/png'
            # Save Input Png
            splited_img = Image.fromarray(splited_input_array)
#            save_path = os.path.join(save_root_dir, f'labelled\\{mode}\\png', f'{root_save_name}_[{x}_{y}].png')
            real_path = save_path+'\\'+save_name
            splited_img.save(real_path, format='png')
            
            save_name = root_save_name+'_'+'['+f'{x}'+'_'+f'{y}'+']'+'.png'
            save_path = '/home/SDMSegmentation/PathologyDataset/labelled/valid/png_label'
            # Save Label Png
            splited_label_img = Image.fromarray(decode_segmap(splited_label_array))
#            save_path = os.path.join(save_root_dir, f'labelled\\{mode}\\png_label', f'{root_save_name}_[{x}_{y}].png')
            real_path = save_path+'\\'+save_name
            splited_label_img.save(real_path, format='png')

In [None]:
# Make Labelled DataSet with Valid
for input_path, label_path in tqdm(zip(valid_data_list, valid_label_list)):
    make_labelled_dataset(overlap_ratio=0., data_path=input_path, class_path=label_path, mode='valid')

### patch generation-test

In [None]:
def start_points(image_size: int = None, 
                 split_size: int = 256, 
                 overlap: float = 0.):
    points = [0]
    stride = int(split_size * (1 - overlap))
    counter = 1
    
    while True:
        pt = stride * counter
        
        if pt + split_size >= image_size:
            points.append(image_size - split_size)    
            break
        
        else:
            points.append(pt)
        
        counter += 1
    
    return points


def decode_segmap(label_mask: np.array):
    pathology_classes = ['background', 'benign', 'malignant']
    label_colors = get_pathology_labels()
    
    r = label_mask.copy()
    g = label_mask.copy()
    b = label_mask.copy()
    
    for ll in range(0, len(pathology_classes)):
        r[label_mask == ll] = label_colors[ll, 0]
        g[label_mask == ll] = label_colors[ll, 1]
        b[label_mask == ll] = label_colors[ll, 2]
    
    rgb = np.dstack([r, g, b])
    rgb = np.uint8(rgb)
    
    return rgb

    
def get_pathology_labels():
    
    return np.asarray(
        [
            [0, 0, 0],#black
            [255, 0, 0],#red
            [0, 255, 0] #green
        ])

      
def make_labelled_dataset(overlap_ratio: float = 0., 
                          split_size: int = 256, 
                          data_path: str = None, 
                          class_path: str = None, 
                          mode: str = 'train'):
    
    if not os.path.exists(os.path.join(save_root_dir, f'labelled\\{mode}\\npy')):
        os.makedirs(os.path.join(save_root_dir, f'labelled\\{mode}\\npy'))
        
    if not os.path.exists(os.path.join(save_root_dir, f'labelled\\{mode}\\png')):
        os.makedirs(os.path.join(save_root_dir, f'labelled\\{mode}\\png'))
        
    if not os.path.exists(os.path.join(save_root_dir, f'labelled\\{mode}\\png_label')):
        os.makedirs(os.path.join(save_root_dir, f'labelled\\{mode}\\png_label'))
    
#    root_save_name = data_path.split('/')[-1].split('.')[0]
    root_save_name = Path(data_path).stem


    # input data
    input_image = Image.open(data_path).convert('RGB')

    # target data
    output_label = Image.open(class_path)

    # Convert PIL Image to Numpy Array 
    # input data
    image_array = np.array(input_image)
    img_h, img_w, _ = image_array.shape

    # output data
    label_array = np.array(output_label)

    # Revise Class Label
    label_array[label_array > 2] = 0
    unique, total = np.unique(label_array, return_counts=True)
    print(f"Unique Label: {unique} | Total Count: {total}")
    print("=="*30)

    x_points = start_points(img_w, split_size=split_size, overlap=overlap_ratio)
    y_points = start_points(img_h, split_size=split_size, overlap=overlap_ratio)

    for y in y_points:

        for x in x_points:
            splited_input_array = image_array[y: y + split_size, x: x + split_size]
            splited_label_array = label_array[y: y + split_size, x: x + split_size]
#             _, sub_total = np.unique(splited_label_array, return_counts=True)
#             print(f"Subset Count: {sub_total}")

            npy_dict = {'input': splited_input_array,
                        'label': splited_label_array}

#            save_name = f'{root_save_name}_[{x}_{y}].npy' 
            save_name = root_save_name+'_'+'['+f'{x}'+'_'+f'{y}'+']'+'.npy'
            save_path = '/home/SDMSegmentation/PathologyDataset/labelled/test/npy'
            # Save Input & Label Npy
#            save_path = os.path.join(save_root_dir, 'labelled', f'{mode}', 'npy', f'{root_save_name}_[{x}_{y}].npy')
            real_path = save_path+'\\'+save_name
            np.save(real_path, npy_dict)
            print(real_path) 
            
            save_name = root_save_name+'_'+'['+f'{x}'+'_'+f'{y}'+']'+'.png'
            save_path = '/home/SDMSegmentation/PathologyDataset/labelled/test/png'
            # Save Input Png
            splited_img = Image.fromarray(splited_input_array)
#            save_path = os.path.join(save_root_dir, f'labelled\\{mode}\\png', f'{root_save_name}_[{x}_{y}].png')
            real_path = save_path+'\\'+save_name
            splited_img.save(real_path, format='png')
            
            save_name = root_save_name+'_'+'['+f'{x}'+'_'+f'{y}'+']'+'.png'
            save_path = '/home/SDMSegmentation/PathologyDataset/labelled/test/png_label'
            # Save Label Png
            splited_label_img = Image.fromarray(decode_segmap(splited_label_array))
#            save_path = os.path.join(save_root_dir, f'labelled\\{mode}\\png_label', f'{root_save_name}_[{x}_{y}].png')
            real_path = save_path+'\\'+save_name
            splited_label_img.save(real_path, format='png')

In [None]:
# Make Labelled DataSet with Test
for input_path, label_path in tqdm(zip(test_data_list, test_label_list)):
    make_labelled_dataset(overlap_ratio=0.75, data_path=input_path, class_path=label_path, mode='test')