In [11]:
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import cv2
import json
from PIL import Image
from collections import Counter
import pickle
import random
import shutil
random.seed(42)

In [12]:
base_path = '..'
generate_all_datset_annots = True
num_folds = 5
input_imgs_path = f'{base_path}/dataset1_files/all_dataset1_imgs' if not generate_all_datset_annots else f'{base_path}/all_dataset_files/all_dataset_imgs'
input_annots_path = f'{base_path}/dataset1_files/all_dataset1_annotations' if not generate_all_datset_annots else f'{base_path}/all_dataset_files/all_dataset_annotations'
validation_imgs = set()

for i in range(num_folds):
    output_imgs_path = f'{base_path}/dataset1_files/all_dataset1_imgs_margins_{i}' if not generate_all_datset_annots else f'{base_path}/all_dataset_files/all_dataset_imgs_margins_{i}'
    temp_input_imgs_path = f'{base_path}/dataset1_files/all_dataset1_imgs_margins_temp_{i}' if not generate_all_datset_annots else f'{base_path}/all_dataset_files/all_dataset_imgs_margins_temp_{i}'
    output_annots_path = f'{base_path}/dataset1_files/all_dataset1_annotations_margins_{i}' if not generate_all_datset_annots else f'{base_path}/all_dataset_files/all_dataset_annotations_margins_{i}'
    if os.path.exists(output_imgs_path):
      shutil.rmtree(output_imgs_path)
    if os.path.exists(temp_input_imgs_path):
      shutil.rmtree(temp_input_imgs_path)
    if os.path.exists(output_annots_path):
      shutil.rmtree(output_annots_path)
    os.mkdir(output_imgs_path)
    os.mkdir(temp_input_imgs_path)
    os.mkdir(output_annots_path)

In [13]:
def get_count_type(tiles_dict):
  ids_with_info = []
  for tile in tiles_dict:
    cur_dict = {'id': tile['id'], 'blood_vessel': 0, 'glomerulus': 0, 'unsure': 0}
    for annot in tile['annotations']:
      cur_dict[annot['type']] += 1
    ids_with_info.append(cur_dict)
  return ids_with_info
def calculate_area(coordinates):
    num_points = len(coordinates)
    if num_points < 3:
        return 0

    area = 0
    for i in range(num_points - 1):
        x_i, y_i = coordinates[i]
        x_iplus1, y_iplus1 = coordinates[i + 1]
        area += (x_i * y_iplus1) - (x_iplus1 * y_i)

    x_n, y_n = coordinates[-1]
    x_0, y_0 = coordinates[0]
    area += (x_n * y_0) - (x_0 * y_n)

    area = abs(area / 2)
    return area
# helper function for data visualization
def visualize(**images):
    """PLot images in one row."""
    n = len(images)
    plt.figure(figsize=(30, 30))
    for i, (name, image) in enumerate(images.items()):
        plt.subplot(1, n, i + 1)
        plt.xticks([])
        plt.yticks([])
        plt.title(' '.join(name.split('_')).title())
        plt.imshow(image)
    plt.show()

In [14]:
with open(f'{base_path}/polygons.jsonl', 'r') as json_file:
    json_list = list(json_file)
    
tiles_dicts = []
for json_str in json_list:
    tiles_dicts.append(json.loads(json_str))

In [15]:
tile_df = pd.read_csv(f'{base_path}/tile_meta.csv')
# Find the same stats as the above cell for annotated images
annotated_ids_with_info = get_count_type(tiles_dicts)
annotated_ids_with_info_df = pd.DataFrame.from_dict(annotated_ids_with_info)
tile_df_annotated = pd.merge(tile_df, annotated_ids_with_info_df, on='id', how='inner')
tile_df_annotated.head()

Unnamed: 0,id,source_wsi,dataset,i,j,blood_vessel,glomerulus,unsure
0,0006ff2aa7cd,2,2,16896,16420,8,1,0
1,00168d1b7522,2,2,14848,14884,1,1,0
2,0033bbc76b6b,1,1,10240,43008,3,0,1
3,003504460b3a,3,2,8192,11776,7,0,0
4,004daf1cbe75,3,2,6144,11264,10,1,0


In [16]:
import random
random.seed(42)
tgt_wsis = [1,2,3,4]
wsi_dicts = []
wsi_tile_ids = []
for cur_tgt_wsi in tgt_wsis:
    if generate_all_datset_annots:
        pos_dict = {(x[0], x[1]): x[-1] for x in tile_df_annotated.loc[tile_df_annotated['source_wsi']==cur_tgt_wsi, ['i', 'j', 'id']].values}
        wsi_ids = list(tile_df_annotated.loc[tile_df_annotated['source_wsi']==cur_tgt_wsi, 'id'].values)
    else:
        pos_dict = {(x[0], x[1]): x[-1] for x in tile_df_annotated.loc[(tile_df_annotated['dataset']==1) & (tile_df_annotated['source_wsi']==cur_tgt_wsi), ['i', 'j', 'id']].values}
        wsi_ids = list(tile_df_annotated.loc[(tile_df_annotated['dataset']==1) & (tile_df_annotated['source_wsi']==cur_tgt_wsi), 'id'].values)
    random.shuffle(wsi_ids)
    wsi_tile_ids.append(wsi_ids)
    reverse_pos_dict = {v:k for k,v in pos_dict.items()}
    wsi_dicts.append((pos_dict, reverse_pos_dict))

In [18]:
dataset_1_tile_ids = sorted(list(tile_df_annotated.loc[tile_df_annotated['dataset']==1, 'id'].values))
all_dataset_tile_ids = sorted(list(tile_df_annotated.loc[:, 'id'].values))

In [17]:
def load_img(img_dir, img_id, is_grayscale=False):
    if not is_grayscale:
        img = cv2.imread(f"{img_dir}/{img_id}.png")
        img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
    else:
        img = cv2.imread(f"{img_dir}/{img_id}.png", cv2.IMREAD_GRAYSCALE)
    return img

def create_hor_margins(input_imgs_dir, temp_input_imgs_dir, img_id, img_i, img_j, dataset_pos_dict, is_top, is_train, margin_size=32, is_mask=False):
    is_grayscale = True if is_mask else False
    
    left_img_i = img_i - 512
    left_img_j = img_j - 512 if is_top else img_j + 512
    left_piece = np.zeros((margin_size, margin_size), dtype='uint8') if is_grayscale else np.zeros((margin_size, margin_size, 3), dtype='uint8')
    if (left_img_i, left_img_j) in dataset_pos_dict:
        cur_img_id = dataset_pos_dict[(left_img_i, left_img_j)]
        img_load_dir = temp_input_imgs_dir if (is_train and cur_img_id in validation_imgs) else input_imgs_dir
        cur_img = load_img(img_load_dir, cur_img_id, is_grayscale=is_grayscale)
        left_piece = cur_img[-margin_size:, -margin_size:]
    
    middle_img_i = img_i
    middle_img_j = img_j - 512 if is_top else img_j + 512
    middle_piece = np.zeros((margin_size, 512), dtype='uint8') if is_grayscale else np.zeros((margin_size, 512, 3), dtype='uint8')
    if (middle_img_i, middle_img_j) in dataset_pos_dict:
        cur_img_id = dataset_pos_dict[(middle_img_i, middle_img_j)]
        img_load_dir = temp_input_imgs_dir if (is_train and cur_img_id in validation_imgs) else input_imgs_dir
        cur_img = load_img(img_load_dir, cur_img_id, is_grayscale=is_grayscale)
        middle_piece = cur_img[-margin_size:]
    
    right_img_i = img_i + 512
    right_img_j = img_j - 512 if is_top else img_j + 512
    right_piece = np.zeros((margin_size, margin_size), dtype='uint8') if is_grayscale else np.zeros((margin_size, margin_size, 3), dtype='uint8')
    if (right_img_i, right_img_j) in dataset_pos_dict:
        cur_img_id = dataset_pos_dict[(right_img_i, right_img_j)]
        img_load_dir = temp_input_imgs_dir if (is_train and cur_img_id in validation_imgs) else input_imgs_dir
        cur_img = load_img(img_load_dir, cur_img_id, is_grayscale=is_grayscale)
        right_piece = cur_img[-margin_size:, :margin_size]
    
    return cv2.hconcat([left_piece, middle_piece, right_piece])

def create_vert_margins(input_imgs_dir, temp_input_imgs_dir, img_id, img_i, img_j, dataset_pos_dict, is_left, is_train, margin_size=32, is_mask=False):
    is_grayscale = True if is_mask else False
    
    middle_img_i = img_i - 512 if is_left else img_i + 512
    middle_img_j = img_j
    middle_piece = np.zeros((512, margin_size), dtype='uint8') if is_grayscale else np.zeros((512, margin_size, 3), dtype='uint8')
    if (middle_img_i, middle_img_j) in dataset_pos_dict:
        cur_img_id = dataset_pos_dict[(middle_img_i, middle_img_j)]
        img_load_dir = temp_input_imgs_dir if (is_train and cur_img_id in validation_imgs) else input_imgs_dir
        cur_img = load_img(img_load_dir, cur_img_id, is_grayscale=is_grayscale)
        middle_piece = cur_img[:, -margin_size:]
    return middle_piece

def get_layered_img(input_imgs_dir, temp_input_imgs_dir, img, img_id, img_i, img_j, dataset_pos_dict, is_train, margin_size=32, is_mask=False):
    is_grayscale = True if is_mask else False
    
    top_margin = create_hor_margins(input_imgs_dir, temp_input_imgs_dir, img_id, img_i, img_j, dataset_pos_dict, True, is_train, margin_size=margin_size, is_mask=is_mask)
    bottom_margin = create_hor_margins(input_imgs_dir, temp_input_imgs_dir, img_id, img_i, img_j, dataset_pos_dict, False, is_train, margin_size=margin_size, is_mask=is_mask)
    left_margin = create_vert_margins(input_imgs_dir, temp_input_imgs_dir, img_id, img_i, img_j, dataset_pos_dict, True, is_train, margin_size=margin_size, is_mask=is_mask)
    right_margin = create_vert_margins(input_imgs_dir, temp_input_imgs_dir, img_id, img_i, img_j, dataset_pos_dict, False, is_train, margin_size=margin_size, is_mask=is_mask)
    print(left_margin.shape, img.shape, right_margin.shape, left_margin.dtype, img.dtype, right_margin.dtype)
    middle_piece = cv2.hconcat([left_margin, img, right_margin])
    return cv2.vconcat([top_margin, middle_piece, bottom_margin])

In [24]:
import math
fold_size = int(math.ceil(len(all_dataset_tile_ids) / num_folds))
fold_size

327

In [20]:
i=0
cur_validaton_slice = []
for wsi_tile_id in wsi_tile_ids:
  cur_validaton_slice += wsi_tile_id[int(i*(fold_size/num_wsis)):min(int((i+1)*(fold_size/num_wsis)), len(wsi_tile_id))]
cur_training_slice = list(set(dataset_1_tile_ids) - set(cur_validaton_slice)) if not generate_all_datset_annots else list(set(all_dataset_tile_ids) - set(cur_validaton_slice))
# for idx, cur_tgt_wsi in enumerate(tgt_wsis):
#     pos_dict, reverse_pos_dict = wsi_dicts[idx]
#     for img_id in reverse_pos_dict.keys():
print(cur_validaton_slice)

NameError: name 'fold_size' is not defined