In [1]:
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import cv2
import json
from PIL import Image
from collections import Counter
import pickle
import shutil

In [2]:
base_path = '..'
num_folds = 5
generate_masks = False
should_dilate = False
use_merged_dataset = False
generate_all_datset_annots = True
base_data_dir = 'dataset1_files' if not generate_all_datset_annots else 'all_dataset_files'
base_data_name = 'all_dataset1' if not generate_all_datset_annots else 'all_dataset'
get_whole_dataset_into_fold = False
pick_validation_from_file = False
input_imgs_dir = f'{base_path}/{base_data_dir}/{base_data_name}_imgs_merged' if use_merged_dataset else f'{base_path}/{base_data_dir}/{base_data_name}_imgs'
input_annots_dir = f'{base_path}/{base_data_dir}/{base_data_name}_annotations_merged_cleaned' if use_merged_dataset else f'{base_path}/{base_data_dir}/{base_data_name}_annotations'
vis_output_imgs_dir = f'{base_path}/{base_data_dir}/{base_data_name}_vis'

if os.path.exists(vis_output_imgs_dir):
    shutil.rmtree(vis_output_imgs_dir)
os.mkdir(vis_output_imgs_dir)

for i in range(num_folds):
    train_modes = ['train', 'validation']
    for train_mode in train_modes:
        output_imgs_dir = f'{base_path}/{base_data_dir}/{base_data_name}_imgs_merged_{train_mode}_{i}' if use_merged_dataset else f'{base_path}/{base_data_dir}/{base_data_name}_imgs_{train_mode}_{i}'
        output_annots_dir = f'{base_path}/{base_data_dir}/{base_data_name}_annotations_merged_{train_mode}_{i}' if use_merged_dataset else f'{base_path}/{base_data_dir}/{base_data_name}_annotations_{train_mode}_{i}'
        output_masks_dir = f'{base_path}/{base_data_dir}/{base_data_name}_masks_merged_{train_mode}_{i}' if use_merged_dataset else f'{base_path}/{base_data_dir}/{base_data_name}_masks_{train_mode}_{i}'
        if os.path.exists(output_imgs_dir):
            shutil.rmtree(output_imgs_dir)
        os.mkdir(output_imgs_dir)
        if os.path.exists(output_annots_dir):
            shutil.rmtree(output_annots_dir)
        os.mkdir(output_annots_dir)
        if generate_masks:
            if os.path.exists(output_masks_dir):
                shutil.rmtree(output_masks_dir)
            os.mkdir(output_masks_dir)

In [3]:
def get_count_type(tiles_dict):
  ids_with_info = []
  for tile in tiles_dict:
    cur_dict = {'id': tile['id'], 'blood_vessel': 0, 'glomerulus': 0, 'unsure': 0}
    for annot in tile['annotations']:
      cur_dict[annot['type']] += 1
    ids_with_info.append(cur_dict)
  return ids_with_info

# helper function for data visualization
def visualize(**images):
    """PLot images in one row."""
    n = len(images)
    plt.figure(figsize=(30, 30))
    for i, (name, image) in enumerate(images.items()):
        plt.subplot(1, n, i + 1)
        plt.xticks([])
        plt.yticks([])
        plt.title(' '.join(name.split('_')).title())
        plt.imshow(image)
    plt.show()

In [4]:
with open(f'{base_path}/polygons.jsonl', 'r') as json_file:
    json_list = list(json_file)
    
tiles_dicts = []
for json_str in json_list:
    tiles_dicts.append(json.loads(json_str))

In [5]:
tile_df = pd.read_csv(f'{base_path}/tile_meta.csv')
# Find the same stats as the above cell for annotated images
annotated_ids_with_info = get_count_type(tiles_dicts)
annotated_ids_with_info_df = pd.DataFrame.from_dict(annotated_ids_with_info)
tile_df_annotated = pd.merge(tile_df, annotated_ids_with_info_df, on='id', how='inner')
tile_df_annotated.head()

Unnamed: 0,id,source_wsi,dataset,i,j,blood_vessel,glomerulus,unsure
0,0006ff2aa7cd,2,2,16896,16420,8,1,0
1,00168d1b7522,2,2,14848,14884,1,1,0
2,0033bbc76b6b,1,1,10240,43008,3,0,1
3,003504460b3a,3,2,8192,11776,7,0,0
4,004daf1cbe75,3,2,6144,11264,10,1,0


In [6]:
import math
def load_img(img_dir, img_id):
  img = cv2.imread(f"{img_dir}/{img_id}.png")
  img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
  return img

def add_border(image, border_size, border_color):
    height, width = image.shape[:2]
    new_height = height + 2 * border_size
    new_width = width + 2 * border_size
    bordered_image = np.zeros((new_height, new_width, 3), dtype=np.uint8)
    bordered_image[border_size:height + border_size, border_size:width + border_size] = image
    cv2.rectangle(bordered_image, (0, 0), (new_width - 1, new_height - 1), border_color, border_size)
    return bordered_image

def create_coco_annots(all_coords, pair_format=True):
  cur_tile_coco_annots = []
  for coords in all_coords:
    if not pair_format:
      coords = [[pt[0], pt[1]] for pt in zip(coords[::2], coords[1::2])]
    segmentations = [[pt for pair in coords for pt in pair]]
    min_x = min(coords, key=lambda x: x[0])[0]
    max_x = max(coords, key=lambda x: x[0])[0]
    min_y = min(coords, key=lambda x: x[1])[1]
    max_y = max(coords, key=lambda x: x[1])[1]
    segmentation_bbox = [min_x, min_y, max_x-min_x, max_y-min_y]
    category_id = 0
    cur_tile_coco_annots.append({
      'segmentation': segmentations,
      'bbox': segmentation_bbox,
      'category_id': category_id
    })
  return cur_tile_coco_annots

def get_annotated_img(imgs_dir, annots_dir, base_img_id, is_yolo=False):
  eps = 1
  annotated_image_coco = load_img(imgs_dir, base_img_id)
  img_height, img_width, _ = annotated_image_coco.shape
  with open(f'{annots_dir}/{base_img_id}.pkl', 'rb') as f:
    tgt_annots = pickle.load(f)
  for tgt_annot in tgt_annots:
    coords = [[x, y] for x, y in zip(tgt_annot['segmentation'][0][::2], tgt_annot['segmentation'][0][1::2])]
    min_x, min_y, width, height = tgt_annot['bbox']
    cv2.fillPoly(annotated_image_coco, pts=[np.array(coords)], color=(0,255,0))
    top_left = (int(tgt_annot['bbox'][0]), int(tgt_annot['bbox'][1]))
    min_x = min(coords, key=lambda x: x[0])[0]
    min_y = min(coords, key=lambda x: x[1])[1]
    max_x = max(coords, key=lambda x: x[0])[0]
    max_y = max(coords, key=lambda x: x[1])[1]
    bottom_right = (int(tgt_annot['bbox'][0]+tgt_annot['bbox'][2]), int(tgt_annot['bbox'][1]+tgt_annot['bbox'][3]))
    print(max_x-min_x, max_y-min_y, tgt_annot['bbox'])
#     if min_x <= eps or max_x >= img_width-eps or min_y <= eps or max_y >= img_height-eps:
#       annotated_image_coco = cv2.rectangle(annotated_image_coco, top_left, bottom_right, (255,0,0), 2)
    annotated_image_coco = cv2.rectangle(annotated_image_coco, top_left, bottom_right, (255,0,0), 2)
  annotated_image_coco = add_border(annotated_image_coco, 5, (0,0,255))
  return annotated_image_coco

In [7]:
import random
random.seed(42)
dataset_1_tile_ids = sorted(list(tile_df_annotated.loc[tile_df_annotated['dataset']==1, 'id'].values))
all_dataset_tile_ids = sorted(list(tile_df_annotated.loc[:, 'id'].values))
tgt_wsis = [1,2,3,4]
wsi_tile_ids = []
for tgt_wsi in tgt_wsis:
    if generate_all_datset_annots:
        wsi_ids = list(tile_df_annotated.loc[tile_df_annotated['source_wsi']==tgt_wsi, 'id'].values)
    else:
        wsi_ids = list(tile_df_annotated.loc[(tile_df_annotated['dataset']==1) & (tile_df_annotated['source_wsi']==tgt_wsi), 'id'].values)
    random.shuffle(wsi_ids)
    wsi_tile_ids.append(wsi_ids)
random.shuffle(dataset_1_tile_ids)
random.shuffle(all_dataset_tile_ids)

In [8]:
len(all_dataset_tile_ids), len(dataset_1_tile_ids), len(wsi_tile_ids)

(1633, 422, 4)

In [9]:
dataset_1_tile_ids[:10]

['61f8b27a48e6',
 '44c66d51c40e',
 '3e01bb974a1b',
 'e3c41ad419a2',
 '7e97ecdac5d2',
 'a373ae26f4f0',
 'fc6def641612',
 '4ca084aec87b',
 '936f9bfb3966',
 'da774b6f9cd1']

In [10]:
input_imgs_dir, input_annots_dir

('../all_dataset_files/all_dataset_imgs',
 '../all_dataset_files/all_dataset_annotations')

In [11]:
# fold_size = int(math.ceil(len(dataset_1_tile_ids) / num_folds)) if not generate_all_datset_annots else int(math.ceil(len(all_dataset_tile_ids) / num_folds))
fold_size = int(math.ceil(len(dataset_1_tile_ids) / num_folds))
print(f'Fold size is {fold_size}')
shifting_thresholds = [0.15,0.30,0.45,0.60,0.75,0.90]
for i in range(num_folds):
  validation_imgs_dir = f'{base_path}/{base_data_dir}/{base_data_name}_imgs_merged_validation_{i}' if use_merged_dataset else f'{base_path}/{base_data_dir}/{base_data_name}_imgs_validation_{i}'
  validation_annots_dir = f'{base_path}/{base_data_dir}/{base_data_name}_annotations_merged_validation_{i}' if use_merged_dataset else f'{base_path}/{base_data_dir}/{base_data_name}_annotations_validation_{i}'
  train_imgs_dir = f'{base_path}/{base_data_dir}/{base_data_name}_imgs_merged_train_{i}' if use_merged_dataset else f'{base_path}/{base_data_dir}/{base_data_name}_imgs_train_{i}'
  train_annots_dir = f'{base_path}/{base_data_dir}/{base_data_name}_annotations_merged_train_{i}' if use_merged_dataset else f'{base_path}/{base_data_dir}/{base_data_name}_annotations_train_{i}'
  
  if pick_validation_from_file:
    with open(f'{base_path}/{base_data_dir}/validation_img_ids_fold_{i}.pkl', 'rb') as f:
        cur_validation_slice = pickle.load(f)
    with open(f'{base_path}/{base_data_dir}/train_img_ids_fold_{i}.pkl', 'rb') as f:
        cur_training_slice = pickle.load(f)
    print(len(cur_validation_slice), len(cur_training_slice))
  else:
    if get_whole_dataset_into_fold:
        cur_validaton_slice = wsi_1_ids + wsi_2_ids
        cur_training_slice = wsi_1_ids + wsi_2_ids
    else:
        print('Creating validation from wsi tiles...')
#         cur_validaton_slice = dataset_1_tile_ids[int(i*fold_size):min(int((i+1)*fold_size), len(dataset_1_tile_ids))] if not generate_all_datset_annots else all_dataset_tile_ids[int(i*fold_size):min(int((i+1)*fold_size), len(all_dataset_tile_ids))]
        cur_validaton_slice = dataset_1_tile_ids[int(i*fold_size):min(int((i+1)*fold_size), len(dataset_1_tile_ids))]
#         cur_validaton_slice = []    
#         num_wsis = float(len(wsi_tile_ids))
#         for wsi_tile_id in wsi_tile_ids:
#             cur_validaton_slice += wsi_tile_id[int(i*(fold_size/num_wsis)):min(int((i+1)*(fold_size/num_wsis)), len(wsi_tile_id))]
        cur_training_slice = list(set(dataset_1_tile_ids) - set(cur_validaton_slice)) if not generate_all_datset_annots else list(set(all_dataset_tile_ids) - set(cur_validaton_slice))
        print(len(cur_validaton_slice), len(cur_training_slice))
  
  print('Saving validation images and annotations...')
  for validation_img_id in cur_validaton_slice:
      shutil.copy(f'{input_imgs_dir}/{validation_img_id}.png', f'{validation_imgs_dir}/{validation_img_id}.png')
      shutil.copy(f'{input_annots_dir}/{validation_img_id}.pkl', f'{validation_annots_dir}/{validation_img_id}.pkl')
    
  print('Saving training images and annotations...')
  for train_img_id in cur_training_slice:
    shutil.copy(f'{input_imgs_dir}/{train_img_id}.png', f'{train_imgs_dir}/{train_img_id}.png')
    shutil.copy(f'{input_annots_dir}/{train_img_id}.pkl', f'{train_annots_dir}/{train_img_id}.pkl')

Fold size is 85
Creating validation from wsi tiles...
85 1548
Saving validation images and annotations...
Saving training images and annotations...
Creating validation from wsi tiles...
85 1548
Saving validation images and annotations...
Saving training images and annotations...
Creating validation from wsi tiles...
85 1548
Saving validation images and annotations...
Saving training images and annotations...
Creating validation from wsi tiles...
85 1548
Saving validation images and annotations...
Saving training images and annotations...
Creating validation from wsi tiles...
82 1551
Saving validation images and annotations...
Saving training images and annotations...


In [None]:
i=0
validation_annots_dir = f'{base_path}/{base_data_dir}/{base_data_name}_annotations_merged_validation_{i}' if use_merged_dataset else f'{base_path}/{base_data_dir}/{base_data_name}_annotations_validation_{i}'
train_annots_dir = f'{base_path}/{base_data_dir}/{base_data_name}_annotations_merged_train_{i}' if use_merged_dataset else f'{base_path}/{base_data_dir}/{base_data_name}_annotations_train_{i}'
files = os.listdir(train_annots_dir)
wsi_counts = [0]*len(wsi_tile_ids)
categories = set()
for fi in files:
    with open(f'{train_annots_dir}/{fi}', 'rb') as f:
        val_annots = pickle.load(f)
    categories = categories.union(set([ann['category_id'] for ann in val_annots]))
    img_id = fi.split('.pkl')[0]
    for idx, wsi_tile_id in enumerate(wsi_tile_ids):
        if img_id in wsi_tile_id:
            wsi_counts[idx] += 1
wsi_counts, categories

In [None]:
fold_size = int(math.ceil(len(dataset_1_tile_ids) / num_folds))
print(f'Fold size is {fold_size}')
shifting_thresholds = [0.15,0.30,0.45,0.60,0.75,0.90]
print(f'Fold size: {fold_size}')
for i in range(1):
  validation_imgs_dir = f'{base_path}/{base_data_dir}/{base_data_name}_imgs_merged_validation_{i}' if use_merged_dataset else f'{base_path}/{base_data_dir}/{base_data_name}_imgs_validation_{i}'
  validation_annots_dir = f'{base_path}/{base_data_dir}/{base_data_name}_annotations_merged_validation_{i}' if use_merged_dataset else f'{base_path}/{base_data_dir}/{base_data_name}_annotations_validation_{i}'
  train_imgs_dir = f'{base_path}/{base_data_dir}/{base_data_name}_imgs_merged_train_{i}' if use_merged_dataset else f'{base_path}/{base_data_dir}/{base_data_name}_imgs_train_{i}'
  train_annots_dir = f'{base_path}/{base_data_dir}/{base_data_name}_annotations_merged_train_{i}' if use_merged_dataset else f'{base_path}/{base_data_dir}/{base_data_name}_annotations_train_{i}'
  
  if pick_validation_from_file:
    with open(f'{base_path}/dataset1_files/validation_wsi_1_fold_img_ids.pkl', 'rb') as f:
        wsi_1_validation_img_ids = pickle.load(f)
    with open(f'{base_path}/dataset1_files/validation_wsi_2_fold_img_ids.pkl', 'rb') as f:
        wsi_2_validation_img_ids = pickle.load(f)
    wsi_validation_img_ids = wsi_1_validation_img_ids[i] + wsi_2_validation_img_ids[i]
    cur_validaton_slice = wsi_validation_img_ids
    cur_training_slice = list(set(dataset_1_tile_ids) - set(cur_validaton_slice))
    print(len(cur_validaton_slice), len(cur_training_slice))
    print(cur_validaton_slice, cur_training_slice)
  else:
#     cur_validaton_slice = dataset_1_tile_ids[int(i*fold_size):min(int((i+1)*fold_size), len(dataset_1_tile_ids))]
#     cur_training_slice = dataset_1_tile_ids[:int(i*fold_size)] + dataset_1_tile_ids[min(int((i+1)*fold_size), len(dataset_1_tile_ids)):]
    if get_whole_dataset_into_fold:
        cur_validaton_slice = wsi_1_ids + wsi_2_ids
        cur_training_slice = wsi_1_ids + wsi_2_ids
    else:
#         cur_validaton_slice = wsi_1_ids[int(i*(fold_size/2)):min(int((i+1)*(fold_size/2)), len(wsi_1_ids))] + wsi_2_ids[int(i*(fold_size/2)):min(int((i+1)*(fold_size/2)), len(wsi_2_ids))]
#         cur_training_slice = list(set(dataset_1_tile_ids) - set(cur_validaton_slice)) if not generate_all_datset_annots else list(set(all_dataset_tile_ids) - set(cur_validaton_slice))
        # Temporary changes below
        cur_validaton_slice = []
        num_wsis = float(len(wsi_tile_ids))
        for wsi_tile_id in wsi_tile_ids:
            cur_validaton_slice += wsi_tile_id[int(i*(fold_size/num_wsis)):min(int((i+1)*(fold_size/num_wsis)), len(wsi_tile_id))]
        cur_training_slice = list(set(dataset_1_tile_ids) - set(cur_validaton_slice)) if not generate_all_datset_annots else list(set(all_dataset_tile_ids) - set(cur_validaton_slice))
  
  print('Saving validation images and annotations...')
  for validation_img_id in cur_validaton_slice:
      shutil.copy(f'{input_imgs_dir}/{validation_img_id}.png', f'{validation_imgs_dir}/{validation_img_id}.png')
      shutil.copy(f'{input_annots_dir}/{validation_img_id}.pkl', f'{validation_annots_dir}/{validation_img_id}.pkl')
  
#   print('Creating adjacent tile dict for the current validation set...')
#   dataset_1_wsi_1_pos_dict = {(x[0], x[1]): x[-1] for x in dataset_1.loc[(dataset_1['source_wsi']==1) & (dataset_1['id'].isin(cur_validaton_slice)), ['i', 'j', 'id']].values}
#   dataset_1_wsi_2_pos_dict = {(x[0], x[1]): x[-1] for x in dataset_1.loc[(dataset_1['source_wsi']==2) & (dataset_1['id'].isin(cur_validaton_slice)), ['i', 'j', 'id']].values}
#   reverse_dataset_1_wsi_1_pos_dict = {v:k for k,v in dataset_1_wsi_1_pos_dict.items()}
#   reverse_dataset_1_wsi_2_pos_dict = {v:k for k,v in dataset_1_wsi_2_pos_dict.items()}
#   adjacent_tile_dict = dict()
#   for i,j in dataset_1_wsi_1_pos_dict.keys():
#     adjacent_tile_dict = get_adjacent_tiles(i, j, dataset_1_wsi_1_pos_dict[(i,j)], adjacent_tile_dict, dataset_1_wsi_1_pos_dict)
#   for i,j in dataset_1_wsi_2_pos_dict.keys():
#     adjacent_tile_dict = get_adjacent_tiles(i, j, dataset_1_wsi_2_pos_dict[(i,j)], adjacent_tile_dict, dataset_1_wsi_2_pos_dict)
  
#   print('Saving validation images and annotations...')
#   for validation_img_id in adjacent_tile_dict.keys():
#     shutil.copy(f'{input_imgs_dir}/{validation_img_id}.png', f'{validation_imgs_dir}/{validation_img_id}.png')
#     shutil.copy(f'{input_annots_dir}/{validation_img_id}.pkl', f'{validation_annots_dir}/{validation_img_id}.pkl')
#     validation_adjacent_tiles = adjacent_tile_dict[validation_img_id]
#     valid_shifts = []
#     if validation_adjacent_tiles['R'] is not None:
#       valid_shifts.append('R')
#     if validation_adjacent_tiles['B'] is not None:
#       valid_shifts.append('B')
#     if validation_adjacent_tiles['B'] is not None and validation_adjacent_tiles['R'] is not None and validation_adjacent_tiles['BR'] is not None:
#       valid_shifts.append('BR')
#     if validation_adjacent_tiles['B'] is not None and validation_adjacent_tiles['L'] is not None and validation_adjacent_tiles['BL'] is not None:
#       valid_shifts.append('BL')
#     for valid_shift in valid_shifts:
#       for shifting_threshold in shifting_thresholds:
#         shutil.copy(f'{input_imgs_dir}/{validation_img_id}_{valid_shift}_{shifting_threshold}.png', f'{validation_imgs_dir}/{validation_img_id}_{valid_shift}_{shifting_threshold}.png')
#         shutil.copy(f'{input_annots_dir}/{validation_img_id}_{valid_shift}_{shifting_threshold}.pkl', f'{validation_annots_dir}/{validation_img_id}_{valid_shift}_{shifting_threshold}.pkl')
  
  print('Creating adjacent tile dict for the current training set...')
  dataset_1_wsi_1_pos_dict = {(x[0], x[1]): x[-1] for x in dataset_1.loc[(dataset_1['source_wsi']==1) & (dataset_1['id'].isin(cur_training_slice)), ['i', 'j', 'id']].values}
  dataset_1_wsi_2_pos_dict = {(x[0], x[1]): x[-1] for x in dataset_1.loc[(dataset_1['source_wsi']==2) & (dataset_1['id'].isin(cur_training_slice)), ['i', 'j', 'id']].values}
  reverse_dataset_1_wsi_1_pos_dict = {v:k for k,v in dataset_1_wsi_1_pos_dict.items()}
  reverse_dataset_1_wsi_2_pos_dict = {v:k for k,v in dataset_1_wsi_2_pos_dict.items()}
  adjacent_tile_dict = dict()
  for i,j in dataset_1_wsi_1_pos_dict.keys():
    adjacent_tile_dict = get_adjacent_tiles(i, j, dataset_1_wsi_1_pos_dict[(i,j)], adjacent_tile_dict, dataset_1_wsi_1_pos_dict)
  for i,j in dataset_1_wsi_2_pos_dict.keys():
    adjacent_tile_dict = get_adjacent_tiles(i, j, dataset_1_wsi_2_pos_dict[(i,j)], adjacent_tile_dict, dataset_1_wsi_2_pos_dict)
  
  print('Saving training images and annotations...')
  for train_img_id in cur_training_slice:
    shutil.copy(f'{input_imgs_dir}/{train_img_id}.png', f'{train_imgs_dir}/{train_img_id}.png')
    if should_dilate:
        with open(f'{input_annots_dir}/{train_img_id}.pkl', 'rb') as f:
            annotations = pickle.load(f)
        cur_tile_coco_annots = []
        for annot in annotations:
            coords = [[x,y] for x,y in zip(annot['segmentation'][0][::2], annot['segmentation'][0][1::2])]
            mask = np.zeros((512,512), dtype=np.uint8)
            cv2.fillPoly(mask, pts=[np.array(coords)], color=1)
            kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (3, 3))
            dilated_mask = cv2.dilate(mask, kernel, iterations=2)
            contours, _ = cv2.findContours(dilated_mask, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
            dilated_coords = contours[0].reshape(-1, 2)
            coords = dilated_coords
            segmentations = [[pt for pair in coords for pt in pair]]
            segmentation_area = calculate_area(coords)
            min_x = min(coords, key=lambda x: x[0])[0]
            max_x = max(coords, key=lambda x: x[0])[0]
            min_y = min(coords, key=lambda x: x[1])[1]
            max_y = max(coords, key=lambda x: x[1])[1]
            segmentation_bbox = [min_x, min_y, max_x-min_x, max_y-min_y]
            category_id = annot['category_id']
            cur_tile_coco_annots.append({
              'segmentation': segmentations,
              'area': segmentation_area,
              'bbox': segmentation_bbox,
              'category_id': category_id
            })
        with open(f'{train_annots_dir}/{train_img_id}.pkl', 'wb') as f:
            pickle.dump(cur_tile_coco_annots, f, protocol=pickle.HIGHEST_PROTOCOL)
    else:
        shutil.copy(f'{input_annots_dir}/{train_img_id}.pkl', f'{train_annots_dir}/{train_img_id}.pkl')
    if use_merged_dataset:
        train_adjacent_tiles = adjacent_tile_dict[train_img_id]
        valid_shifts = []
        if train_adjacent_tiles['R'] is not None:
          valid_shifts.append('R')
        if train_adjacent_tiles['B'] is not None:
          valid_shifts.append('B')
        if train_adjacent_tiles['B'] is not None and train_adjacent_tiles['R'] is not None and train_adjacent_tiles['BR'] is not None:
          valid_shifts.append('BR')
        if train_adjacent_tiles['B'] is not None and train_adjacent_tiles['L'] is not None and train_adjacent_tiles['BL'] is not None:
          valid_shifts.append('BL')
        for valid_shift in valid_shifts:
          for shifting_threshold in shifting_thresholds:
            shutil.copy(f'{input_imgs_dir}/{train_img_id}_{valid_shift}_{shifting_threshold}.png', f'{train_imgs_dir}/{train_img_id}_{valid_shift}_{shifting_threshold}.png')
            shutil.copy(f'{input_annots_dir}/{train_img_id}_{valid_shift}_{shifting_threshold}.pkl', f'{train_annots_dir}/{train_img_id}_{valid_shift}_{shifting_threshold}.pkl')

In [None]:
validation_imgs = os.listdir(f'{base_path}/dataset1_files/all_dataset1_imgs_validation_0')
wsi_1_validation_imgs = 0
wsi_2_validation_imgs = 0
for img in validation_imgs:
    if img.split('.png')[0] in wsi_1_ids:
        wsi_1_validation_imgs += 1
    else:
        wsi_2_validation_imgs += 1
print(wsi_1_validation_imgs, wsi_2_validation_imgs)

In [None]:
def load_img(img_dir, img_id):
  img = cv2.imread(f"{img_dir}/{img_id}.png")
  img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
  return img
def get_annotated_mask(imgs_dir, annots_dir, base_img_id):
  orig_img = load_img(imgs_dir, base_img_id)
  img_height, img_width, _ = orig_img.shape
  with open(f'{annots_dir}/{base_img_id}.pkl', 'rb') as f:
    tgt_annots = pickle.load(f)
  img_mask = np.zeros((img_height, img_width))
  for tgt_annot in tgt_annots:
    coords = [[x, y] for x, y in zip(tgt_annot['segmentation'][0][::2], tgt_annot['segmentation'][0][1::2])]
    cv2.fillPoly(img_mask, pts=[np.array(coords)], color=1)
  return img_mask

In [None]:
import time
if generate_masks:
  train_modes = ['train', 'validation']
  for i in range(1):
    start_time = time.time()
    for train_mode in train_modes:
      orig_img_dir = f'{base_path}/dataset1_files/all_dataset1_imgs_merged_{train_mode}_{i}'
      orig_annots_dir = f'{base_path}/dataset1_files/all_dataset1_annotations_merged_{train_mode}_{i}'
      output_mask_dir = f'{base_path}/dataset1_files/all_dataset1_masks_merged_{train_mode}_{i}'
      all_orig_imgs = os.listdir(orig_img_dir)
      for orig_img in all_orig_imgs:
        orig_img_id = orig_img.split('.png')[0]
        annotated_mask = get_annotated_mask(orig_img_dir, orig_annots_dir, orig_img_id)
        cv2.imwrite(f'{output_mask_dir}/{orig_img_id}.png', annotated_mask)
    print(f'Finished generating masks for fold {i} in {float(time.time()-start_time)/60} minutes')
    start_time = time.time()

In [None]:
tgt_img_id = 'b8db704134ac_BR_0.75'
train_mode = 'train'
i = 0
orig_img = get_annotated_img(f'{base_path}/dataset1_files/all_dataset1_imgs_merged_{train_mode}_{i}', f'{base_path}/dataset1_files/all_dataset1_annotations_merged_{train_mode}_{i}', tgt_img_id)
# img_mask = cv2.imread(f'{base_path}/dataset1_files/all_dataset1_masks_merged_{train_mode}_{i}/{tgt_img_id}.png', cv2.IMREAD_GRAYSCALE)
plt.imshow(orig_img)
plt.show()
# plt.imshow(img_mask)
# plt.show()

In [None]:
with open('/home/ec2-user/hubmap-hacking-the-human-vasculature/dataset1_files/all_dataset1_annotations_merged_train_0/bd090bb1b654.pkl', 'rb') as f:
    data = pickle.load(f)

In [None]:
annot_files = os.listdir('/home/ec2-user/hubmap-hacking-the-human-vasculature/dataset1_files/all_dataset1_annotations_merged_train_0')
no_mask_annots = []
for annot_file in annot_files:
    with open(f'/home/ec2-user/hubmap-hacking-the-human-vasculature/dataset1_files/all_dataset1_annotations_merged_train_0/{annot_file}', 'rb') as f:
        data = pickle.load(f)
    if len(data)==0:
        no_mask_annots.append(annot_file)

In [None]:
for no_mask_annot in no_mask_annots:
    img_id = no_mask_annot.split('.pkl')[0]
    if os.path.exists(f'/home/ec2-user/hubmap-hacking-the-human-vasculature/dataset1_files/all_dataset1_annotations_merged_train_0/{img_id}.pkl'):
        os.remove(f'/home/ec2-user/hubmap-hacking-the-human-vasculature/dataset1_files/all_dataset1_annotations_merged_train_0/{img_id}.pkl')
    if os.path.exists(f'/home/ec2-user/hubmap-hacking-the-human-vasculature/dataset1_files/all_dataset1_imgs_merged_train_0/{img_id}.png'):
        os.remove(f'/home/ec2-user/hubmap-hacking-the-human-vasculature/dataset1_files/all_dataset1_imgs_merged_train_0/{img_id}.png')
