In [1]:
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import cv2
import json
from PIL import Image
from collections import Counter
import pickle
import shutil
from shapely.geometry import Polygon

In [2]:
base_path = '..'
generate_all_datset_annots = True
keep_all_in_train = False
tile_categories = ['blood_vessel']
base_data_dir = 'dataset1_files' if not generate_all_datset_annots else 'all_dataset_files'
base_data_name = 'all_dataset1' if not generate_all_datset_annots else 'all_dataset'

In [3]:
num_folds = 2
input_imgs_path = f'{base_path}/train'

for i in range(num_folds):
    output_dir = f'{base_path}/{base_data_dir}/{base_data_name}_mmdet_fold_{i}'
    if os.path.exists(output_dir):
        shutil.rmtree(output_dir)
    os.mkdir(output_dir)
    os.mkdir(f'{output_dir}/train_images')
    os.mkdir(f'{output_dir}/validation_images')
    os.mkdir(f'{output_dir}/annotations')

In [4]:
def get_count_type(tiles_dict):
  ids_with_info = []
  for tile in tiles_dict:
    cur_dict = {'id': tile['id'], 'blood_vessel': 0, 'glomerulus': 0, 'unsure': 0}
    for annot in tile['annotations']:
      cur_dict[annot['type']] += 1
    ids_with_info.append(cur_dict)
  return ids_with_info
def calculate_area(coords):
    p = Polygon(coords)
    return p.area
def visualize(**images):
    """PLot images in one row."""
    n = len(images)
    plt.figure(figsize=(30, 30))
    for i, (name, image) in enumerate(images.items()):
        plt.subplot(1, n, i + 1)
        plt.xticks([])
        plt.yticks([])
        plt.title(' '.join(name.split('_')).title())
        plt.imshow(image)
    plt.show()

In [5]:
def create_category_annots(tile_categories):
    categories = []
    for idx, tc in enumerate(tile_categories):
        categories.append({'id': idx, 'name': tc})
    return categories
def create_image_annots(input_imgs_names):
    image_annots = []
    img_name_to_id = dict()
    input_img_names = sorted(input_imgs_names)
    for idx, img_name in enumerate(input_img_names):
        img = cv2.imread(f'{input_imgs_path}/{img_name}.tif')
        height, width = img.shape[:2]
        image_annots.append({'file_name': f'{img_name}.tif', 'height': height, 'width': width, 'id': idx})
        img_name_to_id[img_name] = idx
    return image_annots, img_name_to_id
def create_seg_annots(tgt_tile_dicts, img_name_to_id):
    annotations = []
    annot_id = 0
    for tile_dict in tgt_tile_dicts:
        cur_img_name = tile_dict['id']
        cur_annots = tile_dict['annotations']
        for annot in cur_annots:
            if annot['type'] not in tile_categories:
                continue
            coords = annot['coordinates'][0]
            segmentation = [[pt for pair in coords for pt in pair]]
            area = calculate_area(coords)
            image_id = img_name_to_id[cur_img_name]
            min_x = min(coords, key=lambda x: x[0])[0]
            max_x = max(coords, key=lambda x: x[0])[0]
            min_y = min(coords, key=lambda x: x[1])[1]
            max_y = max(coords, key=lambda x: x[1])[1]
            bbox = [min_x, min_y, max_x-min_x, max_y-min_y]
            category_id = tile_categories.index(annot['type'])
            annotations.append({'segmentation': segmentation, 'area': area, 'iscrowd': 0, 'image_id': image_id, 'bbox': bbox, 'category_id': category_id, 'id': annot_id})
#             annotations.append({'iscrowd': 0, 'image_id': image_id, 'bbox': bbox, 'category_id': category_id, 'id': annot_id})
            annot_id += 1
    return annotations

In [6]:
with open(f'{base_path}/cleaned_polygons.jsonl', 'r') as json_file:
    json_list = list(json_file)
    
tiles_dicts = []
for json_str in json_list:
    tiles_dicts.append(json.loads(json_str))

In [7]:
tile_df = pd.read_csv(f'{base_path}/tile_meta.csv')
# Find the same stats as the above cell for annotated images
annotated_ids_with_info = get_count_type(tiles_dicts)
annotated_ids_with_info_df = pd.DataFrame.from_dict(annotated_ids_with_info)
tile_df_annotated = pd.merge(tile_df, annotated_ids_with_info_df, on='id', how='inner')
tile_df_annotated.head()

Unnamed: 0,id,source_wsi,dataset,i,j,blood_vessel,glomerulus,unsure
0,0006ff2aa7cd,2,2,16896,16420,8,1,0
1,00168d1b7522,2,2,14848,14884,1,1,0
2,0033bbc76b6b,1,1,10240,43008,3,0,1
3,003504460b3a,3,2,8192,11776,7,0,0
4,004daf1cbe75,3,2,6144,11264,10,1,0


In [8]:
dataset_1_tile_ids = sorted(tile_df_annotated.loc[tile_df_annotated['dataset']==1, 'id'].values)
all_dataset_tile_ids = sorted(tile_df_annotated.loc[:, 'id'].values)
tgt_tile_dicts = [x for x in tiles_dicts if x['id'] in dataset_1_tile_ids] if not generate_all_datset_annots else tiles_dicts
print(len(tgt_tile_dicts))

1633


In [9]:
import random
random.seed(42)
tgt_dataset_tile_ids = dataset_1_tile_ids if not generate_all_datset_annots else all_dataset_tile_ids
random.shuffle(tgt_dataset_tile_ids)
random.shuffle(dataset_1_tile_ids)

In [10]:
import math
# fold_size = int(math.ceil(len(tgt_dataset_tile_ids) / num_folds))
fold_size = int(math.ceil(len(dataset_1_tile_ids) / num_folds))
print(f'Fold size is {fold_size}')
categories = create_category_annots(tile_categories)
for i in range(num_folds):
    output_dir = f'{base_path}/{base_data_dir}/{base_data_name}_mmdet_fold_{i}'
    
#     cur_validation_slice = tgt_dataset_tile_ids[int(i*fold_size):min(int((i+1)*fold_size), len(tgt_dataset_tile_ids))]
    cur_validation_slice = dataset_1_tile_ids[int(i*fold_size):min(int((i+1)*fold_size), len(dataset_1_tile_ids))]
    cur_training_slice = list(set(tgt_dataset_tile_ids) - set(cur_validation_slice)) if not keep_all_in_train else tgt_dataset_tile_ids
    
    for img_name in cur_validation_slice:
        shutil.copy(f'{input_imgs_path}/{img_name}.tif', f'{output_dir}/validation_images/{img_name}.tif')
    for img_name in cur_training_slice:
        shutil.copy(f'{input_imgs_path}/{img_name}.tif', f'{output_dir}/train_images/{img_name}.tif')
    
    val_images, val_img_name_to_id = create_image_annots(cur_validation_slice)
    train_images, train_img_name_to_id = create_image_annots(cur_training_slice)
    
    val_segmentation_annots = create_seg_annots([x for x in tgt_tile_dicts if x['id'] in cur_validation_slice], val_img_name_to_id)
    train_segmentation_annots = create_seg_annots([x for x in tgt_tile_dicts if x['id'] in cur_training_slice], train_img_name_to_id)
    
    val_final_annots = {'categories': categories, 'images': val_images, 'annotations': val_segmentation_annots}
    train_final_annots = {'categories': categories, 'images': train_images, 'annotations': train_segmentation_annots}
    
    with open(f'{output_dir}/annotations/validation_annotations.json', 'w') as f:
        json.dump(val_final_annots, f)
    with open(f'{output_dir}/annotations/train_annotations.json', 'w') as f:
        json.dump(train_final_annots, f)

Fold size is 211


In [None]:
i = 1
with open(f'{base_path}/{base_data_dir}/{base_data_name}_mmdet_fold_{i}/annotations/train_annotations.json', 'r') as f:
    train_annots = json.load(f)
with open(f'{base_path}/{base_data_dir}/{base_data_name}_mmdet_fold_{i}/annotations/validation_annotations.json', 'r') as f:
    validation_annots = json.load(f)

In [None]:
train_annots.keys()

In [None]:
img_to_segs = dict()

for annot in train_annots['annotations']:
    image_id = annot['image_id']
    segmentation = annot['segmentation'][0]
    if image_id not in img_to_segs:
        img_to_segs[image_id] = [segmentation]
    else:
        img_to_segs[image_id].append(segmentation)
for image_id in img_to_segs.keys():
    segmentations = img_to_segs[image_id]
    if len(segmentations) > 0:
        for i in range(len(segmentations)):
            for j in range(i+1, len(segmentations)):
                if segmentations[i]==segmentations[j]:
                    print(f'Found duplicate annots for image {image_id}!!!')

In [None]:
categories = set()
train_img_ids = set()
train_seg_img_ids = set()
for annot in train_annots['images']:
    train_img_ids.add(annot['id'])
for annot in train_annots['annotations']:
    train_seg_img_ids.add(annot['image_id'])
    categories.add(annot['category_id'])
print(categories)

In [None]:
categories = set()
validation_img_ids = set()
validation_seg_img_ids = set()
for annot in validation_annots['images']:
    validation_img_ids.add(annot['id'])
for annot in validation_annots['annotations']:
    validation_seg_img_ids.add(annot['image_id'])
    categories.add(annot['category_id'])
print(categories)

In [None]:
len(train_img_ids), len(validation_img_ids)

In [None]:
i=0
train_imgs_dir = f'{base_path}/{base_data_dir}/{base_data_name}_train_mmdet_fold_{i}/images'
train_img_files = os.listdir(train_imgs_dir)
