In [45]:
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import cv2
import json
from PIL import Image
from collections import Counter
import pickle
import shutil
from shapely.geometry import Polygon

In [46]:
base_path = '..'
generate_all_datset_annots = False
tile_categories = ['blood_vessel', 'glomerulus', 'unsure']
base_data_dir = 'dataset1_files' if not generate_all_datset_annots else 'all_dataset_files'
base_data_name = 'all_dataset1' if not generate_all_datset_annots else 'all_dataset'
input_imgs_path = f'{base_path}/{base_data_dir}/{base_data_name}_imgs'
output_annotations_path = f'{base_path}/{base_data_dir}/{base_data_name}_annotations_mmdet/full_annotations.json'
if os.path.exists(output_annotations_path):
    os.remove(output_annotations_path)
# train_modes = ['train', 'validation']

# for i in range(num_folds):
#     for train_mode in train_modes:
#         output_imgs_path = f'{base_path}/{base_data_dir}/{base_data_name}_{train_mode}_imgs_fold_{i}_mmdet'
#         output_annotations_path = f'{base_path}/{base_data_dir}/{base_data_name}_{train_mode}_annotations_fold_{i}_mmdet'
#         if os.path.exists(output_imgs_path):
#             shutil.rmtree(output_imgs_path)
#         if os.path.exists(output_annotations_path):
#             shutil.rmtree(output_annotations_path)
#         os.mkdir(output_imgs_path)
#         os.mkdir(output_annotations_path)

In [47]:
def get_count_type(tiles_dict):
  ids_with_info = []
  for tile in tiles_dict:
    cur_dict = {'id': tile['id'], 'blood_vessel': 0, 'glomerulus': 0, 'unsure': 0}
    for annot in tile['annotations']:
      cur_dict[annot['type']] += 1
    ids_with_info.append(cur_dict)
  return ids_with_info
def calculate_area(coords):
    p = Polygon(coords)
    return p.area
def visualize(**images):
    """PLot images in one row."""
    n = len(images)
    plt.figure(figsize=(30, 30))
    for i, (name, image) in enumerate(images.items()):
        plt.subplot(1, n, i + 1)
        plt.xticks([])
        plt.yticks([])
        plt.title(' '.join(name.split('_')).title())
        plt.imshow(image)
    plt.show()

In [48]:
def create_category_annots(tile_categories):
    categories = []
    for idx, tc in enumerate(tile_categories):
        categories.append({'id': idx, 'name': tc})
    return categories
def create_image_annots(input_imgs_path):
    image_annots = []
    img_name_to_id = dict()
    input_img_files = sorted(os.listdir(input_imgs_path))
    for idx, img_file in enumerate(input_img_files):
        img_name = img_file.split('.')[0]
        img = cv2.imread(f'{input_imgs_path}/{img_file}')
        height, width = img.shape[:2]
        image_annots.append({'file_name': img_file, 'height': height, 'width': width, 'id': idx})
        img_name_to_id[img_name] = idx
    return image_annots, img_name_to_id
def create_seg_annots(tgt_tile_dicts, img_name_to_id):
    annotations = []
    annot_id = 0
    for tile_dict in tgt_tile_dicts:
        cur_img_name = tile_dict['id']
        cur_annots = tile_dict['annotations']
        for annot in cur_annots:
            if annot['type'] not in tile_categories:
                continue
            coords = annot['coordinates'][0]
            segmentation = [[pt for pair in coords for pt in pair]]
            area = calculate_area(coords)
            image_id = img_name_to_id[cur_img_name]
            min_x = min(coords, key=lambda x: x[0])[0]
            max_x = max(coords, key=lambda x: x[0])[0]
            min_y = min(coords, key=lambda x: x[1])[1]
            max_y = max(coords, key=lambda x: x[1])[1]
            bbox = [min_x, min_y, max_x-min_x, max_y-min_y]
            category_id = tile_categories.index(annot['type'])
            annotations.append({'segmentation': segmentation, 'area': area, 'iscrowd': 0, 'image_id': image_id, 'bbox': bbox, 'category_id': category_id, 'id': annot_id})
            annot_id += 1

In [49]:
with open(f'{base_path}/polygons.jsonl', 'r') as json_file:
    json_list = list(json_file)
    
tiles_dicts = []
for json_str in json_list:
    tiles_dicts.append(json.loads(json_str))

In [50]:
tile_df = pd.read_csv(f'{base_path}/tile_meta.csv')
# Find the same stats as the above cell for annotated images
annotated_ids_with_info = get_count_type(tiles_dicts)
annotated_ids_with_info_df = pd.DataFrame.from_dict(annotated_ids_with_info)
tile_df_annotated = pd.merge(tile_df, annotated_ids_with_info_df, on='id', how='inner')
tile_df_annotated.head()

Unnamed: 0,id,source_wsi,dataset,i,j,blood_vessel,glomerulus,unsure
0,0006ff2aa7cd,2,2,16896,16420,8,1,0
1,00168d1b7522,2,2,14848,14884,1,1,0
2,0033bbc76b6b,1,1,10240,43008,3,0,1
3,003504460b3a,3,2,8192,11776,7,0,0
4,004daf1cbe75,3,2,6144,11264,10,1,0


In [51]:
dataset_1_tile_ids = sorted(tile_df_annotated.loc[tile_df_annotated['dataset']==1, 'id'].values)
tgt_tile_dicts = [x for x in tiles_dicts if x['id'] in dataset_1_tile_ids] if not generate_all_datset_annots else tiles_dicts
print(len(tgt_tile_dicts))

422


In [52]:
tgt_tile_dicts[:2]

[{'id': '0033bbc76b6b',
  'annotations': [{'type': 'blood_vessel',
    'coordinates': [[[169, 228],
      [168, 228],
      [167, 228],
      [166, 228],
      [165, 228],
      [164, 228],
      [163, 228],
      [163, 227],
      [162, 227],
      [161, 227],
      [161, 226],
      [160, 226],
      [160, 225],
      [159, 225],
      [159, 224],
      [158, 224],
      [158, 223],
      [158, 222],
      [158, 221],
      [158, 220],
      [157, 220],
      [157, 219],
      [157, 218],
      [157, 217],
      [156, 217],
      [156, 216],
      [156, 215],
      [156, 214],
      [156, 213],
      [155, 213],
      [155, 212],
      [154, 212],
      [154, 211],
      [153, 211],
      [153, 210],
      [153, 209],
      [152, 209],
      [152, 208],
      [152, 207],
      [151, 207],
      [150, 207],
      [150, 206],
      [149, 206],
      [149, 205],
      [148, 205],
      [148, 204],
      [148, 203],
      [147, 203],
      [147, 202],
      [147, 201],
      [147, 200],


In [53]:
categories = create_category_annots(tile_categories)
images, img_name_to_id = create_image_annots(input_imgs_path)
segmentation_annots = create_seg_annots(tgt_tile_dicts, img_name_to_id)
final_annots = {'categories': categories, 'images': images, 'annotations': segmentation_annots}
with open(output_annotations_path, 'w') as f:
    json.dump(final_annots, f)