In [1]:
!pip install -Uqqq pycocotools

In [2]:
from pycocotools.coco import COCO
import json
import funcy
from sklearn.model_selection import KFold, train_test_split
import numpy as np
import pandas as pd
from tqdm.notebook import tqdm
import itertools

In [3]:
def save_coco(file, images, annotations, categories):
    with open(file, 'wt', encoding='UTF-8') as coco:
        json.dump({ 'images': list(images), 
            'annotations': list(annotations), 'categories': list(categories)}, coco, indent=2, sort_keys=True)

def filter_annotations(annotations, image_ids):
    return funcy.lfilter(lambda a: a['image_id'] in image_ids, annotations)

def add_image_data(images, all_image_data):
    image_ids = np.array(funcy.lmap(lambda i : i['id'], all_image_data))
    idxs = []
    for image in images:
        idxs.append(np.where(image_ids == image)[0][0])
    return all_image_data[idxs]

def rle_decode(mask_rle, shape):
    '''
    mask_rle: run-length as string formated (start length)
    shape: (height,width) of array to return 
    Returns numpy array, 1 - mask, 0 - background

    '''
    s = mask_rle.split()
    starts, lengths = [np.asarray(x, dtype=int) for x in (s[0:][::2], s[1:][::2])]
    starts -= 1
    ends = starts + lengths
    img = np.zeros(shape[0]*shape[1], dtype=np.uint8)
    for lo, hi in zip(starts, ends):
        img[lo:hi] = 1
    return img.reshape(shape)


def binary_mask_to_rle(binary_mask):
    rle = {'counts': [], 'size': list(binary_mask.shape)}
    counts = rle.get('counts')
    for i, (value, elements) in enumerate(itertools.groupby(binary_mask.ravel(order='F'))):
        if i == 0 and value == 1:
            counts.append(0)
        counts.append(len(list(elements)))
    return rle

def coco_structure(train_df):
    cat_ids = {name:id+1 for id, name in enumerate(train_df.cell_type.unique())}    
    cats =[{'name':name, 'id':id} for name,id in cat_ids.items()]
    images = [{'id':id, 'width':row.width, 'height':row.height, 'file_name':f'train/{id}.png'} for id,row in train_df.groupby('id').agg('first').iterrows()]
    annotations=[]
    for idx, row in tqdm(train_df.iterrows()):
        mk = rle_decode(row.annotation, (row.height, row.width))
        ys, xs = np.where(mk)
        x1, x2 = min(xs), max(xs)
        y1, y2 = min(ys), max(ys)
        enc =binary_mask_to_rle(mk)
        seg = {
            'segmentation':enc, 
            'bbox': [int(x1), int(y1), int(x2-x1+1), int(y2-y1+1)],
            'area': int(np.sum(mk)),
            'image_id':row.id, 
            'category_id':cat_ids[row.cell_type], 
            'iscrowd':0, 
            'id':idx
        }
        annotations.append(seg)
    return {'categories':cats, 'images':images,'annotations':annotations}

In [4]:
train_df = pd.read_csv('../input/sartorius-cell-instance-segmentation/train.csv')
coco = coco_structure(train_df)

0it [00:00, ?it/s]

In [5]:
images = np.array(coco['images'])
annotations = np.array(coco['annotations'])
categories = np.array(coco['categories'])

In [6]:
coco.keys()

dict_keys(['categories', 'images', 'annotations'])

In [7]:
categories

array([{'name': 'shsy5y', 'id': 1}, {'name': 'astro', 'id': 2},
       {'name': 'cort', 'id': 3}], dtype=object)

In [8]:
images_to_categories = {}
categories_to_images = {1 : [], 2 : [], 3 : []}

for annotation in annotations:
    images_to_categories[annotation['image_id']] = annotation['category_id']
for image in images_to_categories:
    categories_to_images[images_to_categories[image]].append(image)

cat = np.array([annotation['category_id'] for annotation in annotations])

In [9]:
images_shsy5y = np.array(categories_to_images[1])
images_astro = np.array(categories_to_images[2])
images_cort = np.array(categories_to_images[3])

Generating per-cell datasets to train segmentation models

In [10]:
shsy5y_train_images, shsy5y_val_images = train_test_split(images_shsy5y, train_size=0.8)
astro_train_images, astro_val_images = train_test_split(images_astro, train_size=0.8)
cort_train_images, cort_val_images = train_test_split(images_cort, train_size=0.8)


save_coco(f"shsy5y_train.json", add_image_data(shsy5y_train_images, images), filter_annotations(annotations, shsy5y_train_images), [categories[0]])
save_coco(f"shsy5y_val.json",  add_image_data(shsy5y_val_images, images), filter_annotations(annotations, shsy5y_val_images), [categories[0]])

save_coco(f"astro_train.json", add_image_data(astro_train_images, images), filter_annotations(annotations, astro_train_images), [categories[1]])
save_coco(f"astro_val.json",  add_image_data(astro_val_images, images), filter_annotations(annotations, astro_val_images), [categories[1]])

save_coco(f"cort_train.json", add_image_data(cort_train_images, images), filter_annotations(annotations, cort_train_images), [categories[2]])
save_coco(f"cort_val.json",  add_image_data(cort_val_images, images), filter_annotations(annotations, cort_val_images), [categories[2]])

Generating and saving 5-fold data to train foldwise segmentation model

In [11]:
folds = 5
kfold = KFold(n_splits=folds, shuffle=True, random_state=17)

shsy5y_split = kfold.split(X=images_shsy5y)
astro_split = kfold.split(X=images_astro)
cort_split = kfold.split(X=images_cort)

for fold, ((shsy5y_train_idx,shsy5y_val_idx), (astro_train_idx, astro_val_idx), (cort_train_idx, cort_val_idx)) in enumerate(zip(shsy5y_split, astro_split, cort_split)):
    print(f"fold {fold+1}")
    train_images = np.concatenate([images_shsy5y[shsy5y_train_idx], images_astro[astro_train_idx], images_cort[cort_train_idx]])
    val_images = np.concatenate([images_shsy5y[shsy5y_val_idx], images_astro[astro_val_idx], images_cort[cort_val_idx]])
    print(train_images.shape, val_images.shape)
    save_coco(f"train_fold_{fold+1}.json", add_image_data(train_images, images), filter_annotations(annotations, train_images), categories)
    save_coco(f"val_fold_{fold+1}.json",  add_image_data(val_images, images), filter_annotations(annotations, val_images), categories)

fold 1
(484,) (122,)
fold 2
(485,) (121,)
fold 3
(485,) (121,)
fold 4
(485,) (121,)
fold 5
(485,) (121,)
