![logo](https://cocodataset.org/images/coco-logo.png)

# Intro
There are many semantic segmentation tools available, and they all require image annotations in one of several specific formats. In this notebook we will create COCO annotations for the Sartoruis dataset. There are many conversion tools available that can convert from COCO to a different target format as well, so COCO is quite versatile.

Update: Astro masks are cleaned with the [Clean Astro Mask dataset](https://www.kaggle.com/hengck23/clean-astro-mask). Also take a look in [this discussion](https://www.kaggle.com/c/sartorius-cell-instance-segmentation/discussion/291371).

In [None]:
!pip install pycocotools
from pycocotools.coco import COCO
from pycocotools.mask import encode, area, toBbox
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import json
from sklearn.model_selection import StratifiedKFold 
import glob
from PIL import Image
from skimage import measure   
import skimage.io as io
from shapely.geometry import Polygon, MultiPolygon
from tqdm import tqdm
import gc
from os.path import exists

# Read and prepare dataset
We will create stratified KFold test and validation COCO sets.  
References:
 * [create-coco-annotations-from-scratch](https://www.immersivelimit.com/create-coco-annotations-from-scratch)
 * [pycocotools](https://github.com/cocodataset/cocoapi/tree/master/PythonAPI)

In [None]:
df = pd.read_csv('../input/sartorius-cell-instance-segmentation/train.csv')

Create a few lists with files, ids and cell type for later use:

In [None]:
FILE_NAMES = glob.glob('/kaggle/input/sartorius-cell-instance-segmentation/train/*.png')
cell_type = []
fids = []
for i in range(len(FILE_NAMES)):
    fid = FILE_NAMES[i].split('/')[-1].split('.')[0]
    fids.append(fid)
    cell_type.append(df[df.id == fid].cell_type.iloc[0])

# Create COCO files
Conversion is pretty slow, going from binary masks to polygons.

In [None]:
CATEGORIES = {"shsy5y": 1, "astro":2, "cort": 3}

# ref: https://www.kaggle.com/inversion/run-length-decoding-quick-start
def rle_decode(mask_rle, mask, color=1):
    '''
    mask_rle: run-length as string formated (start length)
    shape: (height, width, channels) of array to return 
    color: color for the mask
    Returns numpy array (mask)

    '''
    s = mask_rle.split()
    
    starts = list(map(lambda x: int(x) - 1, s[0::2]))
    lengths = list(map(int, s[1::2]))
    ends = [x + y for x, y in zip(starts, lengths)]
    
    img = mask.reshape((mask.shape[0] * mask.shape[1]))
            
    for start, end in zip(starts, ends):
        img[start : end] = color
    
    return img.reshape(mask.shape)

def create_segmentation(sub_mask):
    contours = measure.find_contours(sub_mask, 0.5, positive_orientation='low')
    segmentations = []
    for contour in contours:
        # Flip from (row, col) representation to (x, y)
        # and subtract the padding pixel
        for i in range(len(contour)):
            row, col = contour[i]
            contour[i] = (col - 1, row - 1)

        # Make a polygon and simplify it
        if len(contour) > 2:
            poly = Polygon(contour)
            poly = poly.simplify(1.0, preserve_topology=False)
            if not poly.is_empty:
                try: # might fail if polygons are not connected
                    segmentation = np.array(poly.exterior.coords).ravel().tolist()
                    for i in range(len(segmentation)):
                        segmentation[i] = np.clip(segmentation[i], 0, 1e6)
                    segmentations.append(segmentation)
                except:
                    pass
            
    return segmentations

# https://www.kaggle.com/c/sartorius-cell-instance-segmentation/discussion/291371
def fill_hole(m):
    filled = m.copy()
    pad = np.pad(m, 4)
    lb = measure.label(pad < 0.5, background=0, connectivity=1)
    u, cc = np.unique(lb, return_counts=True)
    if len(u) > 2:
        #print(u, cc)
        lb = lb[4:-4, 4:-4]
        for uu in u[2:]:
            filled[lb == uu] = 1

    return filled

CLEAN_M = '/kaggle/input/clean-astro-mask/'

def create_single_mask(annotation, img_size, r=None):
    mask = np.zeros(img_size, dtype=np.uint8)
    mask = rle_decode(annotation, mask)
    mask = fill_hole(mask)
    if r is not None:
        mask = mask & r
    return mask

def add_image(df, fid, fpath, tset, aid):
    idx = len(tset["images"])+1
    h = df[df.id == fid].height.iloc[0]
    w = df[df.id == fid].width.iloc[0]
    tset['images'].append({"height": int(h), 
                           "width": int(w), 
                           "id": int(idx), 
                           "file_name": fpath.replace('\\', '/')})
    adf = df[df.id == fid]
    # check for cleaned mask
    ipath = CLEAN_M+fid+'.png'
    if exists(ipath):
        corr = plt.imread(ipath)
        # extract red channel 
        r = corr[:,:,0].astype(np.uint8)
    else:
        r = None
    # add each object as segment
    for j in range(len(adf)):
        cat = CATEGORIES[df[df.id == fid].cell_type.iloc[j]]
        # create mask
        m = create_single_mask(df[df.id == fid].annotation.iloc[j], [h, w], r)
        # encode as RLE
        me = encode(np.asfortranarray(m))
        # calc stats
        bbox = toBbox(me).astype(np.int32).tolist()
        a = area(me)
        # Polygons
        poly = create_segmentation(m) 
        if len(poly) > 0:
            tset["annotations"].append({"iscrowd": 0,
                                        "image_id": int(idx),
                                        "bbox": bbox,
                                        "segmentation": poly,
                                        "category_id": int(cat),
                                        "id": int(aid),
                                        "area": int(a)})
            aid += 1
    return tset, aid

Stratify on cell type and create one COCO .json file for train and test per fold.

In [None]:
def create_coco(files, fids):
    # define overall structure
    train_set = {"images": [], "categories": [], "annotations": []}
    # define classes
    train_set["categories"].append({"supercategory": "cells", "id": 1, "name": "shsy5y"})
    train_set["categories"].append({"supercategory": "cells", "id": 2, "name": "astro"})
    train_set["categories"].append({"supercategory": "cells", "id": 3, "name": "cort"})
    anno_id = 1 # start annotation ID at 1
    for i in tqdm(range(len(files))):
        train_set, anno_id = add_image(df, fids[i], files[i], train_set, anno_id)
    return train_set

In [None]:
K_FOLDS = 5

kf = StratifiedKFold(n_splits=K_FOLDS, shuffle=True, random_state=777)
for fold, (train_index, test_index) in enumerate(kf.split(FILE_NAMES, cell_type)):
    # train set
    train_ds = [FILE_NAMES[i] for i in train_index]
    train_fids = [fids[i] for i in train_index]
    tset = create_coco(train_ds, train_fids)
    with open('train_fold_{}.json'.format(fold), 'w') as f:
        json.dump(tset, f, indent=4)
    # test set
    valid_ds = [FILE_NAMES[i] for i in test_index]
    valid_fids = [fids[i] for i in test_index]
    vset = create_coco(valid_ds, valid_fids)
    with open('test_fold_{}.json'.format(fold), 'w') as f:
        json.dump(vset, f, indent=4)
    
    del tset, vset
    gc.collect()

# Verify COCO annotations
Finally we will check the segmentations masks with pycocotools.

In [None]:
coco=COCO('train_fold_0.json')

CNT = 5
fig = plt.figure(figsize=(16,CNT*6))
for i in range(CNT):
    axes = fig.add_subplot(CNT, 2, 2*i+1)
    plt.setp(axes, xticks=[], yticks=[])
    img = coco.loadImgs([i+1])[0]
    I = io.imread(img['file_name'])
    plt.imshow(I, cmap='gray')
    axes = fig.add_subplot(CNT, 2, 2*i+2)
    plt.setp(axes, xticks=[], yticks=[])
    plt.imshow(I, cmap='gray')
    annIds = coco.getAnnIds(imgIds=img['id'], iscrowd=None)
    anns = coco.loadAnns(annIds)
    coco.showAnns(anns)
    plt.tight_layout()

In [None]:
coco=COCO('test_fold_0.json')

CNT = 5
fig = plt.figure(figsize=(16,CNT*6))
for i in range(CNT):
    axes = fig.add_subplot(CNT, 2, 2*i+1)
    plt.setp(axes, xticks=[], yticks=[])
    img = coco.loadImgs([i+1])[0]
    I = io.imread(img['file_name'])
    plt.imshow(I, cmap='gray')
    axes = fig.add_subplot(CNT, 2, 2*i+2)
    plt.setp(axes, xticks=[], yticks=[])
    plt.imshow(I, cmap='gray')
    annIds = coco.getAnnIds(imgIds=img['id'], iscrowd=None)
    anns = coco.loadAnns(annIds)
    coco.showAnns(anns)
    plt.tight_layout()

OK, looks good!