This notebook was Inspired by [this](https://www.kaggle.com/slawekbiel/positive-score-with-detectron-1-3-input-data?scriptVersionId=77658860) great notebook. I made a few improvements in the `rle2mask` code to make it more efficient and used the functions provided in `pycocotools` to generate the json file. This results in massive reduction of compute time and dataset size.

What seemed at first to be a trivial task was a bit difficult as the RLE encoding used by COCO is very different from the encoding used in this comp.  

The comp encoding is rowise and every `odd` index represent the absolute begining of the mask. In the other hand, coco format expects it to be encoded by columns and the `odd` indexes are relative to the last end of the mask.

I couldn't find a trivial way to convert from those two formats without decoding the rle to mask, so the workflow is as folows:

1. Decode rle (competition) to binary mask
1. Encode the binary mask to rle (coco) using `pycocotools`
1. Save to `.json`

In [None]:
%config Completer.use_jedi = False

In [None]:
!pip install -Uqqq pycocotools

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import json,itertools

## Loading the train dataframe

In [None]:
df = pd.read_csv('../input/sartorius-cell-instance-segmentation/train.csv')
df.head()

## Function that decodes rle (for this comp) to a binary mask

In [None]:
## Based on: https://www.kaggle.com/eigrad/convert-rle-to-bounding-box-x0-y0-x1-y1
def rle2mask(rle, img_w, img_h):
    
    ## transforming the string into an array of shape (2, N)
    array = np.fromiter(rle.split(), dtype = np.uint)
    array = array.reshape((-1,2)).T
    array[0] = array[0] - 1
    
    ## decompressing the rle encoding (ie, turning [3, 1, 10, 2] into [3, 4, 10, 11, 12])
    # for faster mask construction
    starts, lenghts = array
    mask_decompressed = np.concatenate([np.arange(s, s + l, dtype = np.uint) for s, l in zip(starts, lenghts)])

    ## Building the binary mask
    msk_img = np.zeros(img_w * img_h, dtype = np.uint8)
    msk_img[mask_decompressed] = 1
    msk_img = msk_img.reshape((img_h, img_w))
    msk_img = np.asfortranarray(msk_img) ## This is important so pycocotools can handle this object
    
    return msk_img

## Minor Sanity Check

In [None]:
rle = df.loc[0, 'annotation']
print(rle)
plt.imshow(rle2mask(rle, 704, 520));

## Function that builds the .json file

In [None]:
from tqdm.notebook import tqdm
from pycocotools import mask as maskUtils
from joblib import Parallel, delayed

def annotate(idx, row, cat_ids):
        mask = rle2mask(row['annotation'], row['width'], row['height']) # Binary mask
        c_rle = maskUtils.encode(mask) # Encoding it back to rle (coco format)
        c_rle['counts'] = c_rle['counts'].decode('utf-8') # converting from binary to utf-8
        area = maskUtils.area(c_rle).item() # calculating the area
        bbox = maskUtils.toBbox(c_rle).astype(int).tolist() # calculating the bboxes
        annotation = {
            'segmentation': c_rle,
            'bbox': bbox,
            'area': area,
            'image_id':row['id'], 
            'category_id':cat_ids[row['cell_type']], 
            'iscrowd':0, 
            'id':idx
        }
        return annotation
    
def coco_structure(df, workers = 4):
    
    ## Building the header
    cat_ids = {name:id+1 for id, name in enumerate(df.cell_type.unique())}    
    cats =[{'name':name, 'id':id} for name,id in cat_ids.items()]
    images = [{'id':id, 'width':row.width, 'height':row.height, 'file_name':f'train/{id}.png'} for id,row in df.groupby('id').agg('first').iterrows()]
    
    ## Building the annotations
    annotations = Parallel(n_jobs=workers)(delayed(annotate)(idx, row, cat_ids) for idx, row in tqdm(df.iterrows(), total = len(df)))
        
    return {'categories':cats, 'images':images, 'annotations':annotations}

In [None]:
!mkdir TenFold

## Running for the whole DF and saving it as a .json file

In [None]:
from sklearn.model_selection import StratifiedKFold

skf = StratifiedKFold(n_splits=10, shuffle=True, random_state=221)

for fold, (_, val_idx) in enumerate(skf.split(X=df, y=df['cell_type']), 1):
    df.loc[val_idx, 'fold'] = fold
    
df['fold'] = df['fold'].astype(np.uint8)

In [None]:
for fold in range(1,11):
    train_df, val_df= df.loc[df.fold!=fold], df.loc[df.fold==fold]    
    train_anno = coco_structure(train_df)
    val_anno = coco_structure(val_df)
    with open(f'TenFold/annotations_train_fold_{fold}.json', 'w', encoding='utf-8') as f:
        json.dump(train_anno, f, ensure_ascii=True, indent=4)
    with open(f'TenFold/annotations_val_fold_{fold}.json', 'w', encoding='utf-8') as f:
        json.dump(val_anno, f, ensure_ascii=True, indent=4)

In [None]:
!zip -r TenFold.zip TenFold

# Sanity check

In [None]:
from pycocotools.coco import COCO
import matplotlib.pyplot as plt
from pathlib import Path
from PIL import Image

In [None]:
dataDir=Path('../input/sartorius-cell-instance-segmentation')
annFile = Path('./annotations_train.json')
coco = COCO(annFile)
imgIds = coco.getImgIds()

In [None]:
imgs = coco.loadImgs(imgIds[-3:])
_,axs = plt.subplots(len(imgs),2,figsize=(40,15 * len(imgs)))
for img, ax in zip(imgs, axs):
    I = Image.open(dataDir/img['file_name'])
    annIds = coco.getAnnIds(imgIds=[img['id']])
    anns = coco.loadAnns(annIds)
    ax[0].imshow(I)
    ax[1].imshow(I)
    plt.sca(ax[1])
    coco.showAnns(anns, draw_bbox=True)