In [None]:
!pip install -Uqqq pycocotools

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [None]:
df = pd.read_csv('../input/k/mhmdsyed/annotation-correction-v2/train.csv')
df.head()

In [None]:
## Based on: https://www.kaggle.com/eigrad/convert-rle-to-bounding-box-x0-y0-x1-y1
def rle2mask(rle, img_w, img_h):
    
    ## transforming the string into an array of shape (2, N)
    array = np.fromiter(rle.split(), dtype = np.uint)
    array = array.reshape((-1,2)).T
    array[0] = array[0] - 1
    
    ## decompressing the rle encoding (ie, turning [3, 1, 10, 2] into [3, 4, 10, 11, 12])
    # for faster mask construction
    starts, lenghts = array
    mask_decompressed = np.concatenate([np.arange(s, s + l, dtype = np.uint) for s, l in zip(starts, lenghts)])

    ## Building the binary mask
    msk_img = np.zeros(img_w * img_h, dtype = np.uint8)
    msk_img[mask_decompressed] = 1
    msk_img = msk_img.reshape((img_h, img_w))
    msk_img = np.asfortranarray(msk_img) ## This is important so pycocotools can handle this object
    
    return msk_img

In [None]:
from tqdm.notebook import tqdm
from pycocotools import mask as maskUtils
from joblib import Parallel, delayed

def annotate(idx, row, cat_ids):
        mask = rle2mask(row['annotation'], row['width'], row['height']) # Binary mask
        c_rle = maskUtils.encode(mask) # Encoding it back to rle (coco format)
        c_rle['counts'] = c_rle['counts'].decode('utf-8') # converting from binary to utf-8
        area = maskUtils.area(c_rle).item() # calculating the area
        bbox = maskUtils.toBbox(c_rle).astype(int).tolist() # calculating the bboxes
        annotation = {
            'segmentation': c_rle,
            'bbox': bbox,
            'area': area,
            'image_id':row['id'], 
            'category_id':cat_ids[row['cell_type']], 
            'iscrowd':0, 
            'id':idx
        }
        return annotation
    
def coco_structure(df, workers = 4):
    
    ## Building the header
    cat_ids = {name:id+1 for id, name in enumerate(df.cell_type.unique())}    
    cats =[{'name':name, 'id':id} for name,id in cat_ids.items()]
    images = [{'id':id, 'width':row.width, 'height':row.height, 'file_name':f'../input/sartorius-cell-instance-segmentation/train/{id}.png'} for id,row in df.groupby('id').agg('first').iterrows()]
    
    ## Building the annotations
    annotations = Parallel(n_jobs=workers)(delayed(annotate)(idx, row, cat_ids) for idx, row in tqdm(df.iterrows(), total = len(df)))
        
    return {'categories':cats, 'images':images, 'annotations':annotations}

In [None]:
# df.id.unique()

In [None]:
from sklearn.model_selection import train_test_split
df_train,df_val = train_test_split(df.id.unique(),test_size=.03,random_state=42,shuffle=True)

In [None]:
df_train = df[df.id.isin(df_train)]
df_val = df[df.id.isin(df_val)]

In [None]:
import json,itertools
root_train = coco_structure(df_train)
root_val = coco_structure(df_val)

In [None]:


with open('annotations_train.json', 'w', encoding='utf-8') as f:
    json.dump(root_train, f, ensure_ascii=True, indent=4)



with open('annotations_val.json', 'w', encoding='utf-8') as f:
    json.dump(root_val, f, ensure_ascii=True, indent=4)



In [None]:


# from pycocotools.coco import COCO
# import matplotlib.pyplot as plt
# from pathlib import Path
# from PIL import Image



In [None]:
# dataDir=Path('../input/image-filters/data')
# annFile = Path('./annotations_train.json')
# coco = COCO(annFile)
# imgIds = coco.getImgIds()