In [None]:
!pip install pycocotools
!pip install --upgrade scikit-learn

In [None]:
from tqdm.notebook import tqdm
from pycocotools import mask as maskUtils
from joblib import Parallel, delayed
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import json,itertools
from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold,GroupKFold,StratifiedGroupKFold

In [None]:
gkf  = StratifiedGroupKFold(n_splits = 5 )
df = pd.read_csv('/kaggle/input/sartorius-cell-instance-segmentation/train.csv')
df_shsy5y = df[df['cell_type'] == 'shsy5y'].copy().reset_index(drop=True)
df_astro = df[df['cell_type'] == 'astro'].copy().reset_index(drop=True)
df_cort = df[df['cell_type'] == 'cort'].copy().reset_index(drop=True)

In [None]:
## Based on: https://www.kaggle.com/eigrad/convert-rle-to-bounding-box-x0-y0-x1-y1
def rle2mask(rle, img_w, img_h):
    ## transforming the string into an array of shape (2, N)
    array = np.fromiter(rle.split(), dtype=np.uint)
    array = array.reshape((-1, 2)).T
    array[0] = array[0] - 1

    ## decompressing the rle encoding (ie, turning [3, 1, 10, 2] into [3, 4, 10, 11, 12])
    # for faster mask construction
    starts, lenghts = array
    mask_decompressed = np.concatenate([np.arange(s, s + l, dtype=np.uint) for s, l in zip(starts, lenghts)])

    ## Building the binary mask
    msk_img = np.zeros(img_w * img_h, dtype=np.uint8)
    msk_img[mask_decompressed] = 1
    msk_img = msk_img.reshape((img_h, img_w))
    msk_img = np.asfortranarray(msk_img)  ## This is important so pycocotools can handle this object

    return msk_img

def annotate(idx, row, cat_ids):
    mask = rle2mask(row['annotation'], row['width'], row['height'])  # Binary mask
    c_rle = maskUtils.encode(mask)  # Encoding it back to rle (coco format)
    c_rle['counts'] = c_rle['counts'].decode('utf-8')  # converting from binary to utf-8
    area = maskUtils.area(c_rle).item()  # calculating the area
    bbox = maskUtils.toBbox(c_rle).astype(int).tolist()  # calculating the bboxes
    annotation = {
        'segmentation': c_rle,
        'bbox': bbox,
        'area': area,
        'image_id': row['id'],
        'category_id': cat_ids[row['cell_type']],
        'iscrowd': 0,
        'id': idx
    }
    return annotation

def coco_structure(df, workers=4):
    ## Building the header
    cat_ids = {name: id + 1 for id, name in enumerate(df.cell_type.unique())}
    cats = [{'name': name, 'id': id} for name, id in cat_ids.items()]
    
    images = [{'id': id, 'width': row.width, 'height': row.height, 'file_name': f'train/{id}.png'} for id, row in
              df.groupby('id').agg('first').iterrows()]

    ## Building the annotations
    annotations = Parallel(n_jobs=workers)(
        delayed(annotate)(idx, row, cat_ids) for idx, row in tqdm(df.iterrows(), total=len(df)))

    return {'categories': cats, 'images': images, 'annotations': annotations}

In [None]:
'''all 3 classes'''
for fold, (train_idx, val_idx) in enumerate(gkf.split(df,y=np.array(df['cell_type'].to_list()),groups =  np.array(df['id'].to_list()))):
    train_df = df.loc[train_idx].reset_index(drop=True)
    val_df = df.loc[val_idx].reset_index(drop=True)
    train_root = coco_structure(train_df)
    val_root = coco_structure(val_df)
    with open(f'annotations_train_fold{fold}.json', 'w', encoding='utf-8') as f:
        json.dump(train_root, f, ensure_ascii=True, indent=4)
    with open(f'annotations_val_fold{fold}.json', 'w', encoding='utf-8') as f:
        json.dump(val_root, f, ensure_ascii=True, indent=4)
    # break

In [None]:
'''only shsy5y'''
for fold, (train_idx, val_idx) in enumerate(gkf.split(df_shsy5y,y=np.array(df_shsy5y['cell_type'].to_list()),groups =  np.array(df_shsy5y['id'].to_list()))):
    train_df = df_shsy5y.loc[train_idx].reset_index(drop=True)
    val_df = df_shsy5y.loc[val_idx].reset_index(drop=True)

    train_root = coco_structure(train_df)
    val_root = coco_structure(val_df)
    with open(f'annotations_shsy5y_train_fold{fold}.json', 'w', encoding='utf-8') as f:
        json.dump(train_root, f, ensure_ascii=True, indent=4)
    with open(f'annotations_shsy5y_val_fold{fold}.json', 'w', encoding='utf-8') as f:
        json.dump(val_root, f, ensure_ascii=True, indent=4)
#     break

In [None]:
'''only astro'''
for fold, (train_idx, val_idx) in enumerate(gkf.split(df_astro,y=np.array(df_astro['cell_type'].to_list()),groups =  np.array(df_astro['id'].to_list()))):
    train_df = df_astro.loc[train_idx].reset_index(drop=True)
    val_df = df_astro.loc[val_idx].reset_index(drop=True)
    train_root = coco_structure(train_df)
    val_root = coco_structure(val_df)
    with open(f'annotations_astro_train_fold{fold}.json', 'w', encoding='utf-8') as f:
        json.dump(train_root, f, ensure_ascii=True, indent=4)
    with open(f'annotations_astro_val_fold{fold}.json', 'w', encoding='utf-8') as f:
        json.dump(val_root, f, ensure_ascii=True, indent=4)
    # break

In [None]:
'''only cort'''
for fold, (train_idx, val_idx) in enumerate(gkf.split(df_cort,y=np.array(df_cort['cell_type'].to_list()),groups =  np.array(df_cort['id'].to_list()))):
    train_df = df_cort.loc[train_idx].reset_index(drop=True)
    val_df = df_cort.loc[val_idx].reset_index(drop=True)
    train_root = coco_structure(train_df)
    val_root = coco_structure(val_df)
    with open(f'annotations_cort_train_fold{fold}.json', 'w', encoding='utf-8') as f:
        json.dump(train_root, f, ensure_ascii=True, indent=4)
    with open(f'annotations_cort_val_fold{fold}.json', 'w', encoding='utf-8') as f:
        json.dump(val_root, f, ensure_ascii=True, indent=4)
    # break