In [None]:
!pip install -Uqqq pycocotools

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from PIL import Image, ImageEnhance
from tqdm.notebook import tqdm
from sklearn.model_selection import StratifiedKFold

import glob
import sys
import cv2
import imageio
import joblib
import math
import warnings
import os

tqdm.pandas()

In [None]:
HEIGHT = 520
WIDTH = 704

train = pd.read_csv('/kaggle/input/sartorius-cell-instance-segmentation/train.csv')

In [None]:
# Add image file path
def get_file_path(image_id):
    return f'/kaggle/input/sartorius-cell-instance-segmentation/train/{image_id}.png'

train['file_path'] = train['id'].apply(get_file_path)

In [None]:
train['shape'] = train[['height', 'width']].apply(tuple, axis=1)

In [None]:
display(train.head())

In [None]:
plt.figure(figsize=(8, 8))
train['cell_type'].value_counts().plot(kind='pie', autopct='%1.1f%%', title='Cell Type Distribution')
plt.show()

## Kfold

Below codes are copied from [this discussion](https://www.kaggle.com/c/sartorius-cell-instance-segmentation/discussion/285546) by [Gunes Evitan
](https://www.kaggle.com/gunesevitan)

In [None]:
df_images = train.groupby('id').first().reset_index()

In [None]:
df_images

In [None]:
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
for fold, (_, val_idx) in enumerate(skf.split(X=df_images, y=df_images['cell_type']), 1):
    df_images.loc[val_idx, 'fold'] = fold
df_images['fold'] = df_images['fold'].astype(np.uint8)

In [None]:
DATA_PATH = './fold'
!mkdir $DATA_PATH

In [None]:
df_images[['id', 'fold']].to_csv(f'{DATA_PATH}/train_folds.csv', index=False)

In [None]:
df_train = pd.read_csv('../input/sartorius-cell-instance-segmentation/train.csv')
df_train_folds = pd.read_csv(f'{DATA_PATH}/train_folds.csv')
df_train = df_train.merge(df_train_folds, how='left', on='id')

In [None]:
for i in range(0, 5):
    print(len(df_images[df_images.fold == i+1]))

In [None]:
train_folds = []
val_folds = []
for i in range(1, 6):
    train_folds.append(df_train[df_train['fold'] != i])
    val_folds.append(df_train[df_train['fold'] == i])

In [None]:
len(train_folds)

## Coco Generator

Below code are copied from https://www.kaggle.com/coldfir3/efficient-coco-dataset-generator by [Adriano Passos](https://www.kaggle.com/coldfir3)

In [None]:
## Based on: https://www.kaggle.com/eigrad/convert-rle-to-bounding-box-x0-y0-x1-y1
def rle2mask(rle, img_w, img_h):
    
    ## transforming the string into an array of shape (2, N)
    array = np.fromiter(rle.split(), dtype = np.uint)
    array = array.reshape((-1,2)).T
    array[0] = array[0] - 1
    
    ## decompressing the rle encoding (ie, turning [3, 1, 10, 2] into [3, 4, 10, 11, 12])
    # for faster mask construction
    starts, lenghts = array
    mask_decompressed = np.concatenate([np.arange(s, s + l, dtype = np.uint) for s, l in zip(starts, lenghts)])

    ## Building the binary mask
    msk_img = np.zeros(img_w * img_h, dtype = np.uint8)
    msk_img[mask_decompressed] = 1
    msk_img = msk_img.reshape((img_h, img_w))
    msk_img = np.asfortranarray(msk_img) ## This is important so pycocotools can handle this object
    
    return msk_img

In [None]:
from tqdm.notebook import tqdm
from pycocotools import mask as maskUtils
from joblib import Parallel, delayed

def annotate(idx, row, cat_ids):
        mask = rle2mask(row['annotation'], row['width'], row['height']) # Binary mask
        c_rle = maskUtils.encode(mask) # Encoding it back to rle (coco format)
        c_rle['counts'] = c_rle['counts'].decode('utf-8') # converting from binary to utf-8
        area = maskUtils.area(c_rle).item() # calculating the area
        bbox = maskUtils.toBbox(c_rle).astype(int).tolist() # calculating the bboxes
        annotation = {
            'segmentation': c_rle,
            'bbox': bbox,
            'area': area,
            'image_id':row['id'], 
            'category_id':cat_ids[row['cell_type']], 
            'iscrowd':0, 
            'id':idx
        }
        return annotation
    
def coco_structure(df, workers = 4):
    
    ## Building the header
    cat_ids = {name:id+1 for id, name in enumerate(df.cell_type.unique())}    
    cats =[{'name':name, 'id':id} for name,id in cat_ids.items()]
    images = [{'id':id, 'width':row.width, 'height':row.height, 'file_name':f'train/{id}.png'} for id,row in df.groupby('id').agg('first').iterrows()]
    
    ## Building the annotations
    annotations = Parallel(n_jobs=workers)(delayed(annotate)(idx, row, cat_ids) for idx, row in tqdm(df.iterrows(), total = len(df)))
        
    return {'categories':cats, 'images':images, 'annotations':annotations}

## To COCO json

In [None]:
import json,itertools

train_fold_json = [coco_structure(fold) for fold in train_folds]
val_fold_json = [coco_structure(fold) for fold in val_folds]

In [None]:
coco_json = './fold_json'
!mkdir $coco_json

In [None]:
for idx, (train_fold, val_fold) in enumerate(zip(train_fold_json, val_fold_json)):
    with open(f'{coco_json}/fold_{idx+1}_train.json', 'w+', encoding='utf-8') as f:
        json.dump(train_fold, f, ensure_ascii=True, indent=4)
    with open(f'{coco_json}/fold_{idx+1}_val.json', 'w+', encoding='utf-8') as f:
        json.dump(val_fold, f, ensure_ascii=True, indent=4)

In [None]:
from pycocotools.coco import COCO
import matplotlib.pyplot as plt
from pathlib import Path
from PIL import Image

In [None]:
dataDir=Path('../input/sartorius-cell-instance-segmentation')
annFile = Path(f'{coco_json}/fold_1_train.json')
coco = COCO(annFile)
imgIds = coco.getImgIds()

In [None]:
!ls ./fold_json

In [None]:
imgs = coco.loadImgs(imgIds[-3:])
_,axs = plt.subplots(len(imgs),2,figsize=(40,15 * len(imgs)))
for img, ax in zip(imgs, axs):
    I = Image.open(dataDir/img['file_name'])
    annIds = coco.getAnnIds(imgIds=[img['id']])
    anns = coco.loadAnns(annIds)
    ax[0].imshow(I)
    ax[1].imshow(I)
    plt.sca(ax[1])
    coco.showAnns(anns, draw_bbox=True)