### Reference Notebooks
- [Positive score with Detectron 1/3 - input data](https://www.kaggle.com/code/slawekbiel/positive-score-with-detectron-1-3-input-data)
- [Efficient COCO Dataset Generator](https://www.kaggle.com/code/coldfir3/efficient-coco-dataset-generator)
- [UW-Madison GI Tract Image Segmentation - EDA](https://www.kaggle.com/code/gunesevitan/uw-madison-gi-tract-image-segmentation-eda)

In [None]:
!pip install pycocotools

In [None]:
import os
import json
from pathlib import Path
from PIL import Image

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.model_selection import GroupKFold

from glob import glob
from tqdm.notebook import tqdm

from joblib import Parallel, delayed

from pycocotools.coco import COCO

In [None]:
df = pd.read_csv('../input/uw-madison-gi-tract-image-segmentation/train.csv')
df.head()

In [None]:
df = pd.concat((
    df,
    df['id'].str.split('_', expand=True).drop(columns=[2]).rename(columns={0: 'case', 1: 'day', 3: 'slice_number'})
), axis=1)
df.head()

In [None]:
slice_filenames = glob('../input/uw-madison-gi-tract-image-segmentation/train/*/*/scans/*.png')

for filename in tqdm(slice_filenames):
    case, day = filename.split('/')[5].split('_')
    filename_split = filename.split('/')[-1].split('_')
    slice_number = filename_split[1]
    slice_height = filename_split[2]
    slice_width = filename_split[3]
    slice_vertical_pixel_spacing = filename_split[4]
    slice_horizontal_pixel_spacing = filename_split[5].replace('.png', '')
    
    slice_idx = (df['case'] == case) & (df['day'] == day) & (df['slice_number'] == slice_number)
    df.loc[slice_idx, 'slice_height'] = int(slice_height)
    df.loc[slice_idx, 'slice_width'] = int(slice_width)
    df.loc[slice_idx, 'slice_vertical_pixel_spacing'] = float(slice_vertical_pixel_spacing)
    df.loc[slice_idx, 'slice_horizontal_pixel_spacing'] = float(slice_horizontal_pixel_spacing)
    filename_sp = filename.split('/')
    df.loc[slice_idx, 'filename'] = os.path.join(filename_sp[4], filename_sp[5], filename_sp[6], filename_sp[7])

In [None]:
df['slice_width'] = df['slice_width'].astype(int)
df['slice_height'] = df['slice_height'].astype(int)

In [None]:
def decode_rle_mask(rle_mask, height, width, fill=1):
    """
    Decode run-length encoded segmentation mask string into 2d array

    Parameters
    ----------
    rle_mask (str): Run-length encoded segmentation mask string
    height (int): Height of the mask
    width (int): Width of the mask

    Returns
    -------
    mask [numpy.ndarray of shape (height, width)]: Decoded 2d segmentation mask
    """

    rle_mask = rle_mask.split()
    starts, lengths = [np.asarray(x, dtype=int) for x in (rle_mask[0:][::2], rle_mask[1:][::2])]
    starts -= 1
    ends = starts + lengths

    mask = np.zeros((height * width), dtype=np.uint8)
    for start, end in zip(starts, ends):
        mask[start:end] = fill

    mask = mask.reshape(height, width)
    return mask

In [None]:
def annotate(idx, row, cat_ids):
        c_rle = {}
        c_rle['counts'] = list(map(int, row['segmentation'].split()))
        c_rle['size'] = [row['slice_width'], row['slice_height']]
        mask = decode_rle_mask(row['segmentation'], row['slice_height'], row['slice_width'])
        area = int(np.sum(mask))
        ys, xs = np.where(mask)
        x1, x2 = min(xs), max(xs)
        y1, y2 = min(ys), max(ys)
        bbox = [int(x1), int(y1), int(x2-x1+1), int(y2-y1+1)]
        annotation = {
            'segmentation': c_rle,
            'bbox': bbox,
            'area': area,
            'image_id': row['id'], 
            'category_id': cat_ids[row['class']], 
            'iscrowd': 1, 
            'id': idx
        }
        return annotation
    
def coco_structure(df, workers = 4):
    ## Building the header
    cat_ids = {name:id+1 for id, name in enumerate(df['class'].unique())}    
    cats =[{'name':name, 'id':id} for name,id in cat_ids.items()]
    images = [{'id':id, 'width':row.slice_width, 'height':row.slice_height, 'file_name':f'train/{row.filename}'} for id,row in df.groupby('id').agg('first').iterrows()]
    
    ## Building the annotations
    annotations = Parallel(n_jobs=workers)(delayed(annotate)(idx, row, cat_ids) for idx, row in tqdm(df.iterrows(), total = len(df)))
        
    return {'categories':cats, 'images':images, 'annotations':annotations}

In [None]:
df = df[df.segmentation.notna()]
df = df.reset_index(drop=True)
df['fold'] = -1

gkf = GroupKFold(n_splits=5)
for fold, (_, val_idx) in enumerate(gkf.split(X=df, y=df['class'], groups=df['id'])):
    df.loc[val_idx, 'fold'] = fold

In [None]:
FOLD = 0

train_df = df.query("fold!=@FOLD")
valid_df = df.query("fold==@FOLD")
train_ann = coco_structure(train_df)
valid_ann = coco_structure(valid_df)

In [None]:
valid_ann['annotations'][0]

In [None]:
# Code taken from: https://stackoverflow.com/a/65151218/12890869
def np_encoder(object):
    if isinstance(object, np.generic):
        return object.item()

In [None]:
with open(f'train_annotations_{FOLD}.json', 'w', encoding='utf-8') as f:
    json.dump(train_ann, f, ensure_ascii=True, indent=4, default=np_encoder)
    
with open(f'valid_annotations_{FOLD}.json', 'w', encoding='utf-8') as f:
    json.dump(valid_ann, f, ensure_ascii=True, indent=4, default=np_encoder)