# This Notebook helps to create a dataset in the cocoformat and also has a visualization with with both coco and detectron API

Reference:
https://www.kaggle.com/code/ammarnassanalhajali/k-fold-crossvalidation-coco-dataset-generator/notebook

In [None]:
# # This Python 3 environment comes with many helpful analytics libraries installed
# # It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# # For example, here's several helpful packages to load

# import numpy as np # linear algebra
# import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# # Input data files are available in the read-only "../input/" directory
# # For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

# import os
# for dirname, _, filenames in os.walk('/kaggle/input'):
#     for filename in filenames:
#         print(os.path.join(dirname, filename))

# # You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# # You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
! pip install pycocotools
! pip install 'git+https://github.com/facebookresearch/detectron2.git'

In [None]:
import json
import numpy as np
import pycocotools.mask as mask_util
from skimage import measure
import os
from tqdm import tqdm
from tqdm.notebook import tqdm
import cv2
import random
from itertools import groupby
import itertools
import pandas as pd
import cv2
from sklearn.model_selection import train_test_split
from tqdm import tqdm


In [None]:
def rle_decode(mask_rle, shape):
    '''
    mask_rle: run-length as string formated (start length)
    shape: (height,width) of array to return 
    Returns numpy array, 1 - mask, 0 - background

    '''
    s = mask_rle.split()
    starts, lengths = [np.asarray(x, dtype=int) for x in (s[0:][::2], s[1:][::2])]
    starts -= 1
    ends = starts + lengths
    img = np.zeros(shape[0]*shape[1], dtype=np.uint8)
    for lo, hi in zip(starts, ends):
        img[lo:hi] = 1
    return img.reshape(shape)  # Needed to align to RLE direction

# From https://newbedev.com/encode-numpy-array-using-uncompressed-rle-for-coco-dataset
def binary_mask_to_rle(binary_mask):
    rle = {'counts': [], 'size': list(binary_mask.shape)}
    counts = rle.get('counts')
    for i, (value, elements) in enumerate(itertools.groupby(binary_mask.ravel(order='F'))):
        if i == 0 and value == 1:
            counts.append(0)
        counts.append(len(list(elements)))
    return rle

def create_coco_format_json(data_frame, classes, filepaths):
    images = []
    annotations = []
    categories = []
    count = 0
    
    # Additing categories
    for idx, class_ in enumerate(classes):
        categories.append(
            { 
                "id": idx,
                "name": class_
            }
        )

    for filepath in tqdm(filepaths):
        file_id = ('_'.join((filepath.split("/")[-3] + "_" + filepath.split("/")[-1]).split("_")[:-4]))
        height_slice = int(filepath.split("/")[-1].split("_")[3])
        width_slice = int(filepath.split("/")[-1].split("_")[2])
        ids = data_frame.index[data_frame['id'] == file_id].tolist()
        file_name = '/'.join(filepath.split("/")[4:])

        if (len(ids) > 0):
            # Adding images which has annotations
            images.append(
                {
                    "id":file_id,
                    "width":width_slice,
                    "height":height_slice,
                    "file_name": file_name
                }
            )
            for idx in ids:
                mk = rle_decode(data_frame.iloc[idx]['segmentation'], (height_slice, width_slice))
                ys, xs = np.where(mk)
                x1, x2 = min(xs), max(xs)
                y1, y2 = min(ys), max(ys)              
                contours,hierarchy = cv2.findContours(mk,cv2.RETR_CCOMP,cv2.CHAIN_APPROX_NONE)
                for id_, contour in enumerate(contours):
                    mask_image = np.zeros((mk.shape[0], mk.shape[1], 3),  np.uint8)
                    cv2.drawContours(mask_image, [contour], -1, (255,255,255), thickness=cv2.FILLED)
                    mask_image = cv2.cvtColor(mask_image, cv2.COLOR_BGR2GRAY)
                    mask_image_bool = np.array(mask_image, dtype=bool).astype(np.uint8)
                    ys, xs = np.where(mask_image_bool)
                    x1, x2 = min(xs), max(xs)
                    y1, y2 = min(ys), max(ys)
                    enc =binary_mask_to_rle(mask_image_bool)
                    seg = {
                        'segmentation':enc, 
                        'bbox': [int(x1), int(y1), int(x2-x1+1), int(y2-y1+1)],
                        'area': int(np.sum(mask_image_bool)),
                        'image_id':file_id, 
                        'category_id':classes.index(data_frame.iloc[idx]['class']), 
                        'iscrowd':0, 
                        'id': count
                    }
                    annotations.append(seg)
                    count +=1
            
    # creating the dataset
    dataset_coco_format = {
        "categories": categories,
        "images": images,
        "annotations": annotations,
    }
    
    return dataset_coco_format

In [None]:
# Setting the paths
dataset_path = os.path.abspath("/kaggle/input/uw-madison-gi-tract-image-segmentation/")
output_path = os.path.abspath("/kaggle/working/")
csv_path = os.path.abspath("/kaggle/input/uw-madison-gi-tract-image-segmentation/train.csv")

# creating a dataframe
df = pd.read_csv(csv_path)
df_with_mask = df[df['segmentation'].notnull()] # Removing the slices which donot have any segmnatation
df_with_mask = df_with_mask.reset_index(drop=True)

# Creation of train test split
train_df, test_df = train_test_split(df_with_mask, test_size=0.2)
train_df = train_df.reset_index(drop=True)
test_df = test_df.reset_index(drop=True)

print(train_df.head())
print("\n \nNumber of Train Images:{}".format(len(train_df)))
print("Number of Test Images:{}".format(len(test_df)))

In [None]:
classes = ['small_bowel', 'large_bowel', 'stomach']

filepaths = list()
for (dirpath, dirnames, filenames) in os.walk(dataset_path):
    filepaths += [os.path.join(dirpath, file) for file in filenames if file.endswith(".png")]
    
train_json = create_coco_format_json(train_df, classes, filepaths)
test_json = create_coco_format_json(test_df, classes, filepaths)

# Saving the train and test json

In [None]:
# Code taken from: https://stackoverflow.com/a/65151218/12890869
def np_encoder(object):
    if isinstance(object, np.generic):
        return object.item()
    
with open('train_json.json', 'w', encoding='utf-8') as f:
    json.dump(train_json, f, ensure_ascii=True, indent=4, default=np_encoder)
    
with open('test_json.json', 'w', encoding='utf-8') as f:
    json.dump(test_json, f, ensure_ascii=True, indent=4, default=np_encoder)

# Visualization with Detectron2

In [None]:
from pathlib import Path
from detectron2.data.datasets import register_coco_instances
from detectron2.data import DatasetCatalog, MetadataCatalog
import matplotlib.pyplot as plt
from detectron2.utils.visualizer import Visualizer
from detectron2.utils.visualizer import ColorMode

Data_Resister_training="train";
Data_Resister_testing="test";

if Data_Resister_training in DatasetCatalog.list():
    DatasetCatalog.remove(Data_Resister_training)
if Data_Resister_testing in DatasetCatalog.list():
    DatasetCatalog.remove(Data_Resister_testing)

register_coco_instances(
    Data_Resister_training,
    {}, 
    os.path.join(output_path, "train_json.json"), 
    dataset_path)

register_coco_instances(
    Data_Resister_testing,
    {}, 
    os.path.join(output_path, "test_json.json"), 
    dataset_path)

metadata = MetadataCatalog.get(Data_Resister_training)
metadata = MetadataCatalog.get(Data_Resister_testing)
dataset_train = DatasetCatalog.get(Data_Resister_training)
dataset_test = DatasetCatalog.get(Data_Resister_testing)

In [None]:
plt.figure(figsize=(15,15))
for i, idx in enumerate(random.sample(range(0, len(dataset_train)), 4)):
    d=dataset_train[idx]
    gray_image = cv2.imread(d["file_name"], cv2.IMREAD_ANYDEPTH)
    gray_image = gray_image / gray_image.max()
    img = np.repeat(gray_image[..., np.newaxis], 3, -1) * 255.0
    v = Visualizer(img,
                    metadata=metadata, 
                    scale=2,
                    instance_mode=ColorMode.IMAGE_BW
        )
    out = v.draw_dataset_dict(d)
    plt.subplot(2, 2, i+1)
    plt.imshow(out.get_image())
    plt.axis('off')
plt.show()