In [None]:
import pandas as pd
import numpy as np
import os
from tqdm import tqdm
from tqdm.notebook import tqdm
from datetime import datetime
import json,itertools
from typing import Optional
from glob import glob
from sklearn.model_selection import StratifiedKFold, KFold, StratifiedGroupKFold


In [None]:
df = pd.read_csv("../input/uw-madison-gi-tract-image-segmentation/train.csv")
df.head(4)

In [None]:
df.rename(columns = {'class':'class_name'}, inplace = True)
##################################
df["case"] = df["id"].apply(lambda x: int(x.split("_")[0].replace("case", "")))
df["day"] = df["id"].apply(lambda x: int(x.split("_")[1].replace("day", "")))
df["slice"] = df["id"].apply(lambda x: x.split("_")[3])
#################################
TRAIN_DIR="../input/uw-madison-gi-tract-image-segmentation/train"
all_train_images = glob(os.path.join(TRAIN_DIR, "**", "*.png"), recursive=True)
x = all_train_images[0].rsplit("/", 4)[0] ## ../input/uw-madison-gi-tract-image-segmentation/train

path_partial_list = []
for i in range(0, df.shape[0]):
    path_partial_list.append(os.path.join(x,
                          "case"+str(df["case"].values[i]),
                          "case"+str(df["case"].values[i])+"_"+ "day"+str(df["day"].values[i]),
                          "scans",
                          "slice_"+str(df["slice"].values[i])))
df["path_partial"] = path_partial_list
#################################
path_partial_list = []
for i in range(0, len(all_train_images)):
    path_partial_list.append(str(all_train_images[i].rsplit("_",4)[0]))
    
tmp_df = pd.DataFrame()
tmp_df['path_partial'] = path_partial_list
tmp_df['path'] = all_train_images

#################################
df = df.merge(tmp_df, on="path_partial").drop(columns=["path_partial"])
#################################
# 5. Get slice dimensions from filepath (int in pixels)

df["width"] = df["path"].apply(lambda x: int(x[:-4].rsplit("_",4)[1]))
df["height"] = df["path"].apply(lambda x: int(x[:-4].rsplit("_",4)[2]))

# 6. Pixel spacing from filepath (float in mm)
df["spacing_hight"] = df["path"].apply(lambda x: float(x[:-4].rsplit("_",4)[3]))
df["spacing_width"] = df["path"].apply(lambda x: float(x[:-4].rsplit("_",4)[4]))
#################################
df.head(3)

In [None]:
del path_partial_list,tmp_df

#convert Nan to a string
#df.fillna('', inplace=True)

#df_meta = df.drop_duplicates(subset=['id'], keep='last')
df_meta = df.groupby('id').first().reset_index()
df_meta = df_meta.reset_index(drop=True)
df_meta.head(3)

In [None]:
def rle_decode(mask_rle, shape):
    '''
    mask_rle: run-length as string formated (start length)
    shape: (height,width) of array to return 
    Returns numpy array, 1 - mask, 0 - background

    '''
    s = mask_rle.split()
    starts, lengths = [np.asarray(x, dtype=int) for x in (s[0:][::2], s[1:][::2])]
    starts -= 1
    ends = starts + lengths
    img = np.zeros(shape[0]*shape[1], dtype=np.uint8)
    for lo, hi in zip(starts, ends):
        img[lo:hi] = 1
    return img.reshape(shape)  # Needed to align to RLE direction

# From https://newbedev.com/encode-numpy-array-using-uncompressed-rle-for-coco-dataset
def binary_mask_to_rle(binary_mask):
    rle = {'counts': [], 'size': list(binary_mask.shape)}
    counts = rle.get('counts')
    for i, (value, elements) in enumerate(itertools.groupby(binary_mask.ravel(order='F'))):
        if i == 0 and value == 1:
            counts.append(0)
        counts.append(len(list(elements)))
    return rle

In [None]:
cat_ids = {name:id+1 for id, name in enumerate(df.class_name.unique())}
cats =[{'name':name, 'id':id} for name,id in cat_ids.items()]
cat_ids

In [None]:
def coco_structure(train_df):
    images = [{'id':id, 'width':row.width, 'height':row.height, 'file_name':row.path} for id,row in train_df.groupby('id').agg('first').iterrows()]
    annotations=[]
    for idx, row in tqdm(train_df.iterrows()):
        mk = rle_decode(row.segmentation, (row.height, row.width))
        ys, xs = np.where(mk)
        x1, x2 = min(xs), max(xs)
        y1, y2 = min(ys), max(ys)
        enc =binary_mask_to_rle(mk)
        seg = {
            'segmentation':enc, 
            'bbox': [int(x1), int(y1), int(x2-x1+1), int(y2-y1+1)],
            'area': int(np.sum(mk)),
            'image_id':row.id, 
            'category_id':cat_ids[row.class_name], 
            'iscrowd':0, 
            'id':idx
        }
        annotations.append(seg)
    return {'categories':cats, 'images':images,'annotations':annotations}

In [None]:
n_splits=5
fold_selected=2 # 1..5

skf = StratifiedGroupKFold(n_splits=n_splits, shuffle=True, random_state=42)
for fold, (_, val_idx) in enumerate(skf.split(X=df_meta, y=df_meta['case'],groups =df_meta['case']), 1):
    df_meta.loc[val_idx, 'fold'] = fold  
df_meta['fold'] = df_meta['fold'].astype(np.uint8)

df_meta.groupby('fold').size()

In [None]:
train_ids = df_meta[df_meta["fold"]!=fold_selected].id
valid_ids = df_meta[df_meta["fold"]==fold_selected].id

_train = df[df.id.isin(train_ids)]
_valid = df[df.id.isin(valid_ids)]

_train = _train[~_train.segmentation.isna()]
_valid = _valid[~_valid.segmentation.isna()]

train_json = coco_structure(_train)
valid_json = coco_structure(_valid)


In [None]:
with open(f'coco_train_fold{fold_selected}.json', 'w', encoding='utf-8') as f:
    json.dump(train_json, f, ensure_ascii=True, indent=4)

with open(f'coco_valid_fold{fold_selected}.json', 'w', encoding='utf-8') as f:
    json.dump(valid_json, f, ensure_ascii=True, indent=4)