ref: https://www.kaggle.com/ammarnassanalhajali/barrier-reef-yolox-training

In [None]:
import warnings
warnings.filterwarnings("ignore")
import ast
import os
import json
import pandas as pd
import torch
import importlib
import cv2 

import shutil
from shutil import copyfile
from tqdm.notebook import tqdm
tqdm.pandas()
from sklearn.model_selection import GroupKFold
from PIL import Image
from string import Template
from IPython.display import display
TRAIN_PATH = '/kaggle/input/tensorflow-great-barrier-reef'

In [None]:
# Read in the data CSV files
df = pd.read_csv("/kaggle/input/tensorflow-great-barrier-reef/train.csv")
df.head(5)
df["NumBBox"]=df['annotations'].apply(lambda x: str.count(x, 'x'))
df_train=df[df["NumBBox"]>0]

In [None]:
def get_bbox(annots):
    bboxes = [list(annot.values()) for annot in annots]
    return bboxes
def get_path(row):
    row['image_path'] = f'{TRAIN_PATH}/train_images/video_{row.video_id}/{row.video_frame}.jpg'
    return row
def load_image(image_path):
    return cv2.cvtColor(cv2.imread(image_path), cv2.COLOR_BGR2RGB)

In [None]:
df_train['annotations'] = df_train['annotations'].progress_apply(lambda x: ast.literal_eval(x))
df_train['bboxes'] = df_train.annotations.progress_apply(get_bbox)
df_train["width"]=1280
df_train["height"]=720

In [None]:
df_train = df_train.progress_apply(get_path, axis=1)

In [None]:
n_spt=5
Selected_Fold=0  #0..4

from sklearn.model_selection import StratifiedKFold
kf = StratifiedKFold(n_splits = n_spt) 
df_train = df_train.reset_index(drop=True)
df_train['fold'] = -1
for fold, (train_idx, val_idx) in enumerate(kf.split(df_train, y = df_train.video_id.tolist())):
    df_train.loc[val_idx, 'fold'] = fold
display(df_train.fold.value_counts())

In [None]:
Work_Dir = '/kaggle/working/' 
DataSet_Path = 'dataset/images'

os.makedirs(f'{Work_Dir}{DataSet_Path}/train2017', exist_ok=True)
os.makedirs(f'{Work_Dir}{DataSet_Path}/val2017', exist_ok=True)
os.makedirs(f'{Work_Dir}{DataSet_Path}/annotations', exist_ok=True)

In [None]:
for i in tqdm(range(len(df_train))):
    row = df_train.loc[i]
    if row.fold != Selected_Fold:
        copyfile(f'{row.image_path}', f'{Work_Dir}{DataSet_Path}/train2017/{row.image_id}.jpg')
    else:
        copyfile(f'{row.image_path}', f'{Work_Dir}{DataSet_Path}/val2017/{row.image_id}.jpg') 

In [None]:
print(f'Number of training files: {len(os.listdir(f"{Work_Dir}{DataSet_Path}/train2017/"))}')
print(f'Number of validation files: {len(os.listdir(f"{Work_Dir}{DataSet_Path}/val2017/"))}')

In [None]:
annotion_id = 0

def save_annot_json(json_annotation, filename):
    with open(filename, 'w') as f:
        output_json = json.dumps(json_annotation)
        f.write(output_json)

In [None]:
def dataset2coco(df, dest_path):
    
    global annotion_id
    
    annotations_json = {
        "info": [],
        "licenses": [],
        "categories": [],
        "images": [],
        "annotations": []
    }
    
    info = {
        "year": "2021",
        "version": "1",
        "description": "COTS dataset - COCO format",
        "contributor": "",
        "url": "https://kaggle.com",
        "date_created": "2021-11-30T15:01:26+00:00"
    }
    annotations_json["info"].append(info)
    
    lic = {
            "id": 1,
            "url": "",
            "name": "Unknown"
        }
    annotations_json["licenses"].append(lic)

    classes = {"id": 0, "name": "starfish", "supercategory": "none"}

    annotations_json["categories"].append(classes)

    
    for ann_row in df.itertuples():
            
        images = {
            "id": ann_row[0],
            "license": 1,
            "file_name": ann_row.image_id + '.jpg',
            "height": ann_row.height,
            "width": ann_row.width,
            "date_captured": "2021-11-30T15:01:26+00:00"
        }
        
        annotations_json["images"].append(images)
        
        bbox_list = ann_row.bboxes
        
        for bbox in bbox_list:
            b_width = bbox[2]
            b_height = bbox[3]
            
            # some boxes in COTS are outside the image height and width
            if (bbox[0] + bbox[2] > 1280):
                b_width = 1280 - bbox[0] 
            if (bbox[1] + bbox[3] > 720):
                b_height = 720 - bbox[1] 
                
            image_annotations = {
                "id": annotion_id,
                "image_id": ann_row[0],
                "category_id": 0,
                "bbox": [bbox[0], bbox[1], b_width, b_height],
                "area": bbox[2] * bbox[3],
                "segmentation": [],
                "iscrowd": 0
            }
            
            annotion_id += 1
            annotations_json["annotations"].append(image_annotations)
        
        
    print(f"Dataset COTS annotation to COCO json format completed! Files: {len(df)}")
    return annotations_json

In [None]:
# Convert COTS dataset to JSON COCO
train_annot_json = dataset2coco(df_train[df_train.fold != Selected_Fold], f"{Work_Dir}{DataSet_Path}/train2017/")
val_annot_json = dataset2coco(df_train[df_train.fold == Selected_Fold], f"{Work_Dir}{DataSet_Path}/val2017/")

# Save converted annotations
save_annot_json(train_annot_json, f"{Work_Dir}{DataSet_Path}/annotations/train.json")
save_annot_json(val_annot_json, f"{Work_Dir}{DataSet_Path}/annotations/valid.json")