# Ref: https://www.kaggle.com/remekkinas/yolox-training-pipeline-cots-dataset-lb-0-507

# Import libraries

In [None]:
import warnings
warnings.filterwarnings("ignore")

import ast
import os
import json
import pandas as pd
import torch
import importlib
import cv2 

from shutil import copyfile
from tqdm.notebook import tqdm
tqdm.pandas()
from sklearn.model_selection import GroupKFold

TRAIN_PATH = '/kaggle/input/tensorflow-great-barrier-reef'

In [None]:
df = pd.read_csv("/kaggle/input/tensorflow-great-barrier-reef/train.csv")
df.head(5)

# Add no of bbox, bbox values, image resolution and image path in new dataframe

In [None]:
def get_bbox(annots):
    # get bbox corrdinates only by using anno.values() in annotations dict
    # since the values are string type, we need to convert to list
    bboxes = [list(annot.values()) for annot in annots]
    return bboxes

def get_img_path(row):
    # /kaggle/input/tensorflow-great-barrier-reef/train_images/video_2/16.jpg
    row['image_path'] = f'{TRAIN_PATH}/train_images/video_{row.video_id}/{row.video_frame}.jpg'
    return row 

In [None]:
# Taken only annotated photos and add in num_bbox column in df
df["num_bbox"] = df['annotations'].apply(lambda x: str.count(x, 'x'))

# to make sure num_bbox in each row is greater than 0 and greate df_train
df_train = df[df["num_bbox"]>0]

# change annotations str type to list and add bboxes values in df 
df_train['annotations'] = df_train['annotations'].progress_apply(lambda x: ast.literal_eval(x))
df_train['bboxes'] = df_train.annotations.progress_apply(get_bbox)

# add image resoultuion column in dataframe
df_train["width"] = 1280
df_train["height"] = 720

# add image path in dataframe
df_train = df_train.progress_apply(get_img_path, axis=1)

# GroupK Folding (5 folds)

In [None]:
kf = GroupKFold(n_splits = 5)
df_train = df_train.reset_index(drop=True)
df_train['fold'] = -1

for fold, (train_idx, val_idx) in enumerate(kf.split(df_train, y=df_train.video_id.tolist(), groups=df_train.sequence)):
    df_train.loc[val_idx, 'fold'] = fold
    
df_train.head(5)

In [None]:
HOME_DIR = '/kaggle/working/'
DATASET_PATH = 'dataset/COTS-COCO'

!mkdir {HOME_DIR}dataset
!mkdir {HOME_DIR}{DATASET_PATH}
!mkdir {HOME_DIR}{DATASET_PATH}/images
!mkdir {HOME_DIR}{DATASET_PATH}/images/train2017
!mkdir {HOME_DIR}{DATASET_PATH}/images/val2017
!mkdir {HOME_DIR}{DATASET_PATH}/annotations

# Split train and val depending on fold id

In [None]:
SELECTED_FOLD = 4
# fold 1,2,3,5 --> train
# fold 4 --> val

for i in tqdm(range(len(df_train))):
    
    row = df_train.loc[i]
    
    if row.fold != SELECTED_FOLD: # train
        copyfile(f'{row.image_path}', f'{HOME_DIR}{DATASET_PATH}/images/train2017/{row.image_id}.jpg')
    
    elif row.fold == SELECTED_FOLD: # val
        copyfile(f'{row.image_path}', f'{HOME_DIR}{DATASET_PATH}/images/val2017/{row.image_id}.jpg')

In [None]:
print(f'Number of training files: {len(os.listdir(f"{HOME_DIR}{DATASET_PATH}/images/train2017/"))}')
print(f'Number of validation files: {len(os.listdir(f"{HOME_DIR}{DATASET_PATH}/images/val2017/"))}')

# Convert to COCO Json for annotations

In [None]:
def save_annot_json(annotation_dict, fname):
    with open(fname, 'w') as f:
        output_json = json.dumps(annotation_dict)
        f.write(output_json)

In [None]:
annotation_id = 0
from datetime import datetime

def dataset_to_coco(df):
        
    global annotation_id
    
    annotations_json = {
        "info": [],
        "licenses": [],
        "categories": [],
        "images": [],
        "annotations": []
    }
    
    info = {
        "year": "2021",
        "version": "1",
        "description": "COTS dataset - COCO format",
        "contributor": "",
        "url": "https://www.kaggle.com/c/tensorflow-great-barrier-reef/overview",
        "date_created": str(datetime.now())
    }
    annotations_json["info"].append(info)
    
    lic = {
            "id": 1,
            "url": "https://www.kaggle.com/c/tensorflow-great-barrier-reef/rules",
            "name": "Unknown"
        }
    annotations_json["licenses"].append(lic)

    classes = {"id": 0, "name": "starfish", "supercategory": "none"}

    annotations_json["categories"].append(classes)

    
    for ann_row in df.itertuples():
            
        images = {
            "id": ann_row[0],
            "license": 1,
            "file_name": ann_row.image_id + '.jpg',
            "height": ann_row.height,
            "width": ann_row.width,
            "date_captured": str(datetime.now())
        }
        
        annotations_json["images"].append(images)
        
        bbox_list = ann_row.bboxes
        
        for bbox in bbox_list:
            b_width = bbox[2]
            b_height = bbox[3]
            
            # some boxes in COTS are outside the image height and width
            if (bbox[0] + bbox[2] > 1280):
                b_width = bbox[0] - 1280 
            if (bbox[1] + bbox[3] > 720):
                b_height = bbox[1] - 720 
                
            image_annotations = {
                "id": annotation_id,
                "image_id": ann_row[0],
                "category_id": 0,
                "bbox": [bbox[0], bbox[1], b_width, b_height],
                "area": bbox[2] * bbox[3],
                "segmentation": [],
                "iscrowd": 0
            }
            
            annotation_id += 1
            annotations_json["annotations"].append(image_annotations)
        
        
    print(f"Dataset COTS annotation to COCO json format completed! Files: {len(df)}")
    return annotations_json

# Save coco json annotation file

In [None]:
# fold 1,2,3,5
train_df = df_train[df_train.fold != SELECTED_FOLD]
train_coco_dict = dataset_to_coco(df=train_df)
train_json_path = f'{HOME_DIR}{DATASET_PATH}/annotations/train.json'
save_annot_json(annotation_dict=train_coco_dict, fname=train_json_path)


# fold 4
val_df = df_train[df_train.fold == SELECTED_FOLD]
val_coco_dict = dataset_to_coco(df=val_df)
val_json_path = f'{HOME_DIR}{DATASET_PATH}/annotations/val.json'
save_annot_json(annotation_dict=val_coco_dict, fname=val_json_path)