# Data Prep Only

* This notebook prepares the COTS dataset using a 5-fold group split (by 'sequence'); then selects group '4' as the validation dataset
* Outputs YOLO format (suitable for YOLOv4, YOLOR and ScaledYOLOv4)

### Importing required modules

In [None]:
import pandas as pd
from tqdm.notebook import tqdm
tqdm.pandas()
import ast
import os
import numpy as np
import shutil
import sys
sys.path.append('../input/tensorflow-great-barrier-reef')

import cv2
import matplotlib.pyplot as plt
from joblib import Parallel, delayed

from sklearn.model_selection import GroupKFold

import torch
from PIL import Image

### Utility Functions

In [None]:
# hide

def coco2yolo(image_height, image_width, bboxes):
    """
    coco => [xmin, ymin, w, h]
    yolo => [xmid, ymid, w, h] (normalized)
    """
    
    bboxes = bboxes.copy().astype(float) # otherwise all value will be 0 as voc_pascal dtype is np.int
    
    for bbox in bboxes:
        if bbox[0] + bbox[2] >= image_width:
            bbox[2] = image_width - bbox[0] - 1
        if bbox[1] + bbox[3] >= image_height:
            bbox[3] = image_height - bbox[1] - 1
    
    # normolizinig
    bboxes[..., [0, 2]]= bboxes[..., [0, 2]]/ image_width
    bboxes[..., [1, 3]]= bboxes[..., [1, 3]]/ image_height
    
    # converstion (xmin, ymin) => (xmid, ymid)
    bboxes[..., [0, 1]] = bboxes[..., [0, 1]] + bboxes[..., [2, 3]]/2
    
    return bboxes

In [None]:
TRAIN_PATH = '/kaggle/input/tensorflow-great-barrier-reef'

def get_bbox(annots):
    bboxes = [list(annot.values()) for annot in annots]
    return bboxes

def get_path(row):
    row['image_path'] = f'{TRAIN_PATH}/train_images/video_{row.video_id}/{row.video_frame}.jpg'
    return row

## Processing Training Data

In [None]:
df = pd.read_csv("/kaggle/input/tensorflow-great-barrier-reef/train.csv")

# Taken only annotated photos
df["num_bbox"] = df['annotations'].apply(lambda x: str.count(x, 'x'))
df_train = df[df["num_bbox"]>0]

#Annotations 
df_train['annotations'] = df_train['annotations'].progress_apply(lambda x: ast.literal_eval(x))
df_train['bboxes'] = df_train.annotations.progress_apply(get_bbox)

#Images resolution
df_train["width"] = 1280
df_train["height"] = 720

#Path of images
df_train = df_train.progress_apply(get_path, axis=1)

In [None]:
kf = GroupKFold(n_splits = 5) 
df_train = df_train.reset_index(drop=True)
df_train['fold'] = -1
for fold, (train_idx, val_idx) in enumerate(kf.split(df_train, y = df_train.video_id.tolist(), groups=df_train.sequence)):
    df_train.loc[val_idx, 'fold'] = fold

df_train.head(5)

## Creating Output Dirs

In [None]:
IMAGE_DIR = "/kaggle/working/images"
LABEL_DIR = "/kaggle/working/labels"

In [None]:
!mkdir -p {IMAGE_DIR}
!mkdir -p {LABEL_DIR}

!mkdir -p {IMAGE_DIR + '/train'}
!mkdir -p {IMAGE_DIR + '/valid'}
!mkdir -p {LABEL_DIR + '/train'}
!mkdir -p {LABEL_DIR + '/valid'}

## Moving Image Files

## Outputting label files

In [None]:
def writeLabels(bboxes, destfile, image_width = 1280, image_height = 720):
    bboxes_coco  = np.array(bboxes).astype(np.float32).copy()
    num_bbox     = len(bboxes_coco)
    names        = ['cots'] * num_bbox
    labels       = [0] * num_bbox
    with open(destfile, 'w') as f:
        if num_bbox<1:
            annot = ''
            f.write(annot)
        else:
            bboxes_yolo  = coco2yolo(image_height, image_width, bboxes_coco)
            for bbox_idx in range(len(bboxes_yolo)):
                annot = [str(labels[bbox_idx])]+ list(bboxes_yolo[bbox_idx].astype(str))+(['\n'] if num_bbox!=(bbox_idx+1) else [''])
                annot = ' '.join(annot)
                annot = annot.strip(' ')
                f.write(annot)
    return ''

df1 = df_train[df_train.fold != 4]
for row_idx in tqdm(range(len(df1))):
    row = df1.iloc[row_idx]
    shutil.copyfile(row.image_path, f'{IMAGE_DIR}/train/{row.image_id}.jpg')
    writeLabels(row.bboxes, f'{LABEL_DIR}/train/{row.image_id}.txt')

df2 = df_train[df_train.fold == 4]
for row_idx in tqdm(range(len(df2))):
    row = df2.iloc[row_idx]
    shutil.copyfile(row.image_path, f'{IMAGE_DIR}/valid/{row.image_id}.jpg')
    writeLabels(row.bboxes, f'{LABEL_DIR}/valid/{row.image_id}.txt')


In [None]:
!ls {IMAGE_DIR + '/train/'} | wc -l

In [None]:
!ls {LABEL_DIR + '/train/'} | wc -l

In [None]:
!ls {IMAGE_DIR + '/valid/'} | wc -l

In [None]:
!ls {LABEL_DIR + '/valid/'} | wc -l

#### Prepping YAML file for dataset

In [None]:
!echo -e 'train: ../images/train\nval: ../images/valid\n\nnc: 1\nnames: ['cots']' > cots.yaml
!cat 'cots.yaml'

### Zipping output files

In [None]:
shutil.make_archive(IMAGE_DIR, 'zip', 'images')
shutil.make_archive(LABEL_DIR, 'zip', 'labels')

### Cleanup

In [None]:
!rm -r {IMAGE_DIR}
!rm -r {LABEL_DIR}