
Some parts of this notebook is copied from [this notebook](https://www.kaggle.com/andradaolteanu/greatbarrierreef-yolo-full-guide-train-infer)

# **Data Preperation**

In [None]:
# Import necessary libraries
import numpy as np
import pandas as pd
import sys
import cv2 as cv
import os
from tqdm import tqdm
import shutil 
import yaml

In [None]:
# Read the train.csv file which is holding some important information related to the images.
train= pd.read_csv("/kaggle/input/tensorflow-great-barrier-reef/train.csv")

# Drop all the rows which are not having any annotations
train=train.loc[train["annotations"].astype(str) != "[]"]
train['annotations'] = train['annotations'].apply(eval)

# Adding the column with full image path
train['image_path'] = "/kaggle/input/tensorflow-great-barrier-reef/train_images/video_" + train['video_id'].astype(str) + "/" + train['video_frame'].astype(str) + ".jpg"

# Single annotation per row
train=train.explode('annotations') 
train.reset_index(inplace=True)
train.head()

In [None]:
#Convert data from json to flat tables
df=pd.DataFrame(pd.json_normalize(train['annotations']), columns=['x', 'y', 'width', 'height']).join(train)
df['class']='Fish'
df=df[['image_path','x','y','width','height','class','video_id','video_frame']]
df.head(10)

In [None]:
#Creating two new columns path_images and path_labels pointing towards the two new directories images and labels
df["path_images"] = "/kaggle/images/video_" + df["video_id"].astype(str) + "_" + \
                                                df["video_frame"].astype(str) + ".jpg"
df["path_labels"] = "/kaggle/labels/video_" + df["video_id"].astype(str) + "_" + \
                                                df["video_frame"].astype(str) + ".txt"

In [None]:
df.head()

In [None]:
# Create an array of x, y, width, height columns
labels = df.loc[:, ['x', 'y', 'width', 'height']].values

#Add the labels array into new column bbox in dataframe
df['bboxes'] = list(labels)
df.head()

In [None]:
data1 = df.groupby('image_path')['bboxes'].apply(list).reset_index(name='bboxes')
data2 = df.groupby('image_path')['path_images', 'path_labels'].agg(
    width=('path_images', 'max'), height=('path_labels', 'max'))
train_df = pd.merge(data1, data2, on='image_path')


In [None]:
train_df

In [None]:
train_df['image_height']=720
train_df['image_width']=1280

In [None]:
train_df.head()

In [None]:
!mkdir "../images"
!mkdir "../labels"

In [None]:
for path in tqdm(train_df["image_path"].tolist()):
    split_path = path.split("/")

    # Retrieve the video id (0, 1, 2) and its frame number
    video_id = split_path[-2]
    video_frame = split_path[-1]

    # Create new image path
    path_image = f"../images/{video_id}_{video_frame}"
    
    # Copy file from source (competition data) to destination (our new folder)
    shutil.copy(src=path, dst=path_image)

In [None]:
def coco2yolo(image_height, image_width, bboxes):
    """
    Converts a coco annotation format [xmin, ymin, w, h] to 
    the corresponding yolo format [xmid, ymid, w, h]
    
    image_height: height of the original image
    image_width: width of the original image
    bboxes: coco boxes to be converted
    return :: 
    
    inspo: https://www.kaggle.com/awsaf49/great-barrier-reef-yolov5-train
    """
    
    bboxes = np.array(bboxes).astype(float)
    
    # Normalize xmin, w
    bboxes[:, [0, 2]]= bboxes[:, [0, 2]]/ image_width
    # Normalize ymin, h
    bboxes[:, [1, 3]]= bboxes[:, [1, 3]]/ image_height
    
    # Converstion (xmin, ymin) => (xmid, ymid)
    bboxes[:, [0, 1]] = bboxes[:, [0, 1]] + bboxes[:, [2, 3]]/2
    
    # Clip values (between 0 and 1)
    bboxes = np.clip(bboxes, a_min=0, a_max=1)
    
    return bboxes

In [None]:
yolo_bbox = []
for i in range(len(train_df)):
    yolo_bbox.append(coco2yolo(train_df['image_height'][i], train_df['image_width'][i], train_df['bboxes'][i]))

In [None]:
train_df['yolo_bbox'] = yolo_bbox

In [None]:
train_df.head()

In [None]:
train_df['num_bbox']=train_df['bboxes'].apply(lambda x: len(x))

In [None]:
train_df.head()

In [None]:
for _, row in tqdm(train_df.iterrows(), total=len(train_df)):
    #image_name = row['file_name']
    bounding_boxes = row['yolo_bbox']
    #labels = row['labels']
    num_bbox=row['num_bbox']
    yolo_data=[]
    for bbox in bounding_boxes:
        x_center = bbox[0]
        y_center = bbox[1]
        width = bbox[2]
        height = bbox[3]
        yolo_data.append([x_center,y_center,width,height])
    yolo_data_array=np.array(yolo_data)
    with open(row['height'],'w') as file:
        for i in range(num_bbox):
            annot = ["0"] + yolo_data_array[i].astype(str).tolist()
            annot = " ".join(annot).strip()
            file.write(annot)

In [None]:
f1 = open('../labels/video_1_4238.txt', 'r')
f2 = open('../labels/video_1_5315.txt', 'r')
f3 = open('../labels/video_0_1006.txt', 'r')
print(f1.read())
print(f2.read())
print(f3.read())

In [None]:
from sklearn.model_selection import train_test_split

df_train,df_valid=train_test_split(train_df,test_size=0.2,random_state=42,shuffle=True)

In [None]:
df_train_images=list(df_train['width'])
df_train_labels=list(df_train['height'])

df_test_images=list(df_valid['width'])
df_test_labels=list(df_valid['height'])

In [None]:
print("./working BEFORE:",os.listdir("../working"))

# Create train and test path data
with open("../working/train_images.txt", "w") as file:
    for path in df_train_images:
        file.write(path + "\n")
        
with open("../working/test_images.txt", "w") as file:
    for path in df_test_images:
        file.write(path + "\n")


# Create configuration
config = {'path': '/kaggle/working',
          'train': '/kaggle/working/train_images.txt',
          'val': '/kaggle/working/test_images.txt',
          'nc': 1,
          'names': ['cots']}

with open("../working/cots.yaml", "w") as file:
    yaml.dump(config, file, default_flow_style=False)

        
print("../working AFTER:", os.listdir("../working"))

In [None]:
%cd /kaggle/working
!git clone https://github.com/ultralytics/yolov5.git   
%cd yolov5     
%pip install -qr requirements.txt   

from yolov5 import utils
display = utils.notebook_init()

In [None]:
SIZE = 500
BATCH_SIZE = 4
EPOCHS = 1
MODEL = "yolov5s"
WORKERS = 0
PROJECT = "GreatBarrierReef"
RUN_NAME = f"{MODEL}_size{SIZE}_epochs{EPOCHS}_batch{BATCH_SIZE}_simple"

In [None]:
# Training
!python train.py --img {SIZE}\
                --batch {BATCH_SIZE}\
                --epochs {EPOCHS}\
                --data /kaggle/working/cots.yaml\
                --weights {MODEL}.pt\
                --workers {WORKERS}\
                --project {PROJECT}\
                --name {RUN_NAME}\
                --exist-ok

# **Work in Progress**