In [None]:
!pip install -qU wandb
!pip install -qU bbox-utility # https://github.com/awsaf49/bbox
!pip install -q imagesize

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib.cm as cm

import os
import cv2
import random
import glob
import wandb

import imagesize
import shutil
import yaml

from matplotlib.colors import Normalize
from tqdm.notebook import tqdm
tqdm.pandas()
from joblib import Parallel, delayed
from kaggle_secrets import UserSecretsClient
from bbox.utils import coco2yolo, coco2voc, voc2yolo, yolo2voc
from bbox.utils import draw_bboxes, load_image
from bbox.utils import clip_bbox, str2annot, annot2str
from sklearn.model_selection import KFold
from scipy.stats import gaussian_kde

%matplotlib inline
train_jpg_path = "../input/happy-whale-and-dolphin/train_images"
test_jpg_peth = "../input/happy-whale-and-dolphin/test_images"

ROOT_DIR = "../input/whale-categorization-playground"
IMAGE_DIR = "/kaggle/data1/images"
LABEL_DIR = "/kaggle/data1/labels"

cwd = "/kaggle/working"
train_output = "/kaggle/working/output/train"
test_output = "/kaggle/working/output/test"

#sample_submission = pd.read_csv("../input/happy-whale-and-dolphin/sample_submission.csv")

# 1. Data Loading and EDA

In [None]:
df_train = pd.read_csv("../input/happy-whale-and-dolphin/train.csv")
df_train.species.replace({"globis": "short_finned_pilot_whale",
                          "beluga": "beluga_whale",
                          "pilot_whale": "short_finned_pilot_whale",
                          "kiler_whale": "killer_whale",
                          "bottlenose_dolpin": "bottlenose_dolphin"}, inplace=True)
df_train.head()

In [None]:
df_train.info()

In [None]:
print(f"Number of images in the train folder: {len(os.listdir(train_jpg_path))}")
print(f"Number of images in the test folder: {len(os.listdir(test_jpg_peth))}")

In [None]:
cnt_examples = 5
#random.seed(161)

train_pic_list = os.listdir(train_jpg_path)
fig = plt.figure(figsize=(25,25))

for i in range(cnt_examples):
    ax = fig.add_subplot(cnt_examples,1,i+1)
    example_pic = random.choice(train_pic_list)
    img_plt = plt.imshow(plt.imread(f'{train_jpg_path}/{example_pic}'))
    plt.axis('off')
    ax.set_title(df_train[df_train.image == example_pic].species.values[0])

In [None]:
animal_cnt = df_train.species.value_counts()
print("Occurences of different species:")
print(animal_cnt)
print(f"Total number of species: {len(animal_cnt)}")

In [None]:
specs = list(animal_cnt.keys())
values = list(animal_cnt.values)

cmap = cm.get_cmap('jet')
norm = Normalize(vmin=0,vmax=len(specs))
cols = np.arange(0,len(specs))

fig = plt.figure(figsize=(12,8))
ax = fig.add_subplot(1,1,1)
ax.set_axisbelow(True)
plt.grid(visible=True)
plt.bar(specs, values, color=cmap(norm(cols)))
plt.xticks(rotation='vertical')
plt.title('Occurences Of Different Species In The Dataset', fontsize=16, fontname="Times New Roman Bold")
plt.show()

In [None]:
cnt_dolphins = 0
cnt_whales = 0
for spec in animal_cnt.keys():
    cnt = animal_cnt[spec]
    if spec.split('_')[-1] == 'dolphin':
        cnt_dolphins += cnt
    else:
        cnt_whales += cnt
        
print(f"Number of dolphins in the set: {cnt_dolphins}")
print(f"Number of whales in the set: {cnt_whales}")

# 2. Creating Bounding Boxes

Based on this [notebook](https://www.kaggle.com/awsaf49/happywhale-boundingbox-yolov5) about how to create bounding boxes with YOLOv5 based on the [Whale Flute dataset](https://www.kaggle.com/martinpiotte/humpback-whale-identification-fluke-location) and the [Humpback Whale Identification Challenge](https://www.kaggle.com/c/whale-categorization-playground). Big recommendation!

## WandB

In [None]:
try:
    user_secrets = UserSecretsClient()
    api_key = user_secrets.get_secret("WANDB")
    wandb.login(key=api_key)
    anonymous = None
except:
    wandb.login(anonymous="must")
    print("To use ur W&B account,\nGo to Add-ons -> Secrets and provide your W&B access token. Use the Label name as WANDB. \nGet your W&B access token from here: https://wandb.ai/authorize")

## Meta Data

In [None]:
FOLD = 0
DIM = 640
MODEL = "yolov5x"
BATCH = 16
EPOCHS = 18
OPTIMIZER = "Adam"

PROJECT = "happywhale-det-public"
NAME = f"{MODEL}-dim{DIM}-fold{FOLD}"

## Create Directories

In [None]:
!mkdir -p {IMAGE_DIR}
!mkdir -p {LABEL_DIR}

## Get Paths

In [None]:
df = pd.read_csv(f"{ROOT_DIR}/train.csv")
df["image_id"] = df["Image"]
df["old_image_path"] = f"{ROOT_DIR}/train/" + df.image_id.astype(str)
df["image_path"] = f"{IMAGE_DIR}/" + df.image_id
df["label_path"] = f"{LABEL_DIR}/" + df.image_id.str.replace("jpg", "txt")
df.head(2)

## Write Copies

In [None]:
def make_copy(row):
    shutil.copyfile(row.old_image_path, row.image_path)

In [None]:
image_paths = df.old_image_path.tolist()
_ = Parallel(n_jobs=-1, backend='threading')(delayed(make_copy)(row) for _, row in tqdm(df.iterrows(), total=len(df)))

## Create BBox

In [None]:
def get_bbox(annots):
    bboxes = [list(annot.values()) for annot in annots]
    return bboxes

def get_imgsize(row):
    row["width"], row["height"] = imagesize.get(row["image_path"])
    return row

np.random.seed(32)
colors = [(np.random.randint(255), np.random.randint(255), np.random.randint(255)) for idx in range(1)]

In [None]:
def point2bbox(points):
    points = np.array(points)
    points = points.astype('int')
    points = points.reshape(-1, 2)
    xmin, ymin, xmax, ymax = points[:, 0].min(), points[:, 1].min(), points[:, 0].max(), points[:, 1].max()
    return [[xmin, ymin, xmax, ymax]]

f = open('/kaggle/input/humpback-whale-identification-fluke-location/cropping.txt', 'rt').read()
id2point = {x.split(',')[0]:x.split(',')[1:] for x in f.split('\n')}
df['point'] = df['image_id'].map(id2point)
df = df[~df.point.isna()]
df['bbox'] = df.point.map(point2bbox)

## Get Image-Size

In [None]:
df = df.progress_apply(get_imgsize, axis=1)
display(df.head(2))

In [None]:
df.info()

## Create Labels

In [None]:
cnt = 0
all_bboxes = []
bboxes_info = []
for row_idx in tqdm(range(df.shape[0])):
    row = df.iloc[row_idx]
    image_height = row.height
    image_width = row.width
    bboxes_voc = np.array(row.bbox).astype(np.float32).copy()
    num_bbox = len(bboxes_voc)
    names = ["whale"] * num_bbox
    labels = np.array([0] * num_bbox)[..., None].astype(str)
    
    with open(row.label_path, "w") as f:
        if num_bbox < 1:
            annot = ""
            f.write(annot)
            cnt += 1
            continue
            
        bboxes_voc = clip_bbox(bboxes_voc, image_height, image_width)
        bboxes_yolo = voc2yolo(bboxes_voc, image_height, image_width)
        all_bboxes.extend(bboxes_yolo.astype(float))
        bboxes_info.extend([[row.image_id]]*len(bboxes_yolo))
        annots = np.concatenate([labels, bboxes_yolo], axis=1)
        f.write(annot2str(annots))
        
print(f"Missing: {cnt}")

## Create Folds

In [None]:
kf = KFold(n_splits=6, random_state=161, shuffle=True)
df = df.reset_index(drop=True)
df['fold'] = -1
for fold, (train_idx, val_idx) in enumerate(kf.split(df)):
    df.loc[val_idx, 'fold'] = fold
df.fold.value_counts()

In [None]:
df.head(2)

## BBox Distribution

In [None]:
bbox_df = pd.DataFrame(np.concatenate([bboxes_info, all_bboxes], axis=1),
                      columns=['image_id','xmid','ymid','w','h'])
bbox_df[['xmid','ymid','w','h']] = bbox_df[['xmid','ymid','w','h']].astype(float)
bbox_df['area'] = bbox_df.w * bbox_df.h
bbox_df = bbox_df.merge(df[['image_id', 'fold']], on='image_id', how='left')
bbox_df.head(2)

In [None]:
all_bboxes = np.array(all_bboxes)

x_val = all_bboxes[...,0]
y_val = all_bboxes[...,1]

xy = np.vstack([x_val, y_val])
z = gaussian_kde(xy)(xy)

fig, ax = plt.subplots(figsize = (10, 10))
ax.scatter(x_val, y_val, c=z, s=50, cmap='viridis')
ax.set_xlabel('x_mid')
ax.set_ylabel('y_mid')
plt.show()

## Visualization

In [None]:
df2 = df.sample(100)
y = 3
x = 5
plt.figure(figsize=(4 * x, 4 * y))
for idx in range(x*y):
    row = df2.iloc[idx]
    img = load_image(row.image_path)
    img = cv2.resize(img, (512, 512))
    image_height = row.height
    image_width = row.width
    with open(row.label_path) as f:
        annot = str2annot(f.read())
    bboxes_yolo = annot[...,1:]
    labels = annot[..., 0].astype(int).tolist()
    names = ['whale']*len(bboxes_yolo)
    plt.subplot(y, x, idx+1)
    plt.imshow(draw_bboxes(img = img,
                          bboxes = bboxes_yolo,
                          classes = names,
                          class_ids = labels,
                          class_name = True,
                          colors = colors,
                          bbox_format='yolo',
                          line_thickness=2))

## Fluke Dataset

In [None]:
df.head()

In [None]:
train_files = []
val_files = []
train_df_fluke = df.query("fold!=@FOLD")
val_df_fluke = df.query("fold==@FOLD")
train_files += list(train_df_fluke.image_path.unique())
val_files += list(val_df_fluke.image_path.unique())
len(train_files), len(val_files)

## Configuration

In [None]:
with open(os.path.join(cwd, 'train.txt'), 'w') as f:
    for path in train_df_fluke.image_path.tolist():
        f.write(path+"\n")
        
with open(os.path.join(cwd, 'val.txt'), 'w') as f:
    for path in val_df_fluke.image_path.tolist():
        f.write(path+"\n")

data = dict(
    path=cwd,
    train = os.path.join(cwd, 'train.txt'),
    val = os.path.join(cwd, 'val.txt'),
    nc = 1,
    names = ['whale']
)

with open(os.path.join(cwd, 'happywhale.yaml'), 'w') as outfile:
    yaml.dump(data, outfile, default_flow_style=False)
    
f = open(os.path.join(cwd, 'happywhale.yaml'), 'r')
print(f"yaml file:\n{f.read()}")

In [None]:
%%writefile /kaggle/working/hyp.yaml
lr0: 0.001
lrf: 0.01
momentum: 0.937
weight_decay: 0.0005
warmup_epochs: 3.0
warmup_momentum: 0.8
warmup_bias_lr: 0.1
box: 0.05
cls: 0.5
cls_pw: 1.0
obj: 1.0
obj_pw: 1.0
iuo_t: 0.25
anchor_t: 4.0
fl_gamma: 0.0
hsv_h: 0.015
hsv_s: 0.7
hsv_v: 0.4
degrees: 30.0
translate: 0.10
scale: 0.80
shear: 10.0
perspective: 0.0
flipud: 0.5
fliplr: 0.5
mosaic: 0.75
mixup: 0.0
copy_paste: 0.0

## YOLOv5

In [None]:
%cd /kaggle/working
!rm -r /kaggle/working/yolov5
!git clone https://github.com/ultralytics/yolov5
!cp -r /kaggle/input/yolov5-lib-ds /kaggle/working/yolov5
%cd yolov5
%pip install -qr requirements.txt

In [None]:
from yolov5 import utils
_ = utils.notebook_init()

## Training

In [None]:
!python train.py --img {DIM}\
--batch {BATCH}\
--epochs {EPOCHS}\
--optimizer {OPTIMIZER}\
--data /kaggle/working/happywhale.yaml\
--hyp /kaggle/working/hyp.yaml\
--weights {MODEL}.pt\
--project {PROJECT} --name {NAME}\
--exist-ok

## Output Files

In [None]:
OUTPUT_DIR = f"{PROJECT}/{NAME}"
!ls {OUTPUT_DIR}

In [None]:
!ls {OUTPUT_DIR}/weights/best.pt

## Whale and Dolphin Data

In [None]:
df2 = pd.read_csv(f"/kaggle/input/happywhale-data-distribution/train.csv")
df2["image_id"] = df2["image"]
df2["label_path"] = train_output + "/labels/" + df2["image_id"].str.replace('jpg','txt')

test_df2 = pd.read_csv(f"/kaggle/input/happywhale-data-distribution/test.csv")
test_df2["image_id"] = test_df2["image"]
test_df2["label_path"] = test_output + "/labels/" + test_df2["image_id"].str.replace('jpg','txt')

print("Train Images: {:,} | Test Images: {:,}".format(len(df2), len(test_df2)))

## Prediction on Train

In [None]:
!rm -rf {train_output}
!mkdir -p {train_output}

In [None]:
!python detect.py --img {DIM}\
--source /kaggle/input/happy-whale-and-dolphin/train_images\
--weights {OUTPUT_DIR}/weights/best.pt\
--project /kaggle/working/output --name train\
--conf 0.01 --iou 0.4 --max-det 1\
--safe-txt --safe-conf\
--nosafe\
--half\
--exist-ok

## **To be continued...**