**2022-04-21**

Updated to include dorsal crops aka "backfin".

---

This is the TFRecords creation pipeline for [this](https://www.kaggle.com/code/lextoumbourou/happywhale-arcface-baseline-from-0-470-to-0-804?scriptVersionId=93216274) solution.

In [None]:
!pip install imgsize

In [None]:
from multiprocessing import Pool

import os, json, random, cv2
import numpy as np, pandas as pd
import matplotlib.pyplot as plt
from matplotlib import patches
import tensorflow as tf, re, math
from tqdm import tqdm
from functools import partial

from imgsize import get_size
from PIL import Image
import PIL

In [None]:
def convert_to_rect(img, label, color='b', linewidth=1):
    if isinstance(img, PIL.Image.Image):
        width, height = img.size[0], img.size[1]
    else:
        height, width = img.size[0], img.size[1]

    xmin, ymin, xmax, ymax  = label[0], label[1], label[2], label[3]
    rect = patches.Rectangle((
         xmin * width,
         ymin * height
    ),
        (xmax - xmin) * width,
        (ymax - ymin) * height,
        linewidth=linewidth, edgecolor=color, facecolor='none'
    )
    return rect

def show_img_grid(df, dataset):
    row = 3; col = 3;

    plt.figure(figsize=(25,int(25*row/col)))
    for j in range(row*col):
        df_row = df.iloc[j]
        img = Image.open(f'../input/happy-whale-and-dolphin/{dataset}_images/{df_row.image}')
        plt.subplot(row,col,j+1)
        plt.axis('off')
        plt.imshow(img)
        ax = plt.gca()
        if 'xmin' in df:
            ax.add_patch(convert_to_rect(img, [df_row.xmin, df_row.ymin, df_row.xmax, df_row.ymax]))
    plt.show()

Goal of this notebook is to generate TFRecords but with each image's largest side no larger than 512.

I will use this for fast prototyping, particularly on GPUs.

The goal is to get the dataset size < 20GB.

# Config

In [None]:
MAX_IMAGE_SIDE = None

# Load Data and Clean Species

In [None]:
def clean_species(species):
    return species.replace({
        "globis": "short_finned_pilot_whale",
        "pilot_whale": "short_finned_pilot_whale",
        "kiler_whale": "killer_whale",
        "bottlenose_dolpin": "bottlenose_dolphin",
        "beluga": "beluga_whale"
    })

In [None]:
train_df = pd.read_csv('../input/happy-whale-and-dolphin/train.csv')
train_df.species = clean_species(train_df.species)

test_df = pd.read_csv('../input/happy-whale-and-dolphin/sample_submission.csv')

# Generate 5 Folds

In [None]:
train_df.image

In [None]:
from sklearn.model_selection import StratifiedKFold

train_df['fold'] = -1
skf = StratifiedKFold(n_splits=10)
for i, (train_index, val_index) in enumerate(skf.split(train_df.index, train_df.individual_id)):
    train_df.loc[val_index, 'fold'] = i

In [None]:
train_df.fold.value_counts()

In [None]:
test_df['split'] = test_df.index%10

train_df_dupes = train_df[['individual_id', 'species']].drop_duplicates()
indid_2_species = {}
for idx, row in train_df_dupes.iterrows():
    indid_2_species[row.individual_id] = row.species

train_df['species'] = train_df.species.astype("category")
id_2_species = train_df.species.cat.categories
species_map = {cat: i for i, cat in enumerate(train_df.species.cat.categories)}

In [None]:
id_2_species

In [None]:
species_map

In [None]:
with open('species.json', 'w') as fh:
    json.dump(species_map, fh)

In [None]:
train_df['species'] = train_df.species.cat.codes

In [None]:
train_df.head()

In [None]:
copy = train_df.copy()
copy['species'] = copy.species.apply(lambda specie: id_2_species[specie] )
copy.head()

In [None]:
train_df.individual_id = train_df.individual_id.astype('category')
id_2_individual = train_df.individual_id.cat.categories

In [None]:
id_2_individual

In [None]:
individual_ids_map = {cat: i for i, cat in enumerate(train_df.individual_id.cat.categories)}

In [None]:
with open('individual_ids.json', 'w') as fh:
    json.dump(individual_ids_map, fh)

In [None]:
train_df.individual_id = train_df.individual_id.cat.codes

In [None]:
train_df.head()

# Get Image Stats

In order to be able to easily convert from one bounding box format to another, I'm caching the width and height of each of the images.

In [None]:
def _get_image_size(row, dataset):
    idx, row = row
    return get_size(open(f'../input/happy-whale-and-dolphin/{dataset}_images/{row.image}', 'rb'))


def get_img_sizes(df, dataset):
    with Pool(8) as p:
        return list(tqdm(p.imap(partial(_get_image_size, dataset=dataset), df.iterrows()), total=len(df)))

In [None]:
sizes = get_img_sizes(train_df, 'train')
train_df['width'] = [s[0] for s in sizes]
train_df['height'] = [s[1] for s in sizes]

In [None]:
sizes = get_img_sizes(test_df, 'test')
test_df['width'] = [s[0] for s in sizes]
test_df['height'] = [s[1] for s in sizes]

In [None]:
train_df.head(), test_df.head()

# TokenCut

These bounding box predictions where trained using TokenCut using ViTBase.

https://www.kaggle.com/lextoumbourou/happywhale-tokencut-generate-vit-base

In [None]:
train_vitbase_df = pd.read_csv('../input/happywhale-tokencut-all-bbs/train_vit_base.csv').drop('Unnamed: 0', axis=1)
test_vitbase_df = pd.read_csv('../input/happywhale-tokencut-all-bbs/test_vit_base.csv').drop('Unnamed: 0', axis=1)

train_vit_small_df = pd.read_csv('../input/happywhale-tokencut-all-bbs/train_vit_small.csv').drop('Unnamed: 0', axis=1)
test_vit_small_df = pd.read_csv('../input/happywhale-tokencut-all-bbs/test_vit_small.csv').drop('Unnamed: 0', axis=1)

train_moco_vit_base = pd.read_csv('../input/happywhale-tokencut-all-bbs/train_moco_vit_base.csv').drop('Unnamed: 0', axis=1)
test_moco_vit_base = pd.read_csv('../input/happywhale-tokencut-all-bbs/test_moco_vit_base.csv').drop('Unnamed: 0', axis=1)

In [None]:
show_img_grid(train_vitbase_df, 'train')

In [None]:
show_img_grid(test_vitbase_df, 'test')

In [None]:
import math

In [None]:
def columns_to_string(df):
    return df.apply(lambda row: '' if math.isnan(row.xmin) else (f'{row.xmin} {row.ymin} {row.xmax} {row.ymax}'), axis=1)

In [None]:
train_vitbase_df['tc_vitbase'] = columns_to_string(train_vitbase_df)
test_vitbase_df['tc_vitbase'] = columns_to_string(test_vitbase_df)

train_vit_small_df['tc_vitsmall'] = columns_to_string(train_vit_small_df)
test_vit_small_df['tc_vitsmall'] = columns_to_string(test_vit_small_df)

train_moco_vit_base['tc_mocovit'] = columns_to_string(train_moco_vit_base)
test_moco_vit_base['tc_mocovit'] = columns_to_string(test_moco_vit_base)

In [None]:
train_df = train_df.merge(train_vitbase_df[['image', 'tc_vitbase']], how='left', on='image')
test_df = test_df.merge(test_vitbase_df[['image', 'tc_vitbase']], how='left', on='image')

train_df = train_df.merge(train_vit_small_df[['image', 'tc_vitsmall']], how='left', on='image')
test_df = test_df.merge(test_vit_small_df[['image', 'tc_vitsmall']], how='left', on='image')

train_df = train_df.merge(train_moco_vit_base[['image', 'tc_mocovit']], how='left', on='image')
test_df = test_df.merge(test_moco_vit_base[['image', 'tc_mocovit']], how='left', on='image')

In [None]:
train_df.head()

In [None]:
test_df.head()

# My Annotations

I did 490 crops by hand. I found all the images which didn't have Dietic or Yolo + some where they disagreed.

This now ensures that the train set now has 100% bounding box coverage.

In [None]:
extra_anno = pd.read_csv('../input/happywhaleextraannotations/extra_annotations.csv')
extra_anno = extra_anno.rename(columns={'filename': 'image'})
extra_anno = extra_anno.merge(train_df, on='image')

# Denorm to make life easier when resizing
extra_anno.xmin = extra_anno.xmin / extra_anno.width
extra_anno.ymin = extra_anno.ymin / extra_anno.height
extra_anno.xmax = extra_anno.xmax / extra_anno.width
extra_anno.ymax = extra_anno.ymax / extra_anno.height

In [None]:
show_img_grid(extra_anno, 'train')

In [None]:
str_rows = extra_anno.apply(lambda row: (f'{row.xmin} {row.ymin} {row.xmax} {row.ymax}'), axis=1)
extra_anno['my_box'] = str_rows
train_df = train_df.merge(extra_anno[['image', 'my_box']], how='left', on='image')
train_df.loc[train_df.my_box.isna(), ['my_box']] = ''

# Yolov5

Load the [Yolov5 bounding box predictions](https://www.kaggle.com/awsaf49/happywhale-cropped-dataset-yolov5) created by [awsaf49](https://www.kaggle.com/awsaf49).

In [None]:
yolo5_train_df = pd.read_csv('../input/happywhale-cropped-dataset-yolov5/train.csv')
yolo5_test_df = pd.read_csv('../input/happywhale-cropped-dataset-yolov5/test.csv')
yolo5_train_df.bbox = yolo5_train_df.bbox.str[2:-2].str.replace(',', '')
yolo5_test_df.bbox = yolo5_test_df.bbox.str[2:-2].str.replace(',', '')

In [None]:
tqdm.pandas()

In [None]:
def _norm_bb(row):
    if not row.bbox:
        return pd.Series(dict(
        image=row.image))

    xmin, ymin, xmax, ymax = row.bbox.split()
    return pd.Series(dict(
        image=row.image,
        xmin=float(xmin)/row.width,
        ymin=float(ymin)/row.height,
        xmax=float(xmax)/row.width,
        ymax=float(ymax)/row.height))

yolo_train_norm_df = yolo5_train_df.progress_apply(_norm_bb, axis=1)
yolo_test_norm_df = yolo5_test_df.progress_apply(_norm_bb, axis=1)

In [None]:
show_img_grid(yolo_train_norm_df, 'train')

In [None]:
show_img_grid(yolo_test_norm_df, 'test')

In [None]:
yolo_train_norm_df.head()

In [None]:
yolo_train_norm_df['yolov5'] = columns_to_string(yolo_train_norm_df)
yolo_test_norm_df['yolov5'] = columns_to_string(yolo_test_norm_df)

In [None]:
train_df = train_df.merge(yolo_train_norm_df[['image', 'yolov5']], how='left', on='image')
test_df = test_df.merge(yolo_test_norm_df[['image', 'yolov5']], how='left', on='image')

In [None]:
train_df.head(2)

In [None]:
test_df.head(2)

# Detic

Load the [Detic bounding box predictions] created by [phalanx](https://www.kaggle.com/phalanx).

In [None]:
detic_train_df = pd.read_csv('../input/whale2-cropped-dataset/train2.csv')
detic_test_df = pd.read_csv('../input/whale2-cropped-dataset/test2.csv')

detic_train_df = detic_train_df.merge(train_df[['image', 'width', 'height']], how='left')
detic_test_df = detic_test_df.merge(test_df[['image', 'width', 'height']], how='left')

detic_train_df.loc[detic_train_df.box.isna(), 'box'] = ''
detic_test_df.loc[detic_test_df.box.isna(), 'box'] = ''

def _norm_bb(row):
    if not row.box:
        return pd.Series(dict(
        image=row.image))

    xmin, ymin, xmax, ymax = row.box.split()
    return pd.Series(dict(
        image=row.image,
        xmin=float(xmin)/row.width,
        ymin=float(ymin)/row.height,
        xmax=float(xmax)/row.width,
        ymax=float(ymax)/row.height))

detic_train_norm_df = detic_train_df.progress_apply(_norm_bb, axis=1)
detic_test_norm_df = detic_test_df.progress_apply(_norm_bb, axis=1)

In [None]:
show_img_grid(detic_train_norm_df, 'train')

In [None]:
show_img_grid(detic_test_norm_df, 'test')

In [None]:
train_df['detic'] = columns_to_string(detic_train_norm_df)
test_df['detic'] = columns_to_string(detic_test_norm_df)

# Full Body

In [None]:
fullbody_train = pd.read_csv('../input/fullbodywhaleannotations/fullbody_train.csv')
fullbody_test = pd.read_csv('../input/fullbodywhaleannotations/fullbody_test.csv')

fullbody_train.bbox = fullbody_train.bbox.str[2:-2]
fullbody_test.bbox = fullbody_test.bbox.str[2:-2]

In [None]:
len(fullbody_train)

In [None]:
def _norm_bb(row):
    xmin, ymin, xmax, ymax = row.bbox.split()
    return pd.Series(dict(
        image=row.image,
        xmin=float(xmin)/row.width,
        ymin=float(ymin)/row.height,
        xmax=float(xmax)/row.width,
        ymax=float(ymax)/row.height))

In [None]:
fullbody_train_norm = fullbody_train.progress_apply(_norm_bb, axis=1)
fullbody_test_norm = fullbody_test.progress_apply(_norm_bb, axis=1)

In [None]:
show_img_grid(fullbody_train_norm, 'train')

In [None]:
show_img_grid(fullbody_test_norm, 'test')

In [None]:
train_df['fullbody'] = columns_to_string(fullbody_train_norm)
test_df['fullbody'] = columns_to_string(fullbody_test_norm)

In [None]:
train_df.head()

# Dorsal ("Backfin") crops

In [None]:
dorsal_train = pd.read_csv('../input/backfin-detection-with-yolov5/train.csv')
dorsal_test = pd.read_csv('../input/backfin-detection-with-yolov5/test.csv')

dorsal_train.bbox = dorsal_train.bbox.str[2:-2]
dorsal_test.bbox = dorsal_test.bbox.str[2:-2]

In [None]:
len(dorsal_train), len(dorsal_test)

In [None]:
def _norm_bb(row):
    if type(row.bbox) != str:
        xmin, ymin, xmax, ymax = -1, -1, -1, -1
    else:
        xmin, ymin, xmax, ymax = row.bbox.split()
    return pd.Series(dict(
        image=row.image,
        xmin=float(xmin)/row.width,
        ymin=float(ymin)/row.height,
        xmax=float(xmax)/row.width,
        ymax=float(ymax)/row.height))

In [None]:
dorsal_train_norm = dorsal_train.progress_apply(_norm_bb, axis=1)
dorsal_test_norm = dorsal_test.progress_apply(_norm_bb, axis=1)

In [None]:
show_img_grid(dorsal_train_norm, 'train')

In [None]:
show_img_grid(dorsal_test_norm, 'test')

In [None]:
train_df['dorsal'] = columns_to_string(dorsal_train_norm)
test_df['dorsal'] = columns_to_string(dorsal_test_norm)

In [None]:
train_df.head()

# Pseudo Labels

In [None]:
pseudo_labels_df = pd.read_csv('../input/whale-pseudo-labels/pseudo_labels.csv')

In [None]:
pseudo_labels_df.head()

In [None]:
pseudo_labels_df = pseudo_labels_df.rename(columns={'target': 'individual_id'})

In [None]:
pseudo_labels_df['species'] = pseudo_labels_df.individual_id.map(lambda val: indid_2_species[val])

In [None]:
pseudo_labels_df.head()

In [None]:
orig_train_df = pd.read_csv('../input/happy-whale-and-dolphin/train.csv')

row = 1
col = 4

for test_img in range(10):
    plt.figure(figsize=(25,int(25*row/col)))
    df_row = pseudo_labels_df.iloc[test_img]

    plt.subplot(row, col, 1)

    img = Image.open(f'../input/happy-whale-and-dolphin/test_images/{df_row.image}')
    plt.imshow(img)

    individual_rows = orig_train_df[orig_train_df.individual_id == df_row.individual_id]

    for i, (idx, _row) in enumerate(individual_rows.iterrows()):
        plt.subplot(row, col, i + 1)

        img = Image.open(f'../input/happy-whale-and-dolphin/train_images/{_row.image}')
        plt.imshow(img)
        if i >= 3:
            break

    plt.axis('off')
    plt.show()

In [None]:
pseudo_labels_df.individual_id = pseudo_labels_df.individual_id.apply(lambda _id: individual_ids_map[_id])
pseudo_labels_df.species = pseudo_labels_df.species.apply(lambda spec: species_map[spec])

In [None]:
pseudo_labels_df = pseudo_labels_df.merge(test_df[['image', 'width', 'height', 'tc_vitbase', 'tc_vitsmall', 'tc_mocovit', 'yolov5', 'detic', 'fullbody', 'dorsal']], on='image', how='left')

# Replace missing examples

## Yolov5

Any Yolo examples that are missing, should be set to my hand annotated dataset.

In [None]:
print('Train Yolo missing before:', len(train_df.loc[train_df.yolov5 == '']))
train_df.loc[train_df.yolov5 == '', 'yolov5'] = train_df.my_box
print('Yolo missing after:', len(train_df.loc[train_df.yolov5 == '']))

Then TokenCut.

In [None]:
print('Yolo missing before:', len(train_df.loc[train_df.yolov5 == '']))
train_df.loc[train_df.yolov5 == '', 'yolov5'] = train_df.tc_vitbase
print('Yolo missing after:', len(train_df.loc[train_df.yolov5 == '']))

print('Test Yolo missing before:', len(test_df.loc[test_df.yolov5 == '']))
test_df.loc[test_df.yolov5 == '', 'yolov5'] = test_df.tc_vitbase
print('Test Yolo missing after:', len(test_df.loc[test_df.yolov5 == '']))

print('Pseudo Yolo missing before:', len(pseudo_labels_df.loc[pseudo_labels_df.yolov5 == '']))
pseudo_labels_df.loc[pseudo_labels_df.yolov5 == '', 'yolov5'] = pseudo_labels_df.tc_vitbase
print('Pseudo Yolo missing after:', len(pseudo_labels_df.loc[pseudo_labels_df.yolov5 == '']))

## Detic

In [None]:
print('Detic missing before:', len(train_df.loc[train_df.detic == '']))
train_df.loc[train_df.detic == '', 'detic'] = train_df.my_box
print('Detic missing after:', len(train_df.loc[train_df.detic == '']))

Then TokenCut.

In [None]:
print('Detic missing before:', len(train_df.loc[train_df.detic == '']))
train_df.loc[train_df.detic == '', 'detic'] = train_df.tc_vitbase
print('Detic missing after:', len(train_df.loc[train_df.detic == '']))

print('Test Detic missing before:', len(test_df.loc[test_df.detic == '']))
test_df.loc[test_df.detic == '', 'detic'] = test_df.tc_vitbase
print('Test Detic missing after:', len(test_df.loc[test_df.detic == '']))

print('Pseudo Detic missing before:', len(pseudo_labels_df.loc[pseudo_labels_df.detic == '']))
pseudo_labels_df.loc[pseudo_labels_df.detic == '', 'detic'] = pseudo_labels_df.tc_vitbase
print('Pseudo Detic missing after:', len(pseudo_labels_df.loc[pseudo_labels_df.detic == '']))

# Setup Dataset

In [None]:
%%time

### Create Kaggle Dataset if not exists 
DATASET_NAME = f'happywhale-tfrecords-private2'

!rm -rf /tmp/{DATASET_NAME}

os.makedirs(f'/tmp/{DATASET_NAME}', exist_ok=True)

with open('../input/kaggleapi/kaggle.json') as f:
    kaggle_creds = json.load(f)
    
os.environ['KAGGLE_USERNAME'] = kaggle_creds['username']
os.environ['KAGGLE_KEY'] = kaggle_creds['key']

# Created interactively.
!kaggle datasets init -p /tmp/{DATASET_NAME}

with open(f'/tmp/{DATASET_NAME}/dataset-metadata.json') as f:
    dataset_meta = json.load(f)

dataset_meta['id'] = f'lextoumbourou/{DATASET_NAME}'
dataset_meta['title'] = DATASET_NAME
with open(f'/tmp/{DATASET_NAME}/dataset-metadata.json', "w") as outfile:
    json.dump(dataset_meta, outfile)

print(dataset_meta)

!cp /tmp/{DATASET_NAME}/dataset-metadata.json /tmp/{DATASET_NAME}/meta.json
!ls /tmp/{DATASET_NAME}

# Created interactively.
!kaggle datasets create -p /tmp/{DATASET_NAME} 

# Create TFRecords

In [None]:
def get_fold(fold):
    val_df = train_df[train_df.fold==fold].reset_index(drop=True)
    val_df['order'] = val_df.index
    val_df['order'] = val_df.groupby('individual_id').order.rank()
    val_total_counts = val_df.individual_id.value_counts().to_dict()
    val_df['total_counts'] = val_df.individual_id.map(val_total_counts)
    val_df['order'] = val_df['order']/val_df['total_counts']
    val_df = val_df.sort_values('order',ascending=False).reset_index(drop=True)
    val_df = val_df[['image','species','individual_id', 'width', 'height', 'yolov5', 'detic', 'tc_vitbase', 'tc_vitsmall', 'tc_mocovit', 'fullbody', 'dorsal']]
    return val_df

In [None]:
def _bytes_feature(value):
  """Returns a bytes_list from a string / byte."""
  if isinstance(value, type(tf.constant(0))):
    value = value.numpy() # BytesList won't unpack a string from an EagerTensor.
  return tf.train.Feature(bytes_list=tf.train.BytesList(value=[value]))

def _float_feature(value):
  """Returns a float_list from a float / double."""
  return tf.train.Feature(float_list=tf.train.FloatList(value=[value]))

def _int64_feature(value):
  """Returns an int64_list from a bool / enum / int / uint."""
  return tf.train.Feature(int64_list=tf.train.Int64List(value=[value]))

def _bb_feature(bb):
    """Returns an int64_list from a bool / enum / int / uint."""
    return tf.train.Feature(int64_list=tf.train.Int64List(value=bb))

def serialize_example(image,image_name,target,species,yolov5_bb,detic_bb,tc_vitbase,tc_vitsmall,tc_mocovit,fullbody,dorsal):
    feature = {
        'image': _bytes_feature(image),
        'image_name': _bytes_feature(image_name),
        'target': _int64_feature(target),
        'species': _int64_feature(species),
        'yolov5_box': _bb_feature(yolov5_bb),
        'detic_box': _bb_feature(detic_bb),
        'tc_vitbase': _bb_feature(tc_vitbase),
        'tc_vitsmall': _bb_feature(tc_vitsmall),
        'tc_mocovit': _bb_feature(tc_mocovit),
        'fullbody': _bb_feature(fullbody),
        'dorsal': _bb_feature(dorsal)
    }
    example_proto = tf.train.Example(features=tf.train.Features(feature=feature))
    return example_proto.SerializeToString()

In [None]:
def read_bbox(bbox, width, height):
    xmin, ymin, xmax, ymax = [float(i) for i in bbox.split()]
    return np.array([
        int(round(xmin * width)),
        int(round(ymin * height)),
        int(round(xmax * width)),
        int(round(ymax * height))
    ])


def create_tf_records_from_df(df, label, dataset, fold, progress=False):
    tfr_filename = f'/tmp/{DATASET_NAME}/happywhale-2022-{label}-{fold}-{df.shape[0]}.tfrec'
    with tf.io.TFRecordWriter(tfr_filename) as writer:
        it = df.iterrows()
        if progress:
            it = tqdm(it, total=len(df))
        for i,row in it:
            image_id = row.image
            target = row.individual_id
            species = row.species
            image_path = f"../input/happy-whale-and-dolphin/{dataset}_images/{image_id}"
            image_encoded = tf.io.read_file(image_path)
            image_name = str.encode(image_id)
            yolov5_bb = list(read_bbox(row.yolov5, row.width, row.height))
            detic_bb = list(read_bbox(row.detic, row.width, row.height))
            tc_vitbase_bb = list(read_bbox(row.tc_vitbase, row.width, row.height))
            tc_vitsmall_bb = list(read_bbox(row.tc_vitsmall, row.width, row.height))
            tc_mocovit_bb = list(read_bbox(row.tc_mocovit, row.width, row.height))
            fullbody = list(read_bbox(row.fullbody, row.width, row.height))
            dorsal = list(read_bbox(row.dorsal, row.width, row.height))
            example = serialize_example(image_encoded,image_name,target,species,yolov5_bb,detic_bb,tc_vitbase_bb,tc_vitsmall_bb,tc_mocovit_bb,fullbody,dorsal)
            writer.write(example)
            

def create_tf_records(label, dataset, fold=0, progress=False):
    df = get_fold(fold)
    return create_tf_records_from_df(df, label, dataset, fold, progress)

In [None]:
# create_tf_records(label='train', dataset='train', fold=0, progress=True)

In [None]:
import joblib
_ = joblib.Parallel(n_jobs=8)(
        joblib.delayed(create_tf_records)(label='train', dataset='train', fold=fold) for fold in tqdm(range(10), total=10)
    )

# Pseudo labels

In [None]:
create_tf_records_from_df(pseudo_labels_df, label='pseudo', dataset='test', fold='')

In [None]:
def create_test_tf_records(fold  = 0, progress=False):
    df = test_df[test_df.split==fold]
    tfr_filename = f'/tmp/{DATASET_NAME}/happywhale-2022-test-{fold}-{df.shape[0]}.tfrec'
    with tf.io.TFRecordWriter(tfr_filename) as writer:
        it = df.iterrows()
        if progress:
            it = tqdm(it, total=len(df))
        for i,row in it:
            image_id = row.image
            target = -1
            species = -1
            image_path = f"../input/happy-whale-and-dolphin/test_images/{image_id}"
            image = tf.io.read_file(image_path)
            image_encoded = tf.io.read_file(image_path)
            image_name = str.encode(image_id)
            yolov5_bb = list(read_bbox(row.yolov5, row.width, row.height))
            detic_bb = list(read_bbox(row.detic, row.width, row.height))
            tc_vitbase_bb = list(read_bbox(row.tc_vitbase, row.width, row.height))
            tc_vitsmall_bb = list(read_bbox(row.tc_vitsmall, row.width, row.height))
            tc_mocovit_bb = list(read_bbox(row.tc_mocovit, row.width, row.height))
            fullbody = list(read_bbox(row.fullbody, row.width, row.height))
            dorsal = list(read_bbox(row.dorsal, row.width, row.height))
            example = serialize_example(image_encoded,image_name,target,species,yolov5_bb,detic_bb,tc_vitbase_bb,tc_vitsmall_bb,tc_mocovit_bb, fullbody, dorsal)
            writer.write(example)

In [None]:
import joblib
_ = joblib.Parallel(n_jobs=8)(
        joblib.delayed(create_test_tf_records)(fold) for fold in tqdm(range(10))
    )

In [None]:
from datetime import datetime
version_name = datetime.now().strftime("%Y%m%d-%H%M%S")
print(version_name)

# Verify TFRecords

In this example I load some images from TFRecords and plot with the 2 bounding boxes.

In [None]:
from functools import partial

In [None]:
def decode_image(image_data):
    image = tf.image.decode_jpeg(image_data, channels = 3)
    image = tf.cast(image, tf.float32) / 255.0  # convert image to floats in [0, 1] range
    return image

In [None]:
def normalise_bounding_box(box, img):
    shape = tf.shape(img)
    height, width = shape[0], shape[1]
    box = tf.cast(box, tf.int32)
    return tf.convert_to_tensor([
        box[0] / width,
        box[1] / height,
        box[2] / width,
        box[3] / height
    ])

In [None]:
def read_labeled_tfrecord(example):
    LABELED_TFREC_FORMAT = {
        "image": tf.io.FixedLenFeature([], tf.string), # tf.string means bytestring
        "image_name": tf.io.FixedLenFeature([], tf.string),  # shape [] means single element
        'target': tf.io.FixedLenFeature([], tf.int64),
        'species': tf.io.FixedLenFeature([], tf.int64),
        'yolov5_box': tf.io.FixedLenFeature([4], tf.int64),
        'detic_box': tf.io.FixedLenFeature([4], tf.int64),
        'tc_vitbase': tf.io.FixedLenFeature([4], tf.int64),
        'tc_vitsmall': tf.io.FixedLenFeature([4], tf.int64),
        'tc_mocovit': tf.io.FixedLenFeature([4], tf.int64),
        'fullbody': tf.io.FixedLenFeature([4], tf.int64),
        'dorsal': tf.io.FixedLenFeature([4], tf.int64)
    }

    example = tf.io.parse_single_example(example, LABELED_TFREC_FORMAT)

    image = decode_image(example['image'])
    detic = normalise_bounding_box(example['detic_box'], image)
    yolov5 = normalise_bounding_box(example['yolov5_box'], image)
    tc_vitbase = normalise_bounding_box(example['tc_vitbase'], image)
    tc_vitsmall = normalise_bounding_box(example['tc_vitsmall'], image)
    tc_mocovit = normalise_bounding_box(example['tc_mocovit'], image)
    fullbody = normalise_bounding_box(example['fullbody'], image)
    dorsal = normalise_bounding_box(example['dorsal'], image)
        
    image = tf.image.resize(image, [IMAGE_SIZE,IMAGE_SIZE])
    label = example['target']
    return (
        image, label, example['species'],
        detic, yolov5, tc_vitbase,
        tc_vitsmall, tc_mocovit, fullbody,
        dorsal
    )

In [None]:
def load_dataset(filenames, crop_method, labeled=True, ordered=False):
    # Read from TFRecords. For optimal performance, reading from multiple files at once and
    # disregarding data order. Order does not matter since we will be shuffling the data anyway.

    ignore_order = tf.data.Options()
    if not ordered:
        ignore_order.experimental_deterministic = False # disable order, increase speed

    dataset = tf.data.TFRecordDataset(filenames, num_parallel_reads=AUTO) # automatically interleaves reads from multiple files
    dataset = dataset.with_options(ignore_order) # uses data as soon as it streams in, rather than in its original order
    dataset = dataset.map(partial(read_labeled_tfrecord, crop_method=crop_method))
    # returns a dataset of (image, label) pairs if labeled=True or (image, id) pairs if labeled=False
    return dataset

def get_training_dataset(crop_method):
    dataset = load_dataset(TRAINING_FILENAMES, crop_method, labeled=True)
    dataset = dataset.repeat() # the training dataset must repeat for several epochs
    dataset = dataset.shuffle(512)
    dataset = dataset.batch(BATCH_SIZE)
    dataset = dataset.prefetch(AUTO) # prefetch next batch while training (autotune prefetch buffer size)
    return dataset

def count_data_items(filenames):
    # the number of data items is written in the name of the .tfrec files, i.e. flowers00-230.tfrec = 230 data items
    n = [int(re.compile(r"-([0-9]*)\.").search(filename).group(1)) for filename in filenames]
    return np.sum(n)

In [None]:
def load_dataset(filenames, labeled=True, ordered=False):
    # Read from TFRecords. For optimal performance, reading from multiple files at once and
    # disregarding data order. Order does not matter since we will be shuffling the data anyway.

    ignore_order = tf.data.Options()
    if not ordered:
        ignore_order.experimental_deterministic = False # disable order, increase speed

    dataset = tf.data.TFRecordDataset(filenames, num_parallel_reads=AUTO) # automatically interleaves reads from multiple files
    dataset = dataset.with_options(ignore_order) # uses data as soon as it streams in, rather than in its original order
    dataset = dataset.map(read_labeled_tfrecord)
    # returns a dataset of (image, label) pairs if labeled=True or (image, id) pairs if labeled=False
    return dataset

In [None]:
def plot_dataset(dataset):
    row = 5; col = 4;
    row = min(row,BATCH_SIZE//col)

    for (image, label, species, detic, yolov5, vitbase, vitsmall, mocovit, fullbody, dorsal) in dataset:
        img = image.numpy()
        plt.figure(figsize=(25,int(25*row/col)))
        for j in range(row*col):
            plt.subplot(row,col,j+1)
            # plt.title(f"{image_id[j,].numpy()}")
            plt.axis('off')

            # PIL needs int between 0 and 255.
            im = Image.fromarray((img[j,]*255).astype(np.uint8))
            plt.imshow(im)
            ax = plt.gca()

            rect = convert_to_rect(im, detic[j,].numpy(), color='b', linewidth=1)
            ax.add_patch(rect)

            rect = convert_to_rect(im, yolov5[j,].numpy(), color='r', linewidth=2)
            ax.add_patch(rect)

            rect = convert_to_rect(im, vitbase[j,].numpy(), color='y', linewidth=3)
            ax.add_patch(rect)

            rect = convert_to_rect(im, vitsmall[j,].numpy(), color='m', linewidth=4)
            ax.add_patch(rect)

            rect = convert_to_rect(im, mocovit[j,].numpy(), color='r', linewidth=5)
            ax.add_patch(rect)
            
            rect = convert_to_rect(im, fullbody[j,].numpy(), color='violet', linewidth=5)
            ax.add_patch(rect)
            
            rect = convert_to_rect(im, dorsal[j,].numpy(), color='mediumblue', linewidth=6)
            ax.add_patch(rect)
            
        plt.show()
        break

In [None]:
IMAGE_SIZE = 224
BATCH_SIZE = 32

In [None]:
IMAGESIZE = [IMAGE_SIZE,IMAGE_SIZE]
AUTO = tf.data.experimental.AUTOTUNE
TRAINING_FILENAMES = tf.io.gfile.glob(f'/tmp/{DATASET_NAME}/happywhale-2022-pseudo-*.tfrec')
print(len(TRAINING_FILENAMES))
dataset = load_dataset(TRAINING_FILENAMES, labeled=True)
dataset = dataset.repeat()
dataset = dataset.shuffle(1024)
dataset = dataset.batch(BATCH_SIZE)
dataset = dataset.prefetch(AUTO)

print(count_data_items(TRAINING_FILENAMES))
plot_dataset(dataset)

In [None]:
IMAGESIZE = [IMAGE_SIZE,IMAGE_SIZE]
AUTO = tf.data.experimental.AUTOTUNE
TEST_FILENAMES = tf.io.gfile.glob(f'/tmp/{DATASET_NAME}/happywhale-2022-train-*.tfrec')
print(len(TEST_FILENAMES))
dataset = load_dataset(TEST_FILENAMES, labeled=True)
dataset = dataset.repeat()
dataset = dataset.shuffle(1024)
dataset = dataset.batch(BATCH_SIZE)
dataset = dataset.prefetch(AUTO)

print(count_data_items(TEST_FILENAMES))
plot_dataset(dataset)

In [None]:
IMAGESIZE = [IMAGE_SIZE,IMAGE_SIZE]
AUTO = tf.data.experimental.AUTOTUNE
TEST_FILENAMES = tf.io.gfile.glob(f'/tmp/{DATASET_NAME}/happywhale-2022-test-*.tfrec')
print(len(TEST_FILENAMES))
dataset = load_dataset(TEST_FILENAMES, labeled=True)
dataset = dataset.repeat()
dataset = dataset.shuffle(1024)
dataset = dataset.batch(BATCH_SIZE)
dataset = dataset.prefetch(AUTO)

print(count_data_items(TEST_FILENAMES))
plot_dataset(dataset)

# Upload Dataset

This was uploaded interactively, hence being commented out here.

In [None]:
!kaggle datasets version -m {version_name} -p /tmp/{DATASET_NAME} -r zip -q