In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import cv2
import tensorflow as tf
import re
import math
import os
from tqdm import tqdm
from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import LabelEncoder
import joblib
import json
from datetime import datetime

In [None]:
%%time

### Create Kaggle Dataset if not exists 
DATASET_NAME = f'happywhale-tfrecords-detic-box-768'

!rm -rf /tmp/{DATASET_NAME}

os.makedirs(f'/tmp/{DATASET_NAME}', exist_ok=True)

with open('../input/mkb-kaggle-api/kaggle.json') as f:
    kaggle_creds = json.load(f)
    
os.environ['KAGGLE_USERNAME'] = kaggle_creds['username']
os.environ['KAGGLE_KEY'] = kaggle_creds['key']

!kaggle datasets init -p /tmp/{DATASET_NAME}

with open(f'/tmp/{DATASET_NAME}/dataset-metadata.json') as f:
    dataset_meta = json.load(f)

dataset_meta['id'] = f'ragnar123/{DATASET_NAME}'
dataset_meta['title'] = DATASET_NAME
with open(f'/tmp/{DATASET_NAME}/dataset-metadata.json', "w") as outfile:
    json.dump(dataset_meta, outfile)
print(dataset_meta)

!cp /tmp/{DATASET_NAME}/dataset-metadata.json /tmp/{DATASET_NAME}/meta.json
!ls /tmp/{DATASET_NAME}
!kaggle datasets create -u -p /tmp/{DATASET_NAME} 

In [None]:
# Configurations
# Amount of TF records we want to create for the train set
FOLDS = 30
# Amount of TF records we want to create for the test set
TEST_FOLDS = 28
# Random seed for stratification
SEED = 42
# Image size 
IMAGE_SIZE = (768, 768)

def _bytes_feature(value):
    """Returns a bytes_list from a string / byte."""
    if isinstance(value, type(tf.constant(0))):
        value = value.numpy()
    return tf.train.Feature(bytes_list=tf.train.BytesList(value=[value]))

def _float_feature(value):
    """Returns a float_list from a float / double."""
    return tf.train.Feature(float_list=tf.train.FloatList(value=[value]))

def _int64_feature(value):
    """Returns an int64_list from a bool / enum / int / uint."""
    return tf.train.Feature(int64_list=tf.train.Int64List(value=[value]))

def read_bbox(bbox):
    return np.array([int(i) for i in bbox.split()])

def serialize_example(image_id, image, species, individual_id, train = True):
    if train:
        feature = {
            'image_id': _bytes_feature(image_id),
            'image': _bytes_feature(image),
            'species': _int64_feature(species),
            'individual_id': _int64_feature(individual_id),
        }
    else:
        feature = {
            'image_id': _bytes_feature(image_id),
            'image': _bytes_feature(image),
        }
    example_proto = tf.train.Example(features = tf.train.Features(feature = feature))
    return example_proto.SerializeToString()


# Read train set
train = pd.read_csv('../input/happy-whale-and-dolphin/train.csv')
train.species.replace({"globis": "short_finned_pilot_whale",
                       "pilot_whale": "short_finned_pilot_whale",
                       "kiler_whale": "killer_whale",
                       "bottlenose_dolpin": "bottlenose_dolphin"}, inplace = True)
# Use encoders to label encode species and indivual_ids
species_encoder = LabelEncoder()
train['species_encode'] = species_encoder.fit_transform(train['species'])
individual_id_encoder = LabelEncoder()
train['individual_id_encode'] = individual_id_encoder.fit_transform(train['individual_id'])
# Start stratifiedkfold strategy
kfold = StratifiedKFold(n_splits = FOLDS, shuffle = True, random_state = SEED)
# Add folds to csv file
for fold, (trn_ind, val_ind) in enumerate(kfold.split(train, train['species'])):
    train.loc[val_ind, 'fold'] = fold
train['fold'] = train['fold'].astype(int)
number_of_species = train['species_encode'].nunique()
number_of_individual_id = train['individual_id_encode'].nunique()
print(f'We have {number_of_species} unique species and {number_of_individual_id} unique individuals')

# Add detic (object detection) bounding boxes
detic_train_df = pd.read_csv('../input/whale2-cropped-dataset/train2.csv')
detic_test_df = pd.read_csv('../input/whale2-cropped-dataset/test2.csv')
detic_train_df.loc[detic_train_df.box.isna(), 'box'] = ''
detic_test_df.loc[detic_test_df.box.isna(), 'box'] = ''
train = train.merge(detic_train_df[['image', 'box']], how = 'left', on = 'image')
# Read test file
test = pd.read_csv('../input/happy-whale-and-dolphin/sample_submission.csv')
test = test.merge(detic_test_df[['image', 'box']], how = 'left', on = 'image')
# Any null value ('') will be -1
train.loc[train.box == '', 'box'] = '-1 -1 -1 -1'
test.loc[test.box == '', 'box'] = '-1 -1 -1 -1'
# Save new csv file with encoded classes for mapping
train.to_csv(f'/tmp/{DATASET_NAME}/train_encoded.csv', index = False)

def create_train_tfrec(fold):
    train_ = train[train['fold'] == fold]
    with tf.io.TFRecordWriter(f'/tmp/{DATASET_NAME}/train-{fold}-{train_.shape[0]}.tfrec') as writer:
        for k in range(train_.shape[0]):
            row = train_.iloc[k]
            image_id = row['image']
            box = list(read_bbox(row['box']))
            image = cv2.imread('../input/happy-whale-and-dolphin/train_images/' + row['image'])
            if box is not None and box[0] != -1:
                left, bottom, right, top = box[0], box[1], box[2], box[3]
                image = image[bottom:top, left:right]
            image = cv2.resize(image, IMAGE_SIZE)
            image = cv2.imencode('.jpg', image, (cv2.IMWRITE_JPEG_QUALITY, 100))[1].tobytes()
            species = row['species_encode']
            individual_id = row['individual_id_encode']
            example = serialize_example(str.encode(image_id), image, species, individual_id, train = True)
            writer.write(example)
                
def create_test_tfrec(fold):
    i = fold * 1000
    test_ = test.iloc[i:i+1000]
    with tf.io.TFRecordWriter(f'/tmp/{DATASET_NAME}/test-{fold}-{test_.shape[0]}.tfrec') as writer:
        for k in range(len(test_)):
            row = test_.iloc[k]
            image_id = row['image']
            box = list(read_bbox(row['box']))
            image = cv2.imread('../input/happy-whale-and-dolphin/test_images/' + row['image'])
            if box is not None and box[0] != -1:
                left, bottom, right, top = box[0], box[1], box[2], box[3]
                image = image[bottom:top, left:right]
            image = cv2.resize(image, IMAGE_SIZE)
            image = cv2.imencode('.jpg', image, (cv2.IMWRITE_JPEG_QUALITY, 100))[1].tobytes()
            example = serialize_example(str.encode(image_id), image, None, None, train = False)
            writer.write(example)
                
                
_ = joblib.Parallel(n_jobs=8)(joblib.delayed(create_train_tfrec)(fold) for fold in tqdm(range(FOLDS))
)

_ = joblib.Parallel(n_jobs=8)(joblib.delayed(create_test_tfrec)(fold) for fold in tqdm(range(TEST_FOLDS))
)
version_name = datetime.now().strftime("%Y%m%d-%H%M%S")
!kaggle datasets version -m {version_name} -p /tmp/{DATASET_NAME} -r zip -q