In [9]:
import os
import sys
from multiprocessing import Pool, cpu_count
from multiprocessing.pool import ThreadPool

from tqdm import tqdm

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import h5py as h5

import tensorflow as tf
import tensorflow_hub as hub
from sklearn.utils.class_weight import compute_class_weight
from sklearn.model_selection import StratifiedKFold

# Custom library in dev
import happy as hp

In [10]:
P = {}

P["TRAIN_CSV"] = "input/happy-whale-and-dolphin/train.csv"

P["TRAIN_FOLDER"] = "input/happy-whale-and-dolphin/train_images"

P["SIZE"] = [224, 224]

P["DSTDIR"] = "preprocessed_%d-%d" % tuple(P["SIZE"])

In [11]:
TEST = False

data_df = pd.read_csv(P["TRAIN_CSV"])

if TEST:
    data_df = data_df.iloc[:30]
print(len(data_df))
data_df.head()

51033


Unnamed: 0,image,species,individual_id
0,00021adfb725ed.jpg,melon_headed_whale,cadddb1636b9
1,000562241d384d.jpg,humpback_whale,1a71fbb72250
2,0007c33415ce37.jpg,false_killer_whale,60008f293a2b
3,0007d9bca26a99.jpg,bottlenose_dolphin,4b00fe572063
4,00087baf5cef7a.jpg,humpback_whale,8e5253662392


In [12]:
def jpg_to_array(filepath: str, dtype=tf.uint8):
    x = tf.io.read_file(filepath)
    x = tf.image.decode_jpeg(x)
    x = tf.image.convert_image_dtype(x, tf.float32)
    x = tf.image.resize(x, [224, 224], method="bicubic")
    if x.shape[2] == 1:
        x = tf.image.grayscale_to_rgb(x)
    x = tf.image.convert_image_dtype(x, dtype)
    return x.numpy()


# Estimate size of dataset
x = jpg_to_array(os.path.join(P["TRAIN_FOLDER"], data_df.at[0, "image"]))
size_GB = len(data_df) * sys.getsizeof(x) / 1024**3
print("Size of dset = %.2f MB" % (size_GB*1024))
print("Size of dset = %.2f GB" % size_GB)

Size of dset = 7332.65 MB
Size of dset = 7.16 GB


In [13]:
shape = (len(data_df), *P["SIZE"], 3)

np_data = np.empty(shape=shape, dtype=np.uint8)

for idx in tqdm(data_df.index.values):
    filepath = os.path.join(P["TRAIN_FOLDER"], data_df.at[idx, "image"])
    np_data[idx] = jpg_to_array(filepath)

100%|██████████| 51033/51033 [27:35<00:00, 30.83it/s]


In [15]:
np_data.shape

(51033, 224, 224, 3)

In [16]:
np_data.dtype

dtype('uint8')

In [26]:
h5file = os.path.join(P["DSTDIR"], "data.h5")
os.makedirs(P["DSTDIR"], exist_ok=True)

h5_shape = np_data.shape

with h5.File(h5file, "w") as f:
    f.create_dataset("img", data=np_data,  shape=h5_shape, chunks=(1, *h5_shape[1:]), dtype=np.uint8)

(51033, 224, 224, 3)
