# Converting data to h5py for faster loading

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline

import h5py
from tqdm import tqdm
import cv2

In [2]:
train_df = pd.read_csv('./data/train.csv')
test_df = pd.read_csv('./data/sample_submission.csv')
channels = ['red', 'green', 'blue', 'yellow']
hdf_path = f'/media/litemax/A036809A368072D8/Users/JALDI/Data/external-data-for-protein-atlas/hpa_data.hdf5'

In [3]:
def load_image(id, path):
    img = np.zeros((512, 512, 4), dtype=np.uint8)
    for c, ch in enumerate(channels):
        img[:,:,c] = cv2.imread('./data/{}/{}_{}.png'.format(path, id, ch), cv2.IMREAD_GRAYSCALE)
    return img

In [None]:
with h5py.File(hdf_path, mode='w') as train_hdf5:
    train_hdf5.create_dataset("train", (len(train_df), 512, 512, 4), np.uint8)
    train_hdf5.create_dataset("test", (len(test_df), 512, 512, 4), np.uint8)
    
    for i, id in tqdm(enumerate(train_df['Id'])):
        img = load_image(id, 'train')
        train_hdf5['train'][i, ...] = img
        
    for i, id in tqdm(enumerate(test_df['Id'])):
        img = load_image(id, 'test')
        train_hdf5['test'][i, ...] = img

133it [00:02, 41.82it/s]

## Rough Benchmark

In [5]:
randind = np.random.randint(0, len(train_df), 8)
randind = np.sort(randind)

In [6]:
train_hdf5 = h5py.File(hdf_path, "r")

In [14]:
%%timeit
# with h5py.File(hdf_path, "r") as train_hdf5:       # Causes 20% slowdown :(
batch = train_hdf5['train'][randind, ...]

3.29 ms ± 130 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)


In [71]:
%%timeit
batch = np.zeros((8, 4, 512, 512), dtype=np.uint8)
for i, ind in enumerate(randind):
    batch[i, ...] = load_image(train_df['Id'][ind])

79.9 ms ± 1.07 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)


In [15]:
train_hdf5.close()

Suggestions are welcome.

In [7]:
batch = train_hdf5['train'][0, ...]

In [21]:
np.moveaxis(batch, 0, -1)[:,:,:3].shape

(512, 512, 3)