In [24]:
import time
import dask
import dask.array as da
import dask.bag as db
import dask.dataframe as dd

import pandas as pd
from PIL import Image
import numpy as np

In [14]:
root = '/home/jovyan/work'

metadata = dd.read_csv(root + '/data/metadata.csv').compute()
dataset_size = len(metadata)
sample_size = 20000
frac = sample_size / dataset_size
sample = metadata.sample(frac=frac, random_state=0)
print(len(sample))

In [34]:
import os

npy_files_path = f"{root}/data/big_earth/npy_files"
os.mkdir(npy_files_path)

In [35]:
def image_files_to_npy_file(image_prefix):
    bands = [np.asarray(
    Image.open(f"{root}/data/big_earth/BigEarthNet-v1.0/{image_prefix}/{image_prefix}_B{band}.tif"),
    dtype=np.uint16) for band in ["02", "03", "04"]]
    
    stacked_arr = np.stack(bands, axis=-1)
    np.save(f"{npy_files_path}/{image_prefix}", stacked_arr)

In [36]:
start = time.time()
db.from_sequence(sample.iloc[:5000]['image_prefix'].values).map(image_files_to_npy_file).compute()
print(time.time() - start)

134.17633366584778


In [71]:
import random
import time

import numpy as np
from tensorflow.keras.utils import Sequence


class AugmentedImageSequence(Sequence):
    def __init__(self, x: np.array, y: np.array, batch_size, augmentations):
        self.x = x
        self.y = y
        self.base_index = [idx for idx in range(len(x))]
        self.batch_size = batch_size
        self.augmentations = augmentations

    def __len__(self):
        return int(np.ceil(len(self.x) / self.batch_size))

    def __getitem__(self, batch_num):
        if batch_num == 0:
            print('getting batch_num', batch_num)
            start = time.time()

        batch_x = self.x[batch_num * self.batch_size:(batch_num + 1) * self.batch_size]

        if self.y is not None:
            batch_y = self.y[batch_num * self.batch_size:(batch_num + 1) * self.batch_size]

        start = time.time()
        images = self.batch_loader(batch_x)

        # training
        if self.y is not None:
            batch_x = np.stack([self.augmentations(image=x)["image"] for x in images], axis=0)

            if batch_num == 0:
                print('fetched batch_num', batch_num, 'in', time.time() - start, 'seconds')

            return batch_x, batch_y
        # test (inference only)
        else:
            return np.array(images)

    def batch_loader(self, image_paths) -> np.array:
        raise NotImplementedError()

    def on_epoch_end(self):
        shuffled_index = self.base_index.copy()
        random.shuffle(shuffled_index)
        self.x = self.x[shuffled_index]

        if self.y is not None:
            self.y = self.y[shuffled_index]

class AugmentedImageSequenceFromNpy(AugmentedImageSequence):
    def __init__(self, x: np.array, y: np.array, batch_size, augmentations):
        super().__init__(x=x, y=y, batch_size=batch_size, augmentations=augmentations)

    def batch_loader(self, image_paths) -> np.array:
        return np.array([np.load(image_path) for image_path in image_paths])

In [39]:
from albumentations import (
    Compose, Flip, VerticalFlip, Resize, Rotate, ToFloat
)
import time

AUGMENTATIONS_TRAIN = Compose([
    Flip(p=0.5),
    Rotate(limit=(0, 360), p=0.5)
])

AUGMENTATIONS_TEST = Compose([])

In [56]:
xtrain = npy_files_path + "/" + sample.iloc[:5000]['image_prefix'] + ".npy"
ytrain = np.array([np.random.randn(1, 44) for _ in range(len(xtrain))])

In [75]:
batch_size = 128
a = AugmentedImageSequenceFromNpy(x=xtrain, y=ytrain, batch_size=batch_size,
                                  augmentations=AUGMENTATIONS_TRAIN)

def benchmark(sequence):
    start = time.time()
    for x, y in sequence:
#     for batch in len(sequence):
        time.sleep(0.01)
    print("finished epoch in", time.time() - start, "seconds")
        
# for x, y in a:
#     print(x.shape, y.shape)

# a.on_epoch_end()

benchmark(a)

getting batch_num 0
fetched batch_num 0 in 0.5602941513061523 seconds
23.606159448623657
