In [1]:
import os
import shutil
import time

import matplotlib.pyplot as plt
import numpy as np
from PIL import Image

In [2]:
CLOUD_DIR = '../../data/95cloud'
TRAIN_DIR = f'{CLOUD_DIR}/38-Cloud_training'

### Speed test different ways of loading images
- Option 1: load all channels independently using PIL, stack the numpy arrays
- Option 2: load images from (pre-processed) .png files, which encode the 4 channels as RGBA
- Option 3: load images from (pre-processed) .npy files

In [3]:
# read image from 4 channel-wise TIFs and stack them
def load_cloud_img_1(patch_name: str):
    channel_imgs = []
    for channel in ['nir', 'red', 'green', 'blue']:
         channel_imgs.append(np.array(Image.open(f'{TRAIN_DIR}/train_{channel}/{channel}_{patch_name}.TIF')))
    return np.stack(channel_imgs, axis=2)

# read image from RGBA PNG
def load_cloud_img_2(patch_name: str, patch_dir: str):
    return np.array(Image.open(f'{patch_dir}/{patch_name}.png'))

# read image from numpy file
def load_cloud_img_3(patch_name: str, patch_dir: str):
    return np.load(f'{patch_dir}/{patch_name}.npy')

# get the list of training patches and pick 100 random ones to load during the speed test
training_patches = np.array(open(f'{TRAIN_DIR}/training_patches_38-Cloud.csv').read().split('\n')[1:-1])
rand_patches = training_patches[np.random.choice(len(training_patches), size=100, replace=False)]

# make a temp directory for the preprocessed images
temp_preprocess_dir = '/srv/share/sean/datasets/temp_dir_dl_benchmark'
if os.path.exists(temp_preprocess_dir):
    shutil.rmtree(temp_preprocess_dir, ignore_errors=True)
    while os.path.exists(temp_preprocess_dir):
        time.sleep(0.1)
os.mkdir(temp_preprocess_dir)

# pre-process the images
for patch_name in rand_patches:
    cloud_img = load_cloud_img_1(patch_name)
    np.save(open(f'{temp_preprocess_dir}/{patch_name}.npy', 'wb'), cloud_img)
    cloud_img_rgba = Image.fromarray(cloud_img, 'RGBA')
    cloud_img_rgba.save(f'{temp_preprocess_dir}/{patch_name}.png')

In [4]:
# Run the test!

print('1) Load 4 .tif images and stack:')
%timeit for patch_name in rand_patches: load_cloud_img_1(patch_name)

print('\n2) Load 1 .png image (rgba):')
%timeit for patch_name in rand_patches: load_cloud_img_2(patch_name, temp_preprocess_dir)

print('\n3) Load 1 .npy file:')
%timeit for patch_name in rand_patches: load_cloud_img_3(patch_name, temp_preprocess_dir)

1) Load 4 .tif images and stack:
254 ms ± 1.24 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)

2) Load 1 .png image (rgba):
436 ms ± 14 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)

3) Load 1 .npy file:
78.1 ms ± 223 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)


In [5]:
# The RGBA option is probably the slowest because PIL internally converts the image to uint8, which is also bad:
im1, im2 = load_cloud_img_1(rand_patches[0]), load_cloud_img_2(rand_patches[0], temp_preprocess_dir)
print(im1.dtype, im2.dtype)

uint16 uint8


### Pre-processing and saving .npy files is by far the best option in terms of speed!!

In [6]:
# Clean up the temp directory
shutil.rmtree(temp_preprocess_dir)