To speed up the loading of training samples, we can pre-scale the images to a smaller size (here we use 512) and use a format that loads faster, such as jpg.

In [None]:
import os

import numpy as np
import pandas as pd

from tqdm.notebook import tqdm

import cv2
from PIL import Image

from matplotlib import pyplot as plt

## Experiment
We compare the loading speed of different loading strategies
+ do nothing: load 4 full size png images 
+ load 1 single scaled-down png image with 4 channels
+ load 4 scaled-down jpg image
+ load a jpg image with rgb channel and a jpg image with yellow channel

In [None]:
im = cv2.imread('../input/hpa-single-cell-image-classification/train/000a6c98-bb9b-11e8-b2b9-ac1f6b6435d0_blue.png', cv2.IMREAD_UNCHANGED)
plt.imshow(im)

In [None]:
plt.imshow(cv2.resize(im, (512, 512), interpolation=cv2.INTER_LINEAR))

### Prepare images

In [None]:
imb = cv2.imread('../input/hpa-single-cell-image-classification/train/000a6c98-bb9b-11e8-b2b9-ac1f6b6435d0_blue.png', cv2.IMREAD_UNCHANGED)
img = cv2.imread('../input/hpa-single-cell-image-classification/train/000a6c98-bb9b-11e8-b2b9-ac1f6b6435d0_green.png', cv2.IMREAD_UNCHANGED)
imr = cv2.imread('../input/hpa-single-cell-image-classification/train/000a6c98-bb9b-11e8-b2b9-ac1f6b6435d0_red.png', cv2.IMREAD_UNCHANGED)
imy = cv2.imread('../input/hpa-single-cell-image-classification/train/000a6c98-bb9b-11e8-b2b9-ac1f6b6435d0_yellow.png', cv2.IMREAD_UNCHANGED)

imb = cv2.resize(imb, (512, 512), interpolation=cv2.INTER_LINEAR)
img = cv2.resize(img, (512, 512), interpolation=cv2.INTER_LINEAR)
imr = cv2.resize(imr, (512, 512), interpolation=cv2.INTER_LINEAR)
imy = cv2.resize(imy, (512, 512), interpolation=cv2.INTER_LINEAR)

im = np.stack((imb, img, imr, imy)).transpose((1, 2, 0))
cv2.imwrite('im.png', im)

cv2.imwrite('imb.jpg', imb)
cv2.imwrite('img.jpg', img)
cv2.imwrite('imr.jpg', imr)
cv2.imwrite('imy.jpg', imy)

cv2.imwrite('imgrgb.jpg', np.stack((imb, img, imr)).transpose((1, 2, 0)))  # opencv uses BGR format

In [None]:
!ls -lh

do nothing: load 4 full size png images

In [None]:
%%timeit
imb = cv2.imread('../input/hpa-single-cell-image-classification/train/000a6c98-bb9b-11e8-b2b9-ac1f6b6435d0_blue.png', cv2.IMREAD_UNCHANGED)
img = cv2.imread('../input/hpa-single-cell-image-classification/train/000a6c98-bb9b-11e8-b2b9-ac1f6b6435d0_green.png', cv2.IMREAD_UNCHANGED)
imr = cv2.imread('../input/hpa-single-cell-image-classification/train/000a6c98-bb9b-11e8-b2b9-ac1f6b6435d0_red.png', cv2.IMREAD_UNCHANGED)
imy = cv2.imread('../input/hpa-single-cell-image-classification/train/000a6c98-bb9b-11e8-b2b9-ac1f6b6435d0_yellow.png', cv2.IMREAD_UNCHANGED)

load 1 single scaled-down png image with 4 channels

In [None]:
%%timeit
cv2.imread('im.png')

load 4 scaled-down jpg image

In [None]:
%%timeit
cv2.imread('imb.jpg')
cv2.imread('img.jpg')
cv2.imread('imr.jpg')
cv2.imread('imy.jpg')

load a jpg image with rgb channel and a jpg image with yellow channel, loading time reduced from 164 ms to 2.87 ms

In [None]:
%%timeit
cv2.imread('imrgb.jpg')
cv2.imread('imy.jpg')

In [None]:
imrgb = cv2.imread('imgrgb.jpg', cv2.IMREAD_UNCHANGED)
imrgb = cv2.cvtColor(imrgb, cv2.COLOR_BGR2RGB)
plt.imshow(imrgb)

In [None]:
imy = cv2.imread('imy.jpg', cv2.IMREAD_UNCHANGED)
plt.imshow(imy)

## Process the whole dataset

In [None]:
!rm -rf *
!mkdir train

In [None]:
df = pd.read_csv('../input/hpa-single-cell-image-classification/train.csv')
df

In [None]:
from joblib import Parallel, delayed

In [None]:
path = '../input/hpa-single-cell-image-classification/train'

def save(ID):
    imb = cv2.imread(f'{path}/{ID}_blue.png', cv2.IMREAD_UNCHANGED)
    img = cv2.imread(f'{path}/{ID}_green.png', cv2.IMREAD_UNCHANGED)
    imr = cv2.imread(f'{path}/{ID}_red.png', cv2.IMREAD_UNCHANGED)
    imy = cv2.imread(f'{path}/{ID}_yellow.png', cv2.IMREAD_UNCHANGED)
    
    imb = cv2.resize(imb, (512, 512), interpolation=cv2.INTER_LINEAR)
    img = cv2.resize(img, (512, 512), interpolation=cv2.INTER_LINEAR)
    imr = cv2.resize(imr, (512, 512), interpolation=cv2.INTER_LINEAR)
    imy = cv2.resize(imy, (512, 512), interpolation=cv2.INTER_LINEAR)
    
    cv2.imwrite(f'train/{ID}_rgb.jpg', np.stack((imb, img, imr)).transpose((1, 2, 0)))
    cv2.imwrite(f'train/{ID}_yellow.jpg', imy)

In [None]:
Parallel(n_jobs=4)(delayed(save)(row.ID) for row in tqdm(df.itertuples(), total=len(df)))
'' # suppress output

In [None]:
!du -sh train
!ls -f train | wc -l

In [None]:
!cp ../input/hpa-single-cell-image-classification/train.csv .