In [1]:
import os, sys
import numpy as np
import pandas as pd
import skimage.io
from skimage.transform import resize

import PIL
from PIL import Image
import cv2
import warnings
import h5py
warnings.filterwarnings("ignore")
SIZE = 512

Using TensorFlow backend.


In [2]:
# Load dataset info
DIR = '../input/'
data = pd.read_csv('../input/train.csv')

In [3]:
def getTrainDataset():
    
    path_to_train = DIR + '/train/'
    data = pd.read_csv(DIR + '/train.csv')

    paths = []
    labels = []
    
    for name, lbl in zip(data['Id'], data['Target'].str.split(' ')):
        y = np.zeros(28)
        for key in lbl:
            y[int(key)] = 1
        paths.append(os.path.join(path_to_train, name))
        labels.append(y)

    return np.array(paths), np.array(labels)

def getTestDataset():
    
    path_to_test = DIR + '/test/'
    data = pd.read_csv(DIR + '/sample_submission.csv')

    paths = []
    labels = []
    
    for name in data['Id']:
        y = np.ones(28)
        paths.append(os.path.join(path_to_test, name))
        labels.append(y)

    return np.array(paths), np.array(labels)
paths, labels = getTrainDataset()

In [4]:
labels[0]

array([1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.])

In [5]:
from tqdm import tqdm
hdf5_path = 'D:\Human-Protein-Atlas-Image-Classification\input\proteins.h5'

In [6]:
data = h5py.File(hdf5_path, "w")
data.create_dataset("photos", shape = (0, 512, 512, 4), maxshape = (None, 512, 512, 4)
                    #, chunks = True, compression = "gzip", compression_opts = 9
                   )

<HDF5 dataset "photos": shape (0, 512, 512, 4), type "<f4">

In [7]:
batch_size = 1024
x_max = 512
y_max = 512

In [8]:
channels = ["red", "blue", "green", "yellow"]
def resize_img(path):
    images = []
    for channel in channels:
        im = np.array(Image.open(path + '_' + channel + '.png'))
        images.append(im)
    im = np.stack((
                images
            ), -1)/255
    return im

In [9]:
import concurrent.futures
def data_gen(paths):
    for i in tqdm(range(0, len(paths), batch_size)):
        xs = []
        with concurrent.futures.ThreadPoolExecutor(max_workers = 16) as executor:
                for value in executor.map(resize_img, paths[i:i+batch_size]):
                    xs.append(value)
        final_images = np.array(xs)
        print(final_images.shape)
        data["photos"].resize(data["photos"].len() + final_images.shape[0], axis = 0)
        data["photos"][i:i+final_images.shape[0], :, :, :] = final_images
data_gen(paths)
data.close()

  0%|                                                                                           | 0/31 [00:00<?, ?it/s]

(1024, 512, 512, 4)


  3%|██▋                                                                                | 1/31 [00:20<10:29, 20.99s/it]

(1024, 512, 512, 4)


  6%|█████▎                                                                             | 2/31 [00:44<10:50, 22.44s/it]

(1024, 512, 512, 4)


 10%|████████                                                                           | 3/31 [01:07<10:29, 22.47s/it]

(1024, 512, 512, 4)


 13%|██████████▋                                                                        | 4/31 [01:29<10:06, 22.46s/it]

(1024, 512, 512, 4)


 16%|█████████████▍                                                                     | 5/31 [01:52<09:43, 22.44s/it]

(1024, 512, 512, 4)


 19%|████████████████                                                                   | 6/31 [02:15<09:22, 22.51s/it]

(1024, 512, 512, 4)


 23%|██████████████████▋                                                                | 7/31 [02:37<08:59, 22.47s/it]

(1024, 512, 512, 4)


 26%|█████████████████████▍                                                             | 8/31 [03:00<08:37, 22.51s/it]

(1024, 512, 512, 4)


 29%|████████████████████████                                                           | 9/31 [03:23<08:17, 22.61s/it]

(1024, 512, 512, 4)


 32%|██████████████████████████▍                                                       | 10/31 [03:46<07:55, 22.64s/it]

(1024, 512, 512, 4)


 35%|█████████████████████████████                                                     | 11/31 [04:09<07:32, 22.65s/it]

(1024, 512, 512, 4)


 39%|███████████████████████████████▋                                                  | 12/31 [04:32<07:11, 22.73s/it]

(1024, 512, 512, 4)


 42%|██████████████████████████████████▍                                               | 13/31 [04:56<06:51, 22.84s/it]

(1024, 512, 512, 4)


 45%|█████████████████████████████████████                                             | 14/31 [05:20<06:28, 22.87s/it]

(1024, 512, 512, 4)


 48%|███████████████████████████████████████▋                                          | 15/31 [05:43<06:06, 22.90s/it]

(1024, 512, 512, 4)


 52%|██████████████████████████████████████████▎                                       | 16/31 [06:08<05:45, 23.03s/it]

(1024, 512, 512, 4)


 55%|████████████████████████████████████████████▉                                     | 17/31 [06:35<05:25, 23.27s/it]

(1024, 512, 512, 4)


 58%|███████████████████████████████████████████████▌                                  | 18/31 [06:59<05:03, 23.31s/it]

(1024, 512, 512, 4)


 61%|██████████████████████████████████████████████████▎                               | 19/31 [07:23<04:40, 23.36s/it]

(1024, 512, 512, 4)


 65%|████████████████████████████████████████████████████▉                             | 20/31 [07:48<04:17, 23.43s/it]

(1024, 512, 512, 4)


 68%|███████████████████████████████████████████████████████▌                          | 21/31 [08:15<03:55, 23.60s/it]

(1024, 512, 512, 4)


 71%|██████████████████████████████████████████████████████████▏                       | 22/31 [08:41<03:33, 23.69s/it]

(1024, 512, 512, 4)


 74%|████████████████████████████████████████████████████████████▊                     | 23/31 [09:06<03:10, 23.78s/it]

(1024, 512, 512, 4)


 77%|███████████████████████████████████████████████████████████████▍                  | 24/31 [09:34<02:47, 23.96s/it]

(1024, 512, 512, 4)


 81%|██████████████████████████████████████████████████████████████████▏               | 25/31 [10:03<02:24, 24.13s/it]

(1024, 512, 512, 4)


 84%|████████████████████████████████████████████████████████████████████▊             | 26/31 [10:29<02:01, 24.21s/it]

(1024, 512, 512, 4)


 87%|███████████████████████████████████████████████████████████████████████▍          | 27/31 [10:55<01:37, 24.27s/it]

(1024, 512, 512, 4)


 90%|██████████████████████████████████████████████████████████████████████████        | 28/31 [11:21<01:12, 24.32s/it]

(1024, 512, 512, 4)


 94%|████████████████████████████████████████████████████████████████████████████▋     | 29/31 [11:46<00:48, 24.38s/it]

(1024, 512, 512, 4)


 97%|███████████████████████████████████████████████████████████████████████████████▎  | 30/31 [12:12<00:24, 24.43s/it]

(352, 512, 512, 4)


100%|██████████████████████████████████████████████████████████████████████████████████| 31/31 [12:25<00:00, 24.04s/it]


In [10]:
data = h5py.File(hdf5_path, "r")

In [11]:
data["photos"][0]

array([[[0.03137255, 0.        , 0.03529412, 0.        ],
        [0.        , 0.        , 0.02352941, 0.        ],
        [0.        , 0.08235294, 0.10588235, 0.        ],
        ...,
        [0.        , 0.03529412, 0.00392157, 0.01568628],
        [0.        , 0.        , 0.        , 0.02352941],
        [0.        , 0.        , 0.        , 0.        ]],

       [[0.01960784, 0.01960784, 0.01176471, 0.07450981],
        [0.        , 0.02745098, 0.07843138, 0.03921569],
        [0.        , 0.03529412, 0.07843138, 0.        ],
        ...,
        [0.03529412, 0.        , 0.        , 0.00784314],
        [0.0627451 , 0.        , 0.        , 0.        ],
        [0.        , 0.        , 0.        , 0.01960784]],

       [[0.        , 0.04705882, 0.20392157, 0.        ],
        [0.        , 0.02745098, 0.05882353, 0.        ],
        [0.        , 0.05490196, 0.14117648, 0.        ],
        ...,
        [0.01176471, 0.        , 0.06666667, 0.01176471],
        [0.        , 0.      

In [12]:
%%time
for i in paths[:1000]:
    resize_img(i)

Wall time: 26.4 s


In [14]:
%%time
data["photos"][:1000]

Wall time: 14.2 s


array([[[[0.03137255, 0.        , 0.03529412, 0.        ],
         [0.        , 0.        , 0.02352941, 0.        ],
         [0.        , 0.08235294, 0.10588235, 0.        ],
         ...,
         [0.        , 0.03529412, 0.00392157, 0.01568628],
         [0.        , 0.        , 0.        , 0.02352941],
         [0.        , 0.        , 0.        , 0.        ]],

        [[0.01960784, 0.01960784, 0.01176471, 0.07450981],
         [0.        , 0.02745098, 0.07843138, 0.03921569],
         [0.        , 0.03529412, 0.07843138, 0.        ],
         ...,
         [0.03529412, 0.        , 0.        , 0.00784314],
         [0.0627451 , 0.        , 0.        , 0.        ],
         [0.        , 0.        , 0.        , 0.01960784]],

        [[0.        , 0.04705882, 0.20392157, 0.        ],
         [0.        , 0.02745098, 0.05882353, 0.        ],
         [0.        , 0.05490196, 0.14117648, 0.        ],
         ...,
         [0.01176471, 0.        , 0.06666667, 0.01176471],
         [