# Why shelve?
Reading individual images and resizing them can be slow. Loading the data from disk - especially for very large images like the X-rays here - is one of things that will slow us down in this competition when we train convolutional neural networks. Would it be faster to save the whole data in a pkl type single file that one can access by some index? Yes, but .pkl would not allow parallel access to the data, so you could not have multiple workers reading data.

`shelve` is like pickle, but allows parallel reading and access via dictionary keys - a persistent dictionary, really (see [the documentation](https://docs.python.org/3/library/shelve.html)). 

# What does this notebook do?
In this notebook, I resized the images to 704 by 704 (you could of course change that so that, say, the shortest dimension is at least 600 pixels or something) and I, of course, only kept one grascale channel. The whole file we end up saving ends up being small enough for our purposes - primarily through saving the image itself as uint8.

I also show what a dataloader can look like. In another competition, I found that [shelve is faster than the alternatives](https://www.kaggle.com/bjoernholzhauer/vinbigdata-chest-x-ray-comparing-dataloader-speed) such as loading pre-sized images - let me know, if I missed obvious alternatives.

# Implementation using shelve

In [None]:
import numpy as np
import pandas as pd
import os
import albumentations as A
import pickle as pkl
import fastcore
from fastcore.parallel import parallel
import shelve
from PIL import Image
from tqdm.auto import tqdm
train = pd.read_csv('../input/ranzcr-clip-catheter-line-classification/train.csv')
targets = list(train.columns[1:-1])
targets

In [None]:
def read_and_resize(image_id):
    aug = A.Resize(704,704,always_apply=True)
    return {'image': aug(image=np.array(Image.open(f'../input/ranzcr-clip-catheter-line-classification/train/{image_id}.jpg')))['image'],
            'labels': np.array(train.loc[train['StudyInstanceUID']==image_id, targets], dtype=np.int8)}

list_of_images = list(train['StudyInstanceUID'].values)

# Parallel processing of the image files could be an option, but we run out of memory.
#tmp_imgs = parallel(read_and_resize, list_of_images, n_workers=4, progress=True)

In [None]:
for image_id in tqdm(list_of_images):
    with shelve.open('training_data.db') as myshelf:
        myshelf.update( { image_id: read_and_resize(image_id)})

# Accessing the data

In [None]:
with shelve.open('training_data.db', flag='r', writeback=False) as myshelf:
    print("------ First training records -------------")
    print(train['StudyInstanceUID'].values[0])
    print( myshelf[train['StudyInstanceUID'].values[0]] )
    print("------ Second training records -------------")
    print(train['StudyInstanceUID'].values[1])
    print( myshelf[train['StudyInstanceUID'].values[1]] )

# Bad DataLoader

This is a logical way of doing it, but it turns out to be pretty slow, because we open the `shelve` file again and again for each item.

In [None]:
import torch
from fastai.vision.all import *

class ClassificationDataset:
    def __init__(self, StudyInstanceUIDs, augmentations=None): 
        self.StudyInstanceUIDs, self.augmentations = StudyInstanceUIDs, augmentations
    def __len__(self): return len(self.StudyInstanceUIDs)
    def __getitem__(self, item):        
        with shelve.open('training_data.db', flag='r', writeback=False) as myshelf:
            tmpdict =  myshelf[self.StudyInstanceUIDs[item]]        
        # May wish to add resizing option with resize=None as default, could look something like this, if these were PIL images:
        #if self.resize is not None:
        #    image = image.resize( (self.resize[1], self.resize[0]), resample=Image.BILINEAR)
        # We add a channel dimension, even though images are just 1 channel, because albumentations wants that
        image = np.expand_dims(np.array( tmpdict['image'] ), axis=-1)
        if self.augmentations is not None:
            image = self.augmentations(image=image)['image']
        # Go to torch tensor and switch from height-width-channel tto channel-height-width 
        image = torch.tensor( image[:,:,0] )[None, :, :]
        return image, torch.tensor( tmpdict['labels'], dtype=torch.long) #torch.tensor(image, dtype=torch.float)

image_size = 512
    
train_aug = A.Compose([        
    A.HorizontalFlip(p=0.3),    
    A.OneOf([ A.Rotate(limit=10, border_mode=4, always_apply=True), 
              A.Rotate(limit=10, border_mode=0, value=[0,0,0], always_apply=True),
              A.Rotate(limit=10, border_mode=0, value=[128,128,128], always_apply=True)], p=0.75),
    A.RandomResizedCrop(image_size, image_size, scale=(0.9, 1), always_apply=True),    
    A.RandomBrightnessContrast(brightness_limit=(-0.05,0.05), contrast_limit=(-0.05, 0.05), p=0.5),
    A.CLAHE(clip_limit=(0,2), p=0.5),
    A.GaussNoise(var_limit=[0, 20], p=0.5),
    A.OneOf([A.JpegCompression(quality_lower=75, quality_upper=99),A.Downscale(scale_min=0.5, scale_max=0.8),], p=0.5),
    A.IAAPiecewiseAffine(scale=(0, 0.025), p=0.2),    
    A.CoarseDropout(min_holes=2, max_holes=8, 
                    max_height=int(image_size * 0.15), 
                    max_width=int(image_size * 0.15), 
                    min_height=int(image_size * 0.05), 
                    min_width=int(image_size * 0.05), 
                    fill_value=0, always_apply=False, p=0.5),      
    A.Normalize(0.4827506 , 0.22004028, max_pixel_value=255.0, always_apply=True)
])    
    
    
train_dataset = ClassificationDataset(StudyInstanceUIDs=list_of_images[0:200], 
                                      augmentations=train_aug)    

# train_dataset.__getitem__(0) # Can do that to test getting one item

train_loader = DataLoader(dataset=train_dataset, bs=32, shuffle=True, n_workers=2)

In [None]:
next(iter(train_loader))

# Good DataLoader

There's many approaches we could take to DataLoaders, the previous naive idea turns out to be too slow. For example, it takes almost half a minute for a batch of 32!

The version below that loads a whole batch in one go (and importantly only opens the `shelve` file only once per batch) is massively faster.

In [None]:
def getbatch(items, augmentations=None):
    with shelve.open('training_data.db', flag='r', writeback=False) as myshelf:
        tmpdict =  { key: myshelf[key] for key in items }
        
    # May wish to add resizing option with resize=None as default, could look something like this, if these were PIL images:
    #if self.resize is not None:
    #    image = image.resize( (self.resize[1], self.resize[0]), resample=Image.BILINEAR)
            
    # We add a channel dimension, even though images are just 1 channel, because albumentations wants that        
    images = [ np.expand_dims(tmpdict[key]['image'], axis=-1) for key in items ]    
    # If augmentations are requested, do these one image at a time
    if augmentations is not None:
        images = [ augmentations(image=image)['image'] for image in images]
    # We turn the list of image arrays into a 4-D numpy tensor.
    # Go to torch tensor and switch from item-height-width-channel tto channel-height-width     
    images = torch.tensor( np.array(images)[:,:,:,0] )[:,None, :, :]    
    labels = torch.tensor( [ tmpdict[key]['labels'][0] for key in items ], dtype=torch.long)
        
    return images, labels

# # You can test this works as expected via:
# x,y = getbatch(items=list_of_images[:32], augmentations=train_aug)
# x.shape # torch.Size([32, 1, 512, 512])
# y.shape # torch.Size([32, 11])

class BatchedParallelDataLoader:

    def __init__(self, image_ids, augmentations, batch_size: int=32, shuffle: bool=False, drop_last: bool=False, n_workers: int=1):
        self.dataset_len, self.batch_size, self.shuffle, self.drop_last, self.n_workers = len(image_ids), batch_size, shuffle, drop_last, n_workers
        self.image_ids, self.augmentations = np.array(image_ids), augmentations                    

        # Calculate # batches
        n_batches, remainder = divmod(self.dataset_len, self.batch_size)
        if (remainder > 0) & (drop_last==False):
            n_batches += 1
            self.n_items = self.dataset_len
        else:
            self.n_items = self.n_batches * self.batch_size
        self.n_batches = n_batches
        self.batch_list = [i for i in range(self.n_items)]        

    def __iter__(self):
        if self.shuffle:
            ridx = torch.randperm(self.dataset_len)
            self.batch_list = [ridx[i] for i in range(self.n_items)]
            
        chunks = [ list(self.image_ids[ self.batch_list[i:min(self.n_items, i+self.batch_size)]]) for i in range(0, self.n_items, self.batch_size) ]        
        with ProcessPoolExecutor(self.n_workers) as ex:
            yield from ex.map(getbatch, chunks, augmentations=self.augmentations)

    def __len__(self):
        return self.n_batches
    
train_dl = BatchedParallelDataLoader(n_workers=2, image_ids=list_of_images, augmentations=train_aug, batch_size=32)