In [12]:
import torch
import h5py
import numpy as np

In [13]:
class HDF5Dataset(torch.utils.data.Dataset):
    """Represents an abstract HDF5 dataset.
    
    Input params:
        file_path: Path to the folder containing the dataset (one or multiple HDF5 files).
        transform: PyTorch transform to apply to every data instance (default=None).
    """
    def __init__(self, file_path, transform=None, device=None):
        super().__init__()
        self.data_info = []
        self.data_cache = {}
        self.transform = transform

        self.hdf5_file = h5py.File(file_path, "r")
        
        if device:
            self.device = torch.device(device)
        else:
            self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

            
    def __getitem__(self, index):
        
        x = self.hdf5_file['data'][index]
        x = x + self.hdf5_file['intercept'][index]
        x = torch.from_numpy(x.astype(np.float32)).to(self.device)
        x = x + self.hdf5_file['intercept'][index]
        x = x / self.hdf5_file['intercept'][index]
        
        return x

    def __len__(self):
        return len(self.hdf5_file['intercept'])


In [14]:
!ls /project/davidr/lidc_idri/npy/lanczos_3d/512x512/ | wc -l

851


In [59]:
dataset = HDF5Dataset('/project/davidr/lidc_idri/hdf5/lanczos_3d/16x16.h5',
                     device='cpu')

In [60]:
dataloader = torch.utils.data.DataLoader(dataset, batch_size=4, shuffle=True, num_workers=4, pin_memory=False)

In [61]:
iterable = iter(dataloader)

In [62]:
%timeit -r 1 -n 1 for i in range(64): next(iterable)

253 ms ± 0 ns per loop (mean ± std. dev. of 1 run, 1 loop each)


In [63]:
from pgan_pytorch.data import DatasetFolder

In [64]:
dataset = DatasetFolder('/project/davidr/lidc_idri/npy/lanczos_3d/64x64/', 
                       loader=lambda path: np.load(path),
                       extensions=('npy',),
                       transform=lambda x: torch.from_numpy(x.astype(np.float32)))


In [65]:
dataloader = torch.utils.data.DataLoader(dataset, batch_size=4, shuffle=True, num_workers=4, pin_memory=False)


In [66]:
iterable = iter(dataloader)

In [67]:
%timeit -r 1 -n 1 for i in range(16): next(iterable)

32 ms ± 0 ns per loop (mean ± std. dev. of 1 run, 1 loop each)


In [69]:
dataset = DatasetFolder('/project/davidr/lidc_idri/pt/lanczos_3d/64x64/', 
                       loader=lambda path: torch.load(path),
                       extensions=('pt',),
                       transform=None)


In [70]:
%timeit -r 1 -n 1 for i in range(16): next(iterable)

127 ms ± 0 ns per loop (mean ± std. dev. of 1 run, 1 loop each)


In [44]:
iterable = iter(dataloader)

In [8]:
x = np.random.randint(-2048, 2048, size=(1, 512, 512, 512))

In [9]:
intercept = -2048

In [10]:
x_uint16 = (x - intercept).astype(np.uint16)

In [11]:
np.save('/scratch/test.npy', x_uint16)

In [12]:
torch.save(torch.from_numpy(x.astype(np.int16)), '/scratch/test.pt')

In [17]:
torch.save(torch.from_numpy(x / 1024).float(), '/scratch/test_fl.pt')

In [18]:
def load_uint16(path):
    x = np.load(path).astype(np.float32)
    x = x + intercept
    return x

In [19]:
%timeit torch.from_numpy(load_uint16('/scratch/test.npy'))

1.18 s ± 31.7 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [20]:
device = 'cuda'

In [21]:
%timeit torch.load('/scratch/test.pt').float()

542 ms ± 40 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [22]:
%timeit torch.load('/scratch/test_fl.pt')

559 ms ± 24 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
