# HDF5 Vs Pickle

In [17]:
import pathlib
import time

import numpy as np
import pickle
import h5py

In [2]:
xdim = 10000
ydim = 20000
test_data = np.ones(shape=(xdim, ydim), dtype=np.float64)

In [11]:
gb = 8589934592 # bits

In [14]:
expected_memory = xdim * ydim * 64
expected_memory = expected_memory * 1/gb

In [16]:
expected_memory # in GB

1.4901161193847656

#### Dump to pickle

In [59]:
%%timeit -n 1 -r 5

with open('test_pickle.pkl', 'wb') as f:
    pickle.dump(test_data, f)

2.77 s ± 772 ms per loop (mean ± std. dev. of 5 runs, 1 loop each)


#### Dump to hdf5

In [60]:
%%timeit -n 1 -r 5

with h5py.File('test_hdf5.hdf5', 'w') as f:
    f.create_dataset('test', data=test_data)

1.12 s ± 156 ms per loop (mean ± std. dev. of 5 runs, 1 loop each)


#### Test load times

In [5]:
def load_pickle(filename, dirpath):
    """
    Load pickled object.
    Parameters:
    -----------
    filename : str 
    dirpath : str
    Returns:
    --------
    object
    """

    dirpath = pathlib.Path(dirpath)
    filepath = dirpath / f'{filename}.pkl'

    with open(filepath, 'rb') as f:
        obj = pickle.load(f)

    return obj


def load_hdf5(filename, dirpath):
    """
    Load HDF5 file from disk.
    Parameters:
    -----------
    filename : str
    dirpath : str
    Returns:
    --------
    h5py.File
    """
    dirpath = pathlib.Path(dirpath)
    filepath = dirpath / f'{filename}.hdf5'

    return h5py.File(filepath, 'r')


In [22]:
pickle_times = []

for i in range(7):
    start = time.time()
    pickle_loaded = load_pickle('test_pickle', '')
    end = time.time()
    pickle_times.append(end-start)

In [24]:
hdf5_times = []

for i in range(5):
    start = time.time()
    hdf5_loaded = load_hdf5('test_hdf5', '')
    end = time.time()
    hdf5_times.append(end-start)

In [43]:
hdf5_times = np.array(hdf5_times)
pickle_times = np.array(pickle_times)

In [46]:
mean_pickle_time = pickle_times.mean()
std_dev_pickle_time = pickle_times.std()

In [48]:
print(f"{mean_pickle_time} \pm {std_dev_pickle_time}")

3.1744718074798586 \pm 0.3949027934039339


In [49]:
mean_hdf5_time = hdf5_times.mean()
std_dev_hdf5_time = hdf5_times.std()

In [50]:
print(f"{mean_hdf5_time} \pm {std_dev_hdf5_time}")

0.001910829544067383 \pm 0.003450182116946118


In [61]:
mean_hdf5_time + mean_hdf5_array_time

1.3626612663269042

In [62]:
std_dev_hdf5_array_time + std_dev_hdf5_time

0.11536497723370183

In [41]:
hdf5_array_load_times = []

for i in range(5):
    start = time.time()
    hdf5_loaded['test'][:]
    end = time.time()
    hdf5_array_load_times.append(end-start)

In [52]:
hdf5_array_load_times = np.array(hdf5_array_load_times)

In [53]:
mean_hdf5_array_time = hdf5_array_load_times.mean()
std_dev_hdf5_array_time = hdf5_array_load_times.std()

In [54]:
print(f"{mean_hdf5_array_time} \pm {std_dev_hdf5_array_time}")

1.360750436782837 \pm 0.11191479511675571
