## h5py is the python ibrary to use HDF5. Let's explore with demo datesets here to see how HDF5 works

In [1]:
#importing packages to set up the environment
import numpy as np
import h5py 

In [2]:
#Let's create some data
data = np.arange(10) #creates a bunch of inetgers 
data

array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9])

In [5]:
#open up a file object
f = h5py.File("demo.hdf5", 'w') #in write mode

In [6]:
#f object is created and the attributes here are similar to python dictionary like iteritems, iter,f[]
f

<HDF5 file "demo.hdf5" (mode r+)>

In [7]:
f['mydata'] = data

In [23]:
f['mydata']

array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9])

In [11]:
dset = f['mydata'] #proxy object allows you to do partial i/o
dset

<HDF5 dataset "mydata": shape (10,), type "<i4">

In [14]:
print (dset.shape)
print (dset.dtype)

(10,)
int32


In [15]:
dset[:]

array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9])

In [16]:
#h5py is using the native hdf5 subsetting capabilities and is only reading these files. very important for huge dataset
dset[0:6:2]

array([0, 2, 4])

In [21]:
print (data[[1,2,6]])
print (dset[[1,2,6]])

[1 2 6]
[1 2 6]


In [24]:
dset.attrs #attributes of hdf5 object, access to metadata that is associated with the object

<Attributes of HDF5 object at 1638802879416>

In [36]:
#so let's set some attributes to the metadata of the object
dset.attrs['sampling rate'] = 100e6
dset.attrs['pressure']  = 15

In [37]:
print (list(dset.attrs))
print (list(dset.attrs.keys()))

['pressure', 'sampling rate']
['pressure', 'sampling rate']


In [39]:
list(dset.attrs.items())

[('pressure', 15), ('sampling rate', 100000000.0)]

In [40]:
f.close()

In [41]:
f = h5py.File("demo.hdf5") # not in override mode, in pen mode= default mode

In [43]:
list(f.keys())

['mydata']

In [44]:
dset = f['mydata']

In [45]:
#hdf5 files are organised in hierarchy (h stands for this)
dset.name  #all the objects have full pathname, starting with slash

'/mydata'

In [46]:
f['/']

<HDF5 group "/" (1 members)>

In [47]:
#so the file object we create is to use the root group, so root group will have keys now- got the data center
root = f['/']

In [49]:
list(root.keys())

['mydata']

In [50]:
f['/path/dataset'] = data

In [51]:
dset2 = f['/path/dataset']

In [52]:
dset2.name  # path is a group here

'/path/dataset'

In [54]:
grp = f['/path']

In [55]:
grp

<HDF5 group "/path" (1 members)>

In [57]:
list(grp.keys())

['dataset']

In [58]:
## containership testing'
'mydata'  in f

True

In [59]:
'mydata2'  in f

False

In [60]:
'/path/dataset'  in f

True

### Let's try to create another dataset and see the compression

In [61]:
#the compressed file takes really less memory
dset3 = f.create_dataset('Big', (1000, 1000, 1000, 1000), dtype= 'f', compression = 'gzip') 

In [62]:
dset3.shape

(1000, 1000, 1000, 1000)

In [63]:
dset3.dtype

dtype('float32')

In [66]:
dset3[456, 599, 344, 13] = 42