In [1]:
%run talktools.py

Using PyTables and HDF5
-----------------------
UC Berkeley Python class (AY250; 2013-2016)


*"PyTables presents a database-like approach to data storage, providing features like indexing and fast “in-kernel” queries on dataset contents. It also has a custom system to represent data types." -- http://docs.h5py.org/en/latest/faq.html#what-s-the-difference-between-h5py-and-pytables*

First we'll open a new HDF5 for writing (note: the "w" implies we will overwrite the file we have on disk)

In [2]:
from __future__ import print_function

In [3]:
import numpy as np
from tables import *
h5file = open_file("spam.h5",mode = "w", title = "PyTables/HDF5 test file")
h5file

Traceback (most recent call last):
  File "/Users/Kamilobu/anaconda/envs/ay250/lib/python3.5/site-packages/IPython/core/interactiveshell.py", line 2881, in run_code
    exec(code_obj, self.user_global_ns, self.user_ns)
  File "<ipython-input-3-b725882abc68>", line 3, in <module>
    h5file = open_file("spam.h5",mode = "w", title = "PyTables/HDF5 test file")
  File "/Users/Kamilobu/anaconda/envs/ay250/lib/python3.5/site-packages/tables/file.py", line 318, in open_file
    return File(filename, mode, title, root_uep, filters, **kwargs)
  File "/Users/Kamilobu/anaconda/envs/ay250/lib/python3.5/site-packages/tables/file.py", line 784, in __init__
    self._g_new(filename, mode, **params)
  File "tables/hdf5extension.pyx", line 488, in tables.hdf5extension.File._g_new (tables/hdf5extension.c:5593)
tables.exceptions.HDF5ExtError: HDF5 error back trace

  File "H5F.c", line 522, in H5Fcreate
    unable to create file
  File "H5Fint.c", line 1003, in H5F_open
    unable to open file: time = Fr

HDF5ExtError: HDF5 error back trace

  File "H5F.c", line 522, in H5Fcreate
    unable to create file
  File "H5Fint.c", line 1003, in H5F_open
    unable to open file: time = Fri Sep 30 14:27:44 2016
, name = 'spam.h5', tent_flags = 13
  File "H5FD.c", line 993, in H5FD_open
    open failed
  File "H5FDsec2.c", line 339, in H5FD_sec2_open
    unable to open file: name = 'spam.h5', errno = 2, error message = 'No such file or directory', flags = 13, o_flags = 602

End of HDF5 error back trace

Unable to open/create file 'spam.h5'

Filters sets the protocols for the way all data will be treated in the file. `fletcher32 = True`, for instance will enforce checksums (slower, but more stable data), `complevel` is the compression level, etc.

Now, let's create a 100$\times$100 random image with `create_array` and associate it with a group called "Datasets"

In [3]:
datasets = h5file.create_group(h5file.root, "Datasets", "Test data sets")
h5file.create_array(datasets, 'dataset1', np.random.random((100,100)), "Test data set #1")

  from ipykernel import kernelapp as app


/Datasets/dataset1 (Array(100, 100)) 'Test data set #1'
  atom := Float64Atom(shape=(), dflt=0.0)
  maindim := 0
  flavor := 'numpy'
  byteorder := 'little'
  chunkshape := None

Now let's create a complex object which we'll call a "Particle" that has the properties like name, atomic number, mass, etc.

In [4]:
class Particle(IsDescription):
    name        = StringCol(16, pos=1) # 16-character String
    atomic_num  = IntCol(pos=2)        # integer
    mass        = FloatCol(pos=3)      # double (double-precision)
    pressure    = Float32Col(shape=(2,3))
table1 = h5file.create_table(datasets, "particles", Particle)

In [5]:
row = table1.row
row

/Datasets/particles.row (Row), pointing to row #0

Let's add some data into the first particle

In [6]:
row["name"] = "oxygen"
row["atomic_num"] = 8
row["mass"] = 15.9994
row["pressure"] = [[1,2,3],[-1,1,3]]
row.append() ; table1.flush()
h5file.root.Datasets.particles[0]

(b'oxygen', 8, 15.9994, [[1.0, 2.0, 3.0], [-1.0, 1.0, 3.0]])

Notice that, unlike numpy arrays, we can append new data. So this seems more like a DB in this respect.

In [7]:
row = table1.row
row["name"] = "bezerkilum"
row["atomic_num"] = 150
row["mass"] = 360.0
row["pressure"] = [[1,2,3],[1,0,3]]
row.append() ; table1.flush()
h5file.root.Datasets.particles[1]

(b'bezerkilum', 150, 360.0, [[1.0, 2.0, 3.0], [1.0, 0.0, 3.0]])

In [11]:
[row['name'].decode() for row in table1.where('(atomic_num > 5) & (mass < 100.0)')]

['oxygen']

In [10]:
for row in table1:
    print(row["name"].decode())

oxygen
bezerkilum


In [14]:
h5file.close()

# h5py

Groups work like dictionaries, and datasets work like NumPy arrays

http://docs.h5py.org/en/latest/quick.html


In [39]:
import h5py
import numpy as np
!rm spam-h5py.h5
h5file = h5py.File("spam-h5py.h5",mode = "w", title = "h5py/HDF5 test file")
h5file

<HDF5 file "spam-h5py.h5" (mode r+)>

In h5py, "Datasets are very similar to NumPy arrays. They are homogenous collections of data elements, with an immutable datatype and (hyper)rectangular shape. Unlike NumPy arrays, they support a variety of transparent storage features such as compression, error-detection, and chunked  I/O." -- http://docs.h5py.org/en/latest/high/dataset.html

In [7]:
datasets = h5file.create_group("Datasets")

In [17]:
datasets.create_dataset('Datasets/dataset1', data=np.random.random((100,100)))
datasets.create_dataset('Datasets/dataset2', data=np.random.random((100,100)),
                        compression="gzip", compression_opts=9)

<HDF5 dataset "dataset2": shape (100, 100), type "<f8">

In [19]:
data = datasets.get('Datasets/dataset2')

In [20]:
data.value

array([[ 0.97794214,  0.11794942,  0.39368585, ...,  0.7658321 ,
         0.78288034,  0.9464316 ],
       [ 0.76573763,  0.8718638 ,  0.7027056 , ...,  0.01710255,
         0.92814481,  0.80044144],
       [ 0.33383122,  0.53455644,  0.91913964, ...,  0.57837372,
         0.56180651,  0.10364729],
       ..., 
       [ 0.23370288,  0.43693144,  0.73115798, ...,  0.93510635,
         0.67461141,  0.38052106],
       [ 0.30162771,  0.62406416,  0.83488421, ...,  0.14604748,
         0.83666988,  0.26251197],
       [ 0.14676648,  0.43856595,  0.16412232, ...,  0.55433556,
         0.70228834,  0.0908137 ]])

In [22]:
data[2:10,1:9:3]

array([[ 0.53455644,  0.55951379,  0.27516592],
       [ 0.0754908 ,  0.62502142,  0.9400895 ],
       [ 0.64286464,  0.44354353,  0.2552696 ],
       [ 0.93754221,  0.16648524,  0.9980154 ],
       [ 0.40565599,  0.30440593,  0.13540566],
       [ 0.64642334,  0.89577348,  0.71847996],
       [ 0.81442915,  0.47496836,  0.80107197],
       [ 0.18471405,  0.780301  ,  0.21541965]])

In [23]:
dtype = [("name","S16"),("atomic_num","i4")] 

In [27]:
datasets.create_dataset("Particle", shape=(100,1), dtype=dtype)

<HDF5 dataset "Particle": shape (100, 1), type "|V20">

In [28]:
pdata = datasets.get("Particle")

In [30]:
pdata[0] = ("oxygen",8)

In [31]:
h5file.close()

In [38]:
with h5py.File("spam-h5py.h5",mode = "r") as f:
    pdata = f.get("Datasets/Particle")
    print(pdata[0])

[(b'oxygen', 8)]
