In [1]:
%run ../00_AdvancedPythonConcepts/talktools.py

Using PyTables and HDF5
-----------------------
UC Berkeley Python class (AY250; 2013-2016)


*"PyTables presents a database-like approach to data storage, providing features like indexing and fast “in-kernel” queries on dataset contents. It also has a custom system to represent data types." -- http://docs.h5py.org/en/latest/faq.html#what-s-the-difference-between-h5py-and-pytables*

First we'll open a new HDF5 for writing (note: the "w" implies we will overwrite the file we have on disk)

In [2]:
from __future__ import print_function

In [3]:
!rm spam.h5

In [4]:
import numpy as np
from tables import *
h5file = open_file("spam.h5",mode = "w", title = "PyTables/HDF5 test file")
h5file

File(filename=spam.h5, title='PyTables/HDF5 test file', mode='w', root_uep='/', filters=Filters(complevel=0, shuffle=False, fletcher32=False, least_significant_digit=None))
/ (RootGroup) 'PyTables/HDF5 test file'

Filters sets the protocols for the way all data will be treated in the file. `fletcher32 = True`, for instance will enforce checksums (slower, but more stable data), `complevel` is the compression level, etc.

Now, let's create a 100$\times$100 random image with `create_array` and associate it with a group called "Datasets"

In [5]:
datasets = h5file.create_group(h5file.root, "Datasets", "Test data sets")
h5file.create_array(datasets, 'dataset1', np.random.random((100,100)), "Test data set #1")

/Datasets/dataset1 (Array(100, 100)) 'Test data set #1'
  atom := Float64Atom(shape=(), dflt=0.0)
  maindim := 0
  flavor := 'numpy'
  byteorder := 'little'
  chunkshape := None

Now let's create a complex object which we'll call a "Particle" that has the properties like name, atomic number, mass, etc.

In [7]:
class Particle(IsDescription):
    name        = StringCol(16, pos=1) # 16-character String
    atomic_num  = IntCol(pos=2)        # integer
    mass        = FloatCol(pos=3)      # double (double-precision)
    pressure    = Float32Col(shape=(2,3))

table1 = h5file.create_table(datasets, "particles", Particle)

NodeError: group ``/Datasets`` already has a child node named ``particles``

In [8]:
row = table1.row
row

/Datasets/particles.row (Row), pointing to row #0

Let's add some data into the first particle

In [9]:
row["name"] = "oxygen"
row["atomic_num"] = 8
row["mass"] = 15.9994
row["pressure"] = [[1,2,3],[-1,1,3]]
row.append() ; table1.flush()
h5file.root.Datasets.particles[0]

(b'oxygen', 8, 15.9994, [[1.0, 2.0, 3.0], [-1.0, 1.0, 3.0]])

Notice that, unlike numpy arrays, we can append new data. So this seems more like a DB in this respect.

In [10]:
row = table1.row
row["name"] = "bezerkilum"
row["atomic_num"] = 150
row["mass"] = 360.0
row["pressure"] = [[1,2,3],[1,0,3]]
row.append() ; table1.flush()
h5file.root.Datasets.particles[1]

(b'bezerkilum', 150, 360.0, [[1.0, 2.0, 3.0], [1.0, 0.0, 3.0]])

In [11]:
[row['name'].decode() for row in table1.where('(atomic_num > 5) & (mass < 100.0)')]

['oxygen']

In [12]:
for row in table1:
    print(row["name"].decode())

oxygen
bezerkilum


In [13]:
h5file.close()

# h5py

Groups work like dictionaries, and datasets work like NumPy arrays

http://docs.h5py.org/en/latest/quick.html


In [14]:
import h5py
import numpy as np
!rm spam-h5py.h5
h5file = h5py.File("spam-h5py.h5",mode = "w", title = "h5py/HDF5 test file")
h5file

<HDF5 file "spam-h5py.h5" (mode r+)>

In h5py, "Datasets are very similar to NumPy arrays. They are homogenous collections of data elements, with an immutable datatype and (hyper)rectangular shape. Unlike NumPy arrays, they support a variety of transparent storage features such as compression, error-detection, and chunked  I/O." -- http://docs.h5py.org/en/latest/high/dataset.html

In [15]:
datasets = h5file.create_group("Datasets")

In [16]:
datasets.create_dataset('Datasets/dataset1', data=np.random.random((100,100)))
datasets.create_dataset('Datasets/dataset2', data=np.random.random((100,100)),
                        compression="gzip", compression_opts=9)

<HDF5 dataset "dataset2": shape (100, 100), type "<f8">

In [17]:
data = datasets.get('Datasets/dataset2')

In [18]:
data.value

array([[ 0.94832348,  0.95941689,  0.47971393, ...,  0.09949417,
         0.56878191,  0.24879191],
       [ 0.4688951 ,  0.99796577,  0.81128018, ...,  0.78506729,
         0.65480603,  0.62773839],
       [ 0.77603067,  0.56815522,  0.73769562, ...,  0.51504394,
         0.8257135 ,  0.80026676],
       ..., 
       [ 0.62237319,  0.64651862,  0.6055465 , ...,  0.373361  ,
         0.83652443,  0.71266194],
       [ 0.19737799,  0.31067732,  0.40489756, ...,  0.01835904,
         0.62359976,  0.6044957 ],
       [ 0.75067804,  0.83246947,  0.23249054, ...,  0.17746304,
         0.73778534,  0.12898696]])

In [19]:
data[2:10,1:9:3]

array([[ 0.56815522,  0.94516091,  0.19845791],
       [ 0.36736373,  0.00987598,  0.24216927],
       [ 0.82126304,  0.23425398,  0.81037593],
       [ 0.22377696,  0.75828786,  0.68689409],
       [ 0.54785322,  0.94308149,  0.5715787 ],
       [ 0.70254158,  0.90627767,  0.67104524],
       [ 0.51082025,  0.43818721,  0.44894661],
       [ 0.88413934,  0.92051837,  0.56631306]])

In [20]:
dtype = [("name","S16"),("atomic_num","i4")] 

In [21]:
datasets.create_dataset("Particle", shape=(100,1), dtype=dtype)

<HDF5 dataset "Particle": shape (100, 1), type "|V20">

In [22]:
pdata = datasets.get("Particle")

In [23]:
pdata[0] = ("oxygen",8)

In [24]:
h5file.close()

In [25]:
with h5py.File("spam-h5py.h5",mode = "r") as f:
    pdata = f.get("Datasets/Particle")
    print(pdata[0])

[(b'oxygen', 8)]
