# Load HDF5 data w/ xarray

A lot of NASA data is stored in HDF5 format. One example is data from the Sentinel-1 radar mission, which will be similar to the upcoming NISAR mission.

https://aria.jpl.nasa.gov/node/97

This notebook illustrates loading the data into xarray for analysis

In [1]:
import xarray as xr
import h5netcdf
import rasterio
import h5py
import gcsfs
import os

In [2]:
print(xr.__version__)
print(h5netcdf.__version__)
print(rasterio.__version__)
print(h5py.__version__)
print(gcsfs.__version__)

0.11.1+64.g612d390f.dirty
0.6.2
1.0.18
2.9.0
0.2.0


In [5]:
# Copied data from ASF to Google Storage bucket
fs = gcsfs.GCSFileSystem()
images = fs.ls('pangeo-data/grfn-v2/137/')
fileObj = fs.open('pangeo-data/grfn-v2/137/S1-GUNW-A-R-137-tops-20181129_20181123-020010-43220N_41518N-PP-e2c7-v2_0_0.nc')

In [6]:
# Option 1) Copy file locally and read w/ xarray
gsPath = 'pangeo-data/grfn-v2/137/S1-GUNW-A-R-137-tops-20181129_20181123-020010-43220N_41518N-PP-e2c7-v2_0_0.nc'
localPath = os.path.basename(gsPath)
#fs.get(gsPath, localPath)

In [7]:
# Data arrays are stored in "science group"
#da = xr.open_dataset(localPath, group='/science/grids/data', engine='h5netcdf')
#da 

In [8]:
# Seems that h5py >2.9.0 can handle file-like-objects:
# https://github.com/h5py/h5py/pull/1105
h5 = h5py.File(fileObj, 'r')

In [8]:
# Works but slow from home wifi (likely due to issues w/ number of network requests required)
# http://matthewrocklin.com/blog/work/2018/02/06/hdf-in-the-cloud
print(h5.attrs.keys())
ds = h5['science/grids/data']
print(ds.items())
print(ds['coherence'].chunks)

<KeysViewHDF5 ['product_type', 'Conventions', 'title', 'version', 'author', 'institution', 'source', 'references', 'ogr_geometry_field', 'ogr_layer_name', 'ogr_layer_type']>
ItemsViewHDF5(<HDF5 group "/science/grids/data" (7 members)>)
(682, 1386)


In [9]:
# but, can we open this w/ xarray anyway? Yes! with modifications to xarray and h5netcdf
ds = xr.open_dataset(fileObj, group='/science/grids/data', engine='h5netcdf')
ds 

<xarray.Dataset>
Dimensions:              (latitude: 2045, longitude: 4158)
Coordinates:
  * longitude            (longitude) float64 -123.1 -123.1 ... -119.6 -119.6
  * latitude             (latitude) float64 43.22 43.22 43.22 ... 41.52 41.52
Data variables:
    crs                  int32 ...
    unwrappedPhase       (latitude, longitude) float32 ...
    coherence            (latitude, longitude) float32 ...
    connectedComponents  (latitude, longitude) float32 ...
    amplitude            (latitude, longitude) float32 ...

In [11]:
# Try as Dask array
ds = xr.open_dataset(fileObj, group='/science/grids/data', engine='h5netcdf',
                     chunks=dict(latitude=682, longitude=1386))
ds 

<xarray.Dataset>
Dimensions:              (latitude: 2045, longitude: 4158)
Coordinates:
  * longitude            (longitude) float64 -123.1 -123.1 ... -119.6 -119.6
  * latitude             (latitude) float64 43.22 43.22 43.22 ... 41.52 41.52
Data variables:
    crs                  int32 ...
    unwrappedPhase       (latitude, longitude) float32 dask.array<shape=(2045, 4158), chunksize=(682, 1386)>
    coherence            (latitude, longitude) float32 dask.array<shape=(2045, 4158), chunksize=(682, 1386)>
    connectedComponents  (latitude, longitude) float32 dask.array<shape=(2045, 4158), chunksize=(682, 1386)>
    amplitude            (latitude, longitude) float32 dask.array<shape=(2045, 4158), chunksize=(682, 1386)>

In [12]:
ds['coherence'].plot.imshow()

<matplotlib.image.AxesImage at 0x111a44390>