In [1]:
from warnings import warn
import matplotlib.pyplot as plt
%matplotlib inline
import numpy as np
import dask.array as da
import h5py

Not Used

In [None]:
def validate_dimensions(dimensions, dim_type='Position'):
    """
    Checks if the provided object is an iterable with pyUSID.Dimension objects.
    If it is not full of Dimension objects, Exceptions are raised.

    Parameters
    ----------
    dimensions : iterable or pyUSID.Dimension
        Iterable containing pyUSID.Dimension objects
    dim_type : str, Optional. Default = "Position"
        Type of Dimensions in the iterable. Set to "Spectroscopic" if not Position dimensions.
        This string is only used for more descriptive Exceptions

    Returns
    -------
    list
        List containing pyUSID.Dimension objects
    """
    if isinstance(dimensions, Dimension):
        dimensions = [dimensions]
    if isinstance(dimensions, np.ndarray):
        if dimensions.ndim > 1:
            dimensions = dimensions.ravel()
            warn(dim_type + ' dimensions should be specified by a 1D array-like. Raveled this numpy array for now')
    if not isinstance(dimensions, (list, np.ndarray, tuple)):
        raise TypeError(dim_type + ' dimensions should be array-like of Dimension objects')
    if not np.all([isinstance(x, Dimension) for x in dimensions]):
        raise TypeError(dim_type + ' dimensions should be a sequence of Dimension objects')
    return dimensions

Used

In [2]:
unicode = str
class Dimension(object):
    """
    ..autoclass::Dimension
    """

    def __init__(self, name, quantity, units, values, is_position):
        """
        Simple object that describes a dimension in a dataset by its name, units, and values
        Parameters
        ----------
        name : str or unicode
            Name of the dimension. For example 'X'
        quantity : str or unicode
            Quantity for this dimension. For example: 'Length'
        units : str or unicode
            Units for this dimension. For example: 'um'
        values : array-like or int
            Values over which this dimension was varied. A linearly increasing set of values will be generated if an
            integer is provided instead of an array.
        is_position : bool
            Whether or not this is a position or spectroscopy dimensions
        """
        #name = validate_single_string_arg(name, 'name')
        #quantity = validate_single_string_arg(quantity, 'quantity')

        if not isinstance(units, (str, unicode)):
            raise TypeError('units should be a string')
        units = units.strip()

        if isinstance(values, int):
            if values < 1:
                raise ValueError('values should at least be specified as a positive integer')
            values = np.arange(values)
        if not isinstance(values, (np.ndarray, list, tuple)):
            raise TypeError('values should be array-like')
        values = np.array(values)
        if values.ndim > 1:
            raise ValueError('Values for dimension: {} are not 1-dimensional'.format(name))

        if not isinstance(is_position, bool):
            raise TypeError('is_position should be a bool')

        self.name = name
        self.quantity = quantity
        self.units = units
        self.values = values
        self.is_position = is_position

    def __repr__(self):
        return '{} - {} ({}): {}'.format(self.name, self.quantity, self.units, self.values)

    def __eq__(self, other):
        if isinstance(other, Dimension):
            if self.name != other.name:
                return False
            if self.units != other.units:
                return False
            if self.quantity != other.quantity:
                return False
            if len(self.values) != len(other.values):
                return False
            if not np.allclose(self.values, other.values):
                return False
        return True

# Input

In [19]:
main_data_name  = 'nDim_Data'
main_data = np.random.rand(5, 7, 11, 3)
quantity = 'intensity'
units="pixel"


dim_dict = {0: Dimension('Y', 'Length', 'um', np.linspace(0, 10, num=5), True),
        1: Dimension('X', 'Length', 'um', np.linspace(0, 6, num=7), True),
        2: Dimension('DC offset', 'Bias', 'V', np.sin(np.linspace(0, 1, num=11) * 2 * np.pi), True),
        3: Dimension('BE Frequency', 'Frequency', 'Hz', np.linspace(0, 10, num=3), True)}

dim_dict 


{0: Y - Length (um): [ 0.   2.5  5.   7.5 10. ],
 1: X - Length (um): [0. 1. 2. 3. 4. 5. 6.],
 2: DC offset - Bias (V): [ 0.00000000e+00  5.87785252e-01  9.51056516e-01  9.51056516e-01
   5.87785252e-01  1.22464680e-16 -5.87785252e-01 -9.51056516e-01
  -9.51056516e-01 -5.87785252e-01 -2.44929360e-16],
 3: BE Frequency - Frequency (Hz): [ 0.  5. 10.]}

## Make pyhdf5 file and channel

In [20]:
try:
    h5_file.close()
except:
    pass
h5_file =  h5py.File('test.hf5', mode='a')
if "Measurement_000/Channel_000" in h5_file:
    current_channel = h5_file["Measurement_000/Channel_000"]
else:
    current_channel = h5_file.create_group("Measurement_000/Channel_000")

if 'DC offset' not in current_channel:
    current_channel['DC offset'] = np.sin(np.linspace(0, 1, num=11) * 2 * np.pi)

dim_dict[2] = current_channel['DC offset'] 
current_channel['DC offset'].attrs['name']= 'DC offset'
current_channel['DC offset'].attrs['quantity']= 'Bias'
current_channel['DC offset'].attrs['units'] = 'V'
current_channel['DC offset'].attrs['is_position'] = False
dim_dict


{0: Y - Length (um): [ 0.   2.5  5.   7.5 10. ],
 1: X - Length (um): [0. 1. 2. 3. 4. 5. 6.],
 2: <HDF5 dataset "DC offset": shape (11,), type "<f8">,
 3: BE Frequency - Frequency (Hz): [ 0.  5. 10.]}

## Activate pyUSID helper functions

We can see which functions can be unchanged. 

In [21]:
import pyUSID as usid
from pyUSID.io.hdf_utils.base import get_attr, write_simple_attrs, is_editable_h5, write_book_keeping_attrs
#from pyUSID.io.hdf_utils.simple import link_as_main, check_if_main, write_ind_val_dsets, validate_dims_against_main, validate_anc_h5_dsets, copy_dataset
from pyUSID.io.dtype_utils import contains_integers, validate_dtype, validate_single_string_arg, validate_string_args, \
    validate_list_of_strings, lazy_load_array

## New Validate Dimension Function 

notice the plural in name

In [22]:
def check_dimension_dataset(this_dim,dim_shape):
    error_message = ''
    # Is it 1D?
    if len(this_dim.shape)!=1:
        error_message += ' High dimensional datasets are not allowed as dimensions;\n'
    # Does this dataset have a "simple" dtype - no compound data type allowed!
    # is the shape matching with the main dataset?
    if len(this_dim) != dim_shape:
        error_message += ' Dimension has wrong length;\n'
    # Does it contain some ancillary attributes like 'name', quantity', 'units', and 'is_position' 
    necessary_attributes =  ['name', 'quantity', 'units', 'is_position']
    for key in necessary_attributes:
        if key not in this_dim.attrs:
            error_message += f'Missing {key} attribute in dimension;\n ' 
        # and are these of types str, str, str, and bool respectively and not empty?
        elif key == 'is_position':
            if this_dim.attrs['is_position'] not in [True, False]: ## isinstance is here not working 
                error_message += f'{key} attribute in dimension should be boolean;\n ' 
        elif not isinstance(this_dim.attrs[key], str):
            error_message += f'{key} attribute in dimension should be string;\n ' 
    
    
    return error_message

def validate_dimensions(main_shape, dim_dict, h5_parent_group ):
    # Each item could either be a Dimension object or a HDF5 dataset
    # Collect the file within which these ancillary HDF5 objectsa are present if they are provided
    which_h5_file = {}
    # Also collect the names of the dimensions. We want them to be unique
    dim_names = []
    
    dimensions_correct = []
    for index, dim_exp_size in enumerate(main_shape):
        this_dim = dim_dict[index]
        if isinstance(this_dim, h5py.Dataset):
            print(f'{index} is a dataset')
            error_message = check_dimension_dataset(this_dim, main_shape[index])
                
            # All these checks should live in a helper function for cleaniness
            # Is it 1D?
            # Does this dataset have a "simple" dtype - no compound data type allowed!
            # is the shape matching with the main dataset?
            # Does it contain some ancillary attributes like 'name', quantity', 'units', and 'is_position' 
            # and are these of types str, str, str, and bool respectively and not empty?
            if len(error_message)>0:
                print(f'Dimension {index} has the following error_message:\n', error_message)
            
            else:
                print("dataset ok")
                dim_names.append(this_dim.name)
                # are all datasets in the same file?
                which_h5_file[index]=this_dim.file.filename #better to keep it indictionary to keep track of index
                
        elif isinstance(this_dim, Dimension):
            print('Dimension')
            print(len(this_dim.values))
            # is the shape matching with the main dataset?
            dimensions_correct.append(len(this_dim.values) == dim_exp_size)
            # Is there a HDF5 dataset with the same name already in the provided group where this dataset will be created?
            if  this_dim.name in h5_parent_group:
                # check if this object with the same name is a dataset and if it satisfies the above tests
                if isinstance(h5_parent_group[this_dim.name], h5py.Dataset):
                    print('needs more checking')
                    # Gerd for the moment disableddimensions_correct[-1] = False
                else:
                    dimensions_correct[-1] = False
            # Otherwise, just append the dimension name for the uniqueness test
            elif this_dim.name not in dim_names:
                dim_names.append(this_dim.name)
            else:
                dimensions_correct[-1] = False
        else:
            raise TypeError(f'Values of dim_dict should either be h5py.Dataset objects or Dimension. '
                            'Object at index: {index} was of type: {index}')
        
        for dim in which_h5_file:
            if which_h5_file[dim] != h5_parent_group.file.filename:
                print('need to copy dimension', dim)
        for i, dim_name in enumerate(dim_names[:-1]):
            if dim_name in  dim_names[i+1:]:
                print(dim_name, ' is not unique')
    
    return dimensions_correct 
    

## Write_main_dataset function

In [25]:


def write_main_dataset(h5_parent_group, main_data, main_data_name, quantity, units, dim_dict,
                       main_dset_attrs=None, h5_pos_inds=None, h5_pos_vals=None, h5_spec_inds=None, h5_spec_vals=None,
                       aux_spec_prefix='Spectroscopic_', aux_pos_prefix='Position_', verbose=False,
                       slow_to_fast=False, **kwargs):
    
    if not isinstance(h5_parent_group, (h5py.Group, h5py.File)):
        raise TypeError('h5_parent_group should be a h5py.File or h5py.Group object')
    if not is_editable_h5(h5_parent_group):
        raise ValueError('The provided file is not editable')
    if verbose:
        print('h5 group and file OK')

        
    #####################
    # Validate Main Data
    #####################
    quantity, units, main_data_name = validate_string_args([quantity, units, main_data_name],
                                                           ['quantity', 'units', 'main_data_name'])

    if verbose:
            print('quantity, units, main_data_name all OK')

    quantity = quantity.strip()
    units = units.strip()
    main_data_name = main_data_name.strip()
    if '-' in main_data_name:
        warn('main_data_name should not contain the "-" character. Reformatted name from:{} to '
             '{}'.format(main_data_name, main_data_name.replace('-', '_')))
    main_data_name = main_data_name.replace('-', '_')
    
    if isinstance(main_data, (list, tuple)):
        if not contains_integers(main_data, min_val=1):
            raise ValueError('main_data if specified as a shape should be a list / tuple of integers >= 1')
        if len(main_data) < 1:
            raise ValueError('main_data if specified as a shape should contain at least 1 number for the singular dimension')
        if 'dtype' not in kwargs:
            raise ValueError('dtype must be included as a kwarg when creating an empty dataset')
        _ = validate_dtype(kwargs.get('dtype'))
        main_shape = main_data
        if verbose:
            print('Selected empty dataset creation. OK so far')
    elif isinstance(main_data, (np.ndarray, da.core.Array)):
        main_shape = main_data.shape
        if verbose:
            print('Provided numpy or Dask array for main_data OK so far')
    else:
        raise TypeError('main_data should either be a numpy array or a tuple / list with the shape of the data')
        
    ######################
    # Validate Dimensions
    ######################
    print( len(dim_dict) , len(main_shape))
    # An N dimensional dataset should have N items in the dimension dictionary
    if len(dim_dict) != len(main_shape):
        raise ValueError('Incorrect number of dimensions: {} provided to support main data, of shape: {}'.format(len(dim_dict), main_shape))
    if set(range(len(main_shape))) != set(dim_dict.keys()):
        raise KeyError('')
        
        
    if False in validate_dimensions(main_shape,dim_dict, h5_parent_group):
        print('dimensions incorrect')
        
        return
    
    print('dimensions correct')
    
    if main_data_name in h5_parent_group:
        print('Oops, dataset exits')
        #del h5_parent_group[main_data_name]
        return
    
    #####################
    # Write Main Dataset
    ####################
    if h5_parent_group.file.driver == 'mpio':
        if kwargs.pop('compression', None) is not None:
            warn('This HDF5 file has been opened with the "mpio" communicator. '
                 'mpi4py does not allow creation of compressed datasets. Compression kwarg has been removed')
    if isinstance(main_data, np.ndarray):
        # Case 1 - simple small dataset
        
        h5_main = h5_parent_group.create_dataset(main_data_name, data=main_data, **kwargs)
        if verbose:
            print('Created main dataset with provided data')
    elif isinstance(main_data, da.core.Array):
        # Case 2 - Dask dataset
        # step 0 - get rid of any automated dtype specification:
        _ = kwargs.pop('dtype', None)
        # step 1 - create the empty dataset:
        h5_main = h5_parent_group.create_dataset(main_data_name, shape=main_data.shape, dtype=main_data.dtype,
                                                 **kwargs)
        if verbose:
            print('Created empty dataset: {} for writing Dask dataset: {}'.format(h5_main, main_data))
            print('Dask array will be written to HDF5 dataset: "{}" in file: "{}"'.format(h5_main.name,
                                                                                          h5_main.file.filename))
        # Step 2 - now ask Dask to dump data to disk
        da.to_hdf5(h5_main.file.filename, {h5_main.name: main_data})
        # main_data.to_hdf5(h5_main.file.filename, h5_main.name)  # Does not work with python 2 for some reason
    else:
        # Case 3 - large empty dataset
        h5_main = h5_parent_group.create_dataset(main_data_name, main_data, **kwargs)
        if verbose:
            print('Created empty dataset for Main')    
    
    #################
    # Add Dimensions
    #################
    for i, this_dim in dim_dict.items():
        if isinstance(this_dim, h5py.Dataset):
            this_dim_dset = this_dim
        elif isinstance(this_dim, Dimension):
            this_dim_dset = h5_parent_group.create_dataset(this_dim.name,data=this_dim.values)
            this_dim_dset.attrs['units'] = this_dim.units
            this_dim_dset.attrs['name'] = this_dim.name
            this_dim_dset.attrs['quantity'] =  this_dim.quantity
            this_dim_dset.attrs['is_position'] = this_dim.is_position
        else:
            print(i,' not a good dimension')
            
            pass
        this_dim_dset.make_scale(this_dim_dset.attrs['name'])
        h5_main.dims[int(i)].label = this_dim_dset.attrs['name']
        h5_main.dims[int(i)].attach_scale(this_dim_dset)
        
    write_simple_attrs(h5_main, {'quantity': quantity, 'units': units})
    if verbose:
        print('Wrote quantity and units attributes to main dataset')

    if isinstance(main_dset_attrs, dict):
        write_simple_attrs(h5_main, main_dset_attrs)
        if verbose:
            print('Wrote provided attributes to main dataset')

    write_book_keeping_attrs(h5_main)
        
    return h5_main

## Write Dataset

In [28]:
main_data_name  = 'nDim_Data'
main_data = np.random.rand(5, 7, 11, 3)
quantity = 'intensity'
units="pixel"
data_type = 'STM_spectroscopy'

print(dim_dict)
current_dataset = write_main_dataset(current_channel, main_data, main_data_name, quantity, units, dim_dict)



{0: Y - Length (um): [ 0.   2.5  5.   7.5 10. ], 1: X - Length (um): [0. 1. 2. 3. 4. 5. 6.], 2: <HDF5 dataset "DC offset": shape (11,), type "<f8">, 3: BE Frequency - Frequency (Hz): [ 0.  5. 10.]}
4 4
Dimension
5
Dimension
7
2 is a dataset
dataset ok
Dimension
3
dimensions correct


## Test 

In [35]:
print(current_dataset.dims[0].keys())
[dim.label for dim in current_dataset.dims]

['Y']


['Y', 'X', 'DC offset', 'BE Frequency']

## Empty h5py Group 

In [27]:
for key in current_channel:
    if 'DC' not in key:
        del current_channel[key]
print(current_channel.keys())

h5_file.flush()

<KeysViewHDF5 ['DC offset']>


In [100]:
# Check to make sure the names are all unqiue

# Check to make sure that all ancillary datasets are in the same HDF5 file using which_h5_file

In [None]:
h5_dims = []
# Now write Dimension objects to HDF5 datasets. 
for index in range(len(main_shape)):
    this_dim = dim_dict[index]
    if isinstance(this_dim, h5py.Dataset):
        h5_dims.append(this_dim)
    else: # We know by now that this is the Dimension object
        # Write this dimension object to HDF5 dataset
        h5_anc_dset = None
        # Append this dataset to the list
        h5_dims.append(h5_anc_dset)

In [None]:
# At this point, we have all the ancillary datasets
# We write the main dataset now

if h5_parent_group.file.driver == 'mpio':
    if kwargs.pop('compression', None) is not None:
        warn('This HDF5 file has been opened with the "mpio" communicator. '
             'mpi4py does not allow creation of compressed datasets. Compression kwarg has been removed')

if isinstance(main_data, np.ndarray):
    # Case 1 - simple small dataset
    h5_main = h5_parent_group.create_dataset(main_data_name, data=main_data, **kwargs)
    if verbose:
        print('Created main dataset with provided data')
elif isinstance(main_data, da.core.Array):
    # Case 2 - Dask dataset
    # step 0 - get rid of any automated dtype specification:
    _ = kwargs.pop('dtype', None)
    # step 1 - create the empty dataset:
    h5_main = h5_parent_group.create_dataset(main_data_name, shape=main_data.shape, dtype=main_data.dtype,
                                             **kwargs)
    if verbose:
        print('Created empty dataset: {} for writing Dask dataset: {}'.format(h5_main, main_data))
        print('Dask array will be written to HDF5 dataset: "{}" in file: "{}"'.format(h5_main.name,
                                                                                      h5_main.file.filename))
    # Step 2 - now ask Dask to dump data to disk
    da.to_hdf5(h5_main.file.filename, {h5_main.name: main_data})
    # main_data.to_hdf5(h5_main.file.filename, h5_main.name)  # Does not work with python 2 for some reason
else:
    # Case 3 - large empty dataset
    h5_main = h5_parent_group.create_dataset(main_data_name, main_data, **kwargs)
    if verbose:
        print('Created empty dataset for Main')

In [None]:
write_simple_attrs(h5_main, {'quantity': quantity, 'units': units})
if verbose:
    print('Wrote quantity and units attributes to main dataset')
    
if isinstance(main_dset_attrs, dict):
    write_simple_attrs(h5_main, main_dset_attrs)
    if verbose:
        print('Wrote provided attributes to main dataset')

write_book_keeping_attrs(h5_main)

In [None]:
# Create and attach dimension scales for each ancillary dataset:

if verbose:
    print('Successfully linked datasets - dataset should be main now')

for index, h5_dim in enumerate(h5_dims):
    dim_name = get_attr(h5_dim, 'name')
    # First make this HDF5 dataset a dimension scale
    h5_dim.make_scale(dim_name)
    # Attach the name of the dimension to the main dataset also
    h5_main.dims[index].label = dim_name
    # Finally attach the scale itself
    h5_main.dims[index].attach_scale(h5_dim)
    
# Now the dataset should be a main dataset

In [None]:
# Now return this object as a powerful object:    
#from ..nsi_data import NSIDataset
#return NSIDataset(h5_main)