Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Convert dtype=object arrays if possible #518

Merged
merged 9 commits into from
Nov 18, 2021
30 changes: 30 additions & 0 deletions pyiron_base/generic/hdfio.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
Classes to map the Python objects to HDF5 data structures
"""

import numbers
import h5py
import os
from collections.abc import MutableMapping
Expand All @@ -16,6 +17,8 @@
import sys
from typing import Union
from pyiron_base.interfaces.has_groups import HasGroups
from pyiron_base.state import state
from pyiron_base.generic.util import deprecate
niklassiemer marked this conversation as resolved.
Show resolved Hide resolved

__author__ = "Joerg Neugebauer, Jan Janssen"
__copyright__ = (
Expand Down Expand Up @@ -144,6 +147,8 @@ def __getitem__(self, item):
if item in self.list_groups():
with self.open(item) as hdf_item:
obj = hdf_item.copy()
if self._is_convertable_dtype_object_array(obj):
obj = self._convert_dtype_obj_array(obj)
return obj
raise ValueError("Unknown item: {} {} {}".format(item, self.file_name, self.h5_path))
else:
Expand Down Expand Up @@ -175,6 +180,31 @@ def __getitem__(self, item):
hdf_object.h5_path = "/".join(item_abs_lst[:-1])
return hdf_object[item_abs_lst[-1]]

#TODO: remove this function upon 1.0.0 release
@staticmethod
def _is_convertable_dtype_object_array(obj):
if isinstance(obj, np.ndarray) and obj.dtype == np.dtype(object):
first_element = obj[tuple([0 for _ in range(obj.ndim)])]
last_element = obj[tuple([-1 for _ in range(obj.ndim)])]
if isinstance(first_element, numbers.Number) and isinstance(last_element, numbers.Number) \
and not _is_ragged_array(obj):
return True
return False

#TODO: remove this function upon 1.0.0 release
@staticmethod
def _convert_dtype_obj_array(obj: np.ndarray):
result = np.array(obj.tolist())
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

If someone has a better function to do this, I would be happy. Especially, since the docstring states

Notes
-----
The array may be recreated via ``a = np.array(a.tolist())``, although this
may sometimes lose precision.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I wouldn't be too worried. I think they mean that you might get a conversion from int64 down to int32. AFAIK we don't have any users who care about using long int/floats (or short ones for more memory efficiency), so any sloppiness here should be perfectly safe.

if result.dtype != np.dtype(object):
state.logger.warning(f"Deprecated data structure! "
f"Returned array was converted from dtype='O' to dtype={result.dtype} "
f"via `np.array(result.tolist())`.\n"
f"Please run rewrite_hdf5() to update this data! "
f"To update all your data run update tool.")
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

On the PR that introduces the tool we need to remember to come back and reference it here.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

^^ @pmrv

return result
else:
return obj

def __setitem__(self, key, value):
"""
Store data inside the HDF5 file
Expand Down
70 changes: 70 additions & 0 deletions tests/generic/test_fileHDFio.py
Original file line number Diff line number Diff line change
Expand Up @@ -110,6 +110,76 @@ def _check_full_hdf_values(self, hdf):
with self.subTest('content/group/some_entry'):
self.assertEqual(hdf['content/group/some_entry'], 'present')

def test__is_convertable_dtype_object_array(self):
object_array_with_lists = np.array([[[1, 2, 3], [2, 3, 4]], [[4, 5, 6]]], dtype=object)
int_array_as_objects_array = np.array([[1, 2, 3], [3, 4, 5]], dtype=object)
float_array_as_objects_array = np.array([[1.1, 1.3, 1.5], [2, 2.1, 2.2]], dtype=object)
object_array_with_none = np.array([[1.1, None, 1.5], [None, 2.1, 2.2]], dtype=object)

self.assertFalse(self.i_o_hdf5._is_convertable_dtype_object_array(object_array_with_lists))
self.assertTrue(self.i_o_hdf5._is_convertable_dtype_object_array(int_array_as_objects_array))
self.assertTrue(self.i_o_hdf5._is_convertable_dtype_object_array(float_array_as_objects_array))
self.assertTrue(self.i_o_hdf5._is_convertable_dtype_object_array(object_array_with_none),
msg="This array should be considered convertable, since first and last element are numbers.")

def test__convert_dtype_obj_array(self):
object_array_with_lists = np.array([[[1, 2, 3], [2, 3, 4]], [[4, 5, 6]]], dtype=object)
int_array_as_objects_array = np.array([[1, 2, 3], [3, 4, 5]], dtype=object)
float_array_as_objects_array = np.array([[1.1, 1.3, 1.5], [2, 2.1, 2.2]], dtype=object)
object_array_with_none = np.array([[1.1, None, 1.5], [None, 2.1, 2.2]], dtype=object)

self.assertIs(self.i_o_hdf5._convert_dtype_obj_array(object_array_with_lists), object_array_with_lists)
self.assertIs(self.i_o_hdf5._convert_dtype_obj_array(object_array_with_none), object_array_with_none)

array = self.i_o_hdf5._convert_dtype_obj_array(int_array_as_objects_array)
self.assertTrue(np.array_equal(array, int_array_as_objects_array))
self.assertEqual(array.dtype, np.dtype(int))

array = self.i_o_hdf5._convert_dtype_obj_array(float_array_as_objects_array)
self.assertTrue(np.array_equal(array, float_array_as_objects_array))
self.assertEqual(array.dtype, np.dtype(float))

def test_array_type_conversion(self):
object_array_with_lists = np.array([[[1, 2, 3], [2, 3, 4]], [[4, 5, 6]]], dtype=object)
int_array_as_objects_array = np.array([[1, 2, 3], [3, 4, 5]], dtype=object)
float_array_as_objects_array = np.array([[1.1, 1.3, 1.5], [2, 2.1, 2.2]], dtype=object)

hdf = self.i_o_hdf5.open("arrays")

hdf['object_array_with_lists'] = object_array_with_lists
hdf['int_array_as_objects_array'] = int_array_as_objects_array
hdf['float_array_as_objects_array'] = float_array_as_objects_array

with self.subTest("object_array_with_lists"):
array = hdf['object_array_with_lists']
np.array_equal(array, object_array_with_lists)
self.assertIsInstance(array, np.ndarray)
self.assertTrue(array.dtype == np.dtype(object))

# Here I got: TypeError: Object dtype dtype('O') has no native HDF5 equivalent
#
# object_array_with_none = np.array([[1.1, None, 1.5], [None, 2.1, 2.2]], dtype=object)
# hdf['object_array_with_none'] = object_array_with_none
# with self.subTest("object_array_with_none"):
# array = hdf['object_array_with_none']
# np.array_equal(array, object_array_with_none)
# self.assertIsInstance(array, np.ndarray)
# self.assertTrue(array.dtype == np.dtype(object))

with self.subTest('int_array_as_objects_array'):
array = hdf['int_array_as_objects_array']
np.array_equal(array, int_array_as_objects_array)
self.assertIsInstance(array, np.ndarray)
self.assertTrue(array.dtype == np.dtype(int))

with self.subTest('float_array_as_objects_array'):
array = hdf['float_array_as_objects_array']
np.array_equal(array, float_array_as_objects_array)
self.assertIsInstance(array, np.ndarray)
self.assertTrue(array.dtype == np.dtype(float))

hdf.remove_group()

def test_get_item(self):
self._check_full_hdf_values(self.full_hdf5)
# Test leaving to pyiron Project at hdf file location:
Expand Down