Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Convert dtype=object arrays if possible #518

Merged
merged 9 commits into from
Nov 18, 2021
29 changes: 29 additions & 0 deletions pyiron_base/generic/hdfio.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
Classes to map the Python objects to HDF5 data structures
"""

import numbers
import h5py
import os
from collections.abc import MutableMapping
Expand All @@ -16,6 +17,7 @@
import sys
from typing import Union
from pyiron_base.interfaces.has_groups import HasGroups
from pyiron_base.state import state

__author__ = "Joerg Neugebauer, Jan Janssen"
__copyright__ = (
Expand Down Expand Up @@ -144,6 +146,8 @@ def __getitem__(self, item):
if item in self.list_groups():
with self.open(item) as hdf_item:
obj = hdf_item.copy()
if self._is_convertable_dtype_object_array(obj):
obj = self._convert_dtype_obj_array(obj)
return obj
raise ValueError("Unknown item: {} {} {}".format(item, self.file_name, self.h5_path))
else:
Expand Down Expand Up @@ -175,6 +179,31 @@ def __getitem__(self, item):
hdf_object.h5_path = "/".join(item_abs_lst[:-1])
return hdf_object[item_abs_lst[-1]]

#TODO: remove this function upon 1.0.0 release
@staticmethod
def _is_convertable_dtype_object_array(obj):
if isinstance(obj, np.ndarray) and obj.dtype == np.dtype(object):
first_element = obj[tuple([0 for _ in range(obj.ndim)])]
last_element = obj[tuple([-1 for _ in range(obj.ndim)])]
if isinstance(first_element, numbers.Number) and isinstance(last_element, numbers.Number) \
and not _is_ragged_array(obj):
return True
return False

#TODO: remove this function upon 1.0.0 release
@staticmethod
def _convert_dtype_obj_array(obj: np.ndarray):
result = np.array(obj.tolist())
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

If someone has a better function to do this, I would be happy. Especially, since the docstring states

Notes
-----
The array may be recreated via ``a = np.array(a.tolist())``, although this
may sometimes lose precision.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I wouldn't be too worried. I think they mean that you might get a conversion from int64 down to int32. AFAIK we don't have any users who care about using long int/floats (or short ones for more memory efficiency), so any sloppiness here should be perfectly safe.

if result.dtype != np.dtype(object):
state.logger.warning(f"Deprecated data structure! "
f"Returned array was converted from dtype='O' to dtype={result.dtype} "
f"via `np.array(result.tolist())`.\n"
f"Please run rewrite_hdf5() to update this data! "
f"To update all your data run update_scripts/pyiron_base_0.3_to_0.4.py")
return result
else:
return obj

def __setitem__(self, key, value):
"""
Store data inside the HDF5 file
Expand Down
70 changes: 70 additions & 0 deletions tests/generic/test_fileHDFio.py
Original file line number Diff line number Diff line change
Expand Up @@ -110,6 +110,76 @@ def _check_full_hdf_values(self, hdf):
with self.subTest('content/group/some_entry'):
self.assertEqual(hdf['content/group/some_entry'], 'present')

def test__is_convertable_dtype_object_array(self):
object_array_with_lists = np.array([[[1, 2, 3], [2, 3, 4]], [[4, 5, 6]]], dtype=object)
int_array_as_objects_array = np.array([[1, 2, 3], [3, 4, 5]], dtype=object)
float_array_as_objects_array = np.array([[1.1, 1.3, 1.5], [2, 2.1, 2.2]], dtype=object)
object_array_with_none = np.array([[1.1, None, 1.5], [None, 2.1, 2.2]], dtype=object)

self.assertFalse(self.i_o_hdf5._is_convertable_dtype_object_array(object_array_with_lists))
self.assertTrue(self.i_o_hdf5._is_convertable_dtype_object_array(int_array_as_objects_array))
self.assertTrue(self.i_o_hdf5._is_convertable_dtype_object_array(float_array_as_objects_array))
self.assertTrue(self.i_o_hdf5._is_convertable_dtype_object_array(object_array_with_none),
msg="This array should be considered convertable, since first and last element are numbers.")

def test__convert_dtype_obj_array(self):
object_array_with_lists = np.array([[[1, 2, 3], [2, 3, 4]], [[4, 5, 6]]], dtype=object)
int_array_as_objects_array = np.array([[1, 2, 3], [3, 4, 5]], dtype=object)
float_array_as_objects_array = np.array([[1.1, 1.3, 1.5], [2, 2.1, 2.2]], dtype=object)
object_array_with_none = np.array([[1.1, None, 1.5], [None, 2.1, 2.2]], dtype=object)

self.assertIs(self.i_o_hdf5._convert_dtype_obj_array(object_array_with_lists), object_array_with_lists)
self.assertIs(self.i_o_hdf5._convert_dtype_obj_array(object_array_with_none), object_array_with_none)

array = self.i_o_hdf5._convert_dtype_obj_array(int_array_as_objects_array)
self.assertTrue(np.array_equal(array, int_array_as_objects_array))
self.assertEqual(array.dtype, np.dtype(int))

array = self.i_o_hdf5._convert_dtype_obj_array(float_array_as_objects_array)
self.assertTrue(np.array_equal(array, float_array_as_objects_array))
self.assertEqual(array.dtype, np.dtype(float))

def test_array_type_conversion(self):
object_array_with_lists = np.array([[[1, 2, 3], [2, 3, 4]], [[4, 5, 6]]], dtype=object)
int_array_as_objects_array = np.array([[1, 2, 3], [3, 4, 5]], dtype=object)
float_array_as_objects_array = np.array([[1.1, 1.3, 1.5], [2, 2.1, 2.2]], dtype=object)

hdf = self.i_o_hdf5.open("arrays")

hdf['object_array_with_lists'] = object_array_with_lists
hdf['int_array_as_objects_array'] = int_array_as_objects_array
hdf['float_array_as_objects_array'] = float_array_as_objects_array

with self.subTest("object_array_with_lists"):
array = hdf['object_array_with_lists']
np.array_equal(array, object_array_with_lists)
self.assertIsInstance(array, np.ndarray)
self.assertTrue(array.dtype == np.dtype(object))

# Here I got: TypeError: Object dtype dtype('O') has no native HDF5 equivalent
#
# object_array_with_none = np.array([[1.1, None, 1.5], [None, 2.1, 2.2]], dtype=object)
# hdf['object_array_with_none'] = object_array_with_none
# with self.subTest("object_array_with_none"):
# array = hdf['object_array_with_none']
# np.array_equal(array, object_array_with_none)
# self.assertIsInstance(array, np.ndarray)
# self.assertTrue(array.dtype == np.dtype(object))

with self.subTest('int_array_as_objects_array'):
array = hdf['int_array_as_objects_array']
np.array_equal(array, int_array_as_objects_array)
self.assertIsInstance(array, np.ndarray)
self.assertTrue(array.dtype == np.dtype(int))

with self.subTest('float_array_as_objects_array'):
array = hdf['float_array_as_objects_array']
np.array_equal(array, float_array_as_objects_array)
self.assertIsInstance(array, np.ndarray)
self.assertTrue(array.dtype == np.dtype(float))

hdf.remove_group()

def test_get_item(self):
self._check_full_hdf_values(self.full_hdf5)
# Test leaving to pyiron Project at hdf file location:
Expand Down
70 changes: 70 additions & 0 deletions update_scripts/pyiron_base_0.3_to_0.4.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,70 @@
"""
pyiron_base<=0.3.10 has a bug that writes all arrays with dtype=object even
numeric ones. As a fix pyiron_base=0.4.0 introduces a conversion when reading
such arrays, but does not automatically save them. This conversion script
simply goes over all jobs and rewrites their HDF5 files, since it's read with
the correct dtype, this then writes this correct dtype.
"""

import os
import re
import stat
import sys
import subprocess

from pyiron_base import Project

from tqdm import tqdm

def detect_bug(file_name):
"""
Checks whether HDF5 file has at least one group setup like
/foo Group
/foo/data Dataset {...}
/foo/index Dataset {...}
which is how h5io stores dtype=object arrays. If a file doesn't have any
of them there's no need to rewrite them. If there is it might be a
corrupted record from our bug or a legitimate dtype=object array. In that
case just rewrite anyway.
"""
out = subprocess.getoutput(f"h5ls -r {file_name}")
lines = out.split('\n')
for i, l in enumerate(lines[:-2]):
if not l.endswith("Group"):
continue
group_name = l.split()[0]
data_match = re.match(f"^{group_name}/data[ \t]*Dataset {'{.*}'}$", lines[i+1])
index_match = re.match(f"^{group_name}/index[ \t]*Dataset {'{.*}'}$", lines[i+2])
if data_match and index_match:
return True
return False


if __name__ == "__main__":
total_size = 0
for l in subprocess.getoutput(f"find {sys.argv[1]} -regex \".*\.h5\" -exec wc -c '{{}}' \;").split("\n"):
total_size += int(l.split()[0])

pr = Project(sys.argv[1])
n_skip = 0
n_err = 0
with tqdm(total=total_size, unit="B", unit_scale=1) as t:
for j in pr.iter_jobs(convert_to_object=False, recursive=True, progress=False):
try:
file_size = os.stat(j.project_hdf5.file_name)[stat.ST_SIZE]
except FileNotFoundError:
n_err += 1
print(f"Job {j.name}/{j.id} is in the database, but points to non-existing HDF5 file {j.project_hdf5.file_name}!")
t.update(file_size)
continue

if detect_bug(j.project_hdf5.file_name):
try:
j.project_hdf5.rewrite_hdf5(j.name)
except Exception as e:
n_err += 1
print(f"WARNING: rewriting job {j.name}/{j.id} failed with {e}")
else:
n_skip += 1
t.update(file_size)
print(f"Errors: {n_err}\tSkipped: {n_skip}")