pyiron · niklassiemer · Nov 18, 2021 · Nov 15, 2021 · Nov 15, 2021 · Nov 16, 2021
diff --git a/pyiron_base/generic/hdfio.py b/pyiron_base/generic/hdfio.py
@@ -5,6 +5,7 @@
 Classes to map the Python objects to HDF5 data structures
 """
 
+import numbers
 import h5py
 import os
 from collections.abc import MutableMapping
@@ -16,6 +17,7 @@
 import sys
 from typing import Union
 from pyiron_base.interfaces.has_groups import HasGroups
+from pyiron_base.state import state
 
 __author__ = "Joerg Neugebauer, Jan Janssen"
 __copyright__ = (
@@ -144,6 +146,8 @@ def __getitem__(self, item):
                 if item in self.list_groups():
                     with self.open(item) as hdf_item:
                         obj = hdf_item.copy()
+                        if self._is_convertable_dtype_object_array(obj):
+                            obj = self._convert_dtype_obj_array(obj)
                         return obj
                 raise ValueError("Unknown item: {} {} {}".format(item, self.file_name, self.h5_path))
             else:
@@ -175,6 +179,31 @@ def __getitem__(self, item):
                     hdf_object.h5_path = "/".join(item_abs_lst[:-1])
                     return hdf_object[item_abs_lst[-1]]
 
+    #TODO: remove this function upon 1.0.0 release
+    @staticmethod
+    def _is_convertable_dtype_object_array(obj):
+        if isinstance(obj, np.ndarray) and obj.dtype == np.dtype(object):
+            first_element = obj[tuple([0 for _ in range(obj.ndim)])]
+            last_element = obj[tuple([-1 for _ in range(obj.ndim)])]
+            if isinstance(first_element, numbers.Number) and isinstance(last_element, numbers.Number) \
+                    and not _is_ragged_array(obj):
+                return True
+        return False
+
+    #TODO: remove this function upon 1.0.0 release
+    @staticmethod
+    def _convert_dtype_obj_array(obj: np.ndarray):
+        result = np.array(obj.tolist())
+        if result.dtype != np.dtype(object):
+            state.logger.warning(f"Deprecated data structure! "
+                                 f"Returned array was converted from dtype='O' to dtype={result.dtype} "
+                                 f"via `np.array(result.tolist())`.\n"
+                                 f"Please run rewrite_hdf5() to update this data! "
+                                 f"To update all your data run update_scripts/pyiron_base_0.3_to_0.4.py")
+            return result
+        else:
+            return obj
+
     def __setitem__(self, key, value):
         """
         Store data inside the HDF5 file

diff --git a/tests/generic/test_fileHDFio.py b/tests/generic/test_fileHDFio.py
@@ -110,6 +110,76 @@ def _check_full_hdf_values(self, hdf):
         with self.subTest('content/group/some_entry'):
             self.assertEqual(hdf['content/group/some_entry'], 'present')
 
+    def test__is_convertable_dtype_object_array(self):
+        object_array_with_lists = np.array([[[1, 2, 3], [2, 3, 4]], [[4, 5, 6]]], dtype=object)
+        int_array_as_objects_array = np.array([[1, 2, 3], [3, 4, 5]], dtype=object)
+        float_array_as_objects_array = np.array([[1.1, 1.3, 1.5], [2, 2.1, 2.2]], dtype=object)
+        object_array_with_none = np.array([[1.1, None, 1.5], [None, 2.1, 2.2]], dtype=object)
+
+        self.assertFalse(self.i_o_hdf5._is_convertable_dtype_object_array(object_array_with_lists))
+        self.assertTrue(self.i_o_hdf5._is_convertable_dtype_object_array(int_array_as_objects_array))
+        self.assertTrue(self.i_o_hdf5._is_convertable_dtype_object_array(float_array_as_objects_array))
+        self.assertTrue(self.i_o_hdf5._is_convertable_dtype_object_array(object_array_with_none),
+                        msg="This array should be considered convertable, since first and last element are numbers.")
+
+    def test__convert_dtype_obj_array(self):
+        object_array_with_lists = np.array([[[1, 2, 3], [2, 3, 4]], [[4, 5, 6]]], dtype=object)
+        int_array_as_objects_array = np.array([[1, 2, 3], [3, 4, 5]], dtype=object)
+        float_array_as_objects_array = np.array([[1.1, 1.3, 1.5], [2, 2.1, 2.2]], dtype=object)
+        object_array_with_none = np.array([[1.1, None, 1.5], [None, 2.1, 2.2]], dtype=object)
+
+        self.assertIs(self.i_o_hdf5._convert_dtype_obj_array(object_array_with_lists), object_array_with_lists)
+        self.assertIs(self.i_o_hdf5._convert_dtype_obj_array(object_array_with_none), object_array_with_none)
+
+        array = self.i_o_hdf5._convert_dtype_obj_array(int_array_as_objects_array)
+        self.assertTrue(np.array_equal(array, int_array_as_objects_array))
+        self.assertEqual(array.dtype, np.dtype(int))
+
+        array = self.i_o_hdf5._convert_dtype_obj_array(float_array_as_objects_array)
+        self.assertTrue(np.array_equal(array, float_array_as_objects_array))
+        self.assertEqual(array.dtype, np.dtype(float))
+
+    def test_array_type_conversion(self):
+        object_array_with_lists = np.array([[[1, 2, 3], [2, 3, 4]], [[4, 5, 6]]], dtype=object)
+        int_array_as_objects_array = np.array([[1, 2, 3], [3, 4, 5]], dtype=object)
+        float_array_as_objects_array = np.array([[1.1, 1.3, 1.5], [2, 2.1, 2.2]], dtype=object)
+
+        hdf = self.i_o_hdf5.open("arrays")
+
+        hdf['object_array_with_lists'] = object_array_with_lists
+        hdf['int_array_as_objects_array'] = int_array_as_objects_array
+        hdf['float_array_as_objects_array'] = float_array_as_objects_array
+
+        with self.subTest("object_array_with_lists"):
+            array = hdf['object_array_with_lists']
+            np.array_equal(array, object_array_with_lists)
+            self.assertIsInstance(array, np.ndarray)
+            self.assertTrue(array.dtype == np.dtype(object))
+
+        #  Here I got:  TypeError: Object dtype dtype('O') has no native HDF5 equivalent
+        #
+        # object_array_with_none = np.array([[1.1, None, 1.5], [None, 2.1, 2.2]], dtype=object)
+        # hdf['object_array_with_none'] = object_array_with_none
+        # with self.subTest("object_array_with_none"):
+        #     array = hdf['object_array_with_none']
+        #     np.array_equal(array, object_array_with_none)
+        #     self.assertIsInstance(array, np.ndarray)
+        #     self.assertTrue(array.dtype == np.dtype(object))
+
+        with self.subTest('int_array_as_objects_array'):
+            array = hdf['int_array_as_objects_array']
+            np.array_equal(array, int_array_as_objects_array)
+            self.assertIsInstance(array, np.ndarray)
+            self.assertTrue(array.dtype == np.dtype(int))
+
+        with self.subTest('float_array_as_objects_array'):
+            array = hdf['float_array_as_objects_array']
+            np.array_equal(array, float_array_as_objects_array)
+            self.assertIsInstance(array, np.ndarray)
+            self.assertTrue(array.dtype == np.dtype(float))
+
+        hdf.remove_group()
+
     def test_get_item(self):
         self._check_full_hdf_values(self.full_hdf5)
         # Test leaving to pyiron Project at hdf file location:

diff --git a/update_scripts/pyiron_base_0.3_to_0.4.py b/update_scripts/pyiron_base_0.3_to_0.4.py
@@ -0,0 +1,70 @@
+"""
+pyiron_base<=0.3.10 has a bug that writes all arrays with dtype=object even
+numeric ones.  As a fix pyiron_base=0.4.0 introduces a conversion when reading
+such arrays, but does not automatically save them.  This conversion script
+simply goes over all jobs and rewrites their HDF5 files, since it's read with
+the correct dtype, this then writes this correct dtype.
+"""
+
+import os
+import re
+import stat
+import sys
+import subprocess
+
+from pyiron_base import Project
+
+from tqdm import tqdm
+
+def detect_bug(file_name):
+    """
+    Checks whether HDF5 file has at least one group setup like
+    /foo       Group
+    /foo/data  Dataset {...}
+    /foo/index Dataset {...}
+    which is how h5io stores dtype=object arrays.  If a file doesn't have any
+    of them there's no need to rewrite them.  If there is it might be a
+    corrupted record from our bug or a legitimate dtype=object array.  In that
+    case just rewrite anyway.
+    """
+    out = subprocess.getoutput(f"h5ls -r {file_name}")
+    lines = out.split('\n')
+    for i, l in enumerate(lines[:-2]):
+        if not l.endswith("Group"):
+            continue
+        group_name = l.split()[0]
+        data_match = re.match(f"^{group_name}/data[ \t]*Dataset {'{.*}'}$", lines[i+1])
+        index_match = re.match(f"^{group_name}/index[ \t]*Dataset {'{.*}'}$", lines[i+2])
+        if data_match and index_match:
+            return True
+    return False
+
+
+if __name__ == "__main__":
+    total_size = 0
+    for l in subprocess.getoutput(f"find {sys.argv[1]} -regex \".*\.h5\" -exec wc -c '{{}}' \;").split("\n"):
+        total_size += int(l.split()[0])
+
+    pr = Project(sys.argv[1])
+    n_skip = 0
+    n_err  = 0
+    with tqdm(total=total_size, unit="B", unit_scale=1) as t:
+        for j in pr.iter_jobs(convert_to_object=False, recursive=True, progress=False):
+            try:
+                file_size = os.stat(j.project_hdf5.file_name)[stat.ST_SIZE]
+            except FileNotFoundError:
+                n_err += 1
+                print(f"Job {j.name}/{j.id} is in the database, but points to non-existing HDF5 file {j.project_hdf5.file_name}!")
+                t.update(file_size)
+                continue
+
+            if detect_bug(j.project_hdf5.file_name):
+                try:
+                    j.project_hdf5.rewrite_hdf5(j.name)
+                except Exception as e:
+                    n_err += 1
+                    print(f"WARNING: rewriting job {j.name}/{j.id} failed with {e}")
+            else:
+                n_skip += 1
+            t.update(file_size)
+    print(f"Errors: {n_err}\tSkipped: {n_skip}")