From a0c27361b63e935faa91e5adf9c6b3d1fb5f5c82 Mon Sep 17 00:00:00 2001 From: Vincenzo Eduardo Padulano Date: Tue, 11 Nov 2025 00:21:46 +0100 Subject: [PATCH 1/2] [df] Fix interaction between RVecDS and Snapshot This commit updates the implementation of the column reader retrieval in RVecDS to use GetColumnReaders with type_info instead of the older version. This in turn allows Snapshot to request the correct column reader when reading a Numpy-based dataset. --- .../pythonizations/test/rdataframe_misc.py | 27 ++++++- tree/dataframe/inc/ROOT/RVecDS.hxx | 71 ++++++++++--------- 2 files changed, 65 insertions(+), 33 deletions(-) diff --git a/bindings/pyroot/pythonizations/test/rdataframe_misc.py b/bindings/pyroot/pythonizations/test/rdataframe_misc.py index 1759451f97ca9..c3d901451753e 100644 --- a/bindings/pyroot/pythonizations/test/rdataframe_misc.py +++ b/bindings/pyroot/pythonizations/test/rdataframe_misc.py @@ -5,6 +5,8 @@ import ROOT +import numpy + class DatasetContext: """A helper class to create the dataset for the tutorial below.""" @@ -129,7 +131,30 @@ def test_ttree_ownership(self): self.assertEqual(rdf.Count().GetValue(), 9) - + def test_regression_gh_20291(self): + """ + Regression test for https://github.com/root-project/root/issues/20291 + """ + # Issues on Windows with contention on file deletion + if platform.system() == "Windows": + return + out_path = "dataframe_misc_regression_gh20291.root" + try: + x, y = numpy.array([1, 2, 3]), numpy.array([4, 5, 6]) + df = ROOT.RDF.FromNumpy({"x": x, "y": y}) + + df.Snapshot("tree", out_path) + + df_out = ROOT.RDataFrame("tree", out_path) + count = df_out.Count() + take_x = df_out.Take["Long64_t"]("x") + take_y = df_out.Take["Long64_t"]("y") + + self.assertEqual(count.GetValue(), 3) + self.assertSequenceEqual(take_x.GetValue(), [1, 2, 3]) + self.assertSequenceEqual(take_y.GetValue(), [4, 5, 6]) + finally: + os.remove(out_path) if __name__ == '__main__': diff --git a/tree/dataframe/inc/ROOT/RVecDS.hxx b/tree/dataframe/inc/ROOT/RVecDS.hxx index 27149c9283301..fdf1c5d9eec74 100644 --- a/tree/dataframe/inc/ROOT/RVecDS.hxx +++ b/tree/dataframe/inc/ROOT/RVecDS.hxx @@ -32,6 +32,14 @@ namespace Internal { namespace RDF { +class R__CLING_PTRCHECK(off) RVecDSColumnReader final : public ROOT::Detail::RDF::RColumnReaderBase { + TPointerHolder *fPtrHolder; + void *GetImpl(Long64_t) final { return fPtrHolder->GetPointer(); } + +public: + RVecDSColumnReader(TPointerHolder *ptrHolder) : fPtrHolder(ptrHolder) {} +}; + //////////////////////////////////////////////////////////////////////////////////////////////// /// \brief A RDataSource implementation which takes a collection of RVecs, which /// are able to adopt data from Numpy arrays @@ -46,46 +54,18 @@ class RVecDS final : public ROOT::RDF::RDataSource { using PointerHolderPtrs_t = std::vector; std::tuple...> fColumns; - const std::vector fColNames; - const std::map fColTypesMap; + std::vector fColNames; + std::unordered_map fColTypesMap; // The role of the fPointerHoldersModels is to be initialised with the pack // of arguments in the constrcutor signature at construction time // Once the number of slots is known, the fPointerHolders are initialised // according to the models. - const PointerHolderPtrs_t fPointerHoldersModels; + PointerHolderPtrs_t fPointerHoldersModels; std::vector fPointerHolders; std::vector> fEntryRanges{}; std::function fDeleteRVecs; - Record_t GetColumnReadersImpl(std::string_view colName, const std::type_info &id) - { - auto colNameStr = std::string(colName); - // This could be optimised and done statically - const auto idName = ROOT::Internal::RDF::TypeID2TypeName(id); - auto it = fColTypesMap.find(colNameStr); - if (fColTypesMap.end() == it) { - std::string err = "The specified column name, \"" + colNameStr + "\" is not known to the data source."; - throw std::runtime_error(err); - } - - const auto colIdName = it->second; - if (colIdName != idName) { - std::string err = "Column " + colNameStr + " has type " + colIdName + - " while the id specified is associated to type " + idName; - throw std::runtime_error(err); - } - - const auto colBegin = fColNames.begin(); - const auto colEnd = fColNames.end(); - const auto namesIt = std::find(colBegin, colEnd, colName); - const auto index = std::distance(colBegin, namesIt); - - Record_t ret(fNSlots); - for (auto slot : ROOT::TSeqU(fNSlots)) { - ret[slot] = fPointerHolders[index][slot]->GetPointerAddr(); - } - return ret; - } + Record_t GetColumnReadersImpl(std::string_view, const std::type_info &) { return {}; } size_t GetEntriesNumber() { return std::get<0>(fColumns).size(); } template @@ -146,6 +126,33 @@ public: fDeleteRVecs(); } + std::unique_ptr + GetColumnReaders(unsigned int slot, std::string_view colName, const std::type_info &id) final + { + auto colNameStr = std::string(colName); + + auto it = fColTypesMap.find(colNameStr); + if (fColTypesMap.end() == it) { + std::string err = "The specified column name, \"" + colNameStr + "\" is not known to the data source."; + throw std::runtime_error(err); + } + + const auto &colIdName = it->second; + const auto idName = ROOT::Internal::RDF::TypeID2TypeName(id); + if (colIdName != idName) { + std::string err = "Column " + colNameStr + " has type " + colIdName + + " while the id specified is associated to type " + idName; + throw std::runtime_error(err); + } + + if (auto colNameIt = std::find(fColNames.begin(), fColNames.end(), colNameStr); colNameIt != fColNames.end()) { + const auto index = std::distance(fColNames.begin(), colNameIt); + return std::make_unique(fPointerHolders[index][slot]); + } + + throw std::runtime_error("Could not find column name \"" + colNameStr + "\" in available column names."); + } + const std::vector &GetColumnNames() const { return fColNames; } std::vector> GetEntryRanges() From d9099d29f70a8a83dbe5971a786993142f83b0fb Mon Sep 17 00:00:00 2001 From: Vincenzo Eduardo Padulano Date: Tue, 11 Nov 2025 00:38:34 +0100 Subject: [PATCH 2/2] ruff format and lint --- .../pythonizations/test/rdataframe_misc.py | 26 +++++++------------ 1 file changed, 9 insertions(+), 17 deletions(-) diff --git a/bindings/pyroot/pythonizations/test/rdataframe_misc.py b/bindings/pyroot/pythonizations/test/rdataframe_misc.py index c3d901451753e..1a1efd21b3ec8 100644 --- a/bindings/pyroot/pythonizations/test/rdataframe_misc.py +++ b/bindings/pyroot/pythonizations/test/rdataframe_misc.py @@ -3,19 +3,14 @@ import platform import unittest -import ROOT - import numpy +import ROOT class DatasetContext: """A helper class to create the dataset for the tutorial below.""" - filenames = [ - "rdataframe_misc_1.root", - "rdataframe_misc_2.root", - "rdataframe_misc_3.root" - ] + filenames = ["rdataframe_misc_1.root", "rdataframe_misc_2.root", "rdataframe_misc_3.root"] treename = "dataset" nentries = 5 @@ -48,6 +43,7 @@ def __exit__(self, *_): for filename in self.filenames: os.remove(filename) + class RDataFrameMisc(unittest.TestCase): """Miscellaneous RDataFrame tests""" @@ -59,10 +55,7 @@ def test_empty_filenames(self): # https://bugs.llvm.org/show_bug.cgi?id=49692 : # llvm JIT fails to catch exceptions on MacOS ARM, so we disable their testing # Also fails on Windows for the same reason - if ( - (platform.processor() != "arm" or platform.mac_ver()[0] == '') and not - platform.system() == "Windows" - ): + if (platform.processor() != "arm" or platform.mac_ver()[0] == "") and not platform.system() == "Windows": # With implicit conversions, cppyy also needs to try dispatching to the various # constructor overloads. The C++ exception will be thrown, but will be incapsulated # in a more generic TypeError telling the user that none of the overloads worked @@ -83,7 +76,7 @@ def _get_rdf(self, dataset): chain.Add(filename) return ROOT.RDataFrame(chain) - + def _get_chain(self, dataset): chain = ROOT.TChain(dataset.treename) for filename in dataset.filenames: @@ -95,9 +88,8 @@ def _define_col(self, rdf): def _filter_x(self, rdf): return rdf.Filter("x > 2") - - def _test_rdf_in_function(self, chain): + def _test_rdf_in_function(self, chain): rdf = ROOT.RDataFrame(chain) meanx = rdf.Mean("x") meany = rdf.Mean("y") @@ -117,8 +109,8 @@ def test_ttree_ownership(self): rdf = self._get_rdf(dataset) npy_dict = rdf.AsNumpy() - self.assertIsNone(numpy.testing.assert_array_equal(npy_dict["x"], numpy.array([1,2,3,4,5]*3))) - self.assertIsNone(numpy.testing.assert_array_equal(npy_dict["y"], numpy.array([2,4,6,8,10]*3))) + self.assertIsNone(numpy.testing.assert_array_equal(npy_dict["x"], numpy.array([1, 2, 3, 4, 5] * 3))) + self.assertIsNone(numpy.testing.assert_array_equal(npy_dict["y"], numpy.array([2, 4, 6, 8, 10] * 3))) chain = self._get_chain(dataset) @@ -157,5 +149,5 @@ def test_regression_gh_20291(self): os.remove(out_path) -if __name__ == '__main__': +if __name__ == "__main__": unittest.main()