From a0c27361b63e935faa91e5adf9c6b3d1fb5f5c82 Mon Sep 17 00:00:00 2001
From: Vincenzo Eduardo Padulano <vincenzo.eduardo.padulano@cern.ch>
Date: Tue, 11 Nov 2025 00:21:46 +0100
Subject: [PATCH 1/2] [df] Fix interaction between RVecDS and Snapshot

This commit updates the implementation of the column reader retrieval in
RVecDS to use GetColumnReaders with type_info instead of the older
version. This in turn allows Snapshot to request the correct column
reader when reading a Numpy-based dataset.
---
 .../pythonizations/test/rdataframe_misc.py    | 27 ++++++-
 tree/dataframe/inc/ROOT/RVecDS.hxx            | 71 ++++++++++---------
 2 files changed, 65 insertions(+), 33 deletions(-)
diff --git a/bindings/pyroot/pythonizations/test/rdataframe_misc.py b/bindings/pyroot/pythonizations/test/rdataframe_misc.py
index 1759451f97ca9..c3d901451753e 100644
--- a/bindings/pyroot/pythonizations/test/rdataframe_misc.py
+++ b/bindings/pyroot/pythonizations/test/rdataframe_misc.py
@@ -5,6 +5,8 @@
 
 import ROOT
 
+import numpy
+
 
 class DatasetContext:
     """A helper class to create the dataset for the tutorial below."""
@@ -129,7 +131,30 @@ def test_ttree_ownership(self):
 
             self.assertEqual(rdf.Count().GetValue(), 9)
 
-
+    def test_regression_gh_20291(self):
+        """
+        Regression test for https://github.com/root-project/root/issues/20291
+        """
+        # Issues on Windows with contention on file deletion
+        if platform.system() == "Windows":
+            return
+        out_path = "dataframe_misc_regression_gh20291.root"
+        try:
+            x, y = numpy.array([1, 2, 3]), numpy.array([4, 5, 6])
+            df = ROOT.RDF.FromNumpy({"x": x, "y": y})
+
+            df.Snapshot("tree", out_path)
+
+            df_out = ROOT.RDataFrame("tree", out_path)
+            count = df_out.Count()
+            take_x = df_out.Take["Long64_t"]("x")
+            take_y = df_out.Take["Long64_t"]("y")
+
+            self.assertEqual(count.GetValue(), 3)
+            self.assertSequenceEqual(take_x.GetValue(), [1, 2, 3])
+            self.assertSequenceEqual(take_y.GetValue(), [4, 5, 6])
+        finally:
+            os.remove(out_path)
 
 
 if __name__ == '__main__':
diff --git a/tree/dataframe/inc/ROOT/RVecDS.hxx b/tree/dataframe/inc/ROOT/RVecDS.hxx
index 27149c9283301..fdf1c5d9eec74 100644
--- a/tree/dataframe/inc/ROOT/RVecDS.hxx
+++ b/tree/dataframe/inc/ROOT/RVecDS.hxx
@@ -32,6 +32,14 @@ namespace Internal {
 
 namespace RDF {
 
+class R__CLING_PTRCHECK(off) RVecDSColumnReader final : public ROOT::Detail::RDF::RColumnReaderBase {
+   TPointerHolder *fPtrHolder;
+   void *GetImpl(Long64_t) final { return fPtrHolder->GetPointer(); }
+
+public:
+   RVecDSColumnReader(TPointerHolder *ptrHolder) : fPtrHolder(ptrHolder) {}
+};
+
 ////////////////////////////////////////////////////////////////////////////////////////////////
 /// \brief A RDataSource implementation which takes a collection of RVecs, which
 /// are able to adopt data from Numpy arrays
@@ -46,46 +54,18 @@ class RVecDS final : public ROOT::RDF::RDataSource {
    using PointerHolderPtrs_t = std::vector<ROOT::Internal::RDF::TPointerHolder *>;
 
    std::tuple<ROOT::RVec<ColumnTypes>...> fColumns;
-   const std::vector<std::string> fColNames;
-   const std::map<std::string, std::string> fColTypesMap;
+   std::vector<std::string> fColNames;
+   std::unordered_map<std::string, std::string> fColTypesMap;
    // The role of the fPointerHoldersModels is to be initialised with the pack
    // of arguments in the constrcutor signature at construction time
    // Once the number of slots is known, the fPointerHolders are initialised
    // according to the models.
-   const PointerHolderPtrs_t fPointerHoldersModels;
+   PointerHolderPtrs_t fPointerHoldersModels;
    std::vector<PointerHolderPtrs_t> fPointerHolders;
    std::vector<std::pair<ULong64_t, ULong64_t>> fEntryRanges{};
    std::function<void()> fDeleteRVecs;
 
-   Record_t GetColumnReadersImpl(std::string_view colName, const std::type_info &id)
-   {
-      auto colNameStr = std::string(colName);
-      // This could be optimised and done statically
-      const auto idName = ROOT::Internal::RDF::TypeID2TypeName(id);
-      auto it = fColTypesMap.find(colNameStr);
-      if (fColTypesMap.end() == it) {
-         std::string err = "The specified column name, \"" + colNameStr + "\" is not known to the data source.";
-         throw std::runtime_error(err);
-      }
-
-      const auto colIdName = it->second;
-      if (colIdName != idName) {
-         std::string err = "Column " + colNameStr + " has type " + colIdName +
-                           " while the id specified is associated to type " + idName;
-         throw std::runtime_error(err);
-      }
-
-      const auto colBegin = fColNames.begin();
-      const auto colEnd = fColNames.end();
-      const auto namesIt = std::find(colBegin, colEnd, colName);
-      const auto index = std::distance(colBegin, namesIt);
-
-      Record_t ret(fNSlots);
-      for (auto slot : ROOT::TSeqU(fNSlots)) {
-         ret[slot] = fPointerHolders[index][slot]->GetPointerAddr();
-      }
-      return ret;
-   }
+   Record_t GetColumnReadersImpl(std::string_view, const std::type_info &) { return {}; }
 
    size_t GetEntriesNumber() { return std::get<0>(fColumns).size(); }
    template <std::size_t... S>
@@ -146,6 +126,33 @@ public:
       fDeleteRVecs();
    }
 
+   std::unique_ptr<ROOT::Detail::RDF::RColumnReaderBase>
+   GetColumnReaders(unsigned int slot, std::string_view colName, const std::type_info &id) final
+   {
+      auto colNameStr = std::string(colName);
+
+      auto it = fColTypesMap.find(colNameStr);
+      if (fColTypesMap.end() == it) {
+         std::string err = "The specified column name, \"" + colNameStr + "\" is not known to the data source.";
+         throw std::runtime_error(err);
+      }
+
+      const auto &colIdName = it->second;
+      const auto idName = ROOT::Internal::RDF::TypeID2TypeName(id);
+      if (colIdName != idName) {
+         std::string err = "Column " + colNameStr + " has type " + colIdName +
+                           " while the id specified is associated to type " + idName;
+         throw std::runtime_error(err);
+      }
+
+      if (auto colNameIt = std::find(fColNames.begin(), fColNames.end(), colNameStr); colNameIt != fColNames.end()) {
+         const auto index = std::distance(fColNames.begin(), colNameIt);
+         return std::make_unique<ROOT::Internal::RDF::RVecDSColumnReader>(fPointerHolders[index][slot]);
+      }
+
+      throw std::runtime_error("Could not find column name \"" + colNameStr + "\" in available column names.");
+   }
+
    const std::vector<std::string> &GetColumnNames() const { return fColNames; }
 
    std::vector<std::pair<ULong64_t, ULong64_t>> GetEntryRanges()

From d9099d29f70a8a83dbe5971a786993142f83b0fb Mon Sep 17 00:00:00 2001
From: Vincenzo Eduardo Padulano <vincenzo.eduardo.padulano@cern.ch>
Date: Tue, 11 Nov 2025 00:38:34 +0100
Subject: [PATCH 2/2] ruff format and lint

---
 .../pythonizations/test/rdataframe_misc.py    | 26 +++++++------------
 1 file changed, 9 insertions(+), 17 deletions(-)

diff --git a/bindings/pyroot/pythonizations/test/rdataframe_misc.py b/bindings/pyroot/pythonizations/test/rdataframe_misc.py
index c3d901451753e..1a1efd21b3ec8 100644
--- a/bindings/pyroot/pythonizations/test/rdataframe_misc.py
+++ b/bindings/pyroot/pythonizations/test/rdataframe_misc.py
@@ -3,19 +3,14 @@
 import platform
 import unittest
 
-import ROOT
-
 import numpy
+import ROOT
 
 
 class DatasetContext:
     """A helper class to create the dataset for the tutorial below."""
 
-    filenames = [
-        "rdataframe_misc_1.root",
-        "rdataframe_misc_2.root",
-        "rdataframe_misc_3.root"
-    ]
+    filenames = ["rdataframe_misc_1.root", "rdataframe_misc_2.root", "rdataframe_misc_3.root"]
     treename = "dataset"
     nentries = 5
 
@@ -48,6 +43,7 @@ def __exit__(self, *_):
         for filename in self.filenames:
             os.remove(filename)
 
+
 class RDataFrameMisc(unittest.TestCase):
     """Miscellaneous RDataFrame tests"""
 
@@ -59,10 +55,7 @@ def test_empty_filenames(self):
         # https://bugs.llvm.org/show_bug.cgi?id=49692 :
         # llvm JIT fails to catch exceptions on MacOS ARM, so we disable their testing
         # Also fails on Windows for the same reason
-        if (
-            (platform.processor() != "arm" or platform.mac_ver()[0] == '') and not
-            platform.system() == "Windows"
-        ):
+        if (platform.processor() != "arm" or platform.mac_ver()[0] == "") and not platform.system() == "Windows":
             # With implicit conversions, cppyy also needs to try dispatching to the various
             # constructor overloads. The C++ exception will be thrown, but will be incapsulated
             # in a more generic TypeError telling the user that none of the overloads worked
@@ -83,7 +76,7 @@ def _get_rdf(self, dataset):
             chain.Add(filename)
 
         return ROOT.RDataFrame(chain)
-    
+
     def _get_chain(self, dataset):
         chain = ROOT.TChain(dataset.treename)
         for filename in dataset.filenames:
@@ -95,9 +88,8 @@ def _define_col(self, rdf):
 
     def _filter_x(self, rdf):
         return rdf.Filter("x > 2")
-    
-    def _test_rdf_in_function(self, chain):
 
+    def _test_rdf_in_function(self, chain):
         rdf = ROOT.RDataFrame(chain)
         meanx = rdf.Mean("x")
         meany = rdf.Mean("y")
@@ -117,8 +109,8 @@ def test_ttree_ownership(self):
             rdf = self._get_rdf(dataset)
 
             npy_dict = rdf.AsNumpy()
-            self.assertIsNone(numpy.testing.assert_array_equal(npy_dict["x"], numpy.array([1,2,3,4,5]*3)))
-            self.assertIsNone(numpy.testing.assert_array_equal(npy_dict["y"], numpy.array([2,4,6,8,10]*3)))
+            self.assertIsNone(numpy.testing.assert_array_equal(npy_dict["x"], numpy.array([1, 2, 3, 4, 5] * 3)))
+            self.assertIsNone(numpy.testing.assert_array_equal(npy_dict["y"], numpy.array([2, 4, 6, 8, 10] * 3)))
 
             chain = self._get_chain(dataset)
 
@@ -157,5 +149,5 @@ def test_regression_gh_20291(self):
             os.remove(out_path)
 
 
-if __name__ == '__main__':
+if __name__ == "__main__":
     unittest.main()