diff --git a/tree/dataframe/src/RDFSnapshotHelpers.cxx b/tree/dataframe/src/RDFSnapshotHelpers.cxx index 02f1f2e438882..ec06e521cb935 100644 --- a/tree/dataframe/src/RDFSnapshotHelpers.cxx +++ b/tree/dataframe/src/RDFSnapshotHelpers.cxx @@ -1035,7 +1035,7 @@ struct SnapshotOutputWriter { { if (!fBranchToBitmaskMapping.empty()) { fFile->WriteObject(&fBranchToBitmaskMapping, - (std::string{"R_rdf_branchToBitmaskMapping_"} + fTree->GetName()).c_str()); + (std::string{"R_rdf_column_to_bitmask_mapping_"} + fTree->GetName()).c_str()); } if (fTree) { // use AutoSave to flush TTree contents because TTree::Write writes in gDirectory, not in fDirectory diff --git a/tree/dataframe/src/RDataFrame.cxx b/tree/dataframe/src/RDataFrame.cxx index 7e6fa33f5b1d1..45bc4e7524115 100644 --- a/tree/dataframe/src/RDataFrame.cxx +++ b/tree/dataframe/src/RDataFrame.cxx @@ -1265,10 +1265,28 @@ In that case, RDataFrame will snapshot the filtered columns in a memory-efficien default-constructed object in case of classes. If none of the filters pass like in row 6, the entire event is omitted from the snapshot. To tell apart a genuine `0` (like `x` in row 0) from a variation that didn't pass the selection, RDataFrame writes a bitmask for each event, indicating which variations -are valid (see last column). A mapping of column names to this bitmask is placed in the same file as the output dataset, and automatically loaded when -RDataFrame opens a file that was snapshot with variations. -Attempting to read such missing values with RDataFrame will produce an error, but RDataFrame can either skip these values or fill in defaults as -described in the \ref missing-values "section on dealing with missing values". +are valid (see last column). The bitmask is implemented as a 64-bit `std::bitset` in memory, written to the output +dataset as a `std::uin64_t`. For every 64 columns, a new bitmask column is added to the output dataset. + +Each column that might contain invalid values is connected to exactly one bit in one bitmask. A mapping of column names +to the corresponding bitmask is placed in the same file as the output dataset, with a name that follows the pattern +`"R_rdf_column_to_bitmask_mapping_"`. It is of type +`std::unordered_map>`, and maps a column name to the name of the +bitmask column and the index of the relevant bit. For example, in the same file as the dataset "Events" there would be +an object named `R_rdf_column_to_bitmask_mapping_Events`. This object for example would describe a connection such as: + +~~~ +muon_pt --> (R_rdf_mask_Events_0, 42) +~~~ + +which means that the validity of the entries in `muon_pt` is established by the bit `42` in the bitmask found in the +column `R_rdf_mask_Events_0`. + +When RDataFrame opens a file, it checks for the existence of this mapping between columns and bitmasks, and loads it automatically if found. As such, +RDataFrame makes the treatment of the various bitmap maskings completely transparent to the user. + +In case certain values are labeled invalid by the corresponding bit, this will result in reading a missing value. The semantics of such a scenario follow the +rules described in the \ref missing-values "section on dealing with missing values" and can be dealt with accordingly. \note Snapshot with variations is currently restricted to single-threaded TTree snapshots. @@ -1780,6 +1798,9 @@ more of its entries. For example: - When joining different datasets horizontally according to some index value (e.g. the event number), if the index does not find a match in one or more other datasets for a certain entry. +- If, for a certain event, a column is invalid because it results from a Snapshot + with systematic variations, and that variation didn't pass its filters. For + more details, see \ref snapshot-with-variations. For example, suppose that column "y" does not have a value for entry 42: diff --git a/tree/dataframe/src/RTTreeDS.cxx b/tree/dataframe/src/RTTreeDS.cxx index 502abfbddcc2d..824ff39c565a4 100644 --- a/tree/dataframe/src/RTTreeDS.cxx +++ b/tree/dataframe/src/RTTreeDS.cxx @@ -522,7 +522,7 @@ ROOT::Internal::RDF::RTTreeDS::CreateColumnReader(unsigned int /*slot*/, std::st if (TDirectory *treeDir = treeReader->GetTree()->GetDirectory(); treeDir) { using Map_t = std::unordered_map>; const std::string bitmaskMapName = - std::string{"R_rdf_branchToBitmaskMapping_"} + treeReader->GetTree()->GetName(); + std::string{"R_rdf_column_to_bitmask_mapping_"} + treeReader->GetTree()->GetName(); if (Map_t const *columnMaskMap = treeDir->Get(bitmaskMapName.c_str()); columnMaskMap) { if (auto it = columnMaskMap->find(std::string(col)); it != columnMaskMap->end()) { colReader = std::make_unique(*treeReader, std::move(colReader), it->second.first, diff --git a/tree/dataframe/test/dataframe_snapshotWithVariations.cxx b/tree/dataframe/test/dataframe_snapshotWithVariations.cxx index a4ef10fb390ca..7063183e92c8d 100644 --- a/tree/dataframe/test/dataframe_snapshotWithVariations.cxx +++ b/tree/dataframe/test/dataframe_snapshotWithVariations.cxx @@ -219,7 +219,7 @@ TEST(RDFVarySnapshot, Bitmask) ASSERT_NE(branch, nullptr); auto *branchToIndexMap = file.Get>>( - ("R_rdf_branchToBitmaskMapping_" + treename).c_str()); + ("R_rdf_column_to_bitmask_mapping_" + treename).c_str()); ASSERT_NE(branchToIndexMap, nullptr); for (const auto branchName : {"x", "y", "x__xVar_0", "x__xVar_1", "y__xVar_0", "y__xVar_0"}) { ASSERT_NE(branchToIndexMap->find(branchName), branchToIndexMap->end()); @@ -339,8 +339,8 @@ TEST(RDFVarySnapshot, TwoVariationsInSameFile) auto snap2 = rdf.Filter(cuts2, {"x", "y"}).Snapshot(treename2, filename, {"x", "y"}, options); std::unique_ptr file{TFile::Open(filename)}; - EXPECT_NE(file->GetKey(("R_rdf_branchToBitmaskMapping_" + treename1).c_str()), nullptr); - EXPECT_NE(file->GetKey(("R_rdf_branchToBitmaskMapping_" + treename2).c_str()), nullptr); + EXPECT_NE(file->GetKey(("R_rdf_column_to_bitmask_mapping_" + treename1).c_str()), nullptr); + EXPECT_NE(file->GetKey(("R_rdf_column_to_bitmask_mapping_" + treename2).c_str()), nullptr); // In Windows, an exception is thrown as expected, but it cannot be caught for the time being: #if !defined(_MSC_VER) || defined(R__ENABLE_BROKEN_WIN_TESTS)