diff --git a/README/ReleaseNotes/v638/index.md b/README/ReleaseNotes/v638/index.md index 59658a2fdea9d..5a39a2a65d270 100644 --- a/README/ReleaseNotes/v638/index.md +++ b/README/ReleaseNotes/v638/index.md @@ -134,9 +134,12 @@ If you want to keep using `TList*` return values, you can write a small adapter RDF uses one copy of each histogram per thread. Now, RDataFrame can reduce the number of clones using `ROOT::RDF::Experimental::ThreadsPerTH3()`. Setting this to numbers such as 8 would share one 3-d histogram among 8 threads, greatly reducing the memory consumption. This might slow down execution if the histograms are filled at very high rates. Use lower number in this case. -- The Snapshot method has been refactored so that it does not need anymore compile-time information (i.e. either template arguments or JIT-ting) to know the input column types. This means that any Snapshot call that specifies the template arguments, e.g. `Snapshot(..., {"intCol", "floatCol"})` is now redundant and the template arguments can safely be removed from the call. At the same time, Snapshot does not need to JIT compile the column types, practically giving huge speedups depending on the number of columns that need to be written to disk. In certain cases (e.g. when writing O(10000) columns) the speedup can be larger than an order of magnitude. The Snapshot template is now deprecated and it will issue a compile-time warning when called. The function overload is scheduled for removal in ROOT 6.40. - Add HistoNSparseD action that fills a sparse N-dimensional histogram. +### Snapshot +- The Snapshot method has been refactored so that it does not need anymore compile-time information (i.e. either template arguments or JIT-ting) to know the input column types. This means that any Snapshot call that specifies the template arguments, e.g. `Snapshot(..., {"intCol", "floatCol"})` is now redundant and the template arguments can safely be removed from the call. At the same time, Snapshot does not need to JIT compile the column types, practically giving huge speedups depending on the number of columns that need to be written to disk. In certain cases (e.g. when writing O(10000) columns) the speedup can be larger than an order of magnitude. The Snapshot template is now deprecated and it will issue a compile-time warning when called. The function overload is scheduled for removal in ROOT 6.40. +- The default compression setting for the output dataset used by Snapshot has been changed from 101 (ZLIB level 1, the TTree default) to 505 (ZSTD level 5). This is a better setting on average, and makes more sense for RDataFrame since now the Snapshot operation supports more than just the TTree output data format. This change may result in smaller output file sizes for your analyses that use Snapshot with default settings. During the 6.38 development release cycle, Snapshot will print information about this change once per program run. Starting from 6.40.00, the information will not be printed. The message can be suppressed by setting ROOT_RDF_SILENCE_SNAPSHOT_INFO=1 in your environment or by setting 'ROOT.RDF.Snapshot.Info: 0' in your .rootrc. + ## Python Interface ROOT dropped support for Python 3.8, meaning ROOT now requires at least Python 3.9. diff --git a/config/rootrc.in b/config/rootrc.in index d59be7370c2cc..eae21bf4bed11 100644 --- a/config/rootrc.in +++ b/config/rootrc.in @@ -615,3 +615,9 @@ Rint.Canvas.HighLightColor: 5 # 1 All Branches (default) # Can be overridden by the environment variable ROOT_TTREECACHE_PREFILL # TTreeCache.Prefill: 1 + +# Set whether to show or suppress an info message coming from RDataFrame +# Snapshot informing the user on the change of default output dataset +# compression settings introduced in ROOT 6.38 (1 means show the info, 0 means +# suppress, 1 by default). +# ROOT.RDF.Snapshot.Info: 1 diff --git a/roottest/root/dataframe/.rootrc b/roottest/root/dataframe/.rootrc new file mode 100644 index 0000000000000..56607e2f7cddc --- /dev/null +++ b/roottest/root/dataframe/.rootrc @@ -0,0 +1,5 @@ +# First two lines are taken from ROOTTEST_ADD_TESTDIRS in RootMacros.cmake +Rint.History: .root_hist +ACLiC.LinkLibs: 1 +# Suppress the info message from Snapshot to prevent test failures +ROOT.RDF.Snapshot.Info: 0 diff --git a/tree/dataframe/inc/ROOT/RDF/RInterface.hxx b/tree/dataframe/inc/ROOT/RDF/RInterface.hxx index f6be83e8b9380..c6c228317d303 100644 --- a/tree/dataframe/inc/ROOT/RDF/RInterface.hxx +++ b/tree/dataframe/inc/ROOT/RDF/RInterface.hxx @@ -45,6 +45,13 @@ #include "TProfile2D.h" #include "TStatistic.h" +// TODO: Needed to show the info message in Snapshot, remove in 6.40 +#include "ROOT/RLogger.hxx" +#include "ROOT/RVersion.hxx" +#include "TEnv.h" +#include +#include + #include #include #include @@ -1332,6 +1339,26 @@ public: const ColumnNames_t &columnList, const RSnapshotOptions &options = RSnapshotOptions()) { + // TODO: Remove before releasing 6.40.00 +#if ROOT_VERSION_CODE >= ROOT_VERSION(6, 40, 0) + static_assert(false && "Remove information about change of Snapshot defaut compression settings."); +#endif + [[maybe_unused]] static bool once = []() { + if (const char *suppress = std::getenv("ROOT_RDF_SNAPSHOT_INFO")) + if (std::strcmp(suppress, "0") == 0) + return true; + if (const char *suppress = gEnv->GetValue("ROOT.RDF.Snapshot.Info", "1")) + if (std::strcmp(suppress, "0") == 0) + return true; + RLogScopedVerbosity showInfo{ROOT::Detail::RDF::RDFLogChannel(), ROOT::ELogLevel::kInfo}; + R__LOG_INFO(ROOT::Detail::RDF::RDFLogChannel()) + << "\n\tIn ROOT 6.38, the default compression settings of Snapshot have been changed from 101 (ZLIB with " + "compression level 1, the TTree default) to 505 (ZSTD with compression level 5). This change may result " + "in smaller Snapshot output dataset size by default. In order to suppress this message, set " + "'ROOT_RDF_SNAPSHOT_INFO=0' in your environment or set 'ROOT.RDF.Snapshot.Info: 0' in your .rootrc " + "file."; + return true; + }(); // like columnList but with `#var` columns removed auto colListNoPoundSizes = RDFInternal::FilterArraySizeColNames(columnList, "Snapshot"); // like columnListWithoutSizeColumns but with aliases resolved diff --git a/tree/dataframe/inc/ROOT/RSnapshotOptions.hxx b/tree/dataframe/inc/ROOT/RSnapshotOptions.hxx index 6d558d2188e46..2ddd0e7381602 100644 --- a/tree/dataframe/inc/ROOT/RSnapshotOptions.hxx +++ b/tree/dataframe/inc/ROOT/RSnapshotOptions.hxx @@ -46,8 +46,8 @@ struct RSnapshotOptions { } std::string fMode = "RECREATE"; ///< Mode of creation of output file ECAlgo fCompressionAlgorithm = - ROOT::RCompressionSetting::EAlgorithm::kZLIB; ///< Compression algorithm of output file - int fCompressionLevel = 1; ///< Compression level of output file + ROOT::RCompressionSetting::EAlgorithm::kZSTD; ///< Compression algorithm of output file + int fCompressionLevel = 5; ///< Compression level of output file int fAutoFlush = 0; ///< AutoFlush value for output tree int fSplitLevel = 99; ///< Split level of output tree bool fLazy = false; ///< Do not start the event loop when Snapshot is called