From eb202347b5f8cc85d7dccc1dd64fa749f72baa47 Mon Sep 17 00:00:00 2001 From: Emmanouil Date: Thu, 21 Oct 2021 12:36:16 +0200 Subject: [PATCH] [RF] Bugfix (cuda computations always run in default stream) --- roofit/batchcompute/inc/rbc.h | 4 ++-- roofit/batchcompute/src/RooBatchCompute.cu | 8 ++++---- roofit/batchcompute/src/RooBatchCompute.cxx | 4 ++-- roofit/roofit/inc/RooArgusBG.h | 2 +- roofit/roofit/inc/RooBernstein.h | 2 +- roofit/roofit/inc/RooBifurGauss.h | 2 +- roofit/roofit/inc/RooBreitWigner.h | 2 +- roofit/roofit/inc/RooBukinPdf.h | 2 +- roofit/roofit/inc/RooCBShape.h | 2 +- roofit/roofit/inc/RooChebychev.h | 2 +- roofit/roofit/inc/RooChiSquarePdf.h | 2 +- roofit/roofit/inc/RooDstD0BG.h | 2 +- roofit/roofit/inc/RooExponential.h | 2 +- roofit/roofit/inc/RooGamma.h | 2 +- roofit/roofit/inc/RooGaussian.h | 2 +- roofit/roofit/inc/RooJohnson.h | 2 +- roofit/roofit/inc/RooLandau.h | 2 +- roofit/roofit/inc/RooLognormal.h | 2 +- roofit/roofit/inc/RooNovosibirsk.h | 2 +- roofit/roofit/inc/RooPoisson.h | 2 +- roofit/roofit/inc/RooPolynomial.h | 2 +- roofit/roofit/inc/RooVoigtian.h | 2 +- roofit/roofit/src/RooArgusBG.cxx | 5 +++-- roofit/roofit/src/RooBernstein.cxx | 5 +++-- roofit/roofit/src/RooBifurGauss.cxx | 5 +++-- roofit/roofit/src/RooBreitWigner.cxx | 5 +++-- roofit/roofit/src/RooBukinPdf.cxx | 5 +++-- roofit/roofit/src/RooCBShape.cxx | 5 +++-- roofit/roofit/src/RooChebychev.cxx | 5 +++-- roofit/roofit/src/RooChiSquarePdf.cxx | 5 +++-- roofit/roofit/src/RooDstD0BG.cxx | 5 +++-- roofit/roofit/src/RooExponential.cxx | 5 +++-- roofit/roofit/src/RooGamma.cxx | 5 +++-- roofit/roofit/src/RooGaussian.cxx | 5 +++-- roofit/roofit/src/RooJohnson.cxx | 5 +++-- roofit/roofit/src/RooLandau.cxx | 5 +++-- roofit/roofit/src/RooLognormal.cxx | 5 +++-- roofit/roofit/src/RooNovosibirsk.cxx | 5 +++-- roofit/roofit/src/RooPoisson.cxx | 5 +++-- roofit/roofit/src/RooPolynomial.cxx | 5 +++-- roofit/roofit/src/RooVoigtian.cxx | 5 +++-- roofit/roofitcore/inc/RooAbsPdf.h | 2 +- roofit/roofitcore/inc/RooAbsReal.h | 2 +- roofit/roofitcore/inc/RooAddPdf.h | 2 +- roofit/roofitcore/inc/RooFormula.h | 2 +- roofit/roofitcore/inc/RooFormulaVar.h | 4 ++-- roofit/roofitcore/inc/RooGenericPdf.h | 2 +- roofit/roofitcore/inc/RooNLLVarNew.h | 4 ++-- roofit/roofitcore/inc/RooProdPdf.h | 2 +- roofit/roofitcore/src/RooAbsPdf.cxx | 4 ++-- roofit/roofitcore/src/RooAbsReal.cxx | 4 ++-- roofit/roofitcore/src/RooAddPdf.cxx | 5 +++-- roofit/roofitcore/src/RooFitDriver.cxx | 15 ++++++++------- roofit/roofitcore/src/RooFormula.cxx | 2 +- roofit/roofitcore/src/RooGenericPdf.cxx | 4 ++-- roofit/roofitcore/src/RooNLLVarNew.cxx | 16 +++++++++------- roofit/roofitcore/src/RooProdPdf.cxx | 5 +++-- 57 files changed, 124 insertions(+), 100 deletions(-) diff --git a/roofit/batchcompute/inc/rbc.h b/roofit/batchcompute/inc/rbc.h index 80e437272bad6..3ec44068cd8e8 100644 --- a/roofit/batchcompute/inc/rbc.h +++ b/roofit/batchcompute/inc/rbc.h @@ -64,8 +64,8 @@ class RbcInterface { public: virtual ~RbcInterface() = default; virtual void init() { throw std::bad_function_call(); } - virtual void compute(Computer, RestrictArr, size_t, const DataMap&, const VarVector&, const ArgVector& ={}) = 0; - virtual double sumReduce(InputArr, size_t) = 0; + virtual void compute(cudaStream_t*, Computer, RestrictArr, size_t, const DataMap&, const VarVector&, const ArgVector& ={}) = 0; + virtual double sumReduce(cudaStream_t*, InputArr, size_t) = 0; //cuda functions that need to be interfaced virtual void* cudaMalloc(size_t) { throw std::bad_function_call(); } diff --git a/roofit/batchcompute/src/RooBatchCompute.cu b/roofit/batchcompute/src/RooBatchCompute.cu index 8779777e792e7..3dca1ba5376ab 100644 --- a/roofit/batchcompute/src/RooBatchCompute.cu +++ b/roofit/batchcompute/src/RooBatchCompute.cu @@ -51,14 +51,14 @@ class RbcClass : public RbcInterface { \param varData A std::map containing the values of the variables involved in the computation. \param vars A std::vector containing pointers to the variables involved in the computation. \param extraArgs An optional std::vector containing extra double values that may participate in the computation. **/ - void compute(Computer computer, RestrictArr output, size_t nEvents, const DataMap& varData, const VarVector& vars, const ArgVector& extraArgs) override + void compute(cudaStream_t* stream, Computer computer, RestrictArr output, size_t nEvents, const DataMap& varData, const VarVector& vars, const ArgVector& extraArgs) override { Batches batches(output, nEvents, varData, vars, extraArgs); - _computeFunctions[computer]<<<128,512>>>(batches); + _computeFunctions[computer]<<<128,512,0,*stream>>>(batches); } /// Return the sum of an input array - double sumReduce(InputArr input, size_t n) override { - return thrust::reduce(thrust::device, input, input+n, 0.0); + double sumReduce(cudaStream_t* stream, InputArr input, size_t n) override { + return thrust::reduce(thrust::cuda::par.on(*stream), input, input+n, 0.0); } //cuda functions diff --git a/roofit/batchcompute/src/RooBatchCompute.cxx b/roofit/batchcompute/src/RooBatchCompute.cxx index 03ef8edd1a90e..1efe9257f235b 100644 --- a/roofit/batchcompute/src/RooBatchCompute.cxx +++ b/roofit/batchcompute/src/RooBatchCompute.cxx @@ -39,7 +39,7 @@ class RbcClass : public RbcInterface { \param varData A std::map containing the values of the variables involved in the computation. \param vars A std::vector containing pointers to the variables involved in the computation. \param extraArgs An optional std::vector containing extra double values that may participate in the computation. **/ - void compute(Computer computer, RestrictArr output, size_t nEvents, const DataMap& varData, const VarVector& vars, const ArgVector& extraArgs) override + void compute(cudaStream_t*, Computer computer, RestrictArr output, size_t nEvents, const DataMap& varData, const VarVector& vars, const ArgVector& extraArgs) override { double buffer[maxParams][bufferSize]; ROOT::Internal::TExecutor ex; @@ -72,7 +72,7 @@ class RbcClass : public RbcInterface { ex.Map(task, batchesArr); } /// Return the sum of an input array - double sumReduce(InputArr input, size_t n) override + double sumReduce(cudaStream_t*, InputArr input, size_t n) override { long double sum=0.0; for (size_t i=0; icompute(rbc::ArgusBG, output, nEvents, dataMap, {&*m,&*m0,&*c,&*p,&*_norm}); + auto dispatch = stream ? rbc::dispatchCUDA : rbc::dispatchCPU; + dispatch->compute(stream, rbc::ArgusBG, output, nEvents, dataMap, {&*m,&*m0,&*c,&*p,&*_norm}); } //////////////////////////////////////////////////////////////////////////////// diff --git a/roofit/roofit/src/RooBernstein.cxx b/roofit/roofit/src/RooBernstein.cxx index f6cc47a3c0cad..d858674bb91b7 100644 --- a/roofit/roofit/src/RooBernstein.cxx +++ b/roofit/roofit/src/RooBernstein.cxx @@ -142,7 +142,7 @@ Double_t RooBernstein::evaluate() const //////////////////////////////////////////////////////////////////////////////// /// Compute multiple values of Bernstein distribution. -void RooBernstein::computeBatch(rbc::RbcInterface* dispatch, double* output, size_t nEvents, rbc::DataMap& dataMap) const +void RooBernstein::computeBatch(cudaStream_t* stream, double* output, size_t nEvents, rbc::DataMap& dataMap) const { const int nCoef = _coefList.size(); std::vector extraArgs(nCoef+2); @@ -151,7 +151,8 @@ void RooBernstein::computeBatch(rbc::RbcInterface* dispatch, double* output, siz extraArgs[nCoef] = _x.min(); extraArgs[nCoef+1] = _x.max(); - dispatch->compute(rbc::Bernstein, output, nEvents, dataMap, {&*_x, &*_norm}, extraArgs); + auto dispatch = stream ? rbc::dispatchCUDA : rbc::dispatchCPU; + dispatch->compute(stream, rbc::Bernstein, output, nEvents, dataMap, {&*_x, &*_norm}, extraArgs); } //////////////////////////////////////////////////////////////////////////////// diff --git a/roofit/roofit/src/RooBifurGauss.cxx b/roofit/roofit/src/RooBifurGauss.cxx index a4b518d5e7308..c140626ed5862 100644 --- a/roofit/roofit/src/RooBifurGauss.cxx +++ b/roofit/roofit/src/RooBifurGauss.cxx @@ -80,9 +80,10 @@ Double_t RooBifurGauss::evaluate() const { //////////////////////////////////////////////////////////////////////////////// /// Compute multiple values of BifurGauss distribution. -void RooBifurGauss::computeBatch(rbc::RbcInterface* dispatch, double* output, size_t nEvents, rbc::DataMap& dataMap) const +void RooBifurGauss::computeBatch(cudaStream_t* stream, double* output, size_t nEvents, rbc::DataMap& dataMap) const { - dispatch->compute(rbc::BifurGauss, output, nEvents, dataMap, {&*x,&*mean,&*sigmaL,&*sigmaR,&*_norm}); + auto dispatch = stream ? rbc::dispatchCUDA : rbc::dispatchCPU; + dispatch->compute(stream, rbc::BifurGauss, output, nEvents, dataMap, {&*x,&*mean,&*sigmaL,&*sigmaR,&*_norm}); } //////////////////////////////////////////////////////////////////////////////// diff --git a/roofit/roofit/src/RooBreitWigner.cxx b/roofit/roofit/src/RooBreitWigner.cxx index b83791cc121fc..4e9548baf0bdb 100644 --- a/roofit/roofit/src/RooBreitWigner.cxx +++ b/roofit/roofit/src/RooBreitWigner.cxx @@ -66,9 +66,10 @@ Double_t RooBreitWigner::evaluate() const //////////////////////////////////////////////////////////////////////////////// /// Compute multiple values of BreitWigner distribution. -void RooBreitWigner::computeBatch(rbc::RbcInterface* dispatch, double* output, size_t nEvents, rbc::DataMap& dataMap) const +void RooBreitWigner::computeBatch(cudaStream_t* stream, double* output, size_t nEvents, rbc::DataMap& dataMap) const { - dispatch->compute(rbc::BreitWigner, output, nEvents, dataMap, {&*x,&*mean,&*width,&*_norm}); + auto dispatch = stream ? rbc::dispatchCUDA : rbc::dispatchCPU; + dispatch->compute(stream, rbc::BreitWigner, output, nEvents, dataMap, {&*x,&*mean,&*width,&*_norm}); } //////////////////////////////////////////////////////////////////////////////// diff --git a/roofit/roofit/src/RooBukinPdf.cxx b/roofit/roofit/src/RooBukinPdf.cxx index 725b5e3ffe8fe..27d85ce8162c7 100644 --- a/roofit/roofit/src/RooBukinPdf.cxx +++ b/roofit/roofit/src/RooBukinPdf.cxx @@ -142,7 +142,8 @@ Double_t RooBukinPdf::evaluate() const //////////////////////////////////////////////////////////////////////////////// /// Compute multiple values of Bukin distribution. -void RooBukinPdf::computeBatch(rbc::RbcInterface* dispatch, double* output, size_t nEvents, rbc::DataMap& dataMap) const +void RooBukinPdf::computeBatch(cudaStream_t* stream, double* output, size_t nEvents, rbc::DataMap& dataMap) const { - dispatch->compute(rbc::Bukin, output, nEvents, dataMap, {&*x,&*Xp,&*sigp,&*xi,&*rho1,&*rho2,&*_norm}); + auto dispatch = stream ? rbc::dispatchCUDA : rbc::dispatchCPU; + dispatch->compute(stream, rbc::Bukin, output, nEvents, dataMap, {&*x,&*Xp,&*sigp,&*xi,&*rho1,&*rho2,&*_norm}); } diff --git a/roofit/roofit/src/RooCBShape.cxx b/roofit/roofit/src/RooCBShape.cxx index 49cdfc5754cd5..e73578673483d 100644 --- a/roofit/roofit/src/RooCBShape.cxx +++ b/roofit/roofit/src/RooCBShape.cxx @@ -92,9 +92,10 @@ Double_t RooCBShape::evaluate() const { //////////////////////////////////////////////////////////////////////////////// /// Compute multiple values of Crystal ball Shape distribution. -void RooCBShape::computeBatch(rbc::RbcInterface* dispatch, double* output, size_t nEvents, rbc::DataMap& dataMap) const +void RooCBShape::computeBatch(cudaStream_t* stream, double* output, size_t nEvents, rbc::DataMap& dataMap) const { - dispatch->compute(rbc::CBShape, output, nEvents, dataMap, {&*m,&*m0,&*sigma,&*alpha,&*n,&*_norm}); + auto dispatch = stream ? rbc::dispatchCUDA : rbc::dispatchCPU; + dispatch->compute(stream, rbc::CBShape, output, nEvents, dataMap, {&*m,&*m0,&*sigma,&*alpha,&*n,&*_norm}); } //////////////////////////////////////////////////////////////////////////////// diff --git a/roofit/roofit/src/RooChebychev.cxx b/roofit/roofit/src/RooChebychev.cxx index 065990848e7a6..36fda0387a586 100644 --- a/roofit/roofit/src/RooChebychev.cxx +++ b/roofit/roofit/src/RooChebychev.cxx @@ -184,14 +184,15 @@ Double_t RooChebychev::evaluate() const //////////////////////////////////////////////////////////////////////////////// /// Compute multiple values of Chebychev. -void RooChebychev::computeBatch(rbc::RbcInterface* dispatch, double* output, size_t nEvents, rbc::DataMap& dataMap) const +void RooChebychev::computeBatch(cudaStream_t* stream, double* output, size_t nEvents, rbc::DataMap& dataMap) const { rbc::ArgVector extraArgs; for (auto* coef:_coefList) extraArgs.push_back( static_cast(coef)->getVal() ); extraArgs.push_back( _x.min(_refRangeName?_refRangeName->GetName() : nullptr) ); extraArgs.push_back( _x.max(_refRangeName?_refRangeName->GetName() : nullptr) ); - dispatch->compute(rbc::Chebychev, output, nEvents, dataMap, {&*_x,&*_norm}, extraArgs); + auto dispatch = stream ? rbc::dispatchCUDA : rbc::dispatchCPU; + dispatch->compute(stream, rbc::Chebychev, output, nEvents, dataMap, {&*_x,&*_norm}, extraArgs); } //////////////////////////////////////////////////////////////////////////////// diff --git a/roofit/roofit/src/RooChiSquarePdf.cxx b/roofit/roofit/src/RooChiSquarePdf.cxx index b3a92831e7ddc..fa05fd29b6503 100644 --- a/roofit/roofit/src/RooChiSquarePdf.cxx +++ b/roofit/roofit/src/RooChiSquarePdf.cxx @@ -64,9 +64,10 @@ Double_t RooChiSquarePdf::evaluate() const //////////////////////////////////////////////////////////////////////////////// /// Compute multiple values of ChiSquare distribution. -void RooChiSquarePdf::computeBatch(rbc::RbcInterface* dispatch, double* output, size_t nEvents, rbc::DataMap& dataMap) const +void RooChiSquarePdf::computeBatch(cudaStream_t* stream, double* output, size_t nEvents, rbc::DataMap& dataMap) const { - dispatch->compute(rbc::ChiSquare, output, nEvents, dataMap, {&*_x,&*_norm}, {_ndof}); + auto dispatch = stream ? rbc::dispatchCUDA : rbc::dispatchCPU; + dispatch->compute(stream, rbc::ChiSquare, output, nEvents, dataMap, {&*_x,&*_norm}, {_ndof}); } //////////////////////////////////////////////////////////////////////////////// diff --git a/roofit/roofit/src/RooDstD0BG.cxx b/roofit/roofit/src/RooDstD0BG.cxx index 551ffd34ae0ab..46f34e84dca7a 100644 --- a/roofit/roofit/src/RooDstD0BG.cxx +++ b/roofit/roofit/src/RooDstD0BG.cxx @@ -81,9 +81,10 @@ Double_t RooDstD0BG::evaluate() const //////////////////////////////////////////////////////////////////////////////// /// Compute multiple values of D*-D0 mass difference distribution. -void RooDstD0BG::computeBatch(rbc::RbcInterface* dispatch, double* output, size_t nEvents, rbc::DataMap& dataMap) const +void RooDstD0BG::computeBatch(cudaStream_t* stream, double* output, size_t nEvents, rbc::DataMap& dataMap) const { - dispatch->compute(rbc::DstD0BG, output, nEvents, dataMap, {&*dm,&*dm0,&*C,&*A,&*B,&*_norm}); + auto dispatch = stream ? rbc::dispatchCUDA : rbc::dispatchCPU; + dispatch->compute(stream, rbc::DstD0BG, output, nEvents, dataMap, {&*dm,&*dm0,&*C,&*A,&*B,&*_norm}); } //////////////////////////////////////////////////////////////////////////////// diff --git a/roofit/roofit/src/RooExponential.cxx b/roofit/roofit/src/RooExponential.cxx index 3237a26aff9bc..183d396336b71 100644 --- a/roofit/roofit/src/RooExponential.cxx +++ b/roofit/roofit/src/RooExponential.cxx @@ -62,9 +62,10 @@ Double_t RooExponential::evaluate() const{ //////////////////////////////////////////////////////////////////////////////// /// Compute multiple values of Exponential distribution. -void RooExponential::computeBatch(rbc::RbcInterface* dispatch, double* output, size_t nEvents, rbc::DataMap& dataMap) const +void RooExponential::computeBatch(cudaStream_t* stream, double* output, size_t nEvents, rbc::DataMap& dataMap) const { - dispatch->compute(rbc::Exponential, output, nEvents, dataMap, {&*x,&*c,&*_norm}); + auto dispatch = stream ? rbc::dispatchCUDA : rbc::dispatchCPU; + dispatch->compute(stream, rbc::Exponential, output, nEvents, dataMap, {&*x,&*c,&*_norm}); } diff --git a/roofit/roofit/src/RooGamma.cxx b/roofit/roofit/src/RooGamma.cxx index b580013001065..66407898881e0 100644 --- a/roofit/roofit/src/RooGamma.cxx +++ b/roofit/roofit/src/RooGamma.cxx @@ -87,9 +87,10 @@ Double_t RooGamma::evaluate() const //////////////////////////////////////////////////////////////////////////////// /// Compute multiple values of Gamma PDF. -void RooGamma::computeBatch(rbc::RbcInterface* dispatch, double* output, size_t nEvents, rbc::DataMap& dataMap) const +void RooGamma::computeBatch(cudaStream_t* stream, double* output, size_t nEvents, rbc::DataMap& dataMap) const { - dispatch->compute(rbc::Gamma, output, nEvents, dataMap, {&*x,&*gamma,&*beta,&*mu,&*_norm}); + auto dispatch = stream ? rbc::dispatchCUDA : rbc::dispatchCPU; + dispatch->compute(stream, rbc::Gamma, output, nEvents, dataMap, {&*x,&*gamma,&*beta,&*mu,&*_norm}); } //////////////////////////////////////////////////////////////////////////////// diff --git a/roofit/roofit/src/RooGaussian.cxx b/roofit/roofit/src/RooGaussian.cxx index db1f85dc18fa6..ba35721888490 100644 --- a/roofit/roofit/src/RooGaussian.cxx +++ b/roofit/roofit/src/RooGaussian.cxx @@ -64,9 +64,10 @@ Double_t RooGaussian::evaluate() const //////////////////////////////////////////////////////////////////////////////// /// Compute multiple values of Gaussian distribution. -void RooGaussian::computeBatch(rbc::RbcInterface* dispatch, double* output, size_t nEvents, rbc::DataMap& dataMap) const +void RooGaussian::computeBatch(cudaStream_t* stream, double* output, size_t nEvents, rbc::DataMap& dataMap) const { - dispatch->compute(rbc::Gaussian, output, nEvents, dataMap, {&*x,&*mean,&*sigma,&*_norm}); + auto dispatch = stream ? rbc::dispatchCUDA : rbc::dispatchCPU; + dispatch->compute(stream, rbc::Gaussian, output, nEvents, dataMap, {&*x,&*mean,&*sigma,&*_norm}); } //////////////////////////////////////////////////////////////////////////////// diff --git a/roofit/roofit/src/RooJohnson.cxx b/roofit/roofit/src/RooJohnson.cxx index c45ed08c939f6..d6e327d420b2b 100644 --- a/roofit/roofit/src/RooJohnson.cxx +++ b/roofit/roofit/src/RooJohnson.cxx @@ -112,9 +112,10 @@ double RooJohnson::evaluate() const //////////////////////////////////////////////////////////////////////////////// /// Compute multiple values of the Johnson distribution. -void RooJohnson::computeBatch(rbc::RbcInterface* dispatch, double* output, size_t nEvents, rbc::DataMap& dataMap) const +void RooJohnson::computeBatch(cudaStream_t* stream, double* output, size_t nEvents, rbc::DataMap& dataMap) const { - dispatch->compute(rbc::Johnson, output, nEvents, dataMap, {&*_mass,&*_mu,&*_lambda,&*_gamma,&*_delta,&*_norm},{_massThreshold}); + auto dispatch = stream ? rbc::dispatchCUDA : rbc::dispatchCPU; + dispatch->compute(stream, rbc::Johnson, output, nEvents, dataMap, {&*_mass,&*_mu,&*_lambda,&*_gamma,&*_delta,&*_norm},{_massThreshold}); } //////////////////////////////////////////////////////////////////////////////// diff --git a/roofit/roofit/src/RooLandau.cxx b/roofit/roofit/src/RooLandau.cxx index cffa4ae7c5a5e..9be788998c88b 100644 --- a/roofit/roofit/src/RooLandau.cxx +++ b/roofit/roofit/src/RooLandau.cxx @@ -61,9 +61,10 @@ Double_t RooLandau::evaluate() const //////////////////////////////////////////////////////////////////////////////// /// Compute multiple values of Landau distribution. -void RooLandau::computeBatch(rbc::RbcInterface* dispatch, double* output, size_t nEvents, rbc::DataMap& dataMap) const +void RooLandau::computeBatch(cudaStream_t* stream, double* output, size_t nEvents, rbc::DataMap& dataMap) const { - dispatch->compute(rbc::Landau, output, nEvents, dataMap, {&*x,&*mean,&*sigma,&*_norm}); + auto dispatch = stream ? rbc::dispatchCUDA : rbc::dispatchCPU; + dispatch->compute(stream, rbc::Landau, output, nEvents, dataMap, {&*x,&*mean,&*sigma,&*_norm}); } //////////////////////////////////////////////////////////////////////////////// diff --git a/roofit/roofit/src/RooLognormal.cxx b/roofit/roofit/src/RooLognormal.cxx index d62c924549ff7..61310b497cf7f 100644 --- a/roofit/roofit/src/RooLognormal.cxx +++ b/roofit/roofit/src/RooLognormal.cxx @@ -82,9 +82,10 @@ Double_t RooLognormal::evaluate() const //////////////////////////////////////////////////////////////////////////////// /// Compute multiple values of Lognormal distribution. -void RooLognormal::computeBatch(rbc::RbcInterface* dispatch, double* output, size_t nEvents, rbc::DataMap& dataMap) const +void RooLognormal::computeBatch(cudaStream_t* stream, double* output, size_t nEvents, rbc::DataMap& dataMap) const { - dispatch->compute(rbc::Lognormal, output, nEvents, dataMap, {&*x,&*m0,&*k,&*_norm}); + auto dispatch = stream ? rbc::dispatchCUDA : rbc::dispatchCPU; + dispatch->compute(stream, rbc::Lognormal, output, nEvents, dataMap, {&*x,&*m0,&*k,&*_norm}); } //////////////////////////////////////////////////////////////////////////////// diff --git a/roofit/roofit/src/RooNovosibirsk.cxx b/roofit/roofit/src/RooNovosibirsk.cxx index 3d7ce7c7e0aaa..a710bbd79d7d2 100644 --- a/roofit/roofit/src/RooNovosibirsk.cxx +++ b/roofit/roofit/src/RooNovosibirsk.cxx @@ -90,9 +90,10 @@ Double_t RooNovosibirsk::evaluate() const } //////////////////////////////////////////////////////////////////////////////// /// Compute multiple values of Novosibirsk distribution. -void RooNovosibirsk::computeBatch(rbc::RbcInterface* dispatch, double* output, size_t nEvents, rbc::DataMap& dataMap) const +void RooNovosibirsk::computeBatch(cudaStream_t* stream, double* output, size_t nEvents, rbc::DataMap& dataMap) const { - dispatch->compute(rbc::Novosibirsk, output, nEvents, dataMap, {&*x,&*peak,&*width,&*tail,&*_norm}); + auto dispatch = stream ? rbc::dispatchCUDA : rbc::dispatchCPU; + dispatch->compute(stream, rbc::Novosibirsk, output, nEvents, dataMap, {&*x,&*peak,&*width,&*tail,&*_norm}); } //////////////////////////////////////////////////////////////////////////////// diff --git a/roofit/roofit/src/RooPoisson.cxx b/roofit/roofit/src/RooPoisson.cxx index f0129162da6c0..b4361bb6a5ce7 100644 --- a/roofit/roofit/src/RooPoisson.cxx +++ b/roofit/roofit/src/RooPoisson.cxx @@ -64,9 +64,10 @@ Double_t RooPoisson::evaluate() const //////////////////////////////////////////////////////////////////////////////// /// Compute multiple values of the Poisson distribution. -void RooPoisson::computeBatch(rbc::RbcInterface* dispatch, double* output, size_t nEvents, rbc::DataMap& dataMap) const +void RooPoisson::computeBatch(cudaStream_t* stream, double* output, size_t nEvents, rbc::DataMap& dataMap) const { - dispatch->compute(rbc::Poisson, output, nEvents, dataMap, {&*x,&*mean,&*_norm}, + auto dispatch = stream ? rbc::dispatchCUDA : rbc::dispatchCPU; + dispatch->compute(stream, rbc::Poisson, output, nEvents, dataMap, {&*x,&*mean,&*_norm}, {static_cast(_protectNegative), static_cast(_noRounding)}); } diff --git a/roofit/roofit/src/RooPolynomial.cxx b/roofit/roofit/src/RooPolynomial.cxx index 874caf3cea315..fef94fce52ee6 100644 --- a/roofit/roofit/src/RooPolynomial.cxx +++ b/roofit/roofit/src/RooPolynomial.cxx @@ -146,13 +146,14 @@ Double_t RooPolynomial::evaluate() const //////////////////////////////////////////////////////////////////////////////// /// Compute multiple values of Polynomial. -void RooPolynomial::computeBatch(rbc::RbcInterface* dispatch, double* output, size_t nEvents, rbc::DataMap& dataMap) const +void RooPolynomial::computeBatch(cudaStream_t* stream, double* output, size_t nEvents, rbc::DataMap& dataMap) const { rbc::ArgVector extraArgs; for (auto* coef:_coefList) extraArgs.push_back( static_cast(coef)->getVal() ); extraArgs.push_back(_lowestOrder); - dispatch->compute(rbc::Polynomial, output, nEvents, dataMap, {&*_x,&*_norm}, extraArgs); + auto dispatch = stream ? rbc::dispatchCUDA : rbc::dispatchCPU; + dispatch->compute(stream, rbc::Polynomial, output, nEvents, dataMap, {&*_x,&*_norm}, extraArgs); } //////////////////////////////////////////////////////////////////////////////// diff --git a/roofit/roofit/src/RooVoigtian.cxx b/roofit/roofit/src/RooVoigtian.cxx index 6c43af7b0b315..e0adf1a4ee859 100644 --- a/roofit/roofit/src/RooVoigtian.cxx +++ b/roofit/roofit/src/RooVoigtian.cxx @@ -100,7 +100,8 @@ Double_t RooVoigtian::evaluate() const //////////////////////////////////////////////////////////////////////////////// /// Compute multiple values of Voigtian distribution. -void RooVoigtian::computeBatch(rbc::RbcInterface* dispatch, double* output, size_t nEvents, rbc::DataMap& dataMap) const +void RooVoigtian::computeBatch(cudaStream_t* stream, double* output, size_t nEvents, rbc::DataMap& dataMap) const { - dispatch->compute(rbc::Voigtian, output, nEvents, dataMap, {&*x,&*mean,&*width,&*sigma,&*_norm}); + auto dispatch = stream ? rbc::dispatchCUDA : rbc::dispatchCPU; + dispatch->compute(stream, rbc::Voigtian, output, nEvents, dataMap, {&*x,&*mean,&*width,&*sigma,&*_norm}); } diff --git a/roofit/roofitcore/inc/RooAbsPdf.h b/roofit/roofitcore/inc/RooAbsPdf.h index 39c69d4320730..8d3de6d146e53 100644 --- a/roofit/roofitcore/inc/RooAbsPdf.h +++ b/roofit/roofitcore/inc/RooAbsPdf.h @@ -236,7 +236,7 @@ class RooAbsPdf : public RooAbsReal { const RooArgSet* normSet = nullptr) const; RooSpan getLogProbabilities(rbc::RunContext& evalData, const RooArgSet* normSet = nullptr) const; - void computeBatch(rbc::RbcInterface* dispatch, double* output, size_t size, rbc::DataMap& dataMap) const; + void computeBatch(cudaStream_t*, double* output, size_t size, rbc::DataMap&) const; /// \copydoc getNorm(const RooArgSet*) const Double_t getNorm(const RooArgSet& nset) const { diff --git a/roofit/roofitcore/inc/RooAbsReal.h b/roofit/roofitcore/inc/RooAbsReal.h index 377bc7ddb98cf..4766f827ed167 100644 --- a/roofit/roofitcore/inc/RooAbsReal.h +++ b/roofit/roofitcore/inc/RooAbsReal.h @@ -389,7 +389,7 @@ class RooAbsReal : public RooAbsArg { const RooAbsReal* createPlotProjection(const RooArgSet& depVars, const RooArgSet& projVars, RooArgSet*& cloneSet) const ; const RooAbsReal *createPlotProjection(const RooArgSet &dependentVars, const RooArgSet *projectedVars, RooArgSet *&cloneSet, const char* rangeName=0, const RooArgSet* condObs=0) const; - virtual void computeBatch(rbc::RbcInterface* dispatch, double* output, size_t size, rbc::DataMap& dataMap) const; + virtual void computeBatch(cudaStream_t*, double* output, size_t size, rbc::DataMap&) const; virtual bool canComputeBatchWithCuda() const { return false; } protected: diff --git a/roofit/roofitcore/inc/RooAddPdf.h b/roofit/roofitcore/inc/RooAddPdf.h index 22a07aad1e3ed..5dca917ab6b80 100644 --- a/roofit/roofitcore/inc/RooAddPdf.h +++ b/roofit/roofitcore/inc/RooAddPdf.h @@ -128,7 +128,7 @@ class RooAddPdf : public RooAbsPdf { Double_t evaluate() const override; - void computeBatch(rbc::RbcInterface* dispatch, double* output, size_t nEvents, rbc::DataMap& dataMap) const override; + void computeBatch(cudaStream_t*, double* output, size_t nEvents, rbc::DataMap&) const override; inline bool canComputeBatchWithCuda() const override { return true; } diff --git a/roofit/roofitcore/inc/RooFormula.h b/roofit/roofitcore/inc/RooFormula.h index 1104bc3075423..69c235d03e417 100644 --- a/roofit/roofitcore/inc/RooFormula.h +++ b/roofit/roofitcore/inc/RooFormula.h @@ -61,7 +61,7 @@ class RooFormula : public TNamed, public RooPrintable { /// Evalute all parameters/observables, and then evaluate formula. Double_t eval(const RooArgSet* nset=0) const; RooSpan evaluateSpan(const RooAbsReal* dataOwner, rbc::RunContext& inputData, const RooArgSet* nset = nullptr) const; - void computeBatch(rbc::RbcInterface* dispatch, double* output, size_t nEvents, rbc::DataMap& dataMap) const; + void computeBatch(cudaStream_t*, double* output, size_t nEvents, rbc::DataMap&) const; /// DEBUG: Dump state information void dump() const; diff --git a/roofit/roofitcore/inc/RooFormulaVar.h b/roofit/roofitcore/inc/RooFormulaVar.h index 527e1c31bb20f..c6f061128ec53 100644 --- a/roofit/roofitcore/inc/RooFormulaVar.h +++ b/roofit/roofitcore/inc/RooFormulaVar.h @@ -71,9 +71,9 @@ class RooFormulaVar : public RooAbsReal { // Function evaluation virtual Double_t evaluate() const ; RooSpan evaluateSpan(rbc::RunContext& evalData, const RooArgSet* normSet) const; - inline void computeBatch(rbc::RbcInterface* dispatch, double* output, size_t nEvents, rbc::DataMap& dataMap) const + inline void computeBatch(cudaStream_t* stream, double* output, size_t nEvents, rbc::DataMap& dataMap) const { - formula().computeBatch(dispatch, output, nEvents, dataMap); + formula().computeBatch(stream, output, nEvents, dataMap); } diff --git a/roofit/roofitcore/inc/RooGenericPdf.h b/roofit/roofitcore/inc/RooGenericPdf.h index 53ce673320fbe..cb84c3f23dad5 100644 --- a/roofit/roofitcore/inc/RooGenericPdf.h +++ b/roofit/roofitcore/inc/RooGenericPdf.h @@ -50,7 +50,7 @@ class RooGenericPdf : public RooAbsPdf { RooListProxy _actualVars ; virtual Double_t evaluate() const ; RooSpan evaluateSpan(rbc::RunContext& inputData, const RooArgSet* normSet) const; - void computeBatch(rbc::RbcInterface* dispatch, double* output, size_t nEvents, rbc::DataMap& dataMap) const; + void computeBatch(cudaStream_t*, double* output, size_t nEvents, rbc::DataMap&) const; Bool_t setFormula(const char* formula) ; diff --git a/roofit/roofitcore/inc/RooNLLVarNew.h b/roofit/roofitcore/inc/RooNLLVarNew.h index 256398498be9f..8b317d54de10e 100644 --- a/roofit/roofitcore/inc/RooNLLVarNew.h +++ b/roofit/roofitcore/inc/RooNLLVarNew.h @@ -33,10 +33,10 @@ class RooNLLVarNew : public RooAbsReal { } inline RooAbsPdf* getPdf() const { return &*_pdf; } - void computeBatch(rbc::RbcInterface* dispatch, double* output, size_t nEvents, rbc::DataMap& dataMap) const override; + void computeBatch(cudaStream_t*, double* output, size_t nEvents, rbc::DataMap&) const override; inline bool canComputeBatchWithCuda() const override { return true; } - double reduce(rbc::RbcInterface* dispatch, const double* input, size_t nEvents) const; + double reduce(cudaStream_t*, const double* input, size_t nEvents) const; protected: RooTemplateProxy _pdf; diff --git a/roofit/roofitcore/inc/RooProdPdf.h b/roofit/roofitcore/inc/RooProdPdf.h index 6596c2f734192..d34941ef7c4e4 100644 --- a/roofit/roofitcore/inc/RooProdPdf.h +++ b/roofit/roofitcore/inc/RooProdPdf.h @@ -100,7 +100,7 @@ class RooProdPdf : public RooAbsPdf { private: Double_t evaluate() const ; - void computeBatch(rbc::RbcInterface* dispatch, double* output, size_t nEvents, rbc::DataMap& dataMap) const; + void computeBatch(cudaStream_t*, double* output, size_t nEvents, rbc::DataMap&) const; inline bool canComputeBatchWithCuda() const { return true; } RooAbsReal* makeCondPdfRatioCorr(RooAbsReal& term, const RooArgSet& termNset, const RooArgSet& termImpSet, const char* normRange, const char* refRange) const ; diff --git a/roofit/roofitcore/src/RooAbsPdf.cxx b/roofit/roofitcore/src/RooAbsPdf.cxx index 99060fa0e971c..000b1561cda3a 100644 --- a/roofit/roofitcore/src/RooAbsPdf.cxx +++ b/roofit/roofitcore/src/RooAbsPdf.cxx @@ -3738,9 +3738,9 @@ pdf. After that, divide by the normalization values found in the dataMap. \param nEvents The number of events to be processed \param dataMap A std::map containing the input data for the computations **/ -void RooAbsPdf::computeBatch(rbc::RbcInterface* dispatch, double* output, size_t nEvents, rbc::DataMap& dataMap) const +void RooAbsPdf::computeBatch(cudaStream_t* stream, double* output, size_t nEvents, rbc::DataMap& dataMap) const { - RooAbsReal::computeBatch(dispatch, output, nEvents, dataMap); + RooAbsReal::computeBatch(stream, output, nEvents, dataMap); auto integralSpan = dataMap[_norm]; diff --git a/roofit/roofitcore/src/RooAbsReal.cxx b/roofit/roofitcore/src/RooAbsReal.cxx index 70a31e73be2b7..76a69cef2d64e 100644 --- a/roofit/roofitcore/src/RooAbsReal.cxx +++ b/roofit/roofitcore/src/RooAbsReal.cxx @@ -4888,8 +4888,8 @@ RooSpan RooAbsReal::evaluateSpan(rbc::RunContext& evalData, const RooArg \param output The array where the results are stored \param nEvents The number of events to be processed \param dataMap A std::map containing the input data for the computations -**/ -void RooAbsReal::computeBatch(rbc::RbcInterface*, double* output, size_t nEvents, rbc::DataMap& dataMap) const { +**/ +void RooAbsReal::computeBatch(cudaStream_t*, double* output, size_t nEvents, rbc::DataMap& dataMap) const { // Find all servers that are serving real numbers to us, retrieve their batch data, // and switch them into "always clean" operating mode, so they return always the last-set value. diff --git a/roofit/roofitcore/src/RooAddPdf.cxx b/roofit/roofitcore/src/RooAddPdf.cxx index b9a201baa21f4..ee45b232d2207 100644 --- a/roofit/roofitcore/src/RooAddPdf.cxx +++ b/roofit/roofitcore/src/RooAddPdf.cxx @@ -785,7 +785,7 @@ Double_t RooAddPdf::evaluate() const //////////////////////////////////////////////////////////////////////////////// /// Compute addition of PDFs in batches. -void RooAddPdf::computeBatch(rbc::RbcInterface* dispatch, double* output, size_t nEvents, rbc::DataMap& dataMap) const +void RooAddPdf::computeBatch(cudaStream_t* stream, double* output, size_t nEvents, rbc::DataMap& dataMap) const { rbc::VarVector pdfs; rbc::ArgVector coefs; @@ -800,7 +800,8 @@ void RooAddPdf::computeBatch(rbc::RbcInterface* dispatch, double* output, size_t static_cast(cache->_suppNormList.at(pdfNo))->getVal() : 1) ); } } - dispatch->compute(rbc::AddPdf, output, nEvents, dataMap, pdfs, coefs); + auto dispatch = stream ? rbc::dispatchCUDA : rbc::dispatchCPU; + dispatch->compute(stream, rbc::AddPdf, output, nEvents, dataMap, pdfs, coefs); } diff --git a/roofit/roofitcore/src/RooFitDriver.cxx b/roofit/roofitcore/src/RooFitDriver.cxx index 405b915e210c1..26d8dd5d2a9da 100644 --- a/roofit/roofitcore/src/RooFitDriver.cxx +++ b/roofit/roofitcore/src/RooFitDriver.cxx @@ -248,11 +248,11 @@ void RooFitDriver::computeCPUNode(const RooAbsReal* node, NodeInfo& info) { if (_getValInvocations==1) { using namespace std::chrono; auto start = steady_clock::now(); - node->computeBatch(rbc::dispatchCPU, buffer, _nEvents, _dataMapCPU); + node->computeBatch(nullptr, buffer, _nEvents, _dataMapCPU); info.cpuTime = duration_cast( steady_clock::now()-start ); } else { - node->computeBatch(rbc::dispatchCPU, buffer, _nEvents, _dataMapCPU); + node->computeBatch(nullptr, buffer, _nEvents, _dataMapCPU); } if (info.copyAfterEvaluation) { @@ -334,13 +334,14 @@ double RooFitDriver::getVal() if (!pNLLVarNew) return 0.0; // recycle the top node's buffer and return the final value - if (_nodeInfos.at(&_topNode).computeInGPU) { + NodeInfo& info = _nodeInfos.at(&_topNode); + if (info.computeInGPU) { _gpuBuffers.push( const_cast( _dataMapCUDA.at(&_topNode).data() )); - return pNLLVarNew->reduce(rbc::dispatchCUDA, _dataMapCUDA.at(&_topNode).data(), _nEvents); + return pNLLVarNew->reduce(info.stream, _dataMapCUDA.at(&_topNode).data(), _nEvents); } else { _cpuBuffers.push( const_cast( _dataMapCPU.at(&_topNode).data() )); - return pNLLVarNew->reduce(rbc::dispatchCPU, _dataMapCPU.at(&_topNode).data(), _nEvents); + return pNLLVarNew->reduce(nullptr, _dataMapCPU.at(&_topNode).data(), _nEvents); } } @@ -394,10 +395,10 @@ void RooFitDriver::assignToGPU(const RooAbsReal* node) using namespace std::chrono; rbc::dispatchCUDA->cudaEventRecord(info.eventStart, info.stream); auto start = steady_clock::now(); - node->computeBatch(rbc::dispatchCUDA, buffer, _nEvents, _dataMapCUDA); + node->computeBatch(info.stream, buffer, _nEvents, _dataMapCUDA); info.cudaTime = duration_cast( steady_clock::now()-start ); } - else node->computeBatch(rbc::dispatchCUDA, buffer, _nEvents, _dataMapCUDA); + else node->computeBatch(info.stream, buffer, _nEvents, _dataMapCUDA); rbc::dispatchCUDA->cudaEventRecord(info.event, info.stream); if (info.copyAfterEvaluation) { diff --git a/roofit/roofitcore/src/RooFormula.cxx b/roofit/roofitcore/src/RooFormula.cxx index d6acad191922c..73bd6d881e872 100644 --- a/roofit/roofitcore/src/RooFormula.cxx +++ b/roofit/roofitcore/src/RooFormula.cxx @@ -409,7 +409,7 @@ RooSpan RooFormula::evaluateSpan(const RooAbsReal* dataOwner, rbc::RunCo return output; } -void RooFormula::computeBatch(rbc::RbcInterface*, double* output, size_t nEvents, rbc::DataMap& dataMap) const +void RooFormula::computeBatch(cudaStream_t*, double* output, size_t nEvents, rbc::DataMap& dataMap) const { const int nPars=_origList.size(); std::vector> inputSpans(nPars); diff --git a/roofit/roofitcore/src/RooGenericPdf.cxx b/roofit/roofitcore/src/RooGenericPdf.cxx index 4043b5e588051..1560eaa9a9287 100644 --- a/roofit/roofitcore/src/RooGenericPdf.cxx +++ b/roofit/roofitcore/src/RooGenericPdf.cxx @@ -137,9 +137,9 @@ RooSpan RooGenericPdf::evaluateSpan(rbc::RunContext& inputData, const Ro } //////////////////////////////////////////////////////////////////////////////// -void RooGenericPdf::computeBatch(rbc::RbcInterface* dispatch, double* output, size_t nEvents, rbc::DataMap& dataMap) const +void RooGenericPdf::computeBatch(cudaStream_t* stream, double* output, size_t nEvents, rbc::DataMap& dataMap) const { - formula().computeBatch(dispatch, output, nEvents, dataMap); + formula().computeBatch(stream, output, nEvents, dataMap); RooSpan normVal = dataMap.at(&*_norm); for (size_t i=0; i(vars.size()-1)}; - dispatch->compute(rbc::NegativeLogarithms, output, nEvents, dataMap, vars, args); + auto dispatch = stream ? rbc::dispatchCUDA : rbc::dispatchCPU; + dispatch->compute(stream, rbc::NegativeLogarithms, output, nEvents, dataMap, vars, args); if ((_isExtended || _rangeNormTerm) && _sumWeight == 0.0) { if(!_weight) { @@ -120,7 +121,7 @@ void RooNLLVarNew::computeBatch(rbc::RbcInterface* dispatch, double* output, siz } else { auto weightSpan = dataMap[&**_weight]; _sumWeight = weightSpan.size() == 1 ? weightSpan[0] * nEvents - : dispatch->sumReduce(dataMap[&**_weight].data(), nEvents); + : dispatch->sumReduce(stream, dataMap[&**_weight].data(), nEvents); } } if (_rangeNormTerm) { @@ -129,11 +130,11 @@ void RooNLLVarNew::computeBatch(rbc::RbcInterface* dispatch, double* output, siz _sumCorrectionTerm = _sumWeight * rangeNormTermSpan[0]; } else { if(!_weight) { - _sumCorrectionTerm = dispatch->sumReduce(rangeNormTermSpan.data(), nEvents); + _sumCorrectionTerm = dispatch->sumReduce(stream, rangeNormTermSpan.data(), nEvents); } else { auto weightSpan = dataMap[&**_weight]; if(weightSpan.size() == 1) { - _sumCorrectionTerm = weightSpan[0] * dispatch->sumReduce(rangeNormTermSpan.data(), nEvents); + _sumCorrectionTerm = weightSpan[0] * dispatch->sumReduce(stream, rangeNormTermSpan.data(), nEvents); } else { // We don't need to use the library for now because the weights and // correction term integrals are always in the CPU map. @@ -153,9 +154,10 @@ void RooNLLVarNew::computeBatch(rbc::RbcInterface* dispatch, double* output, siz \param input The input array with the nlls to be reduced \param nEvents the number of events to be processed **/ -double RooNLLVarNew::reduce(rbc::RbcInterface* dispatch, const double* input, size_t nEvents) const +double RooNLLVarNew::reduce(cudaStream_t* stream, const double* input, size_t nEvents) const { - double nll = dispatch->sumReduce(input, nEvents); + auto dispatch = stream ? rbc::dispatchCUDA : rbc::dispatchCPU; + double nll = dispatch->sumReduce(stream, input, nEvents); if (_constraints) { nll += _constraints->getVal(); } diff --git a/roofit/roofitcore/src/RooProdPdf.cxx b/roofit/roofitcore/src/RooProdPdf.cxx index cd02c6cc8b62d..400436e1c87cf 100644 --- a/roofit/roofitcore/src/RooProdPdf.cxx +++ b/roofit/roofitcore/src/RooProdPdf.cxx @@ -505,7 +505,7 @@ Double_t RooProdPdf::calculate(const RooProdPdf::CacheElem& cache, Bool_t /*verb //////////////////////////////////////////////////////////////////////////////// /// Evaluate product of PDFs in batch mode. -void RooProdPdf::computeBatch(rbc::RbcInterface* dispatch, double* output, size_t nEvents, rbc::DataMap& dataMap) const +void RooProdPdf::computeBatch(cudaStream_t* stream, double* output, size_t nEvents, rbc::DataMap& dataMap) const { rbc::VarVector pdfs; for (const RooAbsArg* i:_pdfList) { @@ -518,7 +518,8 @@ void RooProdPdf::computeBatch(rbc::RbcInterface* dispatch, double* output, size_ } rbc::ArgVector special{ static_cast(pdfs.size()) }; pdfs.push_back(&*_norm); - dispatch->compute(rbc::ProdPdf, output, nEvents, dataMap, pdfs, special); + auto dispatch = stream ? rbc::dispatchCUDA : rbc::dispatchCPU; + dispatch->compute(stream, rbc::ProdPdf, output, nEvents, dataMap, pdfs, special); } namespace {