diff --git a/BinInfoUtils.h b/BinInfoUtils.h index 908eca6534c53..7c9481caa3caa 100644 --- a/BinInfoUtils.h +++ b/BinInfoUtils.h @@ -12,18 +12,27 @@ typedef std::pair BinInfo; typedef std::vector> BinInfoLayerMap; typedef std::vector BinInfoMap; +#ifdef __CUDACC__ +__host__ __device__ +#endif inline float downPhi(float phi) { while (phi >= Config::PI) {phi-=Config::TwoPI;} return phi; } +#ifdef __CUDACC__ +__host__ __device__ +#endif inline float upPhi(float phi) { while (phi <= -Config::PI) {phi+=Config::TwoPI;} return phi; } +#ifdef __CUDACC__ +__host__ __device__ +#endif inline float normalizedPhi(float phi) { // return std::fmod(phi, (float) Config::PI); // return phi +pi out of phase for |phi| beyond boundary! @@ -31,6 +40,9 @@ inline float normalizedPhi(float phi) return phi; } +#ifdef __CUDACC__ +__host__ __device__ +#endif inline int getPhiPartition(float phi) { //assume phi is between -PI and PI diff --git a/Config.h b/Config.h index d4db239c5ac2f..cc8b1774c81f7 100644 --- a/Config.h +++ b/Config.h @@ -121,7 +121,7 @@ namespace Config // Config for Hit and BinInfoUtils constexpr int nPhiPart = 1260; constexpr float fPhiFactor = nPhiPart / TwoPI; - constexpr int nEtaPart = 11; + constexpr int nEtaPart = 11; // 1 is better for GPU best_hit constexpr int nEtaBin = 2 * nEtaPart - 1; constexpr float fEtaFull = 2 * Config::fEtaDet; @@ -204,7 +204,7 @@ namespace Config #ifdef __MIC__ #define MPT_SIZE 16 #elif defined USE_CUDA - #define MPT_SIZE 10000 + #define MPT_SIZE 8 #else #define MPT_SIZE 8 #endif diff --git a/Hit.h b/Hit.h index c44ba1d8ea594..a95f3d9ed38bb 100644 --- a/Hit.h +++ b/Hit.h @@ -85,11 +85,17 @@ inline float getInvRad2(float x, float y){ return 1.0f/(x*x + y*y); } +#ifdef __CUDACC__ +__host__ __device__ +#endif inline float getPhi(float x, float y) { return std::atan2(y,x); } +#ifdef __CUDACC__ +__host__ __device__ +#endif inline float getTheta(float r, float z){ return std::atan2(r,z); } @@ -217,6 +223,10 @@ class Hit const float* posArray() const {return state_.pos_.Array();} const float* errArray() const {return state_.err_.Array();} +#if __CUDACC__ + __device__ float* posArrayCU(); + __device__ float* errArrayCU(); +#endif // Non-const versions needed for CopyOut of Matriplex. SVector3& parameters_nc() {return state_.pos_;} diff --git a/Makefile.config b/Makefile.config index badc9594ca741..15231cdb21ed5 100644 --- a/Makefile.config +++ b/Makefile.config @@ -19,6 +19,7 @@ # OSXGCC5 := yes # To keep Dan's version working # TBB_PREFIX := tbb +# TBB_PREFIX := ${TBBROOT} # 1. Use ROOT or not (never used on MIC) # Comment out to disable root ("yes" is not relevant) @@ -34,9 +35,11 @@ endif # 2.1 Use nvcc to compile cuda code # CUDA compiler -NV := nvcc +CUBROOT=/home/ml15/tools/cub +NV := nvcc -prec-sqrt=true -I${CUBROOT} +#-g -G -lineinfo # Comment out to compile for CPU -# USE_CUDA := -DUSE_CUDA +#USE_CUDA := yes # 3. Optimization # -O3 implies vectorization and simd (but not AVX) @@ -113,12 +116,19 @@ INWARD_FIT := -DINWARDFIT # Derived settings ################################################################ -CPPFLAGS := -I. ${USE_MATRIPLEX} ${USE_INTRINSICS} ${USE_CUDA} -std=c++11 +CPPFLAGS := -I. ${USE_MATRIPLEX} ${USE_INTRINSICS} -std=c++11 CXXFLAGS := ${OPT} ${OSX_CXXFLAGS} -LDFLAGS_HOST := +LDFLAGS_HOST := LDFLAGS_MIC := +ifdef USE_CUDA +CPPFLAGS += -DUSE_CUDA -I/nfs/opt/cuda/include +#CPPFLAGS += -I/home/ml15/tools/cub +CPPFLAGS += -I${CUBROOT} +LDFLAGS_HOST += -L${CUDALIBDIR} +endif + CPPFLAGS += ${USE_STATE_VALIDITY_CHECKS} ${USE_SCATTERING} ${USE_LINEAR_INTERPOLATION} ${ENDTOEND} ${USE_ETA_SEGMENTATION} ${INWARD_FIT} ${GEN_FLAT_ETA} ifdef USE_VTUNE_NOTIFY @@ -130,7 +140,8 @@ endif endif ifneq ($(CXX),icc) - #CXXFLAGS += -Wall -Wno-unknown-pragmas + CPPFLAGS += -I/opt/rh/python27/root/usr/include + LDFLAGS_HOST += -L/opt/rh/python27/root/usr/lib64 endif ifeq ($(CXX),icc) diff --git a/Math/MatrixRepresentationsStatic.h b/Math/MatrixRepresentationsStatic.h index 21cf5b9c59c46..90ef3ab2ab6a3 100644 --- a/Math/MatrixRepresentationsStatic.h +++ b/Math/MatrixRepresentationsStatic.h @@ -241,6 +241,9 @@ namespace Math { inline T* Array() { return fArray; } inline const T* Array() const { return fArray; } +#ifdef __CUDACC__ + T* ArrayCU(); +#endif /** assignment : only symmetric to symmetric allowed diff --git a/Math/SMatrix.h b/Math/SMatrix.h index b14edfccc5d56..82126403ef951 100644 --- a/Math/SMatrix.h +++ b/Math/SMatrix.h @@ -272,6 +272,9 @@ class SMatrix { const T* Array() const; /// return pointer to internal array T* Array(); +#ifdef __CUDACC__ + T* ArrayCU(); +#endif /** @name --- STL-like interface --- The iterators access the matrix element in the order how they are diff --git a/Math/SVector.h b/Math/SVector.h index 7de0c830a9538..1186867a7679c 100644 --- a/Math/SVector.h +++ b/Math/SVector.h @@ -185,6 +185,9 @@ class SVector { const T* Array() const; /// return non-const pointer to internal array T* Array(); +#ifdef __CUDACC__ + T* ArrayCU(); +#endif /** @name --- STL-like interface --- */ diff --git a/Matriplex/GenMul.pm b/Matriplex/GenMul.pm index 45df5311c487a..186029e53bc0d 100644 --- a/Matriplex/GenMul.pm +++ b/Matriplex/GenMul.pm @@ -532,6 +532,75 @@ sub multiply_standard # ---------------------------------------------------------------------- +sub generate_addend_gpu +{ + my ($S, $x, $y) = @_; + + return undef if $S->{$x}{pat} eq '0' or $S->{$y}{pat} eq '0'; + return "1" if $S->{$x}{pat} eq '1' and $S->{$y}{pat} eq '1'; + + my $xstr = sprintf "$S->{$x}{mat}{name}\[%2d*$S->{$x}{mat}{name}N+$S->{$x}{mat}{name}n]", $S->{$x}{idx}; + my $ystr = sprintf "$S->{$y}{mat}{name}\[%2d*$S->{$y}{mat}{name}N+$S->{$y}{mat}{name}n]", $S->{$y}{idx}; + + return $xstr if $S->{$y}{pat} eq '1'; + return $ystr if $S->{$x}{pat} eq '1'; + + return "${xstr}*${ystr}"; +} + +sub multiply_gpu +{ + # Standard mutiplication - outputs unrolled C code, one line + # per target matrix element. + # Arguments: a, b, c -- all GenMul::MBase with right dimensions. + # Does: c = a * b + + check_multiply_arguments(@_); + + my ($S, $a, $b, $c) = @_; + + my $is_c_symmetric = $c->isa("GenMul::MatrixSym"); + + # With no_size_check matrices do not have to be compatible. + my $k_max = $a->{N} <= $b->{M} ? $a->{N} : $b->{M}; + + for (my $i = 0; $i < $c->{M}; ++$i) + { + my $j_max = $is_c_symmetric ? $i + 1 : $c->{N}; + + for (my $j = 0; $j < $j_max; ++$j) + { + my $x = $c->idx($i, $j); + + printf "$S->{prefix}$c->{name}\[%2d*$c->{name}N+$c->{name}n\] = ", $x; + + my @sum; + + for (my $k = 0; $k < $k_max; ++$k) + { + $S->generate_indices_and_patterns_for_multiplication($i, $j, $k); + + my $addend = $S->generate_addend_gpu('a', 'b'); + + push @sum, $addend if defined $addend; + } + if (@sum) + { + print join(" + ", @sum), ";"; + } + else + { + print "0;" + } + print "\n"; + } + } + + $S->delete_temporaries(); +} + +# ---------------------------------------------------------------------- + sub load_if_needed { my ($S, $x) = @_; @@ -709,6 +778,7 @@ sub dump_multiply_std_and_intrinsic } print <<"FNORD"; +#ifndef __CUDACC__ #ifdef MPLEX_INTRINSICS for (int n = 0; n < N; n += MPLEX_INTRINSICS_WIDTH_BYTES / sizeof(T)) @@ -732,6 +802,11 @@ FNORD print <<"FNORD"; } #endif +#else // __CUDACC__ +FNORD + $S->multiply_gpu($a, $b, $c); + print <<"FNORD"; +#endif // __CUDACC__ FNORD diff --git a/Matriplex/Matriplex.h b/Matriplex/Matriplex.h index 0853e75618677..eb26e4ae64827 100644 --- a/Matriplex/Matriplex.h +++ b/Matriplex/Matriplex.h @@ -131,6 +131,9 @@ class Matriplex for (int j = 0; j < N; ++j) { fArray[i*N + j] = * (const T*) (arr + i*sizeof(T) + vi[j]); + //if(j==2) { + //printf("cpu -- %d : %d, %f\n", i, vi[j], fArray[i*N+j]); + //} } } } diff --git a/Matrix.h b/Matrix.h index efe0782c8e87f..fa622e80fc165 100644 --- a/Matrix.h +++ b/Matrix.h @@ -50,11 +50,17 @@ inline double dtime() return( tseconds ); } +#ifdef __CUDACC__ +__host__ __device__ +#endif inline float hipo(float x, float y) { return std::sqrt(x*x + y*y); } +#ifdef __CUDACC__ +__host__ __device__ +#endif inline void sincos4(const float x, float& sin, float& cos) { // Had this writen with explicit division by factorial. @@ -74,7 +80,7 @@ inline void sincos4(const float x, float& sin, float& cos) #ifdef __INTEL_COMPILER #define ASSUME_ALIGNED(a, b) __assume_aligned(a, b) #else - #define ASSUME_ALIGNED(a, b) __builtin_assume_aligned(a, b) + #define ASSUME_ALIGNED(a, b) a = static_cast(__builtin_assume_aligned(a, b)) #endif #include "Matriplex/MatriplexSym.h" diff --git a/Track.h b/Track.h index b7d485f1e6a64..334c48bd359f8 100644 --- a/Track.h +++ b/Track.h @@ -140,6 +140,11 @@ class Track const float* posArray() const {return state_.parameters.Array();} const float* errArray() const {return state_.errors.Array();} +//#ifdef USE_CUDA +#if __CUDACC__ + __device__ float* posArrayCU(); + __device__ float* errArrayCU(); +#endif // Non-const versions needed for CopyOut of Matriplex. SVector6& parameters_nc() {return state_.parameters;} @@ -149,8 +154,17 @@ class Track SVector3 position() const {return SVector3(state_.parameters[0],state_.parameters[1],state_.parameters[2]);} SVector3 momentum() const {return SVector3(state_.parameters[3],state_.parameters[4],state_.parameters[5]);} +#if __CUDACC__ + __host__ __device__ +#endif int charge() const {return state_.charge;} +#if __CUDACC__ + __host__ __device__ +#endif float chi2() const {return chi2_;} +#if __CUDACC__ + __host__ __device__ +#endif int label() const {return label_;} float x() const { return state_.parameters[0];} @@ -200,12 +214,18 @@ class Track } } +#if __CUDACC__ + __host__ __device__ +#endif void addHitIdx(int hitIdx,float chi2) { hitIdxArr_[++hitIdxPos_] = hitIdx; if (hitIdx >= 0) { ++nGoodHitIdx_; chi2_+=chi2; } } +#if __CUDACC__ + __host__ __device__ +#endif int getHitIdx(int posHitIdx) const { return hitIdxArr_[posHitIdx]; @@ -222,6 +242,9 @@ class Track } } +#if __CUDACC__ + __host__ __device__ +#endif void setHitIdx(int posHitIdx, int newIdx) { hitIdxArr_[posHitIdx] = newIdx; } @@ -233,6 +256,16 @@ class Track } } +#if __CUDACC__ + __host__ __device__ +#endif + void setNGoodHitIdx(int nHits) { + nGoodHitIdx_ = nHits; + } + +#if __CUDACC__ + __host__ __device__ +#endif void resetHits() { hitIdxPos_ = -1; @@ -250,8 +283,18 @@ class Track } return layers; } + +#if __CUDACC__ + __host__ __device__ +#endif void setCharge(int chg) {state_.charge=chg;} +#if __CUDACC__ + __host__ __device__ +#endif void setChi2(float chi2) {chi2_=chi2;} +#if __CUDACC__ + __host__ __device__ +#endif void setLabel(int lbl) {label_=lbl;} void setState(const TrackState& newState) {state_=newState;} diff --git a/mkFit/BuilderCU.cu b/mkFit/BuilderCU.cu new file mode 100644 index 0000000000000..6d6ee60863365 --- /dev/null +++ b/mkFit/BuilderCU.cu @@ -0,0 +1,78 @@ +#include "BuilderCU.h" + +#include "gpu_utils.h" +#include "HitStructures.h" +#include "HitStructuresCU.h" +#include "GeometryCU.h" +#include "FitterCU.h" +#include "Event.h" + + +BuilderCU::BuilderCU() +{ +} + + +BuilderCU::BuilderCU(const EventOfHits& event_of_hits, const Event* event, + const EventOfCandidates& event_of_cands) +{ + setUp(event_of_hits, event, event_of_cands); +} + + +BuilderCU::~BuilderCU() { + tearDown(); +} + + +void BuilderCU::setUp(const EventOfHits& event_of_hits, const Event* event, + const EventOfCandidates& event_of_cands) +{ + int gplex_size = 1 << 14; + cuFitter = new FitterCU (gplex_size); + cuFitter->allocateDevice(); + cuFitter->allocate_extra_addBestHit(); + cuFitter->createStream(); + cuFitter->setNumberTracks(gplex_size); + + event_of_hits_cu.allocGPU(event_of_hits); + event_of_hits_cu.copyFromCPU(event_of_hits); + + std::vector radii (Config::nLayers); + for (int ilay = Config::nlayers_per_seed; ilay < Config::nLayers; ++ilay) { + radii[ilay] = event->geom_.Radius(ilay); + } + geom_cu.allocate(); + geom_cu.getRadiiFromCPU(&radii[0]); + + event_of_cands_cu.allocGPU(event_of_cands); +} + + +void BuilderCU::tearDown() { + event_of_cands_cu.deallocGPU(); + + geom_cu.deallocate(); + event_of_hits_cu.deallocGPU(); + + cuFitter->destroyStream(); + cuFitter->free_extra_addBestHit(); + cuFitter->freeDevice(); + delete cuFitter; +} + + +void BuilderCU::FindTracksBestHit(EventOfCandidates& event_of_cands) +{ + event_of_cands_cu.copyFromCPU(event_of_cands, cuFitter->get_stream()); + + cuFitter->addBestHit(event_of_hits_cu, geom_cu, event_of_cands_cu); + + event_of_cands_cu.copyToCPU(event_of_cands, cuFitter->get_stream()); + cudaStreamSynchronize(cuFitter->get_stream()); + //cudaCheckError(); + + /*size_t free_mem, total_mem;*/ + /*cudaMemGetInfo(&free_mem, &total_mem);*/ + /*fprintf(stderr, "Free: %d\n", free_mem);*/ +} diff --git a/mkFit/BuilderCU.h b/mkFit/BuilderCU.h new file mode 100644 index 0000000000000..87439d6be5d5b --- /dev/null +++ b/mkFit/BuilderCU.h @@ -0,0 +1,37 @@ +#ifndef BUILDER_CU_H +#define BUILDER_CU_H + +#include "FitterCU.h" +#include "HitStructures.h" +#include "HitStructuresCU.h" +#include "GeometryCU.h" +#include "Geometry.h" +#include "Event.h" + + +// FIXME: Design Issue +// What to do, allocation in ctor, free in dtor? +// not exception-safe +// but manage mem +// or in separate function? +class BuilderCU +{ +public: + BuilderCU(); + BuilderCU(const EventOfHits& event_of_hits, const Event* event, + const EventOfCandidates& event_of_cands); + ~BuilderCU(); + + void setUp(const EventOfHits& event_of_hits, const Event* event, + const EventOfCandidates& event_of_cands); + void tearDown(); + void FindTracksBestHit(EventOfCandidates& event_of_cands); +private: + FitterCU *cuFitter; + EventOfHitsCU event_of_hits_cu; + EventOfCandidatesCU event_of_cands_cu; + GeometryCU geom_cu; +}; + + +#endif /* ifndef BUILDER_CU_H */ diff --git a/mkFit/FitterCU-imp.h b/mkFit/FitterCU-imp.h index 8a94f3d5f4aaa..cae975636a6df 100644 --- a/mkFit/FitterCU-imp.h +++ b/mkFit/FitterCU-imp.h @@ -1,5 +1,13 @@ +#include +#include "Config.h" +#include "GeometryCU.h" +#include "reorganize_gplex.h" +#include "fittracks_kernels.h" + +#include "Track.h" + template -void FitterCU::setNumberTracks(idx_t Ntracks) { +void FitterCU::setNumberTracks(const idx_t Ntracks) { N = Ntracks; // Raise an exceptioin when the FitterCU instance is too small @@ -22,16 +30,25 @@ void FitterCU::destroyStream() { template void FitterCU::allocateDevice() { + d_par_iP.allocate(Nalloc, LV); d_par_iC.allocate(Nalloc, LV); + + d_Err_iP.allocate(Nalloc, LS); + d_Err_iC.allocate(Nalloc, LS); + d_inChg.allocate(Nalloc, QI); - d_par_iP.allocate(Nalloc, LV); d_errorProp.allocate(Nalloc, LL); - d_Err_iP.allocate(Nalloc, LS); - d_msPar.allocate(Nalloc, HV); - d_outErr.allocate(Nalloc, LS); - d_msErr.allocate(Nalloc, HS); - cudaCheckError() + cudaMalloc((void**)&d_msPar_arr, Config::nLayers * sizeof(GPlexHV)); + cudaMalloc((void**)&d_msErr_arr, Config::nLayers * sizeof(GPlexHS)); + for (int hi = 0; hi < Config::nLayers; ++hi) { + d_msPar[hi].allocate(Nalloc, HV); + d_msErr[hi].allocate(Nalloc, HS); + } + cudaMemcpy(d_msPar_arr, d_msPar, Config::nLayers*sizeof(GPlexHV), cudaMemcpyHostToDevice); + cudaMemcpy(d_msErr_arr, d_msErr, Config::nLayers*sizeof(GPlexHS), cudaMemcpyHostToDevice); + cudaMalloc((void**)&d_maxSize, sizeof(int)); // global maximum + cudaCheckError(); } template @@ -41,125 +58,180 @@ void FitterCU::freeDevice() { d_par_iP.free(); d_errorProp.free(); d_Err_iP.free(); - d_msPar.free(); - d_outErr.free(); - d_msErr.free(); - - cudaCheckError() -} - -template -void FitterCU::sendInParToDevice(const MPlexLV& inPar) { - cudaMemcpy2DAsync(d_par_iC.ptr, d_par_iC.pitch, inPar.fArray, N*sizeof(T), - N*sizeof(T), LV, cudaMemcpyHostToDevice, stream); - cudaCheckError() + d_Err_iC.free(); + for (int hi = 0; hi < Config::nLayers; ++hi) { + d_msPar[hi].free(); + d_msErr[hi].free(); + } + cudaFree(d_msPar_arr); + cudaFree(d_msErr_arr); + cudaFree(d_maxSize); + cudaCheckError(); } template -void FitterCU::sendInErrToDevice(const MPlexLS& inErr) { - cudaMemcpy2DAsync(d_outErr.ptr, d_outErr.pitch, inErr.fArray, N*sizeof(T), - N*sizeof(T), LS, cudaMemcpyHostToDevice, stream); - cudaCheckError() +void FitterCU::kalmanUpdateMerged(const int hit_idx) { + kalmanUpdate_wrapper(stream, d_Err_iP, d_msErr[hit_idx], + d_par_iP, d_msPar[hit_idx], d_par_iC, d_Err_iC, N); } template -void FitterCU::sendInChgToDevice(const MPlexQI& inChg) { - cudaMemcpy2DAsync(d_inChg.ptr, d_inChg.pitch, inChg.fArray, N*sizeof(T), - N*sizeof(T), QI, cudaMemcpyHostToDevice, stream); - cudaCheckError() +void FitterCU::kalmanUpdate_standalone( + const MPlexLS &psErr, const MPlexLV& psPar, const MPlexQI &inChg, + const MPlexHS &msErr, const MPlexHV& msPar, + MPlexLS &outErr, MPlexLV& outPar, + const int hit_idx, const int N_proc) +{ + d_Err_iP.copyAsyncFromHost(stream, psErr); + d_msErr[hit_idx].copyAsyncFromHost(stream, msErr); + d_par_iP.copyAsyncFromHost(stream, psPar); + d_msPar[hit_idx].copyAsyncFromHost(stream, msPar); + + kalmanUpdate_wrapper(stream, d_Err_iP, d_msErr[hit_idx], + d_par_iP, d_msPar[hit_idx], d_par_iC, d_Err_iC, N_proc); + + d_par_iC.copyAsyncToHost(stream, outPar); + d_Err_iC.copyAsyncToHost(stream, outErr); } template -void FitterCU::sendMsRadToDevice(const MPlexQF& msRad) { - cudaMemcpy2DAsync(d_msRad.ptr, d_msRad.pitch, msRad.fArray, N*sizeof(T), - N*sizeof(T), QF, cudaMemcpyHostToDevice, stream); - cudaCheckError() +void FitterCU::propagationMerged(const int hit_idx) { + propagation_wrapper(stream, d_msPar[hit_idx], d_Err_iC, d_par_iC, d_inChg, + d_par_iP, d_Err_iP, N); } +// FIXME: Temporary. Separate allocations / transfers template -void FitterCU::sendOutParToDevice(const MPlexLV& outPar) { - cudaMemcpy2DAsync(d_par_iP.ptr, d_par_iP.pitch, outPar.fArray, N*sizeof(T), - N*sizeof(T), LV, cudaMemcpyHostToDevice, stream); - cudaCheckError() +void FitterCU::allocate_extra_addBestHit() { + d_outChi2.allocate(Nalloc, QF); + d_XHitPos.allocate(Nalloc, QI); + d_XHitSize.allocate(Nalloc, QI); + d_XHitArr.allocate(Nalloc, GPlexHitIdxMax); + cudaMalloc((void**)&d_HitsIdx_arr, Config::nLayers * sizeof(GPlexQI)); + for (int hi = 0; hi < Config::nLayers; ++hi) { + d_HitsIdx[hi].allocate(Nalloc, QI); + } + cudaMemcpy(d_HitsIdx_arr, d_HitsIdx, Config::nLayers*sizeof(GPlexQI), cudaMemcpyHostToDevice); + d_Chi2.allocate(Nalloc, QF); + d_Label.allocate(Nalloc, QI); + cudaCheckError(); } template -void FitterCU::sendOutErrToDevice(const MPlexLS& outErr) { - cudaMemcpy2DAsync(d_Err_iP.ptr, d_Err_iP.pitch, outErr.fArray, N*sizeof(T), - N*sizeof(T), LS, cudaMemcpyHostToDevice, stream); - cudaCheckError() +void FitterCU::free_extra_addBestHit() { + for (int hi = 0; hi < Config::nLayers; ++hi) { + d_HitsIdx[hi].free(); cudaCheckError(); + } + cudaFree(d_HitsIdx_arr); + d_Label.free(); cudaCheckError(); + d_Chi2.free(); cudaCheckError(); + + d_XHitArr.free(); cudaCheckError(); + d_XHitSize.free(); cudaCheckError(); + d_XHitPos.free(); cudaCheckError(); + d_outChi2.free(); cudaCheckError(); } template -void FitterCU::sendMsParToDevice(const MPlexHV& msPar) { - cudaMemcpy2DAsync(d_msPar.ptr, d_msPar.pitch, msPar.fArray, N*sizeof(T), - N*sizeof(T), HV, cudaMemcpyHostToDevice, stream); - cudaCheckError() +void FitterCU::setHitsIdxToZero(const int hit_idx) { + cudaMemset(d_HitsIdx[hit_idx].ptr, 0, Nalloc*sizeof(int)); } template -void FitterCU::sendMsErrToDevice(const MPlexHS& msErr) { - cudaMemcpy2DAsync(d_msErr.ptr, d_msErr.pitch, msErr.fArray, N*sizeof(T), - N*sizeof(T), HS, cudaMemcpyHostToDevice, stream); - cudaCheckError() -} +void FitterCU::addBestHit(EventOfHitsCU &event, GeometryCU &geom_cu, + EventOfCandidatesCU &event_of_cands_cu) { + findBestHit_wrapper(stream, event.m_layers_of_hits, + event_of_cands_cu, + d_XHitSize, d_XHitArr, + d_Err_iP, d_par_iP, + d_msErr_arr, d_msPar_arr, + d_Err_iC, d_par_iC, d_outChi2, + d_Chi2, d_HitsIdx_arr, + d_inChg, d_Label, geom_cu, + d_maxSize, N); +} template -void FitterCU::getOutParFromDevice(MPlexLV& outPar) { - cudaMemcpy2DAsync(outPar.fArray, N*sizeof(T), d_par_iC.ptr, d_par_iC.pitch, - N*sizeof(T), LV, cudaMemcpyDeviceToHost, stream); - cudaCheckError() +void FitterCU::InputTracksAndHitIdx(const EtaBinOfCandidatesCU &etaBin, + const int beg, const int end, const bool inputProp) { + InputTracksCU_wrapper(stream, etaBin, d_Err_iP, d_par_iP, + d_inChg, d_Chi2, d_Label, d_HitsIdx_arr, + beg, end, inputProp, N); } template -void FitterCU::getErrorPropFromDevice(MPlexLL& errorProp) { - cudaMemcpy2DAsync(errorProp.fArray, N*sizeof(T), - d_errorProp.ptr, d_errorProp.pitch, - N*sizeof(T), LL, cudaMemcpyDeviceToHost, stream); - cudaCheckError() +void FitterCU::OutputTracksAndHitIdx(EtaBinOfCandidatesCU &etaBin, + const int beg, const int end, const bool outputProp) { + OutputTracksCU_wrapper(stream, etaBin, d_Err_iP, d_par_iP, + d_inChg, d_Chi2, d_Label, d_HitsIdx_arr, + beg, end, outputProp, N); + cudaStreamSynchronize(stream); + cudaCheckError(); } -template -void FitterCU::getOutErrFromDevice(MPlexLS& outErr) { - cudaMemcpy2DAsync(outErr.fArray, N*sizeof(T), d_outErr.ptr, d_outErr.pitch, - N*sizeof(T), LS, cudaMemcpyDeviceToHost, stream); - cudaCheckError() -} template -void FitterCU::getMsRadFromDevice(MPlexQF& msRad) { - cudaMemcpy2DAsync(msRad.fArray, N*sizeof(T), d_msRad.ptr, d_msRad.pitch, - N*sizeof(T), QF, cudaMemcpyDeviceToHost, stream); - cudaCheckError() +void FitterCU::propagateTracksToR(const float radius, const int N) { + propagationForBuilding_wrapper(stream, d_Err_iC, d_par_iC, d_inChg, + radius, d_Err_iP, d_par_iP, N); } -template -void FitterCU::setOutParFromInPar() { - cudaMemcpy2DAsync(d_par_iP.ptr, d_par_iP.pitch, d_par_iC.ptr, d_par_iC.pitch, - N*sizeof(T), LV, cudaMemcpyDeviceToDevice, stream); - cudaCheckError() -} +#if 1 template -void FitterCU::setOutErrFromInErr() { - cudaMemcpy2DAsync(d_Err_iP.ptr, d_Err_iP.pitch, d_outErr.ptr, d_outErr.pitch, - N*sizeof(T), LS, cudaMemcpyDeviceToDevice, stream); - cudaCheckError() +void FitterCU::propagateTracksToR_standalone(const float radius, const int N, + const MPlexLS& Err_iC, const MPlexLV& par_iC, const MPlexQI& inChg, + MPlexLS& Err_iP, MPlexLV& Par_iP) { + d_Err_iC.copyAsyncFromHost(stream, Err_iC); + d_par_iC.copyAsyncFromHost(stream, par_iC); + //propagationForBuilding_wrapper(stream, d_Err_iC, d_par_iC, d_inChg, + //radius, d_Err_iP, d_par_iP, N); + d_Err_iP.copyAsyncToHost(stream, Err_iP); + d_par_iP.copyAsyncToHost(stream, Par_iP); } template -void FitterCU::kalmanUpdateMerged() { - kalmanUpdate_wrapper(stream, d_Err_iP, d_msErr, - d_par_iP, d_msPar, d_par_iC, d_outErr, N); -} +void FitterCU::FitTracks(Track *tracks_cu, int num_tracks, + EventOfHitsCU &events_of_hits_cu, + int Nhits) +{ + float etime; + cudaEvent_t start, stop; + cudaEventCreate(&start); + cudaEventCreate(&stop); -template -void FitterCU::propagationMerged() { - propagation_wrapper(stream, d_msPar, d_par_iC, d_inChg, - d_par_iP, d_errorProp, d_Err_iP, N); -} + cudaEventRecord(start, 0); + + for (int itrack = 0; itrack < num_tracks; itrack += Nalloc) + { + int beg = itrack; + int end = std::min(itrack + Nalloc, num_tracks); + setNumberTracks(end-beg); + + InputTracksAndHitsCU_wrapper(stream, tracks_cu, events_of_hits_cu, + d_Err_iC, d_par_iC, + d_msErr_arr, d_msPar_arr, + d_inChg, d_Chi2, d_Label, + d_HitsIdx_arr, beg, end, false, N); + fittracks_wrapper(stream, d_Err_iP, d_par_iP, d_msErr_arr, d_msPar_arr, + d_Err_iC, d_par_iC, d_errorProp, d_inChg, + Nhits, N); + OutputFittedTracksCU_wrapper(stream, tracks_cu, + d_Err_iC, d_par_iC, + d_inChg, d_Chi2, d_Label, + beg, end, N); + } + cudaEventRecord(stop, 0); + cudaEventSynchronize(stop); + cudaEventElapsedTime(&etime, start, stop); + std::cerr << "CUDA etime: " << etime << " ms.\n"; + + cudaEventDestroy(start); + cudaEventDestroy(stop); +} +#else template void FitterCU::FitTracks(MPlexQI &Chg, MPlexLV& par_iC, MPlexLS& err_iC, MPlexHV* msPar, MPlexHS* msErr, int Nhits, @@ -177,9 +249,18 @@ void FitterCU::FitTracks(MPlexQI &Chg, MPlexLV& par_iC, MPlexLS& err_iC, setNumberTracks(end-beg); - sendInChgToDevice(Chg); - sendInParToDevice(par_iC); - sendInErrToDevice(err_iC); + Track *tracks_cu; + cudaMalloc((void**)&tracks_cu, tracks.size()*sizeof(Track)); + cudaMemcpy(tracks_cu, &tracks[0], tracks.size()*sizeof(Track), cudaMemcpyHostToDevice); + allocate_extra_addBestHit(); + + InputTracksAndHitsCU_wrapper(stream, tracks_cu, d_Err_iC, d_par_iC, d_inChg, + d_Chi2, d_Label, d_HitsIdx_arr, beg, end, false, N); + + + //d_inChg.copyAsyncFromHost(stream, Chg); + //d_par_iC.copyAsyncFromHost(stream, par_iC); + //d_Err_iC.copyAsyncFromHost(stream, err_iC); cudaEventRecord(start, 0); @@ -188,9 +269,10 @@ void FitterCU::FitTracks(MPlexQI &Chg, MPlexLV& par_iC, MPlexLS& err_iC, { // Switch outPut and inPut parameters and errors // similar to iC <-> iP in the CPU code. - setOutParFromInPar(); - setOutErrFromInErr(); // d_Err_iP + d_par_iP.copyAsyncFromDevice(stream, d_par_iC); + d_Err_iP.copyAsyncFromDevice(stream, d_Err_iC); +#if 0 double time_input = dtime(); int itrack; omp_set_num_threads(Config::numThreadsReorg); @@ -206,22 +288,36 @@ void FitterCU::FitTracks(MPlexQI &Chg, MPlexLV& par_iC, MPlexLS& err_iC, msPar[hi].CopyIn(itrack, hit.posArray()); } total_reorg += (dtime() - time_input)*1e3; - - sendMsParToDevice(msPar[hi]); - sendMsErrToDevice(msErr[hi]); - - propagationMerged(); - kalmanUpdateMerged(); +#endif + + d_msPar[hi].copyAsyncFromHost(stream, msPar[hi]); + d_msErr[hi].copyAsyncFromHost(stream, msErr[hi]); + + propagationMerged(hi); + //MPlexLS err_iP; + //MPlexLV par_iP; + //d_par_iC.copyAsyncToHost(stream, par_iC); + //d_par_iP.copyAsyncToHost(stream, par_iP); + //d_Err_iP.copyAsyncToHost(stream, err_iP); + //propagation_wrapper(stream, d_msPar[hi], d_Err_iC, d_par_iC, d_inChg, + //d_par_iP, d_errorProp, d_Err_iP, N); + kalmanUpdateMerged(hi); + //fittracks_wrapper(stream, d_Err_iP, d_par_iP, d_msErr, d_msPar, + //d_Err_iC, d_par_iC, d_errorProp, d_inChg, + //hi, N); } cudaEventRecord(stop, 0); cudaEventSynchronize(stop); cudaEventElapsedTime(&etime, start, stop); - std::cerr << "CUDA etime: " << etime << " ms.\n"; - std::cerr << "Total reorg: " << total_reorg << " ms.\n"; + //std::cerr << "CUDA etime: " << etime << " ms.\n"; + //std::cerr << "Total reorg: " << total_reorg << " ms.\n"; + + free_extra_addBestHit(); + cudaFree(tracks_cu); - getOutParFromDevice(par_iC); - getOutErrFromDevice(err_iC); + d_par_iC.copyAsyncToHost(stream, par_iC); + d_Err_iC.copyAsyncToHost(stream, err_iC); cudaStreamSynchronize(stream); // freeDevice(); -> moved to mkFit/mkFit.cc @@ -230,4 +326,4 @@ void FitterCU::FitTracks(MPlexQI &Chg, MPlexLV& par_iC, MPlexLS& err_iC, cudaEventDestroy(start); cudaEventDestroy(stop); } - +#endif diff --git a/mkFit/FitterCU.h b/mkFit/FitterCU.h index a56a912559ea3..41715a7745f9b 100644 --- a/mkFit/FitterCU.h +++ b/mkFit/FitterCU.h @@ -2,48 +2,36 @@ #define _PROPAGATOR_CU_H_ #include -#include -#include #include "Matrix.h" + +#include "HitStructuresCU.h" +#include "GPlex.h" +#include "GeometryCU.h" +#include "gpu_utils.h" + #include "propagation_kernels.h" #include "kalmanUpdater_kernels.h" -#include "GPlex.h" +#include "computeChi2_kernels.h" +#include "index_selection_kernels.h" +#include "best_hit_kernels.h" + +#include +#include -#define LV 6 -#define QI 1 -#define QF 1 +constexpr int LV = 6; +constexpr int QI = 1; +constexpr int QF = 1; #define LL 36 -#define LS 21 -#define HV 3 -#define HS 6 -#define LH 18 +constexpr int LS = 21; +constexpr int HV = 3; +constexpr int HS = 6; +constexpr int LH = 18; -#define BLOCK_SIZE_X 16 -#define MAX_BLOCKS_X 65535 // CUDA constraint +#define BLOCK_SIZE_X 256 using idx_t = Matriplex::idx_t; -// Macro for checking cuda errors following a cuda launch or api call -// This comes from Jeff Larkins (NVIDIA) -#define cudaCheckError() { \ - cudaError_t e=cudaGetLastError(); \ - if(e!=cudaSuccess) { \ - printf("Cuda failure %s:%d: '%s'\n",__FILE__,__LINE__,cudaGetErrorString(e)); \ - exit(0); \ - } \ -} -#if 0 -#define cudaCheckErrorSync() { \ - cudaDeviceSynchronize(); \ - cudaCheckError(); \ -} -#else -#define cudaCheckErrorSync() {} -#endif - -void separate_first_call_for_meaningful_profiling_numbers(); - template class FitterCU { public: @@ -55,54 +43,100 @@ class FitterCU { void createStream(); void destroyStream(); + cudaStream_t& get_stream() { return stream; } - void setNumberTracks(idx_t Ntracks); + int get_Nalloc() const { return Nalloc; } + void setNumberTracks(const idx_t Ntracks); - void sendInParToDevice(const MPlexLV& inPar); - void sendInErrToDevice(const MPlexLS& inErr); - void sendInChgToDevice(const MPlexQI& inChg); - void sendMsRadToDevice(const MPlexQF& msRad); - void sendOutParToDevice(const MPlexLV& outPar); - void sendOutErrToDevice(const MPlexLS& outErr); - void sendMsParToDevice(const MPlexHV& msPar); - - void getErrorPropFromDevice(MPlexLL& errorProp); - void getMsRadFromDevice(MPlexQF& msRad); + void propagationMerged(const int hit_idx); + void kalmanUpdateMerged(const int hit_idx); + void kalmanUpdate_standalone( + const MPlexLS &psErr, const MPlexLV& psPar, const MPlexQI &inChg, + const MPlexHS &msErr, const MPlexHV& msPar, + MPlexLS &outErr, MPlexLV& outPar, + const int hit_idx, const int N_proc); - void setOutParFromInPar(); - void setOutErrFromInErr(); +#if 0 + void computeChi2gpu(const MPlexLS &psErr, const MPlexLV& propPar, + const MPlexQI &inChg, MPlexHS &msErr, MPlexHV& msPar, + float *minChi2, int *bestHit, + LayerOfHitsCU &d_layer, MPlexQI &XHitSize, Matriplex::Matriplex &XHitArr, + MPlexQF &Chi2, MPlexQI &HitsIdx, MPlexQF&outChi2, int maxSize, int hit_idx, + int NN); +#endif - // updater specfic transfers. - void sendMsErrToDevice(const MPlexHS& msErr); - void getOutParFromDevice(MPlexLV& outPar); - void getOutErrFromDevice(MPlexLS& outErr); + void allocate_extra_addBestHit(); + void free_extra_addBestHit(); - void propagationMerged(); - void kalmanUpdateMerged(); +#if 0 + void prepare_addBestHit(); + //const MPlexLS &psErr, const MPlexLV& propPar, + //const MPlexQI &inChg, + //MPlexQI &XHitSize, Matriplex::Matriplex &XHitArr, + //size_t NN); + void finalize_addBestHit( + MPlexHS *msErr, MPlexHV* msPar, + MPlexLS& Err_iP, MPlexLV& Par_iP, + MPlexQI *HitsIdx, + MPlexQI &Label, + int start_idx, int end_idx); +#endif + void setHitsIdxToZero(const int hit_idx); + +#if 1 + //void addBestHit(EventOfHitsCU& event, const int ilay, const float *radii, int hit_idx); + void addBestHit(EventOfHitsCU& event, GeometryCU &geom_cu, + EventOfCandidatesCU &event_of_cands_cu); +#endif + void propagateTracksToR(const float radius, const int N); + void propagateTracksToR_standalone(const float radius, const int N, + const MPlexLS& Err_iC, const MPlexLV& par_iC, + const MPlexQI& inChg, + MPlexLS& Err_iP, MPlexLV& Par_iP); // fitting higher order methods - void FitTracks(MPlexQI &Chg, MPlexLV& par_iC, MPlexLS& err_iC, - MPlexHV* msPar, MPlexHS* msErr, int Nhits, - std::vector &tracks, int beg, int end, - std::vector &layerHits); + void FitTracks(Track *tracks_cu, int num_tracks, + EventOfHitsCU &events_of_hits_cu, + int NHits); + void InputTracksAndHitIdx(const EtaBinOfCandidatesCU &etaBin, + const int beg, const int end, const bool inputProp); + void OutputTracksAndHitIdx(EtaBinOfCandidatesCU &etaBin, + const int beg, const int end, const bool outputProp); private: // N is the actual size, Nalloc should be >= N, as it is intended // to allocated arrays that can be used for several sets of tracks. idx_t Nalloc; idx_t N; + /* data */ - GPlex d_par_iC; // LV - GPlex d_inChg; // QI - GPlex d_par_iP; // LV - GPlex d_msRad; // QF - GPlex d_errorProp; // LL - GPlex d_Err_iP; - GPlex d_msPar; - - GPlex d_outErr; - GPlex d_msErr; + GPlexLV d_par_iP; // LV + GPlexLV d_par_iC; // LV + + GPlexLS d_Err_iP; // LS + GPlexLS d_Err_iC; // LS + + GPlexQI d_inChg; // QI + GPlexQF d_msRad; // QF + GPlexLL d_errorProp; // LL + + GPlexHV *d_msPar_arr; // completely on the GPU + GPlexHV d_msPar[Config::nLayers]; // on the CPU, with arrays on the GPU + GPlexHS *d_msErr_arr; + GPlexHS d_msErr[Config::nLayers]; + GPlexQI d_XHitPos; // QI : 1D arrary following itracks + GPlexQI d_XHitSize; // QI : " " + GPlexHitIdx d_XHitArr; + + GPlexQF d_outChi2; + GPlexQI *d_HitsIdx_arr; + GPlexQI d_HitsIdx[Config::nLayers]; + GPlexQF d_Chi2; + GPlexQI d_Label; + + int *d_maxSize; // max number of tracks for AddBestHit + // everything run in a stream so multiple instance of FitterCU can // run concurrently on the GPU. cudaStream_t stream; diff --git a/mkFit/GPlex.h b/mkFit/GPlex.h index 9dfe3b86e6e02..e5c2c48024b8c 100644 --- a/mkFit/GPlex.h +++ b/mkFit/GPlex.h @@ -4,6 +4,9 @@ #include #include +#include "gpu_utils.h" +#include "Matrix.h" + // GPU implementation of a Matriplex-like structure // The number of tracks is the fast dimension and is padded in order to have // consecutive and aligned memory accesses. For cached reads, this result in a @@ -11,23 +14,91 @@ // See: // http://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#global-memory-3-0 // In practice, The number of tracks (ntracks) is set to be MPT_SIZE -template +template struct GPlex { + using value_type = T; T* ptr; - size_t pitch, stride, x, y; + size_t pitch, stride, N, kSize; + + __device__ T operator[](int xx) const { return ptr[xx]; } + __device__ T& operator[](int xx) { return ptr[xx]; } + + __device__ T& operator()(int n, int i, int j) { return ptr[n + i*stride]; } + __device__ T operator()(int n, int i, int j) const { return ptr[n + i*stride]; } - void allocate(size_t ntracks, size_t plex_size) { - x = ntracks; - y = plex_size; - cudaMallocPitch((void**)&ptr, &pitch, x*sizeof(T), y); + void allocate(size_t ntracks, size_t aSize) { + N = ntracks; + kSize = aSize; + cudaMallocPitch((void**)&ptr, &pitch, N*sizeof(T), kSize); stride = pitch/sizeof(T); // Number of elements } void free() { cudaFree(ptr); - x = 0; y = 0; pitch = 0; stride = 0; + N = 0; kSize = 0; pitch = 0; stride = 0; } //cudaMemcpy2D(d_msErr.ptr, d_msErr.pitch, msErr.fArray, N*sizeof(T), //N*sizeof(T), HS, cudaMemcpyHostToDevice); + + void copyAsyncFromHost(cudaStream_t& stream, const M& mplex) { + cudaMemcpy2DAsync(ptr, pitch, mplex.fArray, N*sizeof(T), + N*sizeof(T), kSize, cudaMemcpyHostToDevice, stream); + cudaCheckError(); + } + void copyAsyncToHost(cudaStream_t& stream, M& mplex) { + cudaMemcpy2DAsync(mplex.fArray, N*sizeof(T), ptr, pitch, + N*sizeof(T), kSize, cudaMemcpyDeviceToHost, stream); + cudaCheckError(); + } + void copyAsyncFromDevice(cudaStream_t& stream, GPlex& gplex) { + cudaMemcpy2DAsync(ptr, pitch, gplex.ptr, gplex.pitch, + N*sizeof(T), kSize, cudaMemcpyDeviceToDevice, stream); + cudaCheckError(); + } }; +using GPlexLL = GPlex; +using GPlexLV = GPlex; +using GPlexLS = GPlex; + +using GPlexHH = GPlex; +using GPlexHV = GPlex; +using GPlexHS = GPlex; + +using GPlexLH = GPlex; + +using GPlexQF = GPlex; +using GPlexQI = GPlex; + +const int GPlexHitIdxMax = 16; // FIXME: copied and past from MkFitter.h +using GPlexHitIdx = GPlex>; + +template +struct GPlexReg { + __device__ T operator[](int xx) const { return arr[xx]; } + __device__ T& operator[](int xx) { return arr[xx]; } + + __device__ T& operator()(int n, int i, int j) { return arr[i*D2 + j]; } + __device__ T operator()(int n, int i, int j) const { return arr[i*D2 + j]; } + + __device__ void SetVal(T v) + { + for (int i = 0; i < D1; ++i) + { + arr[i] = v; + } + } + + T arr[D1]; +}; + +using GPlexRegLL = GPlexReg; +using GPlexRegLH = GPlexReg; +using GPlexRegHH = GPlexReg; +using GPlexRegLV = GPlexReg; +using GPlexRegHS = GPlexReg; +using GPlexRegHV = GPlexReg; +using GPlexReg2V = GPlexReg; +using GPlexReg2S = GPlexReg; +using GPlexRegQF = GPlexReg; + #endif // _GPLEX_H_ diff --git a/mkFit/GeometryCU.h b/mkFit/GeometryCU.h new file mode 100644 index 0000000000000..16cda75eaee87 --- /dev/null +++ b/mkFit/GeometryCU.h @@ -0,0 +1,23 @@ +#ifndef GEOMETRY_CU_H +#define GEOMETRY_CU_H + +#include "gpu_utils.h" + +struct GeometryCU { + float *radii; + + void allocate() { + cudaMalloc((void**)&radii, Config::nLayers * sizeof(float)); + cudaCheckError(); + } + void deallocate() { + cudaFree(radii); + cudaCheckError(); + } + void getRadiiFromCPU(const float *h_radii) { + cudaMemcpy(radii, h_radii, Config::nLayers * sizeof(float), cudaMemcpyHostToDevice); + cudaCheckError(); + } +}; + +#endif /* ifndef GEOMETRY_CU_H */ diff --git a/mkFit/HitStructures.cc b/mkFit/HitStructures.cc index 9d5703555fe59..427ca92690b1f 100644 --- a/mkFit/HitStructures.cc +++ b/mkFit/HitStructures.cc @@ -189,7 +189,7 @@ void LayerOfHits::SelectHitIndices(float z, float phi, float dz, float dphi, std // zb1 -= 2; if (zb < 0) zb = 0; // zb2 += 2; if (zb >= m_nz) zb = m_nz; - if (dump) + //if (dump) printf("LayerOfHits::SelectHitIndices %6.3f %6.3f %6.4f %7.5f %3d %3d %4d %4d\n", z, phi, dz, dphi, zb1, zb2, pb1, pb2); @@ -202,6 +202,7 @@ void LayerOfHits::SelectHitIndices(float z, float phi, float dz, float dphi, std for (int hi = m_phi_bin_infos[zi][pb].first; hi < m_phi_bin_infos[zi][pb].second; ++hi) { + printf("hi : %d\n", hi); // Here could enforce some furhter selection on hits #ifdef LOH_USE_PHI_Z_ARRAYS float ddz = std::abs(z - m_hit_zs[hi]); diff --git a/mkFit/HitStructuresCU.cu b/mkFit/HitStructuresCU.cu new file mode 100644 index 0000000000000..3fc3fb797312a --- /dev/null +++ b/mkFit/HitStructuresCU.cu @@ -0,0 +1,215 @@ + +#include +#include + +#include "HitStructuresCU.h" + +void LayerOfHitsCU::alloc_hits(const int size) { + cudaMalloc((void**)&m_hits, sizeof(Hit)*size); + m_capacity = size; +} + +void LayerOfHitsCU::free_hits() { + cudaFree(m_hits); + m_capacity = 0; +} + +void LayerOfHitsCU::alloc_phi_bin_infos(const int nz, const int nphi) { + cudaMalloc((void**)&m_phi_bin_infos, sizeof(PairIntsCU)*nz*nphi); + m_nz = nz; +} + +void LayerOfHitsCU::free_phi_bin_infos() { + cudaFree(m_phi_bin_infos); + m_nz = 0; +} + +void LayerOfHitsCU::copyLayerOfHitsFromCPU(const LayerOfHits &layer, + const cudaStream_t &stream) { + cudaMemcpyAsync(m_hits, layer.m_hits, sizeof(Hit)*m_capacity, + cudaMemcpyHostToDevice, stream); + /*cudaCheckError();*/ + m_zmin = layer.m_zmin; + m_zmax = layer.m_zmax; + m_fz = layer.m_fz; + // FIXME: copy other values + // TODO: probably quite inefficient: + for (int i = 0; i < m_nz; ++i) { + cudaMemcpyAsync(m_phi_bin_infos + i*Config::m_nphi, &(layer.m_phi_bin_infos[i][0]), + sizeof(PairIntsCU)*Config::m_nphi, cudaMemcpyHostToDevice, stream); + /*cudaCheckError();*/ + } +} + +void LayerOfHitsCU::copyFromCPU(const HitVec hits, const cudaStream_t &stream) +{ + cudaMemcpyAsync(m_hits, &hits[0], sizeof(Hit)*hits.size(), + cudaMemcpyHostToDevice, stream); +} + +void EventOfHitsCU::allocGPU(const EventOfHits &event_of_hits) { + m_n_layers = event_of_hits.m_n_layers; + // Allocate GPU array. + // Members's address of array's elements are in the GPU space + cudaMalloc((void**)&m_layers_of_hits, m_n_layers*sizeof(LayerOfHitsCU)); + cudaCheckError(); + // Allocate CPU array. + // Members's address of array's elements are in the CPU space + // This allows to call allocate for each array's element. + m_layers_of_hits_alloc = new LayerOfHitsCU[m_n_layers]; + for (int i = 0; i < m_n_layers; ++i) { + m_layers_of_hits_alloc[i].alloc_hits(event_of_hits.m_layers_of_hits[i].m_capacity); + m_layers_of_hits_alloc[i].alloc_phi_bin_infos( + event_of_hits.m_layers_of_hits[i].m_nz, + Config::m_nphi); + } + /*cudaCheckError();*/ +} + +void EventOfHitsCU::allocGPU(const std::vector &layerHits) +{ + m_n_layers = layerHits.size(); + // Allocate GPU array. + // Members's address of array's elements are in the GPU space + cudaMalloc((void**)&m_layers_of_hits, m_n_layers*sizeof(LayerOfHitsCU)); + cudaCheckError(); + // Allocate CPU array. + // Members's address of array's elements are in the CPU space + // This allows to call allocate for each array's element. + m_layers_of_hits_alloc = new LayerOfHitsCU[m_n_layers]; + for (int i = 0; i < m_n_layers; ++i) { + m_layers_of_hits_alloc[i].alloc_hits(layerHits[i].size()); + // no phi_bin_infos -- free-d later + m_layers_of_hits_alloc[i].alloc_phi_bin_infos(1, 1); + } + /*cudaCheckError();*/ +} + +void EventOfHitsCU::deallocGPU() { + for (int i = 0; i < m_n_layers; ++i) { + /*cudaCheckError();*/ + m_layers_of_hits_alloc[i].free_hits(); + m_layers_of_hits_alloc[i].free_phi_bin_infos(); + /*cudaCheckError();*/ + } + cudaFree(m_layers_of_hits); + /*cudaCheckError();*/ + delete[] m_layers_of_hits_alloc; +} + +void EventOfHitsCU::copyFromCPU(const EventOfHits& event_of_hits, + const cudaStream_t &stream) { + for (int i = 0; i < event_of_hits.m_n_layers; i++) { + m_layers_of_hits_alloc[i].copyLayerOfHitsFromCPU(event_of_hits.m_layers_of_hits[i]); + } + /*cudaCheckError();*/ + cudaMemcpyAsync(m_layers_of_hits, m_layers_of_hits_alloc, + event_of_hits.m_n_layers*sizeof(LayerOfHitsCU), + cudaMemcpyHostToDevice, stream); + /*cudaCheckError();*/ +} + + +void EventOfHitsCU::copyFromCPU(const std::vector &layerHits, + const cudaStream_t &stream) { + for (int i = 0; i < layerHits.size(); i++) { + m_layers_of_hits_alloc[i].copyFromCPU(layerHits[i]); + } + cudaMemcpyAsync(m_layers_of_hits, m_layers_of_hits_alloc, + m_n_layers*sizeof(LayerOfHitsCU), + cudaMemcpyHostToDevice, stream); +} + + +// ============================================================================ + +void EtaBinOfCandidatesCU::alloc_tracks(const int ntracks) { + m_real_size = ntracks; + m_fill_index = 0; + + cudaMalloc((void**)&m_candidates, sizeof(Track)*m_real_size); + /*cudaCheckError();*/ +} + + +void EtaBinOfCandidatesCU::free_tracks() { + cudaFree(m_candidates); + /*cudaCheckError();*/ + m_real_size = 0; + m_fill_index = 0; +} + + +void EtaBinOfCandidatesCU::copyFromCPU(const EtaBinOfCandidates &eta_bin, + const cudaStream_t &stream) { + assert (eta_bin.m_fill_index < m_real_size); // or something + m_fill_index = eta_bin.m_fill_index; + + cudaMemcpyAsync(m_candidates, &eta_bin.m_candidates[0], + sizeof(Track)*m_fill_index, cudaMemcpyHostToDevice, stream); + /*cudaCheckError();*/ +} + + +void EtaBinOfCandidatesCU::copyToCPU(EtaBinOfCandidates &eta_bin, + const cudaStream_t &stream) const { + assert (eta_bin.m_fill_index < m_real_size); // or something + + cudaMemcpyAsync(&eta_bin.m_candidates[0], m_candidates, + sizeof(Track)*m_fill_index, cudaMemcpyDeviceToHost, stream); + /*cudaCheckError();*/ +} + +// ============================================================================ + +void EventOfCandidatesCU::allocGPU(const EventOfCandidates &event_of_cands) { + m_n_etabins = Config::nEtaBin; + // Allocate GPU array. + // Members's address of array's elements are in the GPU space + cudaMalloc((void**)&m_etabins_of_candidates, m_n_etabins*sizeof(EtaBinOfCandidatesCU)); + /*cudaCheckError();*/ + // Allocate CPU array. + // Members's address of array's elements are in the CPU space + // This allows to call allocate for each array's element. + m_etabins_of_candidates_alloc = new EtaBinOfCandidatesCU[m_n_etabins]; + for (int i = 0; i < m_n_etabins; ++i) { + const EtaBinOfCandidates& h_etabin = event_of_cands.m_etabins_of_candidates[i]; + m_etabins_of_candidates_alloc[i].alloc_tracks(h_etabin.m_real_size); + } + /*cudaCheckError();*/ +} + + +void EventOfCandidatesCU::deallocGPU() { + for (int i = 0; i < m_n_etabins; ++i) { + /*cudaCheckError();*/ + m_etabins_of_candidates_alloc[i].free_tracks(); + /*cudaCheckError();*/ + } + cudaFree(m_etabins_of_candidates); + /*cudaCheckError();*/ + delete[] m_etabins_of_candidates_alloc; +} + + +void EventOfCandidatesCU::copyFromCPU(const EventOfCandidates& event_of_cands, + const cudaStream_t &stream) { + for (int i = 0; i < m_n_etabins; i++) { + m_etabins_of_candidates_alloc[i].copyFromCPU(event_of_cands.m_etabins_of_candidates[i]); + } + /*cudaCheckError();*/ + cudaMemcpyAsync(m_etabins_of_candidates, m_etabins_of_candidates_alloc, + m_n_etabins*sizeof(EtaBinOfCandidatesCU), + cudaMemcpyHostToDevice, stream); + /*cudaCheckError();*/ +} + + +void EventOfCandidatesCU::copyToCPU(EventOfCandidates& event_of_cands, + const cudaStream_t &stream) const { + for (int i = 0; i < m_n_etabins; i++) { + m_etabins_of_candidates_alloc[i].copyToCPU(event_of_cands.m_etabins_of_candidates[i]); + } + /*cudaCheckError();*/ + // We do not need to copy the array of pointers to EventOfCandidatesCU back +} diff --git a/mkFit/HitStructuresCU.h b/mkFit/HitStructuresCU.h new file mode 100644 index 0000000000000..11146b3f930e0 --- /dev/null +++ b/mkFit/HitStructuresCU.h @@ -0,0 +1,142 @@ +#ifndef _HIT_STRUCTURES_H_ +#define _HIT_STRUCTURES_H_ + +#include "HitStructures.h" +#include "Config.h" +#include "gpu_utils.h" + +template +struct PairCU { + T1 first; + T2 second; +}; + +using PairIntsCU = PairCU; + +class LayerOfHitsCU { + public: + Hit *m_hits; + PairIntsCU *m_phi_bin_infos; + + float m_zmin, m_zmax, m_fz; + int m_nz = 0; + int m_capacity = 0; + + //fixme, these are copies of the ones above, need to merge with a more generic name + float m_rmin, m_rmax, m_fr; + int m_nr = 0; + + // This could be a parameter, layer dependent. + //static constexpr int m_nphi = 1024; + // Testing bin filling + // static constexpr int m_nphi = 16; + static constexpr float m_fphi = Config::m_nphi / Config::TwoPI; + static constexpr int m_phi_mask = 0x3ff; + + // As above + //static constexpr float m_max_dz = 1; + //static constexpr float m_max_dphi = 0.02; + + LayerOfHitsCU() {}; + ~LayerOfHitsCU() {}; + + void alloc_hits(const int size); + void free_hits(); + + void alloc_phi_bin_infos(const int nz, const int nphi); + void free_phi_bin_infos(); + + void copyLayerOfHitsFromCPU(const LayerOfHits &layer, + const cudaStream_t &stream=0); + void copyFromCPU(const HitVec hits, const cudaStream_t &stream=0); + +#ifdef __CUDACC__ + __device__ +#endif + int GetZBin(const float z) const { return (z - m_zmin) * m_fz; } + +#ifdef __CUDACC__ + __device__ +#endif + int GetZBinChecked(float z) const { + int zb = GetZBin(z); + if (zb < 0) { + zb = 0; + } else { + if (zb >= m_nz) zb = m_nz - 1; + } + return zb; + } + + // if you don't pass phi in (-pi, +pi), mask away the upper bits using m_phi_mask +#ifdef __CUDACC__ + __device__ +#endif + int GetPhiBin(float phi) const { + return floorf(m_fphi * (phi + Config::PI)); + } +}; + + +class EventOfHitsCU +{ +public: + LayerOfHitsCU *m_layers_of_hits; // the real stuff: on GPU + int m_n_layers; + + // The following array is to be able to allocate GPU arrays from + // the CPU and then copy the address of the GPU ptr to the GPU structure + LayerOfHitsCU *m_layers_of_hits_alloc; + + EventOfHitsCU() : m_n_layers{} {}; + + void allocGPU(const EventOfHits &event_of_hits); + void allocGPU(const std::vector &layerHits); + void deallocGPU(); + void copyFromCPU(const EventOfHits& event_of_hits, + const cudaStream_t &stream=0); + void copyFromCPU(const std::vector &layerHits, + const cudaStream_t &stream=0); +}; + +// ============================================================================ + +class EtaBinOfCandidatesCU +{ +public: + Track *m_candidates; + + int m_real_size; + int m_fill_index; + + void alloc_tracks(const int ntracks); + void free_tracks(); + + void copyFromCPU(const EtaBinOfCandidates &eta_bin, + const cudaStream_t &stream=0); + void copyToCPU(EtaBinOfCandidates &eta_bin, + const cudaStream_t &stream=0) const; +}; + + +class EventOfCandidatesCU +{ +public: + EtaBinOfCandidatesCU *m_etabins_of_candidates; // device array + int m_n_etabins; + + // Host array. For allocation and transfer purposes + EtaBinOfCandidatesCU *m_etabins_of_candidates_alloc; + + EventOfCandidatesCU() : m_n_etabins{} {}; + + void allocGPU(const EventOfCandidates &event_of_cands); + void deallocGPU(); + void copyFromCPU(const EventOfCandidates &event_of_cands, + const cudaStream_t &stream=0); + void copyToCPU(EventOfCandidates &event_of_cands, + const cudaStream_t &stream=0) const; +}; + +#endif // _HIT_STRUCTURES_H_ + diff --git a/mkFit/KalmanUtilsMPlex.cc b/mkFit/KalmanUtilsMPlex.cc index 71d49be47b250..4ab6c6d18836c 100644 --- a/mkFit/KalmanUtilsMPlex.cc +++ b/mkFit/KalmanUtilsMPlex.cc @@ -855,7 +855,6 @@ void updateParametersMPlex(const MPlexLS &psErr, const MPlexLV& psPar, const MP #endif } - void computeChi2MPlex(const MPlexLS &psErr, const MPlexLV& psPar, const MPlexQI &inChg, const MPlexHS &msErr, const MPlexHV& msPar, MPlexQF& outChi2, diff --git a/mkFit/KalmanUtilsMPlex.h b/mkFit/KalmanUtilsMPlex.h index 678f417d26c69..53234074e2178 100644 --- a/mkFit/KalmanUtilsMPlex.h +++ b/mkFit/KalmanUtilsMPlex.h @@ -4,11 +4,21 @@ #include "Track.h" #include "Matrix.h" +#ifdef USE_CUDA +#include "FitterCU.h" +#endif + void updateParametersMPlex(const MPlexLS &psErr, const MPlexLV& psPar, const MPlexQI &inChg, const MPlexHS &msErr, const MPlexHV& msPar, MPlexLS &outErr, MPlexLV& outPar, const int N_proc); +#ifdef USE_CUDA // FIXME: temporary; move to FitterCU +void computeChi2MPlex_tmp(const MPlexLS &psErr, const MPlexLV& psPar, const MPlexQI &inChg, + const MPlexHS &msErr, const MPlexHV& msPar, + MPlexQF& outChi2, + FitterCU& cuFitter); +#endif void computeChi2MPlex(const MPlexLS &psErr, const MPlexLV& psPar, const MPlexQI &inChg, const MPlexHS &msErr, const MPlexHV& msPar, MPlexQF& outChi2, diff --git a/mkFit/Makefile b/mkFit/Makefile index eb0c38dea566c..3cfd1cbe24120 100644 --- a/mkFit/Makefile +++ b/mkFit/Makefile @@ -74,12 +74,21 @@ CU_OBJS := $(CU_SRCS:.cu=.o) LDFLAGS_CU := -lcudart +# To share __device__ function across several source files +# 1) compile with --device-c (works as -c should be) +# 2) Create some kind of dictionary with all the .cu.o +# 3) The dictionary AND the original .o files should be used +# -- Works only for CUDA_VERSION >= 5 # TODO: Clean the "-I.. -std=c++11" ${CU_OBJS}: %.o: %.cu - ${NV} -c -o $@ $< -I.. -std=c++11 -DUSE_MATRIPLEX + ${NV} --device-c -o $@ $< -I.. -std=c++11 -DUSE_MATRIPLEX + +CU_LINK := cu_link.o +${CU_LINK}: ${CU_OBJS} + ${NV} --device-link $^ --output-file $@ endif -ALLOBJS := ${MKFOBJS} ${ABOVE_OBJS} ${CU_OBJS} +ALLOBJS := ${MKFOBJS} ${ABOVE_OBJS} ${CU_OBJS} ${CU_LINK} ${MKFDEPS}: auto-genmplex @@ -100,6 +109,11 @@ TTree.h: echo "Using dummy rule for TTree.h" endif +ifdef CUBROOT +cub/util_debug.cuh: + echo "Using dummy rule for cub/util_debug.cuh" +endif + ${MKFOBJS}: %.o: %.cc %.d ${CXX} ${CPPFLAGS} ${CXXFLAGS} ${VEC_HOST} -c -o $@ $< diff --git a/mkFit/MkBuilder.h b/mkFit/MkBuilder.h index a7ed903c32f85..e903d2dce6977 100644 --- a/mkFit/MkBuilder.h +++ b/mkFit/MkBuilder.h @@ -3,9 +3,6 @@ #include -#include "HitStructures.h" - - //------------------------------------------------------------------------------ #include "MkFitter.h" @@ -103,6 +100,10 @@ class MkBuilder virtual void FindTracks(); virtual void FindTracksCloneEngine(); virtual void FindTracksCloneEngineTbb(); +#ifdef USE_CUDA + const Event* get_event() const { return m_event; } + const EventOfHits& get_event_of_hits() const { return m_event_of_hits; } +#endif }; #endif diff --git a/mkFit/MkFitter.cc b/mkFit/MkFitter.cc index 28170716f054b..3aee5ab2c26ec 100644 --- a/mkFit/MkFitter.cc +++ b/mkFit/MkFitter.cc @@ -5,7 +5,7 @@ #include "KalmanUtilsMPlex.h" #include "ConformalUtilsMPlex.h" #ifdef USE_CUDA -#include "FitterCU.h" +//#include "FitterCU.h" #endif //#define DEBUG @@ -65,7 +65,10 @@ void MkFitter::InputTracksAndHits(const std::vector& tracks, // assert(end - beg == NN); int itrack; -#ifdef USE_CUDA + +// FIXME: uncomment when track building is ported to GPU. +#if 0 +//#ifdef USE_CUDA // This openmp loop brings some performances when using // a single thread to fit all events. // However, it is more advantageous to use the threads to @@ -88,7 +91,9 @@ void MkFitter::InputTracksAndHits(const std::vector& tracks, // CopyIn seems fast enough, but indirections are quite slow. // For GPU computations, it has been moved in between kernels // in an attempt to overlap CPU and GPU computations. -#ifndef USE_CUDA +// FIXME: uncomment when track building is ported to GPU. +#if 1 +//#ifndef USE_CUDA for (int hi = 0; hi < Nhits; ++hi) { const int hidx = trk.getHitIdx(hi); @@ -114,7 +119,8 @@ void MkFitter::InputTracksAndHits(const std::vector& tracks, // assert(end - beg == NN); int itrack; -#ifdef USE_CUDA +//#ifdef USE_CUDA +#if 0 // This openmp loop brings some performances when using // a single thread to fit all events. // However, it is more advantageous to use the threads to @@ -137,7 +143,8 @@ void MkFitter::InputTracksAndHits(const std::vector& tracks, // CopyIn seems fast enough, but indirections are quite slow. // For GPU computations, it has been moved in between kernels // in an attempt to overlap CPU and GPU computations. -#ifndef USE_CUDA +//#ifndef USE_CUDA +#if 1 for (int hi = 0; hi < Nhits; ++hi) { const int hidx = trk.getHitIdx(hi); @@ -168,7 +175,8 @@ void MkFitter::SlurpInTracksAndHits(const std::vector& tracks, int idx[NN] __attribute__((aligned(64))); int itrack; -#ifdef USE_CUDA +//#ifdef USE_CUDA +#if 0 // This openmp loop brings some performances when using // a single thread to fit all events. // However, it is more advantageous to use the threads to @@ -200,7 +208,8 @@ void MkFitter::SlurpInTracksAndHits(const std::vector& tracks, // CopyIn seems fast enough, but indirections are quite slow. // For GPU computations, it has been moved in between kernels // in an attempt to overlap CPU and GPU computations. -#ifndef USE_CUDA +//#ifndef USE_CUDA +#if 1 for (int hi = 0; hi < Nhits; ++hi) { const int hidx = tracks[beg].getHitIdx(hi); @@ -257,7 +266,9 @@ void MkFitter::InputTracksAndHitIdx(const std::vector& tracks, for (int hi = 0; hi < Nhits; ++hi) { - + // MPBL: It does not seem that these values are that dummies + // Not transfering them to the GPU reduces the number of + // nFoundHits in the printouts. HitsIdx[hi](itrack, 0, 0) = trk.getHitIdx(hi);//dummy value for now } @@ -351,7 +362,8 @@ void MkFitter::InputSeedsTracksAndHits(const std::vector& seeds, // assert(end - beg == NN); int itrack; -#ifdef USE_CUDA +//#ifdef USE_CUDA +#if 0 // This openmp loop brings some performances when using // a single thread to fit all events. // However, it is more advantageous to use the threads to @@ -378,7 +390,8 @@ void MkFitter::InputSeedsTracksAndHits(const std::vector& seeds, // CopyIn seems fast enough, but indirections are quite slow. // For GPU computations, it has been moved in between kernels // in an attempt to overlap CPU and GPU computations. -#ifndef USE_CUDA +//#ifndef USE_CUDA +#if 1 for (int hi = 0; hi < Nhits; ++hi) { const int hidx = trk.getHitIdx(hi); @@ -661,7 +674,7 @@ void MkFitter::SelectHitIndices(const LayerOfHits &layer_of_hits, const int N_pr // const int pb2 = L.GetPhiBin(phi + dphi) + 2; if (dump) - printf("LayerOfHits::SelectHitIndices %6.3f %6.3f %6.4f %7.5f %3d %3d %4d %4d\n", + printf("LayerOfHits::SelectHitIndices %6.3f %6.3f %6.6f %7.5f %3d %3d %4d %4d\n", z, phi, dz, dphi, zb1, zb2, pb1, pb2); // MT: One could iterate in "spiral" order, to pick hits close to the center. diff --git a/mkFit/MkFitter.h b/mkFit/MkFitter.h index 611de6f0affe9..d90c42a9db457 100644 --- a/mkFit/MkFitter.h +++ b/mkFit/MkFitter.h @@ -8,6 +8,11 @@ #include "HitStructures.h" #include "BinInfoUtils.h" +#if USE_CUDA +#include "FitterCU.h" +#include "HitStructuresCU.h" +#endif + //#define DEBUG 1 //#define USE_BOHS diff --git a/mkFit/PropagationMPlex.cc b/mkFit/PropagationMPlex.cc index 733a21c091842..d0ce07b3cbc8a 100644 --- a/mkFit/PropagationMPlex.cc +++ b/mkFit/PropagationMPlex.cc @@ -583,6 +583,18 @@ void helixAtRFromIterativeCCS(const MPlexLV& inPar, const MPlexQI& inChg, MPlexL } } +#if 0 +// FIXME: Do not destroy improvements from the devel branch. +// Dan's shared .icc proof of concept should be bring back once everything is cleanly merged. +//#pragma omp declare simd simdlen(NN) notinbranch linear(n) +#include "PropagationMPlex.icc" + +void helixAtRFromIterative(const MPlexLV& inPar, const MPlexQI& inChg, MPlexLV& outPar, const MPlexQF &msRad, MPlexLL& errorProp) { + errorProp.SetVal(0); + + //#pragma ivdep + helixAtRFromIterative_impl(inPar, inChg, outPar, msRad, errorProp, 0, NN); +#endif void helixAtRFromIterative(const MPlexLV& inPar, const MPlexQI& inChg, MPlexLV& outPar, const MPlexQF &msRad, MPlexLL& errorProp, const int N_proc) @@ -921,7 +933,7 @@ void applyMaterialEffects(const MPlexQF &hitsRl, const MPlexQF& hitsXi, void propagateHelixToRMPlex(const MPlexLS &inErr, const MPlexLV& inPar, const MPlexQI &inChg, const MPlexHV& msPar, - MPlexLS &outErr, MPlexLV& outPar, + MPlexLS &outErr, MPlexLV& outPar, const int N_proc) { const idx_t N = NN; @@ -1003,7 +1015,7 @@ void propagateHelixToRMPlex(const MPlexLS &inErr, const MPlexLV& inPar, void propagateHelixToRMPlex(const MPlexLS& inErr, const MPlexLV& inPar, const MPlexQI& inChg, const float r, - MPlexLS& outErr, MPlexLV& outPar, + MPlexLS& outErr, MPlexLV& outPar, const int N_proc) { outErr = inErr; diff --git a/mkFit/PropagationMPlex.icc b/mkFit/PropagationMPlex.icc new file mode 100644 index 0000000000000..a9bfd217d063a --- /dev/null +++ b/mkFit/PropagationMPlex.icc @@ -0,0 +1,200 @@ +template +#ifdef __CUDACC__ +__device__ +#endif +static inline void helixAtRFromIterative_impl(const Tf& __restrict__ inPar, const Ti& __restrict__ inChg, + TfLL1& __restrict__ outPar, const Tf11& __restrict__ msRad, + TfLLL& __restrict__ errorProp, int nmin, int nmax) +{ +#pragma simd + for (int n = nmin; n < nmax; ++n) { + //initialize erroProp to identity matrix + errorProp(n,0,0) = 1.f; + errorProp(n,1,1) = 1.f; + errorProp(n,2,2) = 1.f; + errorProp(n,3,3) = 1.f; + errorProp(n,4,4) = 1.f; + errorProp(n,5,5) = 1.f; + + const float xin = inPar(n, 0, 0); + const float yin = inPar(n, 1, 0); + const float pxin = inPar(n, 3, 0); + const float pyin = inPar(n, 4, 0); + const float pzin = inPar(n, 5, 0); + const float r = msRad(n, 0, 0); + float r0 = hipo(xin, yin); + + dprint(std::endl << "attempt propagation from r=" << r0 << " to r=" << r << std::endl + << "x=" << xin << " y=" << yin << " z=" << inPar.ConstAt(n, 2, 0) << " px=" << pxin << " py=" << pyin << " pz=" << pzin << " q=" << inChg.ConstAt(n, 0, 0)); + + if (std::abs(r-r0)<0.0001f) { + dprint("distance less than 1mum, skip"); + continue; + } + const float pt2 = pxin*pxin+pyin*pyin; + const float pt = std::sqrt(pt2); + const float ptinv = 1.f/pt; + const float pt2inv = ptinv*ptinv; + //p=0.3Br => r=p/(0.3*B) + const float k = inChg(n, 0, 0) * 100.f / (-Config::sol*Config::Bfield); + const float invcurvature = 1.f/(pt*k);//in 1./cm + const float ctgTheta=pzin*ptinv; + + //variables to be updated at each iterations + float totalDistance = 0; + //derivatives initialized to value for first iteration, i.e. distance = r-r0in + float dTDdx = r0>0.0f ? -xin/r0 : 0.0f; + float dTDdy = r0>0.0f ? -yin/r0 : 0.0f; + float dTDdpx = 0.f; + float dTDdpy = 0.f; + //5 iterations is a good starting point + //const unsigned int Niter = 10; + // const unsigned int Niter = 5+std::round(r-r0)/2; +#pragma ivdep + for (unsigned int iter=0; iter < Config::Niter; ++iter) { + + dprint("propagation iteration #" << i); + const float x = outPar(n, 0, 0); + const float y = outPar(n, 1, 0); + const float px = outPar(n, 3, 0); + const float py = outPar(n, 4, 0); + r0 = hipo(x, y); + + dprint("r0=" << r0 << " pt=" << pt); + + totalDistance += (r-r0); + dprint("distance=" << (r-r0) << " angPath=" << (r-r0)*invcurvature); + + float cosAP, sinAP; + if (Config::useTrigApprox) { // TODO: uncomment + sincos4((r-r0)*invcurvature, sinAP, cosAP); + } else { + cosAP=std::cos((r-r0)*invcurvature); + sinAP=std::sin((r-r0)*invcurvature); + } + + //helix propagation formulas + //http://www.phys.ufl.edu/~avery/fitting/fitting4.pdf + outPar(n, 0, 0) = outPar(n, 0, 0) + k*(px*sinAP-py*(1-cosAP)); + outPar(n, 1, 0) = outPar(n, 1, 0) + k*(py*sinAP+px*(1-cosAP)); + outPar(n, 2, 0) = outPar(n, 2, 0) + (r-r0)*ctgTheta; + outPar(n, 3, 0) = px*cosAP-py*sinAP; + outPar(n, 4, 0) = py*cosAP+px*sinAP; + //outPar(n, 5, 0) = pz; //take this out as it is redundant + + if (iter+1 != Config::Niter && r0 > 0 && std::abs((r-r0)*invcurvature)>0.000000001f) + { + //update derivatives on total distance for next step, where totalDistance+=r-r0 + //now r0 depends on px and py + + dprint("r0=" << 1.f/r0 << " r0inv=" << r0 << " pt=" << pt); + + //update derivative on D + const float dAPdpx = -(r-r0)*invcurvature*px*pt2inv;//r0 is now 1./r0 (this could go above the redefinition of r0!) + const float dAPdpy = -(r-r0)*invcurvature*py*pt2inv; + r0 = 1.f/r0;//WARNING, now r0 is r0inv (one less temporary) + const float dAPdx = -x*r0*invcurvature; + const float dAPdy = -y*r0*invcurvature; + //reduce temporary variables + //dxdx = 1 + k*dAPdx*(px*cosAP - py*sinAP); + //dydx = k*dAPdx*(py*cosAP + px*sinAP); + //dTDdx -= r0*(x*dxdx + y*dydx); + dTDdx -= r0*(x*(1.f + k*dAPdx*(px*cosAP - py*sinAP)) + y*(k*dAPdx*(py*cosAP + px*sinAP))); + //reuse same temporary variables + //dxdy = k*dAPdy*(px*cosAP - py*sinAP); + //dydy = 1 + k*dAPdy*(py*cosAP + px*sinAP); + //dTDdy -= r0*(x*dxdy + y*dydy); + dTDdy -= r0*(x*(k*dAPdy*(px*cosAP - py*sinAP)) + y*(1.f + k*dAPdy*(py*cosAP + px*sinAP))); + //dxdpx = k*(sinAP + px*cosAP*dAPdpx - py*sinAP*dAPdpx); + //dydpx = k*(py*cosAP*dAPdpx + 1. - cosAP + px*sinAP*dAPdpx); + //dTDdpx -= r0*(x*dxdpx + y*dydpx); + dTDdpx -= r0*(x*(k*(sinAP + px*cosAP*dAPdpx - py*sinAP*dAPdpx)) + y*(k*(py*cosAP*dAPdpx + 1.f - cosAP + px*sinAP*dAPdpx))); + //dxdpy = k*(px*cosAP*dAPdpy - 1. + cosAP - py*sinAP*dAPdpy); + //dydpy = k*(sinAP + py*cosAP*dAPdpy + px*sinAP*dAPdpy); + //dTDdpy -= r0*(x*dxdpy + y*(dydpy); + dTDdpy -= r0*(x*(k*(px*cosAP*dAPdpy - 1.f + cosAP - py*sinAP*dAPdpy)) + y*(k*(sinAP + py*cosAP*dAPdpy + px*sinAP*dAPdpy))); + + } + dprint("iteration end, dump parameters" << std::endl + << "pos = " << outPar.At(n, 0, 0) << " " << outPar.At(n, 1, 0) << " " << outPar.At(n, 2, 0) << std::endl + << "mom = " << outPar.At(n, 3, 0) << " " << outPar.At(n, 4, 0) << " " << outPar.At(n, 5, 0) << std::endl + << "r=" << std::sqrt( outPar.At(n, 0, 0)*outPar.At(n, 0, 0) + outPar.At(n, 1, 0)*outPar.At(n, 1, 0) ) + << " pT=" << std::sqrt( outPar.At(n, 3, 0)*outPar.At(n, 3, 0) + outPar.At(n, 4, 0)*outPar.At(n, 4, 0) )); + const float TD=totalDistance; + const float TP=TD*invcurvature;//totalAngPath + + dprint("TD=" << TD << " TP=" << TP << " arrived at r=" << std::sqrt(outPar.At(n, 0, 0)*outPar.At(n, 0, 0)+outPar.At(n, 1, 0)*outPar.At(n, 1, 0)) + << std::endl + << "pos = " << outPar.At(n, 0, 0) << " " << outPar.At(n, 1, 0) << " " << outPar.At(n, 2, 0) << std::endl + << "mom = " << outPar.At(n, 3, 0) << " " << outPar.At(n, 4, 0) << " " << outPar.At(n, 5, 0)); + + const float iC=invcurvature; + const float dCdpx = k*pxin*ptinv; + const float dCdpy = k*pyin*ptinv; + const float dTPdx = dTDdx*iC; + const float dTPdy = dTDdy*iC; + const float dTPdpx = (dTDdpx - TD*dCdpx*iC)*iC; // MT change: avoid division + const float dTPdpy = (dTDdpy - TD*dCdpy*iC)*iC; // MT change: avoid division + + float cosTP, sinTP; + if (Config::useTrigApprox) { + sincos4(TP, sinTP, cosTP); + } else { + cosTP = std::cos(TP); + sinTP = std::sin(TP); + } + + //now try to make full jacobian + //derive these to compute jacobian + //x = xin + k*(pxin*sinTP-pyin*(1-cosTP)); + //y = yin + k*(pyin*sinTP+pxin*(1-cosTP)); + //z = zin + k*TP*pzin; + //px = pxin*cosTP-pyin*sinTP; + //py = pyin*cosTP+pxin*sinTP; + //pz = pzin; + //jacobian + + errorProp(n,0,0) = 1 + k*dTPdx*(pxin*cosTP - pyin*sinTP); //dxdx; + errorProp(n,0,1) = k*dTPdy*(pxin*cosTP - pyin*sinTP); //dxdy; + errorProp(n,0,2) = 0.; + errorProp(n,0,3) = k*(sinTP + pxin*cosTP*dTPdpx - pyin*sinTP*dTPdpx); //dxdpx; + errorProp(n,0,4) = k*(pxin*cosTP*dTPdpy - 1.f + cosTP - pyin*sinTP*dTPdpy);//dxdpy; + errorProp(n,0,5) = 0.; + + errorProp(n,1,0) = k*dTPdx*(pyin*cosTP + pxin*sinTP); //dydx; + errorProp(n,1,1) = 1 + k*dTPdy*(pyin*cosTP + pxin*sinTP); //dydy; + errorProp(n,1,2) = 0.; + errorProp(n,1,3) = k*(pyin*cosTP*dTPdpx + 1.f - cosTP + pxin*sinTP*dTPdpx);//dydpx; + errorProp(n,1,4) = k*(sinTP + pyin*cosTP*dTPdpy + pxin*sinTP*dTPdpy); //dydpy; + errorProp(n,1,5) = 0.; + + errorProp(n,2,0) = k*pzin*dTPdx; //dzdx; + errorProp(n,2,1) = k*pzin*dTPdy; //dzdy; + errorProp(n,2,2) = 1.f; + errorProp(n,2,3) = k*pzin*dTPdpx;//dzdpx; + errorProp(n,2,4) = k*pzin*dTPdpy;//dzdpy; + errorProp(n,2,5) = k*TP; //dzdpz; + + errorProp(n,3,0) = -dTPdx*(pxin*sinTP + pyin*cosTP); //dpxdx; + errorProp(n,3,1) = -dTPdy*(pxin*sinTP + pyin*cosTP); //dpxdy; + errorProp(n,3,2) = 0.; + errorProp(n,3,3) = cosTP - dTPdpx*(pxin*sinTP + pyin*cosTP); //dpxdpx; + errorProp(n,3,4) = -sinTP - dTPdpy*(pxin*sinTP + pyin*cosTP);//dpxdpy; + errorProp(n,3,5) = 0.; + + errorProp(n,4,0) = -dTPdx*(pyin*sinTP - pxin*cosTP); //dpydx; + errorProp(n,4,1) = -dTPdy*(pyin*sinTP - pxin*cosTP); //dpydy; + errorProp(n,4,2) = 0.; + errorProp(n,4,3) = +sinTP - dTPdpx*(pyin*sinTP - pxin*cosTP);//dpydpx; + errorProp(n,4,4) = +cosTP - dTPdpy*(pyin*sinTP - pxin*cosTP);//dpydpy; + errorProp(n,4,5) = 0.; + + errorProp(n,5,0) = 0.; + errorProp(n,5,1) = 0.; + errorProp(n,5,2) = 0.; + errorProp(n,5,3) = 0.; + errorProp(n,5,4) = 0.; + errorProp(n,5,5) = 1.f; + } + } +} diff --git a/mkFit/accessors_cu.h b/mkFit/accessors_cu.h new file mode 100644 index 0000000000000..f2bb9d9b07c24 --- /dev/null +++ b/mkFit/accessors_cu.h @@ -0,0 +1,40 @@ +#ifndef ACCESSORS_CU_H +#define ACCESSORS_CU_H + +template <> +__device__ float* SVector3::ArrayCU() { + return fArray; +} + +template <> +__device__ float* SVector6::ArrayCU() { + return fArray; +} + +template <> +__device__ float* ROOT::Math::MatRepSym::ArrayCU() { + return fArray; +} + +template <> +__device__ float* SMatrixSym66::ArrayCU() { + return fRep.ArrayCU(); +} + +__device__ float *Hit::posArrayCU() { + return state_.pos_.ArrayCU(); +} + +__device__ float *Hit::errArrayCU() { + return state_.err_.ArrayCU(); +} + +__device__ float *Track::posArrayCU() { + return state_.parameters.ArrayCU(); +} + +__device__ float *Track::errArrayCU() { + return state_.errors.ArrayCU(); +} + +#endif /* ifndef ACCESSORS_CU_H */ diff --git a/mkFit/array_algorithms_cu.h b/mkFit/array_algorithms_cu.h new file mode 100644 index 0000000000000..439a0a10c6710 --- /dev/null +++ b/mkFit/array_algorithms_cu.h @@ -0,0 +1,59 @@ +#ifndef ARRAY_ALGORITHMS_CU_H +#define ARRAY_ALGORITHMS_CU_H + +#include +#include "atomic_utils.h" +#include "gpu_utils.h" + +template +__device__ void reduceMax_fn(const T *d_in, const int in_size, T *d_max) { + // Specialize BlockReduce type for our thread block + typedef cub::BlockReduce BlockReduceT; + // Shared memory + __shared__ typename BlockReduceT::TempStorage temp_storage; + + for (int i = threadIdx.x + blockIdx.x*blockDim.x; + i < in_size; + i += blockDim.x * gridDim.x) { + // Per-thread tile data + T data[ITEMS_PER_THREAD]; + int block_offset = i - threadIdx.x; + cub::LoadDirectStriped(threadIdx.x, + d_in + block_offset, //blockIdx.x*blockDim.x, + data); + // Compute sum for a single thread block + T aggregate = BlockReduceT(temp_storage).Reduce(data, cub::Max()); + + // FIXME: Is reduction over block enough, or do we need device-wise reduction + // CPU code reduces on NN (typically 8 or 16) values. So block-wise should + // be good enough. + // Device-wise reductions with atomics are performance killers + //if (threadIdx.x == 0) { + //AtomicMax(d_max, aggregate); + //} + *d_max = aggregate; + } +} + +template +__global__ void reduceMax_kernel(T *d_in, int in_size, T *d_max) { + reduceMax_fn(d_in, in_size, d_max); +} + +template +void max_element_wrapper(T *d_v, int num_elems, T *d_max) { + dim3 block (BLOCK_THREADS, 1, 1); + dim3 grid (std::min(max_blocks_x, (num_elems-1)/BLOCK_THREADS + 1), 1, 1) ; + reduceMax_kernel + <<< grid, block >>> + (d_v, num_elems, d_max); +} +#endif /* ifndef ARRAY_ALGORITHMS_CU_H */ diff --git a/mkFit/atomic_utils.h b/mkFit/atomic_utils.h new file mode 100644 index 0000000000000..c5f82d99bc50d --- /dev/null +++ b/mkFit/atomic_utils.h @@ -0,0 +1,102 @@ +#ifndef ATOMIC_UTILS_H +#define ATOMIC_UTILS_H + +/* Copyright (c) 2012, NVIDIA CORPORATION. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of NVIDIA CORPORATION nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY + * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR + * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR + * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, + * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, + * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR + * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY + * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +// AtomicMAX code from: +// https://github.com/parallel-forall/code-samples/blob/master/posts/cuda-aware-mpi-example/src/Device.cu + +#define uint64 unsigned long long + + template +__device__ void AtomicMax(T * const address, const T value) +{ + atomicMax(address, value); +} + +/** + * @brief Compute the maximum of 2 single-precision floating point values using an atomic operation + * + * @param[in]addressThe address of the reference value which might get updated with the maximum + * @param[in]valueThe value that is compared to the reference in order to determine the maximum + */ + template <> +__device__ void AtomicMax(float * const address, const float value) +{ + if (* address >= value) + { + return; + } + + int * const address_as_i = (int *)address; + int old = * address_as_i, assumed; + + do + { + assumed = old; + if (__int_as_float(assumed) >= value) + { + break; + } + // The value stored at address_as_i might have changed since it has been loaded + // into old and into assumed. + old = atomicCAS(address_as_i, assumed, __float_as_int(value)); + } while (assumed != old); // the other threads did not change address_as_i, so the + // atomicCAS return action makes sense. +} + +/** + * @brief Compute the maximum of 2 double-precision floating point values using an atomic operation + * + * @param[in]addressThe address of the reference value which might get updated with the maximum + * @param[in]valueThe value that is compared to the reference in order to determine the maximum + */ + template <> +__device__ void AtomicMax(double * const address, const double value) +{ + if (* address >= value) + { + return; + } + + uint64 * const address_as_i = (uint64 *)address; + uint64 old = * address_as_i, assumed; + + do + { + assumed = old; + if (__longlong_as_double(assumed) >= value) + { + break; + } + + old = atomicCAS(address_as_i, assumed, __double_as_longlong(value)); + } while (assumed != old); +} + +#endif /* ifndef ATOMIC_UTILS_H */ diff --git a/mkFit/best_hit_kernels.cu b/mkFit/best_hit_kernels.cu new file mode 100644 index 0000000000000..931aba62d18a9 --- /dev/null +++ b/mkFit/best_hit_kernels.cu @@ -0,0 +1,305 @@ +#include "best_hit_kernels.h" + +#include "computeChi2_kernels.h" +#include "index_selection_kernels.h" +#include "reorganize_gplex.h" +#include "array_algorithms_cu.h" +#include "kalmanUpdater_kernels.h" +#include "propagation_kernels.h" + +#include +#include +#include +#include + +#define BLOCK_SIZE_X 256 + + +__device__ void getNewBestHitChi2_fn( + const GPlexQI &XHitSize, const GPlexHitIdx &XHitArr, + const float *outChi2, float &minChi2, + int &bestHit, const int hit_cnt, const int N) { + int itrack = threadIdx.x + blockDim.x*blockIdx.x; + + if (itrack < N) { + if (hit_cnt < XHitSize[itrack]) { + float chi2 = fabs(outChi2[itrack]); + if (chi2 < minChi2) { + minChi2 = chi2; + bestHit = XHitArr(itrack, hit_cnt, 0); + } + } + } +} + + +__global__ void getNewBestHitChi2_kernel( + const GPlexQI XHitSize, const GPlexHitIdx XHitArr, + const float *outChi2, float *minChi2, + int *bestHit, const int hit_cnt, const int N) { + int itrack = threadIdx.x + blockDim.x*blockIdx.x; + if (itrack < N) { + getNewBestHitChi2_fn(XHitSize, XHitArr, outChi2, minChi2[itrack], bestHit[itrack], hit_cnt, N); + } +} + + +void getNewBestHitChi2_wrapper(const cudaStream_t &stream, + const GPlexQI &XHitSize, const GPlexHitIdx &XHitArr, + const GPlexQF &outChi2, float *minChi2, int *bestHit, + const int hit_cnt, const int N) { + int gridx = std::min((N-1)/BLOCK_SIZE_X + 1, + max_blocks_x); + dim3 grid(gridx, 1, 1); + dim3 block(BLOCK_SIZE_X, 1, 1); + getNewBestHitChi2_kernel <<< grid, block, 0, stream >>> + (XHitSize, XHitArr, outChi2.ptr, minChi2, bestHit, hit_cnt, N); +} + + +void fill_array_cu(float *array, const int size, const float value) { + thrust::device_ptr d_ptr(array); + thrust::fill(d_ptr, d_ptr + size, value); +} + + +__device__ void updateTracksWithBestHit_fn(Hit *hits, + const float minChi2, const int bestHit, + GPlexHS &msErr, GPlexHV &msPar, const GPlexLV &propPar, + GPlexQF &Chi2, GPlexQI &HitsIdx, const int N) { + int itrack = threadIdx.x + blockDim.x*blockIdx.x; + if (itrack < N) { + if (bestHit >= 0) + { + Hit &hit = hits[ bestHit ]; + const float &chi2_local = minChi2; + + for (int i = 0; i < msErr.kSize; ++i) { + msErr(itrack, i, 0) = hit.errArrayCU()[i]; + } + for (int i = 0; i < msPar.kSize; ++i) { + msPar(itrack, i, 0) = hit.posArrayCU()[i]; + } + Chi2[itrack] += chi2_local; + HitsIdx[itrack] = bestHit; + } + else + { + /*msErr[Nhits].SetDiagonal3x3(itrack, 666);*/ + msErr(itrack, 0, 0) = 666; + msErr(itrack, 1, 0) = 0; + msErr(itrack, 2, 0) = 666; + msErr(itrack, 3, 0) = 0; + msErr(itrack, 4, 0) = 0; + msErr(itrack, 5, 0) = 666; + + for (int i = 0; i < msPar.kSize; ++i) { + msPar(itrack, i, 0) = propPar(itrack, i, 0); + } + HitsIdx[itrack] = -1; + // Don't update chi2 + } + } +} + + +__global__ void updateTracksWithBestHit_kernel(Hit *hits, + const float *minChi2, const int *bestHit, + GPlexHS msErr, GPlexHV msPar, const GPlexLV propPar, + GPlexQF Chi2, GPlexQI HitsIdx, const int N) { + int itrack = threadIdx.x + blockDim.x*blockIdx.x; + if (itrack < N) { + updateTracksWithBestHit_fn + (hits, minChi2[itrack], bestHit[itrack], + msErr, msPar, propPar, Chi2, HitsIdx, N); + } +} + + +void updateTracksWithBestHit_wrapper(const cudaStream_t &stream, + LayerOfHitsCU &layer, const float *minChi2, const int *best_hit, + GPlexHS &msErr, GPlexHV &msPar, const GPlexLV &propPar, + GPlexQF &Chi2, GPlexQI &HitsIdx, const int N) { + int gridx = std::min((N-1)/BLOCK_SIZE_X + 1, + max_blocks_x); + dim3 grid(gridx, 1, 1); + dim3 block(BLOCK_SIZE_X, 1, 1); + updateTracksWithBestHit_kernel <<< grid, block, 0, stream >>> + (layer.m_hits, minChi2, best_hit, msErr, msPar, propPar, Chi2, HitsIdx, N); +} + + +int getMaxNumHits_wrapper(const GPlexQI d_XHitSize, const int N) { + thrust::device_ptr d_ptr(d_XHitSize.ptr); + int maxSize= thrust::reduce(d_ptr, d_ptr + N, -1, thrust::maximum()); + maxSize = std::min(maxSize, Config::maxHitsConsidered); + + return maxSize; +} + + +__device__ void bestHit_fn( + Hit *hits, const GPlexQI &XHitSize, const GPlexHitIdx &XHitArr, + const GPlexLS &propErr, GPlexHS &msErr, GPlexHV &msPar, + const GPlexLV &propPar, GPlexQF &outChi2, + GPlexQF &Chi2, GPlexQI &HitsIdx, + const int maxSize, const int itrack, const int N) { + + /*int itrack = threadIdx.x + blockDim.x*blockIdx.x;*/ + int bestHit_reg = -1; + float minChi2_reg = Config::chi2Cut; + + if (itrack < N) + HitsIdx[itrack] = 0; + + for (int hit_cnt = 0; hit_cnt < maxSize; ++hit_cnt) + { + HitToMs_fn(msErr, msPar, hits, XHitSize, XHitArr, HitsIdx, hit_cnt, itrack, N); +#if 0 + // TODO: add CMSGeom + if (Config::useCMSGeom) { + //propagateHelixToRMPlex(psErr, psPar, inChg, msPar, propErr, propPar); + throw std::runtime_error("useCMSGeom not implemented yet for GPU"); + } else {} +#endif + computeChi2_fn(propErr, msErr, msPar, propPar, outChi2, itrack, N); + getNewBestHitChi2_fn(XHitSize, XHitArr, outChi2.ptr, minChi2_reg, bestHit_reg, hit_cnt, N); + } + updateTracksWithBestHit_fn + (hits, + minChi2_reg, bestHit_reg, + msErr, msPar, propPar, + Chi2, HitsIdx, + N); +} + + +__global__ void bestHit_kernel( + Hit *hits, const GPlexQI XHitSize, const GPlexHitIdx XHitArr, + const GPlexLS propErr, GPlexHS msErr, GPlexHV msPar, + const GPlexLV propPar, GPlexQF outChi2, + GPlexQF Chi2, GPlexQI HitsIdx, + const int maxSize, const int N) { + int itrack = threadIdx.x + blockDim.x*blockIdx.x; + bestHit_fn(hits, XHitSize, XHitArr, + propErr, msErr, msPar, + propPar, outChi2, + Chi2, HitsIdx, + maxSize, itrack, N); +} + + +void bestHit_wrapper(const cudaStream_t &stream, + LayerOfHitsCU &layer, const GPlexQI &XHitSize, const GPlexHitIdx &XHitArr, + const GPlexLS &propErr, GPlexHS &msErr, GPlexHV &msPar, + const GPlexLV &propPar, GPlexQF &outChi2, + GPlexQF &Chi2, GPlexQI &HitsIdx, + const int maxSize, const int N) { + int gridx = std::min((N-1)/BLOCK_SIZE_X + 1, + max_blocks_x); + dim3 grid(gridx, 1, 1); + dim3 block(BLOCK_SIZE_X, 1, 1); + + bestHit_kernel <<< grid, block, 0, stream >>> + (layer.m_hits, XHitSize, XHitArr, + propErr, msErr, msPar, propPar, outChi2, + /*propErr.ptr, propErr.stride,*/ + /*msErr.ptr, msErr.stride, msErr.kSize,*/ + /*msPar.ptr, msPar.stride, msPar.kSize,*/ + /*outChi2.ptr, outChi2.stride,*/ + Chi2, HitsIdx, + maxSize, N); +} + + +template +__global__ void findBestHit_kernel(LayerOfHitsCU *layers, + EtaBinOfCandidatesCU *etabin_of_cands, + GPlexQI XHitSize, GPlexHitIdx XHitArr, + GPlexLS Err_iP, GPlexLV Par_iP, + GPlexHS *msErr_arr, GPlexHV *msPar_arr, + GPlexLS Err_iC, GPlexLV Par_iC, + GPlexQF outChi2, + GPlexQF Chi2, GPlexQI *HitsIdx_arr, + GPlexQI inChg, GPlexQI Label, GeometryCU geom, + int *maxSize, int gplex_size, + const int nlayers_per_seed) { + for (int ebin = 0; ebin != Config::nEtaBin; ++ebin) { + for (int beg = 0; beg < etabin_of_cands[ebin].m_fill_index; beg += gplex_size) { + int end = min(beg + gplex_size, etabin_of_cands[ebin].m_fill_index); + int N = end - beg; + + int tidx = threadIdx.x + blockDim.x*blockIdx.x; + int itrack = beg + tidx; + + if (itrack < end) { + + InputTracksCU_fn(etabin_of_cands[ebin].m_candidates, Err_iP, Par_iP, + inChg, Chi2, Label, HitsIdx_arr, beg, end, tidx, N); + + for (int ilay = nlayers_per_seed; ilay < Config::nLayers; ++ilay) + { + int hit_idx = ilay; + GPlexHS &msErr = msErr_arr[hit_idx]; + GPlexHV &msPar = msPar_arr[hit_idx]; + GPlexQI &HitsIdx = HitsIdx_arr[hit_idx]; + + float *radii = geom.radii; + + LayerOfHitsCU &layer = layers[ilay]; + + int maxSize_block; + selectHitIndices_fn(layer, Err_iP, Par_iP, XHitSize, XHitArr, tidx, N); + // FIXME: Is reduction over block enough, or do we need device-wise reduction + reduceMax_fn + (XHitSize.ptr, XHitSize.N, &maxSize_block); + bestHit_fn(layer.m_hits, XHitSize, XHitArr, + Err_iP, msErr, msPar, Par_iP, outChi2, + Chi2, HitsIdx, maxSize_block, tidx, N); + kalmanUpdate_fn( Err_iP, msErr, Par_iP, msPar, Par_iC, Err_iC, tidx, N); + if (ilay+1 < Config::nLayers) { + float radius = radii[ilay+1]; + propagationForBuilding_fn(Err_iC, Par_iC, inChg, radius, Err_iP, Par_iP, tidx, N); + } + } + OutputTracksCU_fn(etabin_of_cands[ebin].m_candidates, + Err_iP, Par_iP, inChg, Chi2, Label, HitsIdx_arr, beg, end, tidx, N); + } + } + } +} + + +void findBestHit_wrapper(cudaStream_t &stream, + LayerOfHitsCU *layers, + EventOfCandidatesCU &event_of_cands_cu, + GPlexQI &XHitSize, GPlexHitIdx &XHitArr, + GPlexLS &Err_iP, GPlexLV &Par_iP, + GPlexHS *msErr, GPlexHV *msPar, + GPlexLS &Err_iC, GPlexLV &Par_iC, + GPlexQF &outChi2, + GPlexQF &Chi2, GPlexQI *HitsIdx, + GPlexQI &inChg, GPlexQI &Label, + GeometryCU &geom, + int *maxSize, int N) { + /*int gridx = std::min((N-1)/BLOCK_SIZE_X + 1,*/ + /*max_blocks_x);*/ + int gridx = (N-1)/BLOCK_SIZE_X + 1; + dim3 grid(gridx, 1, 1); + dim3 block(BLOCK_SIZE_X, 1, 1); + + if (gridx > max_blocks_x) { + throw std::runtime_error("The matriplex size should be chosen such " + "that gplex_size*BLOCK_SIZE_X <= max_blocks_x"); + } + // The loop over tracks is taking care of the case where there is more tracks + // than available threads. + // We should actually not throw an exception, GPlex should be allocated with + // a smaller size in MkFitter. + + findBestHit_kernel <<< grid, block, 0, stream >>> + (layers, event_of_cands_cu.m_etabins_of_candidates, + XHitSize, XHitArr, Err_iP, Par_iP, msErr, msPar, + Err_iC, Par_iC, outChi2, Chi2, HitsIdx, inChg, Label, geom, maxSize, N, + Config::nlayers_per_seed); +} diff --git a/mkFit/best_hit_kernels.h b/mkFit/best_hit_kernels.h new file mode 100644 index 0000000000000..8cf6a421820fe --- /dev/null +++ b/mkFit/best_hit_kernels.h @@ -0,0 +1,49 @@ +#ifndef BEST_HIT_KERNELS_H +#define BEST_HIT_KERNELS_H + + +#include "GPlex.h" +#include "HitStructuresCU.h" +#include "GeometryCU.h" + + +void getNewBestHitChi2_wrapper(const cudaStream_t &stream, + const GPlexQI &XHitSize, const GPlexHitIdx &XHitArr, + const GPlexQF &outChi2, float *minChi2, int *bestHit, + const int hit_cnt, const int N); + + +void fill_array_cu(float *array, const int size, const float value); + + +void updateTracksWithBestHit_wrapper(const cudaStream_t &stream, + LayerOfHitsCU &, const float *minChi2, const int *best_hit, + GPlexHS &msErr, GPlexHV &msPar, const GPlexLV &propPar, + GPlexQF &Chi2, GPlexQI& HitsIdx, const int N); + +int getMaxNumHits_wrapper(const GPlexQI d_XHitSize, const int N); + + +void bestHit_wrapper(const cudaStream_t &stream, + LayerOfHitsCU &layer, const GPlexQI &XHitSize, const GPlexHitIdx &XHitArr, + const GPlexLS &propErr, GPlexHS &msErr, GPlexHV &msPar, + const GPlexLV &propPar, GPlexQF &outChi2, + GPlexQF &Chi2, GPlexQI& HitsIdx, + const int maxSize, const int N); + + +void findBestHit_wrapper(cudaStream_t &stream, + LayerOfHitsCU *layers, + EventOfCandidatesCU &event_of_cands_cu, + GPlexQI &XHitSize, GPlexHitIdx &XHitArr, + GPlexLS &Err_iP, GPlexLV &Par_iP, + GPlexHS *msErr, GPlexHV *msPar, + GPlexLS &Err_iC, GPlexLV &Par_iC, + GPlexQF &outChi2, + GPlexQF &Chi2, GPlexQI *HitsIdx, + GPlexQI &inChg, GPlexQI &Label, + GeometryCU &geom, + int *maxSize, int N); + + +#endif /* ifndef BEST_HIT_KERNELS_H */ diff --git a/mkFit/buildtestMPlex.cc b/mkFit/buildtestMPlex.cc index 22f066fd8c15f..2b83225218a14 100644 --- a/mkFit/buildtestMPlex.cc +++ b/mkFit/buildtestMPlex.cc @@ -7,6 +7,13 @@ #include "BinInfoUtils.h" #include "MkBuilder.h" + +#ifdef USE_CUDA +#include "FitterCU.h" +#include "BuilderCU.h" +#include "check_gpu_hit_structures.h" +#endif + #include "MkBuilderEndcap.h" #include @@ -99,9 +106,20 @@ double runBuildingTestPlexBestHit(Event& ev) __itt_resume(); #endif +#if USE_CUDA + //check_event_of_hits_gpu(builder.get_event_of_hits()); + //check_event_of_cands_gpu(event_of_cands); + BuilderCU builder_cu(builder.get_event_of_hits(), builder.get_event(), + event_of_cands); +#endif + double time = dtime(); +#if USE_CUDA + builder_cu.FindTracksBestHit(event_of_cands); +#else builder.FindTracksBestHit(event_of_cands); +#endif time = dtime() - time; @@ -109,8 +127,11 @@ double runBuildingTestPlexBestHit(Event& ev) __itt_pause(); #endif - if (!Config::normal_val) {builder.quality_output_besthit(event_of_cands);} - else {builder.root_val_besthit(event_of_cands);} + if (!Config::normal_val) { + builder.quality_output_besthit(event_of_cands); + } else { + builder.root_val_besthit(event_of_cands); + } builder.end_event(); @@ -248,3 +269,69 @@ double runBuildingTestPlexTbb(Event& ev, EventTmp& ev_tmp) return time; } + + +//============================================================================== +// runAllBuildTestPlexBestHitGPU +//============================================================================== + +#if USE_CUDA +double runAllBuildingTestPlexBestHitGPU(std::vector &events) +{ + + int num_builders = events.size(); + std::vector> builder_ptrs(num_builders); + std::vector event_of_cands_vec(num_builders); + std::vector builder_cu_vec(num_builders); + + for (int i = 0; i < builder_ptrs.size(); ++i) { + Event &ev = events[i]; + builder_ptrs[i] = std::unique_ptr (make_builder()); + + MkBuilder &builder = * builder_ptrs[i].get(); + + builder.begin_event(&ev, 0, __func__); + + if (Config::findSeeds) {builder.find_seeds();} + else {builder.map_seed_hits();} // all other simulated seeds need to have hit indices line up in LOH for seed fit + + builder.fit_seeds_tbb(); + + EventOfCandidates &event_of_cands = event_of_cands_vec[i]; + builder.find_tracks_load_seeds(event_of_cands); + + BuilderCU &builder_cu = builder_cu_vec[i]; + builder_cu.setUp(builder.get_event_of_hits(), builder.get_event(), + event_of_cands); + } + + //omp_set_num_threads(Config::numThreadsEvents); + //std::cerr << "num threads "<< omp_get_num_threads() << std::endl; +//#pragma omp parallel for reduction(+:total_time) + //for (int i = 0; i < builder_ptrs.size(); ++i) { + double time = dtime(); + tbb::parallel_for(size_t(0), builder_ptrs.size(), [&](size_t i) { + EventOfCandidates &event_of_cands = event_of_cands_vec[i]; + BuilderCU &builder_cu = builder_cu_vec[i]; + MkBuilder &builder = * builder_ptrs[i].get(); + + builder_cu.FindTracksBestHit(event_of_cands); + }); + time = dtime() - time; + + for (int i = 0; i < builder_ptrs.size(); ++i) { + EventOfCandidates &event_of_cands = event_of_cands_vec[i]; + BuilderCU &builder_cu = builder_cu_vec[i]; + MkBuilder &builder = * builder_ptrs[i].get(); + if (!Config::normal_val) { + builder.quality_output_besthit(event_of_cands); + } else { + builder.root_val_besthit(event_of_cands); + } + + builder.end_event(); + } + + return time; +} +#endif diff --git a/mkFit/buildtestMPlex.h b/mkFit/buildtestMPlex.h index 25b636014e108..bb082ad4a07c6 100644 --- a/mkFit/buildtestMPlex.h +++ b/mkFit/buildtestMPlex.h @@ -13,4 +13,8 @@ double runBuildingTestPlexCloneEngine(Event& ev, EventTmp& evtmp); double runBuildingTestPlexTbb(Event& ev, EventTmp& evtmp); +#if USE_CUDA +double runAllBuildingTestPlexBestHitGPU(std::vector &events); +#endif + #endif diff --git a/mkFit/check_gpu_hit_structures.cu b/mkFit/check_gpu_hit_structures.cu new file mode 100644 index 0000000000000..5918fa740518d --- /dev/null +++ b/mkFit/check_gpu_hit_structures.cu @@ -0,0 +1,147 @@ +#include "check_gpu_hit_structures.h" + +/*#include "reorganize_gplex.cu"*/ +#include "Hit.h" +#include "HitStructures.h" +#include "HitStructuresCU.h" +#include "reorganize_gplex.h" +#include "gpu_utils.h" + +#include + + +__global__ void get_hit_pos_and_err(LayerOfHitsCU *layers, + int ilay, int hit_idx, float *pos, float *err, int pos_size, int err_size) { + if (threadIdx.x + blockDim.x * blockIdx.x == 0) { + LayerOfHitsCU &layer = layers[ilay]; + Hit &hit = layer.m_hits[hit_idx]; + float *posArray = get_posArray(hit); + float *errArray = get_errArray(hit); + for (int i = 0; i < pos_size; ++i) { + pos[i] = posArray[i]; + } + for (int i = 0; i < err_size; ++i) { + err[i] = errArray[i]; + } + } +} + + +void compare_carrays(const float *h_a, const float *d_a, + const float prec, const int n) +{ + for (int i = 0; i < n; ++i) { + // should be relative comparison, verify if div by 0 will happen + if (std::abs(h_a[i] - d_a[i]) > prec) { + std::cerr << i << " : " << h_a[i] << " / " << d_a[i] << std::endl; + } + } +} + + +void check_event_of_hits_gpu(const EventOfHits& event_of_hits) +{ + EventOfHitsCU event_of_hits_cu; + event_of_hits_cu.allocGPU(event_of_hits); + event_of_hits_cu.copyFromCPU(event_of_hits); + + constexpr int pos_size = 3; + constexpr int err_size = 6; + + float *d_pos, *d_err; + float pos[pos_size], err[err_size]; + + cudaMalloc((void**)&d_pos, pos_size*sizeof(float)); + cudaMalloc((void**)&d_err, err_size*sizeof(float)); + + dim3 grid(1, 1, 1); + dim3 block(1, 1, 1); + + int ilay = 2; + int hit_idx = 3; + + get_hit_pos_and_err <<< grid, block >>> + (event_of_hits_cu.m_layers_of_hits, ilay, hit_idx, d_pos, d_err, pos_size, err_size); + + cudaMemcpy(pos, d_pos, pos_size*sizeof(float), cudaMemcpyDeviceToHost); + cudaMemcpy(err, d_err, err_size*sizeof(float), cudaMemcpyDeviceToHost); + + //std::cerr << "pos ......................\n"; + compare_carrays(event_of_hits.m_layers_of_hits[ilay].m_hits[hit_idx].posArray(), + pos, 1e-3, pos_size); + //std::cerr << "err ......................\n"; + compare_carrays(event_of_hits.m_layers_of_hits[ilay].m_hits[hit_idx].errArray(), + err, 1e-3, err_size); + + cudaFree(d_pos); + cudaFree(d_err); + + event_of_hits_cu.deallocGPU(); +} + + +__global__ void get_cand_pos_and_err(EtaBinOfCandidatesCU *etabin_of_cands, + const int ebin, const int itrack, float *pos, float *err, + const int pos_size, const int err_size) +{ + if (threadIdx.x + blockDim.x * blockIdx.x == 0) { + Track &track = etabin_of_cands[ebin].m_candidates[itrack]; + float *posArray = get_posArray(track); + float *errArray = get_errArray(track); + + for (int i = 0; i < pos_size; ++i) { + pos[i] = posArray[i]; + } + for (int i = 0; i < err_size; ++i) { + err[i] = errArray[i]; + } + } +} + + +void check_event_of_cands_gpu(const EventOfCandidates& event_of_cands) +{ + EventOfCandidatesCU event_of_cands_cu; + event_of_cands_cu.allocGPU(event_of_cands); + event_of_cands_cu.copyFromCPU(event_of_cands); + + constexpr int pos_size = 6; + constexpr int err_size = 21; + + float *d_pos, *d_err; + float pos[pos_size], err[err_size]; + + cudaMalloc((void**)&d_pos, pos_size*sizeof(float)); + cudaMalloc((void**)&d_err, err_size*sizeof(float)); + + dim3 grid(1, 1, 1); + dim3 block(1, 1, 1); + + int etabin = std::min(2, Config::nEtaBin-1); + int itrack = 3; + + get_cand_pos_and_err <<< grid, block >>> + (event_of_cands_cu.m_etabins_of_candidates, + etabin, itrack, d_pos, d_err, pos_size, err_size); + cudaCheckErrorSync(); + + cudaMemcpy(pos, d_pos, pos_size*sizeof(float), cudaMemcpyDeviceToHost); + cudaCheckErrorSync(); + cudaMemcpy(err, d_err, err_size*sizeof(float), cudaMemcpyDeviceToHost); + cudaCheckErrorSync(); + + /*std::cerr << "pos ......................\n";*/ + compare_carrays(event_of_cands.m_etabins_of_candidates[etabin].m_candidates[itrack].posArray(), + pos, 1e-3, pos_size); + /*std::cerr << "err ......................\n";*/ + compare_carrays(event_of_cands.m_etabins_of_candidates[etabin].m_candidates[itrack].errArray(), + err, 1e-3, err_size); + + cudaFree(d_pos); + cudaFree(d_err); + + //event_of_cands_cu.copyToCPU(event_of_cands); + event_of_cands_cu.deallocGPU(); + + cudaCheckErrorSync(); +} diff --git a/mkFit/check_gpu_hit_structures.h b/mkFit/check_gpu_hit_structures.h new file mode 100644 index 0000000000000..29e6cdfcce39b --- /dev/null +++ b/mkFit/check_gpu_hit_structures.h @@ -0,0 +1,9 @@ +#ifndef CHECK_GPU_HIT_STRUCTURE_H +#define CHECK_GPU_HIT_STRUCTURE_H + +#include "HitStructures.h" + +void check_event_of_hits_gpu(const EventOfHits& event_of_hits); +void check_event_of_cands_gpu(const EventOfCandidates& event_of_cands); + +#endif /* ifndef CHECK_GPU_HIT_STRUCTURE_H */ diff --git a/mkFit/computeChi2_kernels.cu b/mkFit/computeChi2_kernels.cu new file mode 100644 index 0000000000000..dc8d95d870640 --- /dev/null +++ b/mkFit/computeChi2_kernels.cu @@ -0,0 +1,146 @@ +#include "computeChi2_kernels.h" + +#include "GPlex.h" +#include "kalmanUpdater_kernels.h" +#include "gpu_utils.h" + +#include + +#define L 6 +#define HS 6 +#define HV 3 +#define BLOCK_SIZE_X 256 + + +__device__ void chi2Similarity_fn( + const GPlexReg2V &a, + const GPlexReg2S &c, // in registers + float *d, const size_t dN, + const int n) { + + //int n = threadIdx.x + blockIdx.x * blockDim.x; + + // manually subrtact into local vars -- 3 of them + /*float x0 = a[0 * aN + n] - b[0 * aN + n];*/ + /*float x1 = a[1 * aN + n] - b[1 * aN + n];*/ + /*float x2 = a[2 * aN + n] - b[2 * aN + n];*/ + /*d[0 * dN + n] = c[0]*x0*x0 + c[2]*x1*x1 + c[5]*x2*x2 +*/ + /*2*( c[1]*x1*x0 + c[3]*x2*x0 + c[4]*x1*x2);*/ + d[0 * dN + n] = c[0]*a[0]*a[0] + + c[2]*a[1]*a[1] + + 2*( c[1]*a[1]*a[0]); +} + + +__device__ void RotateResidulsOnTangentPlane_fn(const float r00,//r00 + const float r01,//r01 + const GPlexRegHV &a ,//res_glo + GPlexReg2V &b )//res_loc +{ + + // res_loc = rotT * res_glo + // B = R * A + b[0] = r00*a[0] + r01*a[1]; + b[1] = a[2]; +} + + +__device__ void ProjectResErr_fn(const float a00, + const float a01, + const GPlexRegHS &b, + GPlexRegHH &c) +{ + // C = A * B, C is 3x3, A is 3x3 , B is 3x3 sym + + // Based on script generation and adapted to custom sizes. + c[ 0] = a00*b[ 0] + a01*b[ 1]; + c[ 1] = a00*b[ 1] + a01*b[ 2]; + c[ 2] = a00*b[ 3] + a01*b[ 4]; + c[ 3] = b[ 3]; + c[ 4] = b[ 4]; + c[ 5] = b[ 5]; + c[ 6] = a01*b[ 0] - a00*b[ 1]; + c[ 7] = a01*b[ 1] - a00*b[ 2]; + c[ 8] = a01*b[ 3] - a00*b[ 4]; +} + + +__device__ void ProjectResErrTransp_fn(const float a00, + const float a01, const GPlexRegHH &b, GPlexReg2S &c) +{ + // C = A * B, C is 3x3 sym, A is 3x3 , B is 3x3 + + // Based on script generation and adapted to custom sizes. + c[ 0] = b[ 0]*a00 + b[ 1]*a01; + c[ 1] = b[ 3]*a00 + b[ 4]*a01; + c[ 2] = b[ 5]; +} + + +__device__ void computeChi2_fn( + const GPlexLS &propErr, const GPlexHS &msErr, const GPlexHV &msPar, + const GPlexLV &propPar, GPlexQF &outChi2, const int n, const int N) { + //int n = threadIdx.x + blockIdx.x * blockDim.x; + /*float resErr_reg[HS]; // ~ resErr_glo*/ + GPlexRegHS resErr_reg; + + if (n < N) { + // coordinate change + float rotT00; + float rotT01; + const float r = hipo(msPar(n, 0, 0), msPar(n, 1, 0)); + rotT00 = -(msPar(n, 1, 0) + propPar(n, 1, 0))/(2*r); + rotT01 = (msPar(n, 0, 0) + propPar(n, 0, 0))/(2*r); + + /*float res_glo[HV];*/ + GPlexRegHV res_glo; + subtractFirst3_fn(msPar, propPar, res_glo, N, n); + + for (int j = 0; j < HS; ++j) { + resErr_reg[j] = 0; //resErr[j*resErr_stride + n]; + } + addIntoUpperLeft3x3_fn(propErr, msErr, resErr_reg, N, n); + + GPlexReg2V res_loc; //position residual in local coordinates + RotateResidulsOnTangentPlane_fn(rotT00,rotT01,res_glo,res_loc); + /*MPlex2S resErr_loc;//covariance sum in local position coordinates*/ + /*MPlexHH tempHH;*/ + GPlexReg2S resErr_loc; // 2x2 sym + GPlexRegHH tempHH; // 3*3 sym + ProjectResErr_fn (rotT00, rotT01, resErr_reg, tempHH); + ProjectResErrTransp_fn(rotT00, rotT01, tempHH, resErr_loc); + + /*invertCramerSym_fn(resErr_reg);*/ + invertCramerSym2x2_fn(resErr_loc); + + chi2Similarity_fn(res_loc, resErr_loc, outChi2.ptr, outChi2.stride, n); + } +} + + +__global__ void computeChi2_kernel( + const GPlexLS propErr, const GPlexHS msErr, const GPlexHV msPar, + const GPlexLV propPar, GPlexQF outChi2, const int N) { + int grid_width = blockDim.x * gridDim.x; + int itrack = threadIdx.x + blockDim.x*blockIdx.x; + for (int z = 0; z < (N-1)/grid_width +1; z++) { + itrack += z*grid_width; + + if (itrack < N) { + computeChi2_fn (propErr, msErr, msPar, propPar, outChi2, itrack, N); + } + } +} + + +void computeChi2_wrapper(cudaStream_t &stream, + const GPlexLS &propErr, const GPlexHS &msErr, + const GPlexHV &msPar, const GPlexLV &propPar, GPlexQF &outChi2, + const int N) { + int gridx = std::min((N-1)/BLOCK_SIZE_X + 1, + max_blocks_x); + dim3 grid(gridx, 1, 1); + dim3 block(BLOCK_SIZE_X, 1, 1); + computeChi2_kernel <<< grid, block, 0, stream >>> + (propErr, msErr, msPar, propPar, outChi2, N); + } diff --git a/mkFit/computeChi2_kernels.h b/mkFit/computeChi2_kernels.h new file mode 100644 index 0000000000000..72f0e1f06976e --- /dev/null +++ b/mkFit/computeChi2_kernels.h @@ -0,0 +1,33 @@ +#ifndef _COMPUTE_CHI2_KERNELS_H_ +#define _COMPUTE_CHI2_KERNELS_H_ + +#include "HitStructuresCU.h" +#include "GPlex.h" +#include "GeometryCU.h" + + +__device__ void computeChi2_fn(const GPlexLS &propErr, const GPlexHS &msErr, + const GPlexHV &msPar, const GPlexLV &propPar, GPlexQF &outChi2, + const int itrack, const int N); + + +void computeChi2_wrapper(cudaStream_t &stream, + const GPlexLS &propErr, const GPlexHS &msErr, const GPlexHV &msPar, + const GPlexLV &propPar, GPlexQF &outChi2, const int N); + +#if 1 +void HitToMs_wrapper(cudaStream_t& stream, + GPlexHS &msErr, GPlexHV &msPar, LayerOfHitsCU &layer, GPlexQI &XHitSize, + GPlexHitIdx &XHitArr, GPlexQI &HitsIdx, int hit_cnt, int N); +#endif + +__device__ void RotateResidulsOnTangentPlane_fn(const float r00, + const float r01, const GPlexRegHV &a, GPlexReg2V &b); + +__device__ void ProjectResErr_fn(const float a00, const float a01, + const GPlexRegHS &b, GPlexRegHH &c); + +__device__ void ProjectResErrTransp_fn(const float a00, const float a01, + const GPlexRegHH &b, GPlexReg2S &c); + +#endif diff --git a/mkFit/fittestMPlex.cc b/mkFit/fittestMPlex.cc index 9293835b3d909..b5082932d3d7d 100644 --- a/mkFit/fittestMPlex.cc +++ b/mkFit/fittestMPlex.cc @@ -10,6 +10,7 @@ #include "MkFitter.h" #if USE_CUDA +#include "fittestMPlex.h" #include "FitterCU.h" #endif @@ -24,6 +25,7 @@ #include #include +#include #if defined(USE_VTUNE_PAUSE) #include "ittnotify.h" @@ -182,58 +184,107 @@ double runFittingTestPlex(Event& ev, std::vector& rectracks) } #ifdef USE_CUDA -double runFittingTestPlexGPU(FitterCU &cuFitter, - Event& ev, std::vector& rectracks) +void runAllEventsFittingTestPlexGPU(std::vector& events) { + double s_tmp = 0.0; +#if 0 + In principle, the warmup loop should not be required. + The separate_first_call_for_meaningful_profiling_numbers() function + should be enough. + // Warmup loop + for (int i = 0; i < 1; ++i) { + FitterCU cuFitter(NN); + cuFitter.allocateDevice(); + Event &ev = events[0]; + std::vector plex_tracks_ev; + plex_tracks_ev.resize(ev.simTracks_.size()); + + if (g_run_fit_std) runFittingTestPlexGPU(cuFitter, ev, plex_tracks_ev); + cuFitter.freeDevice(); + } +#endif + separate_first_call_for_meaningful_profiling_numbers(); + + // Reorgnanization (copyIn) can eventually be multithreaded. + omp_set_nested(1); + + omp_set_num_threads(Config::numThreadsEvents); + double total_gpu_time = dtime(); +#pragma omp parallel reduction(+:s_tmp) + { + int numThreadsEvents = omp_get_num_threads(); + int thr_idx = omp_get_thread_num(); + + // FitterCU is declared here to share allocations and deallocations + // between the multiple events processed by a single thread. + int gplex_size = 10000; + FitterCU cuFitter(gplex_size); + cuFitter.allocateDevice(); + cuFitter.allocate_extra_addBestHit(); + + for (int evt = thr_idx+1; evt <= Config::nEvents; evt+= numThreadsEvents) { + int idx = thr_idx; + printf("==============================================================\n"); + printf("Processing event %d with thread %d\n", evt, idx); + Event &ev = events[evt-1]; + std::vector plex_tracks_ev; + plex_tracks_ev.resize(ev.simTracks_.size()); + double tmp = 0, tmp2bh = 0, tmp2 = 0, tmp2ce = 0; + + //if (g_run_fit_std) tmp = runFittingTestPlexGPU(cuFitter, ev, plex_tracks_ev); + runFittingTestPlexGPU(cuFitter, ev, plex_tracks_ev); + + printf("Matriplex fit = %.5f -------------------------------------", tmp); + printf("\n"); + s_tmp += tmp; +#if 0 // 0 for timing, 1 for validation + // Validation crashes for multiple threads. + // It is something in relation to ROOT. Not sure what. + if (omp_get_num_threads() <= 1) { + //if (g_run_fit_std) { + std::string tree_name = "validation-plex-" + std::to_string(evt) + ".root"; + make_validation_tree(tree_name.c_str(), ev.simTracks_, plex_tracks_ev); + //} + } +#endif + } + cuFitter.free_extra_addBestHit(); + cuFitter.freeDevice(); + } + std::cerr << "###### [Fitting] Total GPU time: " << dtime() - total_gpu_time << " ######\n"; +} - std::vector& simtracks = ev.simTracks_; - - const int Nhits = MAX_HITS; - // XXX What if there's a missing / double layer? - // Eventually, should sort track vector by number of hits! - // And pass the number in on each "setup" call. - // Reserves should be made for maximum possible number (but this is just - // measurments errors, params). - // NOTE: MkFitter *MUST* be on heap, not on stack! - // Standard operator new screws up alignment of ALL MPlex memebrs of MkFitter, - // even if one adds attr(aligned(64)) thingy to every possible place. +double runFittingTestPlexGPU(FitterCU &cuFitter, + Event& ev, std::vector& rectracks) +{ + std::vector& simtracks = ev.simTracks_; - // MkFitter *mkfp = new (_mm_malloc(sizeof(MkFitter), 64)) MkFitter(Nhits); + cuFitter.createStream(); - MkFitter* mkfp_arr = new (_mm_malloc(sizeof(MkFitter), 64)) MkFitter(Nhits); + Track *tracks_cu; + cudaMalloc((void**)&tracks_cu, simtracks.size()*sizeof(Track)); + cudaMemcpyAsync(tracks_cu, &simtracks[0], simtracks.size()*sizeof(Track), + cudaMemcpyHostToDevice, cuFitter.get_stream()); - int theEnd = simtracks.size(); - double time = dtime(); - int Nstride = NN; + EventOfHitsCU events_of_hits_cu; + events_of_hits_cu.allocGPU(ev.layerHits_); + events_of_hits_cu.copyFromCPU(ev.layerHits_, cuFitter.get_stream()); - for (int itrack = 0; itrack < theEnd; itrack += Nstride) - { - int end = std::min(itrack + Nstride, theEnd); + double time = dtime(); - MkFitter *mkfp = mkfp_arr; + cuFitter.FitTracks(tracks_cu, simtracks.size(), events_of_hits_cu, Config::nLayers); - //double time_input = dtime(); - mkfp->InputTracksAndHits(simtracks, ev.layerHits_, itrack, end); - //std::cerr << "Input time: " << (dtime() - time_input)*1e3 << std::endl; + cudaMemcpy(&rectracks[0], tracks_cu, simtracks.size()*sizeof(Track), cudaMemcpyDeviceToHost); - cuFitter.FitTracks(mkfp->Chg, - mkfp->GetPar0(), - mkfp->GetErr0(), - mkfp->msPar, - mkfp->msErr, - Nhits, - simtracks, itrack, end, ev.layerHits_); + time = dtime() - time; - double time_output = dtime(); - mkfp->OutputFittedTracks(rectracks, itrack, end); - std::cerr << "Output time: " << (dtime() - time_output)*1e3 << std::endl; - } - time = dtime() - time; + events_of_hits_cu.deallocGPU(); + cudaFree(tracks_cu); - _mm_free(mkfp_arr); + cuFitter.destroyStream(); - return time; + return time; } #endif diff --git a/mkFit/fittestMPlex.h b/mkFit/fittestMPlex.h index 5fba69308fb23..2b88d56a17680 100644 --- a/mkFit/fittestMPlex.h +++ b/mkFit/fittestMPlex.h @@ -15,6 +15,7 @@ void make_validation_tree(const char *fname, double runFittingTestPlex(Event& ev, std::vector& rectracks); #ifdef USE_CUDA +void runAllEventsFittingTestPlexGPU(std::vector& events); double runFittingTestPlexGPU(FitterCU &cuFitter, Event& ev, std::vector& rectracks); #endif diff --git a/mkFit/fittracks_kernels.cu b/mkFit/fittracks_kernels.cu new file mode 100644 index 0000000000000..8833d3c3809c4 --- /dev/null +++ b/mkFit/fittracks_kernels.cu @@ -0,0 +1,48 @@ +#include "fittracks_kernels.h" + +#include "kalmanUpdater_kernels.h" +#include "propagation_kernels.h" + +constexpr int BLOCK_SIZE_X = 256; + +__global__ void fittracks_kernel( + GPlexLV par_iP, GPlexLS Err_iP, + GPlexHV *msPar_arr, GPlexHS *msErr_arr, + GPlexLV par_iC, GPlexLS Err_iC, + GPlexLL errorProp, GPlexQI inChg, + const int Nhits, int N) +{ + int grid_width = blockDim.x * gridDim.x; + int n = threadIdx.x + blockIdx.x * blockDim.x; + for (int hi = 0; hi < Nhits; ++hi) { + GPlexHV &msPar = msPar_arr[hi]; + GPlexHS &msErr = msErr_arr[hi]; + + for (int z = 0; z < (N-1)/grid_width +1; z++) { + n += z*grid_width; + + propagation_fn(Err_iC, par_iC, inChg, msPar, Err_iP, par_iP, n, N); + kalmanUpdate_fn(Err_iP, msErr, par_iP, msPar, par_iC, Err_iC, n, N); + } + } +} + +void fittracks_wrapper(cudaStream_t &stream, + GPlexLS &Err_iP, GPlexLV &par_iP, + GPlexHS *msErr_arr, GPlexHV *msPar_arr, + GPlexLS &Err_iC, GPlexLV &par_iC, + GPlexLL &errorProp, GPlexQI &inChg, + const int Nhits, const int N) +{ + int gridx = std::min((N-1)/BLOCK_SIZE_X + 1, + max_blocks_x); + dim3 grid(gridx, 1, 1); + dim3 block(BLOCK_SIZE_X, 1, 1); + + fittracks_kernel <<< grid, block, 0, stream >>> + (par_iP, Err_iP, + msPar_arr, msErr_arr, + par_iC, Err_iC, + errorProp, inChg, + Nhits, N); +} diff --git a/mkFit/fittracks_kernels.h b/mkFit/fittracks_kernels.h new file mode 100644 index 0000000000000..2486b80a51e3b --- /dev/null +++ b/mkFit/fittracks_kernels.h @@ -0,0 +1,13 @@ +#ifndef FITTRACKS_KERNELS_H_G3FDJYTX +#define FITTRACKS_KERNELS_H_G3FDJYTX + +#include "GPlex.h" + +void fittracks_wrapper(cudaStream_t &stream, + GPlexLS &Err_iP, GPlexLV &par_iP, + GPlexHS *msErr, GPlexHV *msPar, + GPlexLS &Err_iC, GPlexLV &par_iC, + GPlexLL &errorProp, GPlexQI &inChg, + const int hit_idx, const int N); + +#endif /* end of include guard: FITTRACKS_KERNELS_H_G3FDJYTX */ diff --git a/mkFit/gpu_utils.cu b/mkFit/gpu_utils.cu new file mode 100644 index 0000000000000..2800aa39bd4b8 --- /dev/null +++ b/mkFit/gpu_utils.cu @@ -0,0 +1,9 @@ +#include "gpu_utils.h" + +void sync_gpu() { + cudaCheckErrorSync(); +} + +void separate_first_call_for_meaningful_profiling_numbers() { + sync_gpu(); +} diff --git a/mkFit/gpu_utils.h b/mkFit/gpu_utils.h new file mode 100644 index 0000000000000..3995866b9370a --- /dev/null +++ b/mkFit/gpu_utils.h @@ -0,0 +1,29 @@ +#ifndef GPU_UTILS_H +#define GPU_UTILS_H + +#include + +#include + +#define cudaCheckError() \ + do { \ + cudaError_t e=cudaGetLastError(); \ + CubDebugExit(e) \ + } while(0) + +#define cudaCheckErrorSync() \ + do { \ + cudaDeviceSynchronize(); \ + cudaCheckError(); \ + } while(0) + +// CUDA specific: +// Maximum number of blocks in the X direction of the thread grid. +constexpr int max_blocks_x = INT32_MAX; + +// The first call to a CUDA API function takes the initialization hit. +void separate_first_call_for_meaningful_profiling_numbers(); + +void sync_gpu(); + +#endif /* ifndef GPU_UTILS_H */ diff --git a/mkFit/index_selection_kernels.cu b/mkFit/index_selection_kernels.cu new file mode 100644 index 0000000000000..00fa9fb987dce --- /dev/null +++ b/mkFit/index_selection_kernels.cu @@ -0,0 +1,176 @@ +#include "index_selection_kernels.h" +#include "Config.h" +#include "HitStructures.h" +#include "gpu_utils.h" + +#include "stdio.h" + +#define BLOCK_SIZE_X 32 + +constexpr bool tmp_useCMSGeom = false; + +__device__ void selectHitIndices_fn(const LayerOfHitsCU &layer_of_hits, + const GPlexLS &Err, const GPlexLV &Par, GPlexQI &XHitSize, + GPlexHitIdx &XHitArr, const int itrack, const int N) { + //int itrack = threadIdx.x + blockDim.x*blockIdx.x; + + if (itrack < N) { + bool dump = false; + const float nSigmaPhi = 3; + const float nSigmaZ = 3; + + int xhitsize_tmp = XHitSize[itrack]; + XHitSize[itrack] = 0; + + float z, phi, dz, dphi; + { + const float x = Par(itrack, 0, 0); + const float y = Par(itrack, 1, 0); + + const float r2 = x*x + y*y; + + z = Par(itrack, 2, 0); + phi = getPhi(x, y); + /*dz = nSigmaZ * sqrtf(Err(itrack, 2, 2));*/ + dz = nSigmaZ * sqrtf(Err[5*Err.stride + itrack]); + + const float dphidx = -y/r2, dphidy = x/r2; + /*const float dphi2 = dphidx * dphidx * Err(itrack, 0, 0) +*/ + /*dphidy * dphidy * Err(itrack, 1, 1) +*/ + /*2 * dphidx * dphidy * Err(itrack, 0, 0);*/ + const float dphi2 = dphidx * dphidx * Err[0*Err.stride + itrack] + + dphidy * dphidy * Err[2*Err.stride + itrack] + + 2 * dphidx * dphidy * Err[1*Err.stride + itrack]; + +#ifdef HARD_CHECK + assert(dphi2 >= 0); +#endif + + dphi = nSigmaPhi * sqrtf(fabs(dphi2)); + + if (tmp_useCMSGeom) + { + //now correct for bending and for layer thickness unsing linear approximation + const float deltaR = Config::cmsDeltaRad; //fixme! using constant value, to be taken from layer properties + const float r = sqrt(r2); +#ifdef CCSCOORD + //here alpha is the difference between posPhi and momPhi + const float alpha = phi - Par(itrack, 4, 0); + float cosA, sinA; + if (Config::useTrigApprox) { + sincos4(alpha, sinA, cosA); + } else { + cosA = cos(alpha); + sinA = sin(alpha); + } +#else + const float px = Par(itrack, 3, 0); + const float py = Par(itrack, 4, 0); + const float pt = ::sqrt(px*px + py*py); + //here alpha is the difference between posPhi and momPhi + //const float cosA = ( x*px + dy*py ) / (pt*r); + //const float sinA = ( y*px - dx*py ) / (pt*r); + // FIXME dx, dy do not exist: + // does not matter yet for gpu as cms geom is not implemented + const float cosA = ( x*px + y*py ) / (pt*r); + const float sinA = ( y*px - x*py ) / (pt*r); +#endif + //take abs so that we always inflate the window + const float dist = fabs(deltaR*sinA/cosA); + + dphi += dist / r; + } + } + + const LayerOfHitsCU &L = layer_of_hits; + + if (fabs(dz) > Config::m_max_dz) dz = Config::m_max_dz; + if (fabs(dphi) > Config::m_max_dphi) dphi = Config::m_max_dphi; + + const int zb1 = L.GetZBinChecked(z - dz); + const int zb2 = L.GetZBinChecked(z + dz) + 1; + const int pb1 = L.GetPhiBin(phi - dphi); + const int pb2 = L.GetPhiBin(phi + dphi) + 1; + // MT: The extra phi bins give us ~1.5% more good tracks at expense of 10% runtime. + // const int pb1 = L.GetPhiBin(phi - dphi) - 1; + // const int pb2 = L.GetPhiBin(phi + dphi) + 2; + + if (dump) + printf("LayerOfHitsCU::SelectHitIndices %6.3f %6.3f %6.6f %7.5f %3d %3d %4d %4d\n", + z, phi, dz, dphi, zb1, zb2, pb1, pb2); + + // MT: One could iterate in "spiral" order, to pick hits close to the center. + // http://stackoverflow.com/questions/398299/looping-in-a-spiral + // This would then work best with relatively small bin sizes. + // Or, set them up so I can always take 3x3 array around the intersection. + for (int zi = zb1; zi < zb2; ++zi) + { + for (int pi = pb1; pi < pb2; ++pi) + { + const int pb = pi & L.m_phi_mask; + + // MT: The following line is the biggest hog (4% total run time). + // This comes from cache misses, I presume. + // It might make sense to make first loop to extract bin indices + // and issue prefetches at the same time. + // Then enter vectorized loop to actually collect the hits in proper order. + +#if 1 + /*for (int hi = L.m_phi_bin_infos[zi][pb].first; hi < L.m_phi_bin_infos[zi][pb].second; ++hi)*/ + for (int hi = L.m_phi_bin_infos[zi*Config::m_nphi + pb].first; + hi < L.m_phi_bin_infos[zi*Config::m_nphi + pb].second; ++hi) + { + // MT: Access into m_hit_zs and m_hit_phis is 1% run-time each. + +#ifdef LOH_USE_PHI_Z_ARRAYS + float ddz = fabs(z - L.m_hit_zs[hi]); + float ddphi = fabs(phi - L.m_hit_phis[hi]); + if (ddphi > Config::PI) ddphi = Config::TwoPI - ddphi; + + if (dump) + printf(" SHI %3d %4d %4d %5d %6.3f %6.3f %6.4f %7.5f %s\n", + zi, pi, pb, hi, + L.m_hit_zs[hi], L.m_hit_phis[hi], ddz, ddphi, + (ddz < dz && ddphi < dphi) ? "PASS" : "FAIL"); + + // MT: Commenting this check out gives full efficiency ... + // and means our error estimations are wrong! + // Avi says we should have *minimal* search windows per layer. + // Also ... if bins are sufficiently small, we do not need the extra + // checks, see above. + // if (ddz < dz && ddphi < dphi && XHitSize[itrack] < MPlexHitIdxMax) +#endif + // MT: The following check also makes more sense with spiral traversal, + // we'd be taking in closest hits first. + if (XHitSize[itrack] < GPlexHitIdxMax) + { + /*if (itrack == 0)*/ + /*if (XHitArr(itrack, XHitSize[itrack], 0) != hi) {*/ + /*printf("before %d, after %d\n", XHitArr(itrack, XHitSize[itrack], 0), hi);*/ + /*printf("--- %d, after %d\n", XHitSize[itrack], hi);*/ + /*}*/ + XHitArr(itrack, XHitSize[itrack]++, 0) = hi; + } + } +#endif // 0 + } + } + } +} + +__global__ void selectHitIndices_kernel(const LayerOfHitsCU layer_of_hits, + const GPlexLS Err, const GPlexLV Par, GPlexQI XHitSize, GPlexHitIdx XHitArr, const int N) { + int itrack = threadIdx.x + blockDim.x*blockIdx.x; + selectHitIndices_fn(layer_of_hits, Err, Par, XHitSize, XHitArr, itrack, N); +} + +void selectHitIndices_wrapper(const cudaStream_t& stream, + const LayerOfHitsCU& layer_of_hits, const GPlexLS& Err, const GPlexLV& Par, + GPlexQI& XHitSize, GPlexHitIdx& XHitArr, const int N) { + int gridx = std::min((N-1)/BLOCK_SIZE_X + 1, + max_blocks_x); + dim3 grid(gridx, 1, 1); + dim3 block(BLOCK_SIZE_X, 1, 1); + selectHitIndices_kernel <<< grid, block, 0, stream >>> + (layer_of_hits, Err, Par, XHitSize, XHitArr, N); +} diff --git a/mkFit/index_selection_kernels.h b/mkFit/index_selection_kernels.h new file mode 100644 index 0000000000000..801b5a4b068f9 --- /dev/null +++ b/mkFit/index_selection_kernels.h @@ -0,0 +1,15 @@ +#ifndef _INDEX_SELECTION_KERNELS_H_ +#define _INDEX_SELECTION_KERNELS_H_ + +#include "HitStructuresCU.h" +#include "GPlex.h" + +void selectHitIndices_wrapper(const cudaStream_t& stream, + const LayerOfHitsCU& layer_of_hits, const GPlexLS& Err, const GPlexLV& Par, + GPlexQI& XHitSize, GPlexHitIdx& XHitArr, const int N); + +__device__ void selectHitIndices_fn(const LayerOfHitsCU &layer_of_hits, + const GPlexLS &Err, const GPlexLV &Par, GPlexQI &XHitSize, + GPlexHitIdx &XHitArr, const int itrack, const int N); + +#endif // _INDEX_SELECTION_KERNELS_H_ diff --git a/mkFit/kalmanUpdater_kernels.cu b/mkFit/kalmanUpdater_kernels.cu index cfd3ea7587608..0c5a44d2db37f 100644 --- a/mkFit/kalmanUpdater_kernels.cu +++ b/mkFit/kalmanUpdater_kernels.cu @@ -1,20 +1,289 @@ #include "Config.h" +#include "Hit.h" #include "kalmanUpdater_kernels.h" +#include "computeChi2_kernels.h" +#include "gpu_utils.h" // TODO: Clean all the hard-coded #define #define LS 21 #define HS 6 #define LH 18 +#define HV 3 #define BLOCK_SIZE_X 32 -#define MAX_BLOCKS_X 65535 // CUDA constraint + +/*__device__ float getPhi_fn2(float x, float y)*/ +/*{*/ + /*return atan2(y,x); */ +/*}*/ + +/*__device__ float getTheta_fn(float r, float z){*/ + /*return atan2(r,z);*/ +/*}*/ + +__device__ void subtract_matrix(const float *a, const int aN, + const float *b, const int bN, + float *c, const int cN, + const int size, const int n) { + for (int i = 0; i < size; ++i) { + c[i*cN + n] = a[i*aN + n] - b[i*bN + n]; + + } +} + +__device__ float getHypot_fn(const float x, const float y) +{ + return sqrt(x*x + y*y); +} + +__device__ +void KalmanHTG_fn(const float a00, const float a01, + const GPlexReg2S &b, GPlexRegHH &c) +{ + + // HTG = rot * res_loc + // C = A * B + + // Based on script generation and adapted to custom sizes. + c[ 0] = a00*b[ 0]; + c[ 1] = a00*b[ 1]; + c[ 2] = 0.; + c[ 3] = a01*b[ 0]; + c[ 4] = a01*b[ 1]; + c[ 5] = 0.; + c[ 6] = b[ 1]; + c[ 7] = b[ 2]; + c[ 8] = 0.; +} + +__device__ +void KalmanGain_fn(const GPlexLS &A, const GPlexRegHH &b, GPlexRegLH &c, const int n) +{ + // C = A * B, C is 6x3, A is 6x6 sym , B is 6x3 + using T = float; + float *a = A.ptr; + int aN = A.stride; int an = n; // Global array + int bN = 1; int bn = 0; // Register array + int cN = 1; int cn = 0; + +#include "KalmanGain.ah" +} + +__device__ +void KHMult_fn(const GPlexRegLH &a, + const float b00, + const float b01, + GPlexRegLL &c) +{ + c[ 0] = a[ 0]*b00; + c[ 1] = a[ 0]*b01; + c[ 2] = a[ 1]; + c[ 3] = 0; + c[ 4] = 0; + c[ 5] = 0; + c[ 6] = a[ 3]*b00; + c[ 7] = a[ 3]*b01; + c[ 8] = a[ 4]; + c[ 9] = 0; + c[10] = 0; + c[11] = 0; + c[12] = a[ 6]*b00; + c[13] = a[ 6]*b01; + c[14] = a[ 7]; + c[15] = 0; + c[16] = 0; + c[17] = 0; + c[18] = a[ 9]*b00; + c[19] = a[ 9]*b01; + c[20] = a[10]; + c[21] = 0; + c[22] = 0; + c[23] = 0; + c[24] = a[12]*b00; + c[25] = a[12]*b01; + c[26] = a[13]; + c[27] = 0; + c[28] = 0; + c[29] = 0; + c[30] = a[15]*b00; + c[31] = a[15]*b01; + c[32] = a[16]; + c[33] = 0; + c[34] = 0; + c[35] = 0; +} + +__device__ +void KHC_fn(const GPlexRegLL &a, const GPlexLS &B, GPlexLS &C, const int n) +{ + // C = A * B, C is 6x6, A is 6x6 , B is 6x6 sym + using T = float; + int aN = 1; int an = 0; // Register array + T *b = B.ptr; int bN = B.stride; int bn = n; + T *c = C.ptr; int cN = C.stride; int cn = n; +#include "KHC.ah" +} + +// +__device__ +void ConvertToPolar_fn(const GPlexLV &a, GPlexRegLV &b, GPlexRegLL &c, const int n) +{ + int aN = a.stride; + typedef float T; + const float pt = getHypot_fn(a[ 3*aN+n], a[ 4*aN+n]); + const float p2 = pt*pt + a[ 5*aN+n]*a[ 5*aN+n]; + // + b[ 0] = a[ 0*aN+n]; + b[ 1] = a[ 1*aN+n]; + b[ 2] = a[ 2*aN+n]; + b[ 3] = 1.0f/pt; + b[ 4] = getPhi(a[ 3*aN+n], a[ 4*aN+n]); //fixme: use trig approx + b[ 5] = getTheta(pt, a[ 5*aN+n]); + // + c[ 0] = 1.; + c[ 1] = 0.; + c[ 2] = 0.; + c[ 3] = 0.; + c[ 4] = 0.; + c[ 5] = 0.; + c[ 6] = 0.; + c[ 7] = 1.; + c[ 8] = 0.; + c[ 9] = 0.; + c[10] = 0.; + c[11] = 0.; + c[12] = 0.; + c[13] = 0.; + c[14] = 1.; + c[15] = 0.; + c[16] = 0.; + c[17] = 0.; + c[18] = 0.; + c[19] = 0.; + c[20] = 0.; + c[21] = -a[ 3*aN+n]/(pt*pt*pt); + c[22] = -a[ 4*aN+n]/(pt*pt*pt); + c[23] = 0.; + c[24] = 0.; + c[25] = 0.; + c[26] = 0.; + c[27] = -a[ 4*aN+n]/(pt*pt); + c[28] = a[ 3*aN+n]/(pt*pt); + c[29] = 0.; + c[30] = 0.; + c[31] = 0.; + c[32] = 0.; + c[33] = a[ 3*aN+n]*a[ 5*aN+n]/(pt*p2); + c[34] = a[ 4*aN+n]*a[ 5*aN+n]/(pt*p2); + c[35] = -pt/p2; +} + +__device__ +void PolarErr_fn(const GPlexRegLL &a, const float *b, int bN, GPlexRegLL &c, const int n) +{ + // C = A * B, C is 6x6, A is 6x6 , B is 6x6 sym + + // Generated code access arrays with variables cN, cn + // c[i*cN+cn] + int aN = 1; int an = 0; // Register array + int bn = n; // Global array + int cN = 1; int cn = 0; +#include "CCSErr.ah" +} + +__device__ +void PolarErrTransp_fn(const GPlexRegLL &a, const GPlexRegLL &b, GPlexLS &C, const int n) +{ + // C = A * B, C is sym, A is 6x6 , B is 6x6 + using T = float; + int aN = 1; int an = 0; + int bN = 1; int bn = 0; + T *c = C.ptr; int cN = C.stride; int cn = n; +#include "CCSErrTransp.ah" +} + +__device__ +void ConvertToCartesian_fn(const GPlexRegLV &a, float *b, int bN, GPlexRegLL &c, const int n) +{ + const float cosP = std::cos(a[ 4]); //fixme: use trig approx + const float sinP = std::sin(a[ 4]); + const float cosT = std::cos(a[ 5]); + const float sinT = std::sin(a[ 5]); + // + b[ 0*bN+n] = a[ 0]; + b[ 1*bN+n] = a[ 1]; + b[ 2*bN+n] = a[ 2]; + b[ 3*bN+n] = cosP/a[ 3]; + b[ 4*bN+n] = sinP/a[ 3]; + b[ 5*bN+n] = cosT/(sinT*a[ 3]); + // + c[ 0] = 1.; + c[ 1] = 0.; + c[ 2] = 0.; + c[ 3] = 0.; + c[ 4] = 0.; + c[ 5] = 0.; + c[ 6] = 0.; + c[ 7] = 1.; + c[ 8] = 0.; + c[ 9] = 0.; + c[10] = 0.; + c[11] = 0.; + c[12] = 0.; + c[13] = 0.; + c[14] = 1.; + c[15] = 0.; + c[16] = 0.; + c[17] = 0.; + c[18] = 0.; + c[19] = 0.; + c[20] = 0.; + c[21] = -cosP/(a[ 3]*a[ 3]); + c[22] = -sinP/a[ 3]; + c[23] = 0.; + c[24] = 0.; + c[25] = 0.; + c[26] = 0.; + c[27] = -sinP/(a[ 3]*a[ 3]); + c[28] = cosP/a[ 3]; + c[29] = 0.; + c[30] = 0.; + c[31] = 0.; + c[32] = 0.; + c[33] = -cosT/(sinT*a[ 3]*a[ 3]); + c[34] = 0.; + c[35] = -1.0f/(sinT*sinT*a[ 3]); +} + +__device__ +void CartesianErr_fn(const GPlexRegLL &a, const float *b, const int bN, GPlexRegLL &c, const int n) +{ + // C = A * B, C is 6x6, A is 6x6 , B is 6x6 sym + int aN = 1; int an = 0; + int bn = n; + int cN = 1; int cn = 0; + +#include "CartesianErr.ah" +} + +__device__ +void CartesianErrTransp_fn(const GPlexRegLL &a, const GPlexRegLL &b, GPlexLS &C, const int n) +{ + // C = A * B, C is sym, A is 6x6 , B is 6x6 + using T = float; + int aN = 1; int an = 0; + int bN = 1; int bn = 0; + T *c = C.ptr; int cN = C.stride; int cn = n; + +#include "CartesianErrTransp.ah" +} /// MultKalmanGain //////////////////////////////////////////////////////////// __device__ void upParam_MultKalmanGain_fn( - const float* __restrict__ a, size_t aN, - float* b_reg, float *c, int N, int n) { + const float* __restrict__ a, const size_t aN, + const float* b_reg, float *c, const int N, const int n) { // const T* __restrict__ tells the compiler that it can uses the read-only // cache, without worrying about coherency. // c -> kalmanGain, in register @@ -101,10 +370,39 @@ __device__ void invertCramerSym_fn(float *a) { a[5] = s*c22; } +__device__ void invertCramerSym2x2_fn(GPlexReg2S &a) { + float det = a[0] * a[2] - a[1] * a[1]; + const float s = float(1) / det; + const float tmp = s * a[2]; + a[1] *= -s; + a[2] = s * a[0]; + a[0] = tmp; +} + +__device__ void subtractFirst3_fn(const GPlexHV __restrict__ &A, + const GPlexLV __restrict__ &B, + GPlexRegHV &C, const int N, int n) { + using T = float; + const T *a = A.ptr; int aN = A.stride; + const T *b = B.ptr; int bN = B.stride; + T *c = C.arr; + /*int n = threadIdx.x + blockIdx.x * blockDim.x;*/ + + if(n < N) { + c[0] = a[0*aN+n] - b[0*bN+n]; + c[1] = a[1*aN+n] - b[1*bN+n]; + c[2] = a[2*aN+n] - b[2*bN+n]; + } +} + /// AddIntoUpperLeft3x3 ////////////////////////////////////////////////////// -__device__ void addIntoUpperLeft3x3_fn(const float* __restrict__ a, size_t aN, - const float* __restrict__ b, size_t bN, - float *c, const int N, int n) { +__device__ void addIntoUpperLeft3x3_fn(const GPlexLS __restrict__ &A, + const GPlexHS __restrict__ &B, + GPlexRegHS &C, const int N, const int n) { + using T = float; + const T *a = A.ptr; int aN = A.stride; + const T *b = B.ptr; int bN = B.stride; + T *c = C.arr; /*int n = threadIdx.x + blockIdx.x * blockDim.x;*/ if(n < N) { @@ -120,24 +418,25 @@ __device__ void addIntoUpperLeft3x3_fn(const float* __restrict__ a, size_t aN, /// MultResidualsAdd ////////////////////////////////////////////////////////// __device__ void multResidualsAdd_fn( - float* reg_a, - const float* __restrict__ b, size_t bN, - const float* __restrict__ c, size_t cN, - float *d, size_t dN, int N, int n) { + const GPlexRegLH ®_a, + const GPlexLV __restrict__ &B, + const GPlexReg2V &c, + GPlexLV &D, + const int N, const int n) { // a -> kalmanGain - /*int i = threadIdx.x;*/ - /*int n = threadIdx.x + blockIdx.x * blockDim.x;*/ + using T = float; + const T *b = B.ptr; int bN = B.stride; + T *d = D.ptr; int dN = D.stride; - float x0, x1, x2; - - /*for (int z = 0; z < (N-1)/gridDim.x +1; z++) {*/ - /*n += z*gridDim.x;*/ if (n < N) { + // TODO: This has changed with the introduction of polar coordiantes + // (x0, x1, x2) are not used anymore (see commented values) + // Clean this function's code, once it is better understood. // manually substract into local vars -- 3 of them - x0 = c[0 * cN + n] - b[0 * bN + n]; - x1 = c[1 * cN + n] - b[1 * bN + n]; - x2 = c[2 * cN + n] - b[2 * bN + n]; + const float x0 = c[0]; // - b[0 * bN + n]; + const float x1 = c[1]; //- b[1 * bN + n]; + const float x2 = 0; //c[2] - b[2 * bN + n]; // generate loop (can also write it manually this time, it's not much) // WARNING: highly numerically sensitive expressions. @@ -157,20 +456,40 @@ __device__ void multResidualsAdd_fn( /*}*/ } +__device__ +void MultResidualsAdd_all_reg(const GPlexRegLH &a, + const GPlexRegLV &b, + const GPlexReg2V &c, + GPlexRegLV &d) +{ + // outPar = psPar + kalmanGain*(dPar) + // D = B A C + // where right half of kalman gain is 0 + + // XXX Regenerate with a script. + // generate loop (can also write it manually this time, it's not much) + d[0] = b[0] + a[ 0] * c[0] + a[ 1] * c[1]; + d[1] = b[1] + a[ 3] * c[0] + a[ 4] * c[1]; + d[2] = b[2] + a[ 6] * c[0] + a[ 7] * c[1]; + d[3] = b[3] + a[ 9] * c[0] + a[10] * c[1]; + d[4] = b[4] + a[12] * c[0] + a[13] * c[1]; + d[5] = b[5] + a[15] * c[0] + a[16] * c[1]; +} + /// KalmanGain_x_propErr ////////////////////////////////////////////////////// __device__ void kalmanGain_x_propErr_fn( - float* d_kalmanGain, - const float* __restrict__ d_propErr, size_t stride_propErr, - float *d_outErr, size_t stride_outErr, const int N, int n) { + const float* d_kalmanGain, + const float* __restrict__ d_propErr, const size_t stride_propErr, + float *d_outErr, const size_t stride_outErr, const int N, const int n) { // a = d_kalmanGain, b = d_propErr, c = outErrTemp // c = b - a*b - float *a = d_kalmanGain; + const float *a = d_kalmanGain; const float *b = d_propErr; float *c = d_outErr; /*size_t aN = stride_kalmanGain;*/ - size_t bN = stride_propErr; - size_t cN = stride_outErr; + const size_t bN = stride_propErr; + const size_t cN = stride_outErr; register float reg_c[LS]; @@ -210,68 +529,135 @@ __device__ void kalmanGain_x_propErr_fn( } } -__global__ void kalmanUpdate_kernel( - const float* __restrict__ propErr, size_t propErr_stride, - const float* __restrict__ msErr, size_t msErr_stride, - const float* __restrict__ par_iP, size_t par_iP_stride, - const float* __restrict__ msPar, size_t msPar_stride, - float *par_iC, size_t par_iC_stride, - float *outErr, size_t outErr_stride, - const int N) { - int grid_width = blockDim.x * gridDim.x; +__device__ void kalmanUpdate_fn( + GPlexLS &propErr, const GPlexHS __restrict__ &msErr, + const GPlexLV __restrict__ &par_iP, const GPlexHV __restrict__ &msPar, + GPlexLV &par_iC, GPlexLS &outErr, const int n, const int N) { // Note: similar results with propErr kept in registers. // It is read-only so using the read-only cache yields more flexibility // wrt block size without increasing the pressure on registers to much. - int n = threadIdx.x + blockIdx.x * blockDim.x; // There is no need to keep resErr and kalmanGain as global memory arrays. - float resErr_reg[HS]; - float kalmanGain_reg[LH]; + /*float resErr_reg[HS];*/ + GPlexRegHS resErr_reg; + /*float kalmanGain_reg[LH];*/ + + // If there is more matrices than max_blocks_x * BLOCK_SIZE_X + if (n < N) { + for (int j = 0; j < HS; ++j) { + resErr_reg[j] = 0; //resErr[j*resErr_stride + n]; + } + + // FIXME: Add useCMSGeom -> port propagateHelixToRMPlex +#if 0 + if (Config::useCMSGeom) { + propagateHelixToRMPlex(psErr, psPar, inChg, msPar, propErr, propPar); + } else { + propErr = psErr; + propPar = psPar; + } +#endif + float rotT00; + float rotT01; + const float r = hipo(msPar(n, 0, 0), msPar(n, 1, 0)); + rotT00 = -(msPar(n, 1, 0) + par_iP(n, 1, 0))/(2*r); + rotT01 = (msPar(n, 0, 0) + par_iP(n, 0, 0))/(2*r); + + GPlexRegHV res_glo; + subtractFirst3_fn(msPar, par_iP, res_glo, N, n); + + addIntoUpperLeft3x3_fn(propErr, msErr, resErr_reg, N, n); + GPlexReg2V res_loc; //position residual in local coordinates + RotateResidulsOnTangentPlane_fn(rotT00,rotT01,res_glo,res_loc); + GPlexReg2S resErr_loc; // 2x2 sym + GPlexRegHH tempHH; // 3*3 sym + ProjectResErr_fn (rotT00, rotT01, resErr_reg, tempHH); + ProjectResErrTransp_fn(rotT00, rotT01, tempHH, resErr_loc); + + /*invertCramerSym_fn(resErr_reg);*/ + invertCramerSym2x2_fn(resErr_loc); +#ifndef CCSCOORD + // Move to "polar" coordinates: (x,y,z,1/pT,phi,theta) [can we find a better name?] + + /*MPlexLV propPar_pol;// propagated parameters in "polar" coordinates*/ + /*MPlexLL jac_pol; // jacobian from cartesian to "polar"*/ + /*ConvertToPolar_fn(propPar,propPar_pol,jac_pol);*/ + /*float propPar_pol[6];*/ + GPlexRegLV propPar_pol; + GPlexRegLL jac_pol; + ConvertToPolar_fn(par_iP, propPar_pol, jac_pol, n); + + GPlexRegLL tempLL; + PolarErr_fn(jac_pol, propErr.ptr, propErr.stride, tempLL, n); + PolarErrTransp_fn(jac_pol, tempLL, propErr, n);// propErr is now propagated errors in "polar" coordinates +#endif + + // Kalman update in "polar" coordinates + GPlexRegLH K; + KalmanHTG_fn(rotT00, rotT01, resErr_loc, tempHH); + KalmanGain_fn(propErr, tempHH, K, n); + +#ifdef CCSCOORD + // FIXME: assuming no polcoord for now + //MultResidualsAdd(K.arr, propPar, res_loc, outPar);// propPar_pol is now the updated parameters in "polar" coordinates + multResidualsAdd_fn(K, par_iP, res_loc, par_iC, N, n);// propPar_pol is now the updated parameters in "polar" coordinates + GPlexRegLL tempLL; +#else + /*MultResidualsAdd(K, propPar_pol, res_loc, propPar_pol);// propPar_pol is now the updated parameters in "polar" coordinates*/ + MultResidualsAdd_all_reg(K, propPar_pol, res_loc, propPar_pol); +#endif + + KHMult_fn(K, rotT00, rotT01, tempLL); + KHC_fn(tempLL, propErr, outErr, n); + /*outErr.Subtract(propErr, outErr);// outErr is in "polar" coordinates now*/ + subtract_matrix(propErr.ptr, propErr.stride, outErr.ptr, outErr.stride, + /*propErr.ptr, propErr.stride, LS, n);*/ + outErr.ptr, outErr.stride, LS, n); + +#ifndef CCSCOORD + // Go back to cartesian coordinates + + // jac_pol is now the jacobian from "polar" to cartesian + // outPar -> par_iC + ConvertToCartesian_fn(propPar_pol, par_iC.ptr, par_iC.stride, jac_pol, n); + CartesianErr_fn (jac_pol, outErr.ptr, outErr.stride, tempLL, n); + CartesianErrTransp_fn(jac_pol, tempLL, outErr, n);// outErr is in cartesian coordinates now +#endif +#if 0 + upParam_MultKalmanGain_fn(propErr, propErr_stride, + resErr_reg, kalmanGain_reg, N, n); + multResidualsAdd_fn(kalmanGain_reg, par_iP, par_iP_stride, + msPar, msPar_stride, par_iC, par_iC_stride, N, n); + + kalmanGain_x_propErr_fn(kalmanGain_reg, + propErr, propErr_stride, + outErr, outErr_stride, N, n); +#endif + } +} + +__global__ void kalmanUpdate_kernel( + GPlexLS propErr, const GPlexHS __restrict__ msErr, + const GPlexLV __restrict__ par_iP, const GPlexHV __restrict__ msPar, + GPlexLV par_iC, GPlexLS outErr, const int N) { + int grid_width = blockDim.x * gridDim.x; + int n = threadIdx.x + blockIdx.x * blockDim.x; - // If there is more matrices than MAX_BLOCKS_X * BLOCK_SIZE_X for (int z = 0; z < (N-1)/grid_width +1; z++) { - /*n += z*gridDim.x;*/ n += z*grid_width; - if (n < N) { - for (int j = 0; j < HS; ++j) { - resErr_reg[j] = 0; //resErr[j*resErr_stride + n]; - } - addIntoUpperLeft3x3_fn(propErr, propErr_stride, - msErr, msErr_stride, resErr_reg, N, n); - invertCramerSym_fn(resErr_reg); - - upParam_MultKalmanGain_fn(propErr, propErr_stride, - resErr_reg, kalmanGain_reg, N, n); - multResidualsAdd_fn(kalmanGain_reg, par_iP, par_iP_stride, - msPar, msPar_stride, par_iC, par_iC_stride, N, n); - - kalmanGain_x_propErr_fn(kalmanGain_reg, - propErr, propErr_stride, - outErr, outErr_stride, N, n); - } + kalmanUpdate_fn(propErr, msErr, par_iP, msPar, par_iC, outErr, n, N); } } -void kalmanUpdate_wrapper(cudaStream_t& stream, - GPlex& d_propErr, GPlex& d_msErr, - GPlex& d_par_iP, GPlex& d_msPar, - GPlex& d_par_iC, GPlex& d_outErr, +void kalmanUpdate_wrapper(const cudaStream_t& stream, + GPlexLS& d_propErr, const GPlexHS& d_msErr, + GPlexLV& d_par_iP, const GPlexHV& d_msPar, + GPlexLV& d_par_iC, GPlexLS& d_outErr, const int N) { int gridx = std::min((N-1)/BLOCK_SIZE_X + 1, - MAX_BLOCKS_X); + max_blocks_x); dim3 grid(gridx, 1, 1); dim3 block(BLOCK_SIZE_X, 1, 1); kalmanUpdate_kernel <<>> - (d_propErr.ptr, d_propErr.stride, - d_msErr.ptr, d_msErr.stride, - d_par_iP.ptr, d_par_iP.stride, - d_msPar.ptr, d_msPar.stride, - d_par_iC.ptr, d_par_iC.stride, - d_outErr.ptr, d_outErr.stride, - N); + (d_propErr, d_msErr, d_par_iP, d_msPar, d_par_iC, d_outErr, N); } -// Should probably not be in this file, but creating a file for -// this oneliner seems overkill. -void separate_first_call_for_meaningful_profiling_numbers() { - cudaDeviceSynchronize(); -} diff --git a/mkFit/kalmanUpdater_kernels.h b/mkFit/kalmanUpdater_kernels.h index 3c00760050582..112462eb4c8ba 100644 --- a/mkFit/kalmanUpdater_kernels.h +++ b/mkFit/kalmanUpdater_kernels.h @@ -3,14 +3,36 @@ #include "GPlex.h" -void kalmanUpdate_wrapper(cudaStream_t& stream, - GPlex& d_propErr, GPlex& d_msErr, - GPlex& d_par_iP, GPlex& d_msPar, - GPlex& d_par_iC, GPlex& d_outErr, +void kalmanUpdate_wrapper(const cudaStream_t& stream, + GPlexLS& d_propErr, const GPlexHS& d_msErr, + GPlexLV& d_par_iP, const GPlexHV& d_msPar, + GPlexLV& d_par_iC, GPlexLS& d_outErr, const int N); -void reorganizeMs_wrapper(cudaStream_t& stream, GPlex& msPar, float *full_posArray, - GPlex& msErr, float *full_errArray, int *full_hitIdx, int hi, int maxHits, - int N, int hs, int hv, int Nhits); +//void reorganizeMs_wrapper(cudaStream_t& stream, GPlexQF& msPar, +// float *full_posArray, GPlexHS& msErr, +// float *full_errArray, int *full_hitIdx, int hi, int maxHits, +// int N, int hs, int hv, int Nhits); + +__global__ void kalmanUpdate_kernel( + GPlexLS propErr, const GPlexHS __restrict__ msErr, + const GPlexLV __restrict__ par_iP, const GPlexHV __restrict__ msPar, + GPlexLV par_iC, GPlexLS outErr, const int N); + +__device__ void kalmanUpdate_fn( + GPlexLS &propErr, const GPlexHS __restrict__ &msErr, + const GPlexLV __restrict__ &par_iP, const GPlexHV __restrict__ &msPar, + GPlexLV &par_iC, GPlexLS &outErr, const int itrack, const int N); + +__device__ void addIntoUpperLeft3x3_fn(const GPlexLS __restrict__ &A, + const GPlexHS __restrict__ &B, + GPlexRegHS &c, const int N, const int n); + +__device__ void subtractFirst3_fn(const GPlexHV __restrict__ &A, + const GPlexLV __restrict__ &B, + GPlexRegHV &C, const int N, const int n); + +__device__ void invertCramerSym_fn(float *a); +__device__ void invertCramerSym2x2_fn(GPlexReg2S &a); #endif // _KALMAN_UPDATER_KERNELS_H_ diff --git a/mkFit/mkFit.cc b/mkFit/mkFit.cc index 5afe3fd38b530..e1c11e6e38f19 100644 --- a/mkFit/mkFit.cc +++ b/mkFit/mkFit.cc @@ -20,8 +20,13 @@ #ifdef USE_CUDA #include "FitterCU.h" +#include "gpu_utils.h" #endif +#include +//#define DEBUG +#include "Debug.h" + #include #include @@ -184,14 +189,18 @@ void test_standard() EventTmp ev_tmp; #if USE_CUDA + tbb::task_scheduler_init tbb_init(Config::numThreadsFinder); + //tbb::task_scheduler_init tbb_init(tbb::task_scheduler_init::automatic); + + //omp_set_num_threads(Config::numThreadsFinder); // fittest time. Sum of all events. In case of multiple events // being run simultaneously in different streams this time will // be larger than the elapsed time. - double s_tmp = 0.0; std::vector events; std::vector validations(Config::nEvents); + events.reserve(Config::nEvents); // simulations are all performed before the fitting loop. // It is mandatory in order to see the benefits of running // multiple streams. @@ -200,6 +209,7 @@ void test_standard() events.emplace_back(geom, val, evt); events.back().Simulate(); events.back().resetLayerHitMap(true); + dprint("Event #" << events.back().evtID() << " simtracks " << events.back().simTracks_.size() << " layerhits " << events.back().layerHits_.size()); } // The first call to a GPU function always take a very long time. @@ -210,67 +220,14 @@ void test_standard() // tell you how much time is spend running cudaDeviceSynchronize(), // use another function). separate_first_call_for_meaningful_profiling_numbers(); -#if 0 - In principle, the warmup loop should not be required. - The separate_first_call_for_meaningful_profiling_numbers() function - should be enough. - // Warmup loop - for (int i = 0; i < 1; ++i) { - FitterCU cuFitter(NN); - cuFitter.allocateDevice(); - Event &ev = events[0]; - std::vector plex_tracks_ev; - plex_tracks_ev.resize(ev.simTracks_.size()); - - if (g_run_fit_std) runFittingTestPlexGPU(cuFitter, ev, plex_tracks_ev); - cuFitter.freeDevice(); - } -#endif - // Reorgnanization (copyIn) can eventually be multithreaded. - omp_set_nested(1); - - omp_set_num_threads(Config::numThreadsEvents); - double total_gpu_time = dtime(); -#pragma omp parallel reduction(+:s_tmp) - { - int numThreadsEvents = omp_get_num_threads(); - int thr_idx = omp_get_thread_num(); - - // FitterCU is declared here to share allocations and deallocations - // between the multiple events processed by a single thread. - FitterCU cuFitter(NN); - cuFitter.allocateDevice(); - - for (int evt = thr_idx+1; evt <= Config::nEvents; evt+= numThreadsEvents) { - int idx = thr_idx; - printf("==============================================================\n"); - printf("Processing event %d with thread %d\n", evt, idx); - Event &ev = events[evt-1]; - std::vector plex_tracks_ev; - plex_tracks_ev.resize(ev.simTracks_.size()); - double tmp = 0, tmp2bh = 0, tmp2 = 0, tmp2ce = 0; - - if (g_run_fit_std) tmp = runFittingTestPlexGPU(cuFitter, ev, plex_tracks_ev); - - printf("Matriplex fit = %.5f -------------------------------------", tmp); - printf("\n"); - s_tmp += tmp; -#if 1 // 0 for timing, 1 for validation - // Validation crashes for multiple threads. - // It is something in relation to ROOT. Not sure what. - if (omp_get_num_threads() <= 1) { - if (g_run_fit_std) { - std::string tree_name = "validation-plex-" + std::to_string(evt) + ".root"; - make_validation_tree(tree_name.c_str(), ev.simTracks_, plex_tracks_ev); - } - } -#endif - } - cuFitter.freeDevice(); - } - std::cerr << "###### Total GPU time: " << dtime() - total_gpu_time << " ######\n"; + if (g_run_fit_std) runAllEventsFittingTestPlexGPU(events); + if (g_run_build_all || g_run_build_bh) { + double total_best_hit_time = 0.; + total_best_hit_time = runAllBuildingTestPlexBestHitGPU(events); + std::cout << "Total best hit time (GPU): " << total_best_hit_time << std::endl; + } #else // MT: task_scheduler_init::automatic doesn't really work (segv!) + we don't // know what to do for non-tbb cases. @@ -310,7 +267,14 @@ void test_standard() for (int b = 0; b < Config::finderReportBestOutOfN; ++b) { +#ifndef USE_CUDA t_cur[0] = (g_run_fit_std) ? runFittingTestPlex(ev, plex_tracks) : 0; +#else + FitterCU cuFitter(NN); + cuFitter.allocateDevice(); + t_cur[0] = (g_run_fit_std) ? runFittingTestPlexGPU(cuFitter, ev, plex_tracks) : 0; + cuFitter.freeDevice(); +#endif t_cur[1] = (g_run_build_all || g_run_build_bh) ? runBuildingTestPlexBestHit(ev) : 0; t_cur[2] = (g_run_build_all || g_run_build_std) ? runBuildingTestPlex(ev, ev_tmp) : 0; t_cur[3] = (g_run_build_all || g_run_build_ce) ? runBuildingTestPlexCloneEngine(ev, ev_tmp) : 0; @@ -346,6 +310,7 @@ void test_standard() t_sum[0], t_sum[1], t_sum[2], t_sum[3], t_sum[4]); printf("Total event > 1 fit = %.5f --- Build BHMX = %.5f MX = %.5f CEMX = %.5f TBBMX = %.5f\n", t_skip[0], t_skip[1], t_skip[2], t_skip[3], t_skip[4]); + //fflush(stdout); if (g_operation == "read") { diff --git a/mkFit/propagation_kernels.cu b/mkFit/propagation_kernels.cu index 6b6f3f8b2d9ec..70f4957177a44 100644 --- a/mkFit/propagation_kernels.cu +++ b/mkFit/propagation_kernels.cu @@ -1,35 +1,56 @@ #include "Config.h" +#include "Debug.h" #include "propagation_kernels.h" #include +#include "gpu_utils.h" -#define L 6 -#define LL 36 -#define LS 21 +constexpr int L = 6; +constexpr int LL2 = 36; +constexpr int LS = 21; // values from 32 to 512 give good results. // 32 gives slightly better results (on a K40) -#define BLOCK_SIZE_X 32 -#define MAX_BLOCKS_X 65535 // CUDA constraint +constexpr int BLOCK_SIZE_X = 32; -__device__ float hipo(float x, float y) { - return sqrt(x*x + y*y); +__device__ +void MultHelixProp_fn(const GPlexRegLL& a, const GPlexLS& b, GPlexRegLL& c, const int n) +{ + // C = A * B + + typedef float T; + /*const idx_t N = NN;*/ + + /*const T *a = A.fArray; ASSUME_ALIGNED(a, 64);*/ + /*const T *b = B.fArray; ASSUME_ALIGNED(b, 64);*/ + /*T *c = C.fArray; ASSUME_ALIGNED(c, 64);*/ + /*float *a = A.ptr;*/ + int aN = 1 ; int an = 0; // Register array + int bN = b.stride; int bn = n; // Global array + int cN = 1; int cn = 0; + +#include "MultHelixProp.ah" } -__device__ void sincos4(float x, float& sin, float& cos) { - // Had this writen with explicit division by factorial. - // The *whole* fitting test ran like 2.5% slower on MIC, sigh. - cos = 1; - sin = x; x *= x * 0.5f; - cos -= x; x *= x * 0.33333333f; - sin -= x; x *= x * 0.25f; - cos += x; + +__device__ +void MultHelixPropTransp_fn(const GPlexRegLL& a, const GPlexRegLL& b, GPlexLS& c, const int n) +{ + // C = B * AT; + + typedef float T; + int aN = 1 ; int an = 0; // Register array + int bN = 1 ; int bn = 0; // Global array + int cN = c.stride; int cn = n; + +#include "MultHelixPropTransp.ah" } // computeJacobianSimple works on values that are in registers. // Registers are thread-private. Thus this function has no notion of // parallelism. It is ran serially by each calling thread. __device__ void computeJacobianSimple(float *errorProp, - float s, float k, float p, float pxin, float pyin, float pzin, - float TP, float cosTP, float sinTP, int N) { + const float s, const float k, const float p, + const float pxin, const float pyin, const float pzin, + const float TP, const float cosTP, const float sinTP, const int N) { // std::cout << "total path s=" << s << std::endl; // TD = s*pt/p; @@ -90,225 +111,263 @@ __device__ void computeJacobianSimple(float *errorProp, } /// Compute MsRad ///////////////////////////////////////////////////////////// -// Not passing msRad.stride, as QF == 1 (second dim f msRad) -__device__ void computeMsRad_fn(const float* __restrict__ msPar, - size_t stride_msPar, float* msRad, int N, int n) { +__device__ void assignMsRad_fn(const float r, float* msRad, const int N, const int n) { /*int n = threadIdx.x + blockIdx.x * blockDim.x;*/ if (n < N) { - *msRad = hipo(msPar[n], msPar[n + stride_msPar]); + *msRad = r; } } -__device__ -void helixAtRFromIterative_fn(float *inPar, size_t inPar_stride, - int *inChg, float *outPar, size_t outPar_stride, float msRad, - float *errorProp_reg, int N, int n) { - - size_t opN = outPar_stride; - size_t ipN = inPar_stride; - +// Not passing msRad.stride, as QF == 1 (second dim f msRad) +__device__ void computeMsRad_fn(const GPlexHV& __restrict__ msPar, + GPlexRegQF &msRad, const int N, const int n) { /*int n = threadIdx.x + blockIdx.x * blockDim.x;*/ - - float outPar_reg[5]; - if (n < N) { - for (int j = 0; j < 5; ++j) { - outPar_reg[j] = outPar[n+j*opN]; - } - const float& xin = inPar[n + 0*ipN]; - const float& yin = inPar[n + 1*ipN]; - const float& pxin = inPar[n + 3*ipN]; - const float& pyin = inPar[n + 4*ipN]; - const float& pzin = inPar[n + 5*ipN]; - const float& r = msRad; - float r0 = hipo(xin, yin); - - if (fabs(r-r0)<0.0001) { - // get an identity matrix - computeJacobianSimple(errorProp_reg, 0, 1, 1, 1, 1, 1, 0, 1, 0, N); - return; // continue; - } - float pt2 = pxin*pxin+pyin*pyin; - float pt = sqrt(pt2); - float ptinv = 1./pt; - float pt2inv = ptinv*ptinv; - //p=0.3Br => r=p/(0.3*B) - float k = inChg[n] * 100. / (-0.299792458*Config::Bfield); - float invcurvature = 1./(pt*k);//in 1./cm - float ctgTheta=pzin*ptinv; - - //variables to be updated at each iterations - float totalDistance = 0; - //derivatives initialized to value for first iteration, i.e. distance = r-r0in - float dTDdx = r0>0. ? -xin/r0 : 0.; - float dTDdy = r0>0. ? -yin/r0 : 0.; - float dTDdpx = 0.; - float dTDdpy = 0.; - //temporaries used within the loop (declare here to reduce memory operations) - float x = 0.; - float y = 0.; - float px = 0.; - float py = 0.; - float cosAP=0.; - float sinAP=0.; - float dAPdx = 0.; - float dAPdy = 0.; - float dAPdpx = 0.; - float dAPdpy = 0.; - // float dxdvar = 0.; - // float dydvar = 0.; - //5 iterations is a good starting point - //const unsigned int Niter = 10; - // const unsigned int Niter = 5+std::round(r-r0)/2; - for (unsigned int iter=0; iter < Config::Niter; ++iter) { - x = outPar_reg[0]; - y = outPar_reg[1]; - px = outPar_reg[3]; - py = outPar_reg[4]; - r0 = hipo(outPar_reg[0], outPar_reg[1]); - - totalDistance += (r-r0); - if (Config::useTrigApprox) { // TODO: uncomment - sincos4((r-r0)*invcurvature, sinAP, cosAP); - } else { - cosAP=cos((r-r0)*invcurvature); - sinAP=sin((r-r0)*invcurvature); + msRad(n, 0, 0) = hipo(msPar(n, 0, 0), msPar(n, 1, 0)); + } +} + +__device__ void helixAtRFromIterativePolar_fn(const GPlexLV& inPar, + const GPlexQI& inChg, GPlexLV& outPar, const GPlexReg& msRad, + GPlexReg& errorProp, const int N, const int n) +{ + errorProp.SetVal(0); + +#pragma simd + //for (int n = 0; n < NN; ++n) + if (n < N) + { + //initialize erroProp to identity matrix + errorProp(n,0,0) = 1.f; + errorProp(n,1,1) = 1.f; + errorProp(n,2,2) = 1.f; + errorProp(n,3,3) = 1.f; + errorProp(n,4,4) = 1.f; + errorProp(n,5,5) = 1.f; + + const float k = inChg(n, 0, 0) * 100.f / (-Config::sol*Config::Bfield); + const float r = msRad(n, 0, 0); + float r0 = hipo(inPar(n, 0, 0), inPar(n, 1, 0)); + + // if (std::abs(r-r0)<0.0001f) { + // dprint("distance less than 1mum, skip"); + // continue; + // } + + const float xin = inPar(n, 0, 0); + const float yin = inPar(n, 1, 0); + const float zin = inPar(n, 2, 0); + const float ipt = inPar(n, 3, 0); + const float phiin = inPar(n, 4, 0); + const float theta = inPar(n, 5, 0); + + dprint_np(n, std::endl << "input parameters" + << " inPar(n, 0, 0)=" << std::setprecision(9) << inPar(n, 0, 0) + << " inPar(n, 1, 0)=" << std::setprecision(9) << inPar(n, 1, 0) + << " inPar(n, 2, 0)=" << std::setprecision(9) << inPar(n, 2, 0) + << " inPar(n, 3, 0)=" << std::setprecision(9) << inPar(n, 3, 0) + << " inPar(n, 4, 0)=" << std::setprecision(9) << inPar(n, 4, 0) + << " inPar(n, 5, 0)=" << std::setprecision(9) << inPar(n, 5, 0) + ); + + const float kinv = 1.f/k; + const float pt = 1.f/ipt; + + float D = 0., cosa = 0., sina = 0., id = 0.; + //no trig approx here, phi can be large + float cosPorT = std::cos(phiin), sinPorT = std::sin(phiin); + float pxin = cosPorT*pt; + float pyin = sinPorT*pt; + + dprint_np(n, std::endl << "k=" << std::setprecision(9) << k << " pxin=" << std::setprecision(9) << pxin << " pyin=" + << std::setprecision(9) << pyin << " cosPorT=" << std::setprecision(9) << cosPorT + << " sinPorT=" << std::setprecision(9) << sinPorT << " pt=" << std::setprecision(9) << pt); + + //derivatives initialized to value for first iteration, i.e. distance = r-r0in + float dDdx = r0 > 0.f ? -xin/r0 : 0.f; + float dDdy = r0 > 0.f ? -yin/r0 : 0.f; + float dDdipt = 0.; + float dDdphi = 0.; + + for (int i = 0; i < Config::Niter; ++i) + { + dprint_np(n, std::endl << "attempt propagation from r=" << r0 << " to r=" << r << std::endl + << "x=" << xin << " y=" << yin << " z=" << inPar(n, 2, 0) << " px=" << pxin << " py=" << pyin << " pz=" << pt*std::tan(theta) << " q=" << inChg(n, 0, 0)); + + //compute distance and path for the current iteration + r0 = hipo(outPar(n, 0, 0), outPar(n, 1, 0)); + id = (r-r0); + D+=id; + if (Config::useTrigApprox) { + sincos4(id*ipt*kinv, sina, cosa); + } else { + cosa=std::cos(id*ipt*kinv); + sina=std::sin(id*ipt*kinv); + } + + dprint_np(n, std::endl << "r=" << std::setprecision(9) << r << " r0=" << std::setprecision(9) << r0 + << " id=" << std::setprecision(9) << id << " cosa=" << cosa << " sina=" << sina); + + //update derivatives on total distance + if (i+1 != Config::Niter) { + + const float x = outPar(n, 0, 0); + const float y = outPar(n, 1, 0); + const float oor0 = (r0>0.f && std::abs(r-r0)<0.0001f) ? 1.f/r0 : 0.f; + + const float dadipt = id*kinv; + + const float dadx = -x*ipt*kinv*oor0; + const float dady = -y*ipt*kinv*oor0; + + const float pxca = pxin*cosa; + const float pxsa = pxin*sina; + const float pyca = pyin*cosa; + const float pysa = pyin*sina; + + float tmp = k*dadx; + dDdx -= ( x*(1.f + tmp*(pxca - pysa)) + y*tmp*(pyca + pxsa) )*oor0; + tmp = k*dady; + dDdy -= ( x*tmp*(pxca - pysa) + y*(1.f + tmp*(pyca + pxsa)) )*oor0; + //now r0 depends on ipt and phi as well + tmp = dadipt*ipt; + dDdipt -= k*( x*(pxca*tmp - pysa*tmp - pyca - pxsa + pyin) + + y*(pyca*tmp + pxsa*tmp - pysa + pxca - pxin))*pt*oor0; + dDdphi += k*( x*(pysa - pxin + pxca) - y*(pxsa - pyin + pyca))*oor0; + } + + //update parameters + outPar(n, 0, 0) = outPar(n, 0, 0) + k*(pxin*sina - pyin*(1.f-cosa)); + outPar(n, 1, 0) = outPar(n, 1, 0) + k*(pyin*sina + pxin*(1.f-cosa)); + const float pxinold = pxin;//copy before overwriting + pxin = pxin*cosa - pyin*sina; + pyin = pyin*cosa + pxinold*sina; + + dprint_np(n, std::endl << "outPar(n, 0, 0)=" << outPar(n, 0, 0) << " outPar(n, 1, 0)=" << outPar(n, 1, 0) + << " pxin=" << pxin << " pyin=" << pyin); } - //helix propagation formulas - //http://www.phys.ufl.edu/~avery/fitting/fitting4.pdf - outPar_reg[0] = outPar_reg[0] + k*(px*sinAP-py*(1-cosAP)); - outPar_reg[1] = outPar_reg[1] + k*(py*sinAP+px*(1-cosAP)); - outPar_reg[2] = outPar_reg[2] + (r-r0)*ctgTheta; - outPar_reg[3] = px*cosAP-py*sinAP; - outPar_reg[4] = py*cosAP+px*sinAP; - //outPar.At(n, 5, 0) = pz; //take this out as it is redundant - - if (Config::useSimpleJac==0 && - iter +1 != Config::Niter && - r0 > 0 && fabs((r-r0)*invcurvature)>0.000000001) { - //update derivatives on total distance for next step, where totalDistance+=r-r0 - //now r0 depends on px and py - r0 = 1./r0;//WARNING, now r0 is r0inv (one less temporary) - - //update derivative on D - dAPdx = -x*r0*invcurvature; - dAPdy = -y*r0*invcurvature; - dAPdpx = -(r-1./r0)*invcurvature*px*pt2inv;//weird, using r0 instead of 1./r0 improves things but it should be wrong since r0 in now r0inv - dAPdpy = -(r-1./r0)*invcurvature*py*pt2inv;//weird, using r0 instead of 1./r0 improves things but it should be wrong since r0 in now r0inv - //reduce temporary variables - //dxdx = 1 + k*dAPdx*(px*cosAP - py*sinAP); - //dydx = k*dAPdx*(py*cosAP + px*sinAP); - //dTDdx -= r0*(x*dxdx + y*dydx); - dTDdx -= r0*(x*(1 + k*dAPdx*(px*cosAP - py*sinAP)) + y*(k*dAPdx*(py*cosAP + px*sinAP))); - //reuse same temporary variables - //dxdy = k*dAPdy*(px*cosAP - py*sinAP); - //dydy = 1 + k*dAPdy*(py*cosAP + px*sinAP); - //dTDdy -= r0*(x*dxdy + y*dydy); - dTDdy -= r0*(x*(k*dAPdy*(px*cosAP - py*sinAP)) + y*(1 + k*dAPdy*(py*cosAP + px*sinAP))); - //dxdpx = k*(sinAP + px*cosAP*dAPdpx - py*sinAP*dAPdpx); - //dydpx = k*(py*cosAP*dAPdpx + 1. - cosAP + px*sinAP*dAPdpx); - //dTDdpx -= r0*(x*dxdpx + y*dydpx); - dTDdpx -= r0*(x*(k*(sinAP + px*cosAP*dAPdpx - py*sinAP*dAPdpx)) + y*(k*(py*cosAP*dAPdpx + 1. - cosAP + px*sinAP*dAPdpx))); - //dxdpy = k*(px*cosAP*dAPdpy - 1. + cosAP - py*sinAP*dAPdpy); - //dydpy = k*(sinAP + py*cosAP*dAPdpy + px*sinAP*dAPdpy); - //dTDdpy -= r0*(x*dxdpy + y*(k*dydpy); - dTDdpy -= r0*(x*(k*(px*cosAP*dAPdpy - 1. + cosAP - py*sinAP*dAPdpy)) + y*(k*(sinAP + py*cosAP*dAPdpy + px*sinAP*dAPdpy))); + const float alpha = D*ipt*kinv; + const float dadx = dDdx*ipt*kinv; + const float dady = dDdy*ipt*kinv; + const float dadipt = (ipt*dDdipt + D)*kinv; + const float dadphi = dDdphi*ipt*kinv; - } - float& TD=totalDistance; - float TP=TD*invcurvature;//totalAngPath - - float& iC=invcurvature; - float dCdpx = k*pxin*ptinv; - float dCdpy = k*pyin*ptinv; - float dTPdx = dTDdx*iC; - float dTPdy = dTDdy*iC; - float dTPdpx = (dTDdpx - TD*dCdpx*iC)*iC; // MT change: avoid division - float dTPdpy = (dTDdpy - TD*dCdpy*iC)*iC; // MT change: avoid division - - float cosTP, sinTP; if (Config::useTrigApprox) { - sincos4(TP, sinTP, cosTP); + sincos4(alpha, sina, cosa); } else { - cosTP = cos(TP); - sinTP = sin(TP); + cosa=std::cos(alpha); + sina=std::sin(alpha); } - if (Config::useSimpleJac) { - //assume total path length s as given and with no uncertainty - float p = pt2 + pzin*pzin; - p = sqrt(p); - float s = TD*p*ptinv; - computeJacobianSimple(errorProp_reg, s, k, p, pxin, pyin, pzin, TP, cosTP, sinTP, N); - } else { - //now try to make full jacobian - //derive these to compute jacobian - //x = xin + k*(pxin*sinTP-pyin*(1-cosTP)); - //y = yin + k*(pyin*sinTP+pxin*(1-cosTP)); - //z = zin + k*TP*pzin; - //px = pxin*cosTP-pyin*sinTP; - //py = pyin*cosTP+pxin*sinTP; - //pz = pzin; - //jacobian - - errorProp_reg[(0*L + 0)] = 1 + k*dTPdx*(pxin*cosTP - pyin*sinTP); //dxdx; - errorProp_reg[(0*L + 1)] = k*dTPdy*(pxin*cosTP - pyin*sinTP); //dxdy; - errorProp_reg[(0*L + 2)] = 0.; - errorProp_reg[(0*L + 3)] = k*(sinTP + pxin*cosTP*dTPdpx - pyin*sinTP*dTPdpx); //dxdpx; - errorProp_reg[(0*L + 4)] = k*(pxin*cosTP*dTPdpy - 1. + cosTP - pyin*sinTP*dTPdpy);//dxdpy; - errorProp_reg[(0*L + 5)] = 0.; - - errorProp_reg[(1*L + 0)] = k*dTPdx*(pyin*cosTP + pxin*sinTP); //dydx; - errorProp_reg[(1*L + 1)] = 1 + k*dTPdy*(pyin*cosTP + pxin*sinTP); //dydy; - errorProp_reg[(1*L + 2)] = 0.; - errorProp_reg[(1*L + 3)] = k*(pyin*cosTP*dTPdpx + 1. - cosTP + pxin*sinTP*dTPdpx);//dydpx; - errorProp_reg[(1*L + 4)] = k*(sinTP + pyin*cosTP*dTPdpy + pxin*sinTP*dTPdpy); //dydpy; - errorProp_reg[(1*L + 5)] = 0.; - - errorProp_reg[(2*L + 0)] = k*pzin*dTPdx; //dzdx; - errorProp_reg[(2*L + 1)] = k*pzin*dTPdy; //dzdy; - errorProp_reg[(2*L + 2)] = 1.; - errorProp_reg[(2*L + 3)] = k*pzin*dTPdpx;//dzdpx; - errorProp_reg[(2*L + 4)] = k*pzin*dTPdpy;//dzdpy; - errorProp_reg[(2*L + 5)] = k*TP; //dzdpz; - - errorProp_reg[(3*L + 0)] = -dTPdx*(pxin*sinTP + pyin*cosTP); //dpxdx; - errorProp_reg[(3*L + 1)] = -dTPdy*(pxin*sinTP + pyin*cosTP); //dpxdy; - errorProp_reg[(3*L + 2)] = 0.; - errorProp_reg[(3*L + 3)] = cosTP - dTPdpx*(pxin*sinTP + pyin*cosTP); //dpxdpx; - errorProp_reg[(3*L + 4)] = -sinTP - dTPdpy*(pxin*sinTP + pyin*cosTP);//dpxdpy; - errorProp_reg[(3*L + 5)] = 0.; - - errorProp_reg[(4*L + 0)] = -dTPdx*(pyin*sinTP - pxin*cosTP); //dpydx; - errorProp_reg[(4*L + 1)] = -dTPdy*(pyin*sinTP - pxin*cosTP); //dpydy; - errorProp_reg[(4*L + 2)] = 0.; - errorProp_reg[(4*L + 3)] = +sinTP - dTPdpx*(pyin*sinTP - pxin*cosTP);//dpydpx; - errorProp_reg[(4*L + 4)] = +cosTP - dTPdpy*(pyin*sinTP - pxin*cosTP);//dpydpy; - errorProp_reg[(4*L + 5)] = 0.; - - errorProp_reg[(5*L + 0)] = 0.; - errorProp_reg[(5*L + 1)] = 0.; - errorProp_reg[(5*L + 2)] = 0.; - errorProp_reg[(5*L + 3)] = 0.; - errorProp_reg[(5*L + 4)] = 0.; - errorProp_reg[(5*L + 5)] = 1.; + errorProp(n,0,0) = 1.f+k*dadx*(cosPorT*cosa-sinPorT*sina)*pt; + errorProp(n,0,1) = k*dady*(cosPorT*cosa-sinPorT*sina)*pt; + errorProp(n,0,2) = 0.f; + errorProp(n,0,3) = k*(cosPorT*(ipt*dadipt*cosa-sina)+sinPorT*((1.f-cosa)-ipt*dadipt*sina))*pt*pt; + errorProp(n,0,4) = k*(cosPorT*dadphi*cosa - sinPorT*dadphi*sina - sinPorT*sina + cosPorT*cosa - cosPorT)*pt; + errorProp(n,0,5) = 0.f; + + errorProp(n,1,0) = k*dadx*(sinPorT*cosa+cosPorT*sina)*pt; + errorProp(n,1,1) = 1.f+k*dady*(sinPorT*cosa+cosPorT*sina)*pt; + errorProp(n,1,2) = 0.f; + errorProp(n,1,3) = k*(sinPorT*(ipt*dadipt*cosa-sina)+cosPorT*(ipt*dadipt*sina-(1.f-cosa)))*pt*pt; + errorProp(n,1,4) = k*(sinPorT*dadphi*cosa + cosPorT*dadphi*sina + sinPorT*cosa + cosPorT*sina - sinPorT)*pt; + errorProp(n,1,5) = 0.f; + + //no trig approx here, theta can be large + cosPorT=std::cos(theta); + sinPorT=std::sin(theta); + //redefine sinPorT as 1./sinPorT to reduce the number of temporaries + sinPorT = 1.f/sinPorT; + + outPar(n, 2, 0) = inPar(n, 2, 0) + k*alpha*cosPorT*pt*sinPorT; + + errorProp(n,2,0) = k*cosPorT*dadx*pt*sinPorT; + errorProp(n,2,1) = k*cosPorT*dady*pt*sinPorT; + errorProp(n,2,2) = 1.f; + errorProp(n,2,3) = k*cosPorT*(ipt*dadipt-alpha)*pt*pt*sinPorT; + errorProp(n,2,4) = k*dadphi*cosPorT*pt*sinPorT; + errorProp(n,2,5) =-k*alpha*pt*sinPorT*sinPorT; + + outPar(n, 3, 0) = ipt; + + errorProp(n,3,0) = 0.f; + errorProp(n,3,1) = 0.f; + errorProp(n,3,2) = 0.f; + errorProp(n,3,3) = 1.f; + errorProp(n,3,4) = 0.f; + errorProp(n,3,5) = 0.f; + + outPar(n, 4, 0) = inPar(n, 4, 0)+alpha; + + errorProp(n,4,0) = dadx; + errorProp(n,4,1) = dady; + errorProp(n,4,2) = 0.f; + errorProp(n,4,3) = dadipt; + errorProp(n,4,4) = 1.f+dadphi; + errorProp(n,4,5) = 0.f; + + outPar(n, 5, 0) = theta; + + errorProp(n,5,0) = 0.f; + errorProp(n,5,1) = 0.f; + errorProp(n,5,2) = 0.f; + errorProp(n,5,3) = 0.f; + errorProp(n,5,4) = 0.f; + errorProp(n,5,5) = 1.f; + + dprint_np(n, "propagation end, dump parameters" << std::endl + << "pos = " << outPar(n, 0, 0) << " " << outPar(n, 1, 0) << " " << outPar(n, 2, 0) << std::endl + << "mom = " << std::cos(outPar(n, 4, 0))/outPar(n, 3, 0) << " " << std::sin(outPar(n, 4, 0))/outPar(n, 3, 0) << " " << 1./(outPar(n, 3, 0)*tan(outPar(n, 5, 0))) + << " r=" << std::sqrt( outPar(n, 0, 0)*outPar(n, 0, 0) + outPar(n, 1, 0)*outPar(n, 1, 0) ) << " pT=" << 1./std::abs(outPar(n, 3, 0)) << std::endl); + +#ifdef DEBUG + if (n < N_proc) { + dmutex_guard; + std::cout << n << ": jacobian" << std::endl; + printf("%5f %5f %5f %5f %5f %5f\n", errorProp(n,0,0),errorProp(n,0,1),errorProp(n,0,2),errorProp(n,0,3),errorProp(n,0,4),errorProp(n,0,5)); + printf("%5f %5f %5f %5f %5f %5f\n", errorProp(n,1,0),errorProp(n,1,1),errorProp(n,1,2),errorProp(n,1,3),errorProp(n,1,4),errorProp(n,1,5)); + printf("%5f %5f %5f %5f %5f %5f\n", errorProp(n,2,0),errorProp(n,2,1),errorProp(n,2,2),errorProp(n,2,3),errorProp(n,2,4),errorProp(n,2,5)); + printf("%5f %5f %5f %5f %5f %5f\n", errorProp(n,3,0),errorProp(n,3,1),errorProp(n,3,2),errorProp(n,3,3),errorProp(n,3,4),errorProp(n,3,5)); + printf("%5f %5f %5f %5f %5f %5f\n", errorProp(n,4,0),errorProp(n,4,1),errorProp(n,4,2),errorProp(n,4,3),errorProp(n,4,4),errorProp(n,4,5)); + printf("%5f %5f %5f %5f %5f %5f\n", errorProp(n,5,0),errorProp(n,5,1),errorProp(n,5,2),errorProp(n,5,3),errorProp(n,5,4),errorProp(n,5,5)); } +#endif + } +} + + +#include "PropagationMPlex.icc" + +__device__ +void helixAtRFromIterative_fn(const GPlexLV& inPar, + const GPlexQI& inChg, GPlexLV& outPar_global, const GPlexReg& msRad, + GPlexReg& errorProp, const int N, const int n) { + + GPlexReg outPar; + + if (n < N) { + for (int j = 0; j < 5; ++j) { + outPar[j] = outPar_global(n, j, 0); } + errorProp.SetVal(0); + + helixAtRFromIterative_impl(inPar, inChg, outPar, msRad, errorProp, n, n+1); + // Once computations are done. Get values from registers to global memory. for (int j = 0; j < 5; ++j) { - outPar[n + j*opN] = outPar_reg[j]; + outPar_global(n, j, 0) = outPar[j]; } } } /// Similarity //////////////////////////////////////////////////////////////// -__device__ void similarity_fn(float* a, float *b, size_t stride_outErr, - int N, int n) { - size_t bN = stride_outErr; +__device__ void similarity_fn(GPlexRegLL &a, GPlexLS &b, int N, int n) { + + size_t bN = b.stride; // Keep most values in registers. - float b_reg[LL]; + float b_reg[LL2]; // To avoid using too many registers, tmp[] as a limited size and is reused. float tmp[6]; @@ -395,46 +454,169 @@ __device__ void similarity_fn(float* a, float *b, size_t stride_outErr, } } -__global__ void propagation_kernel( - const float* __restrict__ msPar, size_t stride_msPar, - float *inPar, size_t inPar_stride, int *inChg, - float *outPar, size_t outPar_stride, float *errorProp, - size_t errorProp_stride, float *outErr, size_t outErr_stride, int N) { +// PropagationMPlex.cc:propagateHelixToRMPlex, first version with 6 arguments +__device__ void propagation_fn( + GPlexLS &inErr, GPlexLV &inPar, + GPlexQI &inChg, GPlexHV &msPar, + GPlexLS &outErr, GPlexLV &outPar, + int n, int N) { + + GPlexRegQF msRad_reg; + // Using registers instead of shared memory is ~ 30% faster. + GPlexRegLL errorProp_reg; + // If there is more matrices than max_blocks_x * BLOCK_SIZE_X + if (n < N) { + for (int i = 0; i < inErr.kSize; ++i) { + outErr[n + i*outErr.stride] = inErr[n + i*inErr.stride]; + } + for (int i = 0; i < inPar.kSize; ++i) { + outPar[n + i*outPar.stride] = inPar[n + i*inPar.stride]; + } + for (int i = 0; i < 36; ++i) { + errorProp_reg[i] = 0.0; + } +#if 0 + computeMsRad_fn(msPar, stride_msPar, &msRad_reg, N, n); + if (Config::doIterative) { + helixAtRFromIterative_fn(inPar, inPar_stride, + inChg, outPar, outPar_stride, msRad_reg, + errorProp_reg, N, n); + } else { + // TODO: not ported for now. Assuming Config::doIterative + // helixAtRFromIntersection(inPar, inChg, outPar, msRad, errorProp); + } + similarity_fn(errorProp_reg, outErr, outErr_stride, N, n); +#endif + computeMsRad_fn(msPar, msRad_reg, N, n); +#ifdef CCSCOORD + helixAtRFromIterativePolar_fn(inPar, inChg, outPar, msRad_reg, errorProp_reg, N, n); +#else + helixAtRFromIterative_fn(inPar, inChg, outPar, msRad_reg, errorProp_reg, N, n); +#endif + /*similarity_fn(errorProp_reg, outErr, N, n);*/ + GPlexRegLL temp; + MultHelixProp_fn (errorProp_reg, outErr, temp, n); + MultHelixPropTransp_fn(errorProp_reg, temp, outErr, n); + } +} + + +__global__ void propagation_kernel( + GPlexLS inErr, + GPlexHV msPar, + GPlexLV inPar, GPlexQI inChg, + GPlexLV outPar, + GPlexLS outErr, int N) +{ int grid_width = blockDim.x * gridDim.x; int n = threadIdx.x + blockIdx.x * blockDim.x; - float msRad_reg; - // Using registers instead of shared memory is ~ 30% faster. - float errorProp_reg[LL]; - // If there is more matrices than MAX_BLOCKS_X * BLOCK_SIZE_X for (int z = 0; z < (N-1)/grid_width +1; z++) { n += z*grid_width; - if (n < N) { - computeMsRad_fn(msPar, stride_msPar, &msRad_reg, N, n); - if (Config::doIterative) { - helixAtRFromIterative_fn(inPar, inPar_stride, - inChg, outPar, outPar_stride, msRad_reg, - errorProp_reg, N, n); - } - similarity_fn(errorProp_reg, outErr, outErr_stride, N, n); - } + propagation_fn(inErr, inPar, inChg, msPar, outErr, outPar, n, N); } } -void propagation_wrapper(cudaStream_t& stream, - GPlex& msPar, - GPlex& inPar, GPlex& inChg, - GPlex& outPar, GPlex& errorProp, - GPlex& outErr, +void propagation_wrapper(const cudaStream_t& stream, + GPlexHV& msPar, GPlexLS& inErr, + GPlexLV& inPar, GPlexQI& inChg, + GPlexLV& outPar, + GPlexLS& outErr, const int N) { int gridx = std::min((N-1)/BLOCK_SIZE_X + 1, - MAX_BLOCKS_X); + max_blocks_x); dim3 grid(gridx, 1, 1); dim3 block(BLOCK_SIZE_X, 1, 1); propagation_kernel <<>> - (msPar.ptr, msPar.stride, - inPar.ptr, inPar.stride, inChg.ptr, - outPar.ptr, outPar.stride, errorProp.ptr, - errorProp.stride, outErr.ptr, outErr.stride, N); + (inErr, msPar, inPar, inChg, outPar, outErr, N); } + + +// PropagationMPlex.cc:propagateHelixToRMPlex, second version with 7 arguments +// Imposes the radius +__device__ void propagationForBuilding_fn( + const GPlexLS &inErr, const GPlexLV &inPar, + const GPlexQI &inChg, const float radius, + GPlexLS &outErr, GPlexLV &outPar, + const int n, const int N) { +#if 1 + GPlexRegQF msRad_reg; + // Using registers instead of shared memory is ~ 30% faster. + GPlexRegLL errorProp_reg; + // If there is more matrices than max_blocks_x * BLOCK_SIZE_X + if (n < N) { + + for (int i = 0; i < inErr.kSize; ++i) { + outErr[n + i*outErr.stride] = inErr[n + i*inErr.stride]; + } + for (int i = 0; i < inPar.kSize; ++i) { + outPar[n + i*outPar.stride] = inPar[n + i*inPar.stride]; + } + for (int i = 0; i < 36; ++i) { + errorProp_reg[i] = 0.0; + } + + /*assignMsRad_fn(radius, &msRad_reg, N, n);*/ + msRad_reg(n, 0, 0) = radius; + /*if (n == 0) printf("gpu r = %f\n", radius);*/ + +#ifdef CCSCOORD + helixAtRFromIterativePolar_fn(inPar, inChg, outPar, msRad_reg, errorProp_reg, N, n); +#else + helixAtRFromIterative_fn(inPar, inChg, outPar, msRad_reg, errorProp_reg, N, n); +#endif + // TODO: port me + /*if (Config::useCMSGeom) {*/ + /*MPlexQF hitsRl;*/ + /*MPlexQF hitsXi;*/ + /*for (int n = 0; n < NN; ++n) {*/ + /*hitsRl.At(n, 0, 0) = getRlVal(r, outPar.ConstAt(n, 2, 0));*/ + /*hitsXi.At(n, 0, 0) = getXiVal(r, outPar.ConstAt(n, 2, 0));*/ + /*}*/ + /*applyMaterialEffects(hitsRl, hitsXi, outErr, outPar, N_proc);*/ + /*}*/ + /*similarity_fn(errorProp_reg, outErr, N, n);*/ + + // Matriplex version of: + // result.errors = ROOT::Math::Similarity(errorProp, outErr); + + //MultHelixProp can be optimized for polar coordinates, see GenMPlexOps.pl + /*MPlexLL temp;*/ + /*MultHelixProp (errorProp, outErr, temp);*/ + /*MultHelixPropTransp(errorProp, temp, outErr);*/ + GPlexRegLL temp; + MultHelixProp_fn (errorProp_reg, outErr, temp, n); + MultHelixPropTransp_fn(errorProp_reg, temp, outErr, n); + + } +#endif +} + +__global__ void propagationForBuilding_kernel( + const GPlexLS inErr, const GPlexLV inPar, + const GPlexQI inChg, const float radius, + GPlexLS outErr, GPlexLV outPar, + const int N) { + int grid_width = blockDim.x * gridDim.x; + int n = threadIdx.x + blockIdx.x * blockDim.x; + + for (int z = 0; z < (N-1)/grid_width +1; z++) { + n += z*grid_width; + propagationForBuilding_fn( inErr, inPar, inChg, radius, outErr, outPar, n, N); + } +} + +void propagationForBuilding_wrapper(const cudaStream_t& stream, + const GPlexLS& inErr, const GPlexLV& inPar, + const GPlexQI& inChg, const float radius, + GPlexLS& outErr, GPlexLV& outPar, + const int N) { + int gridx = std::min((N-1)/BLOCK_SIZE_X + 1, + max_blocks_x); + dim3 grid(gridx, 1, 1); + dim3 block(BLOCK_SIZE_X, 1, 1); + propagationForBuilding_kernel<<>> + (inErr, inPar, inChg, radius, outErr, outPar, N); +} + diff --git a/mkFit/propagation_kernels.h b/mkFit/propagation_kernels.h index 2fb92f0689f39..aec2e256bf918 100644 --- a/mkFit/propagation_kernels.h +++ b/mkFit/propagation_kernels.h @@ -3,11 +3,29 @@ #include "GPlex.h" -void propagation_wrapper(cudaStream_t& stream, - GPlex& msPar, - GPlex& inPar, GPlex& inChg, - GPlex& outPar, GPlex& errorProp, - GPlex& outErr, +void propagation_wrapper(const cudaStream_t& stream, + GPlexHV& msPar, GPlexLS& inErr, + GPlexLV& inPar, GPlexQI& inChg, + GPlexLV& outPar, + GPlexLS& outErr, const int N); +void propagationForBuilding_wrapper(const cudaStream_t& stream, + const GPlexLS& inErr, const GPlexLV& inPar, + const GPlexQI& inChg, const float radius, + GPlexLS& outErr, GPlexLV& outPar, + const int N); + +__device__ void propagation_fn( + GPlexLS &inErr, GPlexLV &inPar, + GPlexQI &inChg, GPlexHV &msPar, + GPlexLS &outErr, GPlexLV &outPar, + int n, int N); + +__device__ void propagationForBuilding_fn( + const GPlexLS &inErr, const GPlexLV &inPar, + const GPlexQI &inChg, const float radius, + GPlexLS &outErr, GPlexLV &outPar, + const int n, const int N); + #endif // _PROPAGATION_KERNELS_H_ diff --git a/mkFit/reorganize.cu b/mkFit/reorganize.cu deleted file mode 100644 index 80d3b7912cac9..0000000000000 --- a/mkFit/reorganize.cu +++ /dev/null @@ -1,64 +0,0 @@ -#include "reorganize.h" -#include - -__global__ void toMatriplex_kernel(float *dst, int dst_stride, - const float* __restrict__ src, int src_stride, - int N, int LS) { - int i = threadIdx.x + blockIdx.x * blockDim.x; - int j = threadIdx.y + blockIdx.y * blockDim.y; - - if (i < N && j < LS) { - if (i==-1) { - printf(" %d, mplex[%f] / lin[%f]\n", j, dst[i+j*dst_stride], src[j+i*src_stride]); - } - dst[i + j*dst_stride] = src[j + i*src_stride]; - /*float diff = fabs((dst[i + j*dst_stride] - src[j + i*src_stride]));*/ - /*if (diff > 1e-3) printf("%f\n", diff);*/ - } -} - -void toMatriplex_wrapper(cudaStream_t& stream, GPlex &dst, GPlex &src, int N, int LS) { - dim3 block(16, 8, 1); - dim3 grid((N-1)/16 + 1, (LS-1)/8 +1, 1); - toMatriplex_kernel <<>> (dst.ptr, dst.stride, src.ptr, src.stride, N, LS); -} - - -__global__ void reorganizeMs(float *msPar, size_t msPar_stride, - float *full_posArray, - float *msErr, size_t msErr_stride, - float *full_errArray, - int *full_hitIdx, int hi, - int maxHits, - int N, int HS, int HV, int Nhits) { - - int i = threadIdx.x + blockIdx.x * blockDim.x; - int j = threadIdx.y + blockIdx.y * blockDim.y; - - if (i < N) { - int hidx = full_hitIdx[i + hi*N]; - if (j < HV) { - /*float tmp1 = msPar[i + msPar_stride*j];*/ - msPar[i + msPar_stride*j] = full_posArray[j + HV*(hidx + hi*maxHits)]; - /*float tmp2 = msPar[i + msPar_stride*j];*/ - - /*if (i==0 && hi == 0) {*/ - /*if (fabs(tmp1 - tmp2) > 1e-3) {*/ - /*printf("i %d, j %d, old: %f, new %f\n", i, j, tmp1, tmp2);*/ - /*}*/ - /*}*/ - } - if (j < HS) { - msErr[i + msErr_stride*j] = full_errArray[j + HS*(hidx + hi*maxHits)]; - } - } -} - -void reorganizeMs_wrapper(cudaStream_t& stream, GPlex& msPar, float *full_posArray, - GPlex& msErr, float *full_errArray, int *full_hitIdx, int hi, int maxHits, - int N, int hs, int hv, int Nhits) { - dim3 block(16, 6, 1); - dim3 grid((N-1)/16 + 1, (hs-1)/6 +1, 1); - reorganizeMs <<>> (msPar.ptr, msPar.stride, full_posArray, - msErr.ptr, msErr.stride, full_errArray, full_hitIdx, hi, maxHits, N, hs, hv, Nhits); -} diff --git a/mkFit/reorganize.h b/mkFit/reorganize.h deleted file mode 100644 index fd72eb8f91be6..0000000000000 --- a/mkFit/reorganize.h +++ /dev/null @@ -1,8 +0,0 @@ -#ifndef _REORGANIZE_KERNELS_H_ -#define _REORGANIZE_KERNELS_H_ - -#include "GPlex.h" - -void toMatriplex_wrapper(cudaStream_t& stream, GPlex &dst, GPlex &src, int n, int ls); - -#endif diff --git a/mkFit/reorganize_gplex.cu b/mkFit/reorganize_gplex.cu new file mode 100644 index 0000000000000..305ab59ac277f --- /dev/null +++ b/mkFit/reorganize_gplex.cu @@ -0,0 +1,318 @@ +#include "reorganize_gplex.h" +#include + +#include "FitterCU.h" +#include "accessors_cu.h" +#include "Track.h" +#include "gpu_utils.h" + +__device__ float *get_posArray(Hit &hit) { + return hit.posArrayCU(); +} +__device__ float *get_errArray(Hit &hit) { + return hit.errArrayCU(); +} + +__device__ float *get_posArray(Track &track) { + return track.posArrayCU(); +} +__device__ float *get_errArray(Track &track) { + return track.errArrayCU(); +} + +template +__device__ void SlurpIn_fn(GPlexObj to, // float *fArray, int stride, int kSize, + const char *arr, const int *vi, const int N) { + int j = threadIdx.x + blockDim.x * blockIdx.x; + if (j +__device__ void SlurpInIdx_fn(GPlexObj to, + const char *arr, const int idx, const int N) { + int j = threadIdx.x + blockDim.x * blockIdx.x; + if (j +__device__ void SlurpOutIdx_fn(GPlexObj from, // float *fArray, int stride, int kSize, + const char *arr, const int idx, const int N) { + int j = threadIdx.x + blockDim.x * blockIdx.x; + if (j>> + (msErr, msPar, layer.m_hits, XHitSize, XHitArr, HitsIdx, hit_cnt, N); + cudaDeviceSynchronize(); +} + + +__device__ void InputTracksCU_fn (Track *tracks, + GPlexLS &Err_iP, GPlexLV &Par_iP, + GPlexQI &Chg, GPlexQF &Chi2, + GPlexQI &Label, GPlexQI *HitsIdx, + const int beg, const int end, + const int itrack, const int N) { + //int itrack = threadIdx.x + blockDim.x*blockIdx.x; + + if (itrack < (end-beg) && itrack < N) { + Track &trk = tracks[beg]; + const char *varr = (char*) &trk; + int off_error = (char*) trk.errArrayCU() - varr; + int off_param = (char*) trk.posArrayCU() - varr; + + int i= itrack + beg; + const Track &trk_i = tracks[i]; + int idx = (char*) &trk_i - varr; + + Label(itrack, 0, 0) = tracks[i].label(); + Chg(itrack, 0, 0) = tracks[i].charge(); + Chi2(itrack, 0, 0) = tracks[i].chi2(); + SlurpInIdx_fn(Err_iP, varr + off_error, idx, N); + SlurpInIdx_fn(Par_iP, varr + off_param, idx, N); + + for (int hi = 0; hi < 3; ++hi) + HitsIdx[hi](itrack, 0, 0) = tracks[i].getHitIdx(hi);//dummy value for now + } +} + + +__global__ void InputTracksCU_kernel(Track *tracks, + GPlexLS Err_iP, GPlexLV Par_iP, + GPlexQI Chg, GPlexQF Chi2, GPlexQI Label, + GPlexQI *HitsIdx, + int beg, int end, int N) { + int itrack = threadIdx.x + blockDim.x*blockIdx.x; + InputTracksCU_fn(tracks, Err_iP, Par_iP, Chg, Chi2, Label, HitsIdx, beg, end, itrack, N); +} + + +void InputTracksCU_wrapper(const cudaStream_t &stream, + const EtaBinOfCandidatesCU &etaBin, + GPlexLS &Err_iP, GPlexLV &Par_iP, + GPlexQI &Chg, GPlexQF &Chi2, GPlexQI &Label, + GPlexQI *HitsIdx, + const int beg, const int end, const bool inputProp, int N) { + int gridx = std::min((N-1)/BLOCK_SIZE_X + 1, + max_blocks_x); + dim3 grid(gridx, 1, 1); + dim3 block(BLOCK_SIZE_X, 1, 1); + + InputTracksCU_kernel <<< grid, block, 0, stream >>> + (etaBin.m_candidates, Err_iP, Par_iP, Chg, Chi2, Label, HitsIdx, + beg, end, N); +} + + +__device__ void InputTracksAndHitsCU_fn (Track *tracks, LayerOfHitsCU *layerHits, + GPlexLS &Err_iP, GPlexLV &Par_iP, + GPlexHS *msErr_arr, GPlexHV *msPar_arr, + GPlexQI &Chg, GPlexQF &Chi2, + GPlexQI &Label, GPlexQI *HitsIdx, + const int beg, const int end, + const int itrack, const int N) { + //int itrack = threadIdx.x + blockDim.x*blockIdx.x; + + if (itrack < (end-beg) && itrack < N) { + Track &trk = tracks[beg]; + const char *varr = (char*) &trk; + int off_error = (char*) trk.errArrayCU() - varr; + int off_param = (char*) trk.posArrayCU() - varr; + + int i= itrack + beg; + const Track &trk_i = tracks[i]; + int idx = (char*) &trk_i - varr; + + Label(itrack, 0, 0) = tracks[i].label(); + Chg(itrack, 0, 0) = tracks[i].charge(); + Chi2(itrack, 0, 0) = tracks[i].chi2(); + SlurpInIdx_fn(Err_iP, varr + off_error, idx, N); + SlurpInIdx_fn(Par_iP, varr + off_param, idx, N); + + // Note Config::nLayers -- not suitable for building + for (int hi = 0; hi < Config::nLayers; ++hi) { + int hidx = tracks[i].getHitIdx(hi); + Hit &hit = layerHits[hi].m_hits[hidx]; + + HitsIdx[hi](itrack, 0, 0) = idx; + if (hidx < 0) continue; + + SlurpInIdx_fn(msErr_arr[hi], (char *)hit.errArrayCU(), 0, N); + SlurpInIdx_fn(msPar_arr[hi], (char *)hit.posArrayCU(), 0, N); + } + } +} + + +__global__ void InputTracksAndHitsCU_kernel(Track *tracks, LayerOfHitsCU *layers, + GPlexLS Err_iP, GPlexLV Par_iP, + GPlexHS *msErr_arr, GPlexHV *msPar_arr, + GPlexQI Chg, GPlexQF Chi2, GPlexQI Label, + GPlexQI *HitsIdx, + int beg, int end, int N) { + int itrack = threadIdx.x + blockDim.x*blockIdx.x; + InputTracksAndHitsCU_fn(tracks, layers, Err_iP, Par_iP, msErr_arr, msPar_arr, + Chg, Chi2, Label, HitsIdx, beg, end, itrack, N); +} + + +void InputTracksAndHitsCU_wrapper(const cudaStream_t &stream, + Track *tracks, EventOfHitsCU &event_of_hits, + GPlexLS &Err_iP, GPlexLV &Par_iP, + GPlexHS *msErr_arr, GPlexHV *msPar_arr, + GPlexQI &Chg, GPlexQF &Chi2, GPlexQI &Label, + GPlexQI *HitsIdx, + const int beg, const int end, + const bool inputProp, int N) { + int gridx = std::min((N-1)/BLOCK_SIZE_X + 1, + max_blocks_x); + dim3 grid(gridx, 1, 1); + dim3 block(BLOCK_SIZE_X, 1, 1); + + InputTracksAndHitsCU_kernel <<< grid, block, 0, stream >>> + (tracks, event_of_hits.m_layers_of_hits, + Err_iP, Par_iP, + msErr_arr, msPar_arr, + Chg, Chi2, Label, HitsIdx, + beg, end, N); +} + + +__device__ void OutputTracksCU_fn(Track *tracks, + const GPlexLS &Err_iP, const GPlexLV &Par_iP, + const GPlexQI &Chg, const GPlexQF &Chi2, + const GPlexQI &Label, const GPlexQI *HitsIdx, + const int beg, const int end, + const int itrack, const int N, + const bool update_hit_idx) { + //int itrack = threadIdx.x + blockDim.x*blockIdx.x; + + if (itrack < (end-beg) && itrack < N) { + Track &trk = tracks[beg]; + const char *varr = (char*) &trk; + int off_error = (char*) trk.errArrayCU() - varr; + int off_param = (char*) trk.posArrayCU() - varr; + + int i= itrack + beg; + const Track &trk_i = tracks[i]; + int idx = (char*) &trk_i - varr; + + SlurpOutIdx_fn(Err_iP, varr + off_error, idx, N); + SlurpOutIdx_fn(Par_iP, varr + off_param, idx, N); + tracks[i].setCharge(Chg(itrack, 0, 0)); + tracks[i].setChi2(Chi2(itrack, 0, 0)); + tracks[i].setLabel(Label(itrack, 0, 0)); + + if (update_hit_idx) { + tracks[i].resetHits(); + /*int nGoodItIdx = 0;*/ + for (int hi = 0; hi < Config::nLayers; ++hi) { + tracks[i].addHitIdx(HitsIdx[hi](itrack, 0, 0),0.); + // FIXME: We probably want to use registers instead of going for class members: + /*int hit_idx = HitsIdx[hi](itrack, 0, 0);*/ + /*tracks[i].setHitIdx(hi, hit_idx);*/ + /*if (hit_idx >= 0) {*/ + /*nGoodItIdx++; */ + /*}*/ + } + /*tracks[i].setNGoodHitIdx(nGoodItIdx);*/ + /*tracks[i].setChi2(0.);*/ + } + } +} + +__global__ void OutputTracksCU_kernel(Track *tracks, + GPlexLS Err_iP, GPlexLV Par_iP, + GPlexQI Chg, GPlexQF Chi2, GPlexQI Label, + GPlexQI *HitsIdx, + int beg, int end, int N, + const bool update_hit_idx=true) { + int itrack = threadIdx.x + blockDim.x*blockIdx.x; + OutputTracksCU_fn(tracks, Err_iP, Par_iP, Chg, Chi2, Label, HitsIdx, + beg, end, itrack, N, update_hit_idx); +} + + +void OutputTracksCU_wrapper(const cudaStream_t &stream, + EtaBinOfCandidatesCU &etaBin, + GPlexLS &Err_iP, GPlexLV &Par_iP, + GPlexQI &Chg, GPlexQF &Chi2, GPlexQI &Label, + GPlexQI *HitsIdx, + const int beg, const int end, const bool outputProp, int N) { + int gridx = std::min((N-1)/BLOCK_SIZE_X + 1, + max_blocks_x); + dim3 grid(gridx, 1, 1); + dim3 block(BLOCK_SIZE_X, 1, 1); + + OutputTracksCU_kernel <<< grid, block, 0, stream >>> + (etaBin.m_candidates, Err_iP, Par_iP, Chg, Chi2, Label, HitsIdx, beg, end, N); +} + + +void OutputFittedTracksCU_wrapper(const cudaStream_t &stream, + Track *tracks_cu, + GPlexLS &Err_iP, GPlexLV &Par_iP, + GPlexQI &Chg, GPlexQF &Chi2, GPlexQI &Label, + const int beg, const int end, int N) { + int gridx = std::min((N-1)/BLOCK_SIZE_X + 1, + max_blocks_x); + dim3 grid(gridx, 1, 1); + dim3 block(BLOCK_SIZE_X, 1, 1); + + OutputTracksCU_kernel <<< grid, block, 0, stream >>> + (tracks_cu, Err_iP, Par_iP, Chg, Chi2, Label, nullptr, beg, end, N, false); +} diff --git a/mkFit/reorganize_gplex.h b/mkFit/reorganize_gplex.h new file mode 100644 index 0000000000000..f59e2b6c0c464 --- /dev/null +++ b/mkFit/reorganize_gplex.h @@ -0,0 +1,74 @@ +#ifndef REORGANIZE_GPLEX_H +#define REORGANIZE_GPLEX_H + +#include "GPlex.h" +#include "Hit.h" +#include "HitStructuresCU.h" +#include "Track.h" + +__device__ float *get_posArray(Hit &hit); +__device__ float *get_errArray(Hit &hit); +__device__ float *get_posArray(Track &track); +__device__ float *get_errArray(Track &track); + +__device__ void HitToMs_fn(GPlexHS &msErr, GPlexHV &msPar, + Hit *hits, const GPlexQI &XHitSize, + const GPlexHitIdx &XHitArr, + GPlexQI &HitsIdx, const int hit_cnt, + const int itrack, const int N); + +__global__ void HitToMs_kernel(GPlexHS msErr, GPlexHV msPar, Hit *hits, + const GPlexQI XHitSize, const GPlexHitIdx XHitArr, + GPlexQI HitsIdx, const int hit_cnt, const int N); + +void HitToMs_wrapper(const cudaStream_t& stream, + GPlexHS &msErr, GPlexHV &msPar, LayerOfHitsCU &layer, + const GPlexQI &XHitSize, const GPlexHitIdx &XHitArr, + GPlexQI &HitsIdx, const int hit_cnt, const int N); + +__device__ void InputTracksCU_fn(Track *tracks, + GPlexLS &Err_iP, GPlexLV &Par_iP, + GPlexQI &Chg, GPlexQF &Chi2, + GPlexQI &Label, GPlexQI *HitsIdx, + const int beg, const int end, + const int itrack, const int N); + +__device__ void OutputTracksCU_fn(Track *tracks, + const GPlexLS &Err_iP, const GPlexLV &Par_iP, + const GPlexQI &Chg, const GPlexQF &Chi2, + const GPlexQI &Label, const GPlexQI *HitsIdx, + const int beg, const int end, + const int itrack, const int N, + const bool update_hit_idx=true); + +void InputTracksCU_wrapper(const cudaStream_t &stream, + const EtaBinOfCandidatesCU &etaBin, + GPlexLS &Err_iP, GPlexLV &Par_iP, + GPlexQI &Chg, GPlexQF &Chi2, GPlexQI &Label, + GPlexQI *HitsIdx, + const int beg, const int end, const bool inputProp, int N); + +void InputTracksAndHitsCU_wrapper(const cudaStream_t &stream, + Track *tracks, EventOfHitsCU &event_of_hits, + GPlexLS &Err_iP, GPlexLV &Par_iP, + GPlexHS *msErr_arr, GPlexHV *msPar_arr, + GPlexQI &Chg, GPlexQF &Chi2, GPlexQI &Label, + GPlexQI *HitsIdx, + const int beg, const int end, + const bool inputProp, int N); + +void OutputTracksCU_wrapper(const cudaStream_t &stream, + EtaBinOfCandidatesCU &etaBin, + GPlexLS &Err_iP, GPlexLV &Par_iP, + GPlexQI &Chg, GPlexQF &Chi2, GPlexQI &Label, + GPlexQI *HitsIdx, + const int beg, const int end, bool outputProp, int N); + + +void OutputFittedTracksCU_wrapper(const cudaStream_t &stream, + Track *tracks_cu, + GPlexLS &Err_iP, GPlexLV &Par_iP, + GPlexQI &Chg, GPlexQF &Chi2, GPlexQI &Label, + const int beg, const int end, int N); + +#endif // REORGANIZE_GPLEX_H