Merge pull request #61 from mpbl/devel

Merge gpu best hit
slava77 · Oct 14, 2016 · 393b7ea · 393b7ea
2 parents da5624c + 53cf9c0
commit 393b7ea
Show file tree

Hide file tree

Showing 56 changed files with 4,035 additions and 699 deletions.
diff --git a/BinInfoUtils.h b/BinInfoUtils.h
@@ -12,25 +12,37 @@ typedef std::pair<int, int>                 BinInfo;
 typedef std::vector<std::vector<BinInfo>>   BinInfoLayerMap;
 typedef std::vector<BinInfoLayerMap>        BinInfoMap;
 
+#ifdef __CUDACC__
+__host__ __device__
+#endif
 inline float downPhi(float phi)
 {
   while (phi >= Config::PI) {phi-=Config::TwoPI;}
   return phi;
 }
 
+#ifdef __CUDACC__
+__host__ __device__
+#endif
 inline float upPhi(float phi)
 {
   while (phi <= -Config::PI) {phi+=Config::TwoPI;}
   return phi;
 }
 
+#ifdef __CUDACC__
+__host__ __device__
+#endif
 inline float normalizedPhi(float phi)
 {
   //  return std::fmod(phi, (float) Config::PI); // return phi +pi out of phase for |phi| beyond boundary! 
   if (std::abs(phi)>=Config::PI) {phi = (phi>0 ? downPhi(phi) : upPhi(phi));}
   return phi;
 }
 
+#ifdef __CUDACC__
+__host__ __device__
+#endif
 inline int getPhiPartition(float phi)
 {
   //assume phi is between -PI and PI

diff --git a/Config.h b/Config.h
@@ -121,7 +121,7 @@ namespace Config
   // Config for Hit and BinInfoUtils
   constexpr int   nPhiPart   = 1260;
   constexpr float fPhiFactor = nPhiPart / TwoPI;
-  constexpr int   nEtaPart   = 11;
+  constexpr int   nEtaPart   = 11;  // 1 is better for GPU best_hit
   constexpr int   nEtaBin    = 2 * nEtaPart - 1;
 
   constexpr float        fEtaFull  = 2 * Config::fEtaDet;
@@ -204,7 +204,7 @@ namespace Config
     #ifdef __MIC__
       #define MPT_SIZE 16
     #elif defined USE_CUDA
-      #define MPT_SIZE 10000
+      #define MPT_SIZE 8
     #else
       #define MPT_SIZE 8
     #endif

diff --git a/Hit.h b/Hit.h
@@ -85,11 +85,17 @@ inline float getInvRad2(float x, float y){
   return 1.0f/(x*x + y*y);
 }
 
+#ifdef __CUDACC__
+__host__ __device__
+#endif
 inline float getPhi(float x, float y)
 {
   return std::atan2(y,x); 
 }
 
+#ifdef __CUDACC__
+__host__ __device__
+#endif
 inline float getTheta(float r, float z){
   return std::atan2(r,z);
 }
@@ -217,6 +223,10 @@ class Hit
 
   const float* posArray() const {return state_.pos_.Array();}
   const float* errArray() const {return state_.err_.Array();}
+#if __CUDACC__
+  __device__ float* posArrayCU();
+  __device__ float* errArrayCU();
+#endif
 
   // Non-const versions needed for CopyOut of Matriplex.
   SVector3&     parameters_nc() {return state_.pos_;}

diff --git a/Makefile.config b/Makefile.config
@@ -19,6 +19,7 @@
 # OSXGCC5    := yes
 # To keep Dan's version working
 # TBB_PREFIX := tbb
+# TBB_PREFIX := ${TBBROOT}
 
 # 1. Use ROOT or not (never used on MIC)
 # Comment out to disable root ("yes" is not relevant)
@@ -34,9 +35,11 @@ endif
 
 # 2.1 Use nvcc to compile cuda code
 # CUDA compiler
-NV := nvcc
+CUBROOT=/home/ml15/tools/cub
+NV := nvcc -prec-sqrt=true -I${CUBROOT} 
+#-g -G -lineinfo
 # Comment out to compile for CPU
-# USE_CUDA := -DUSE_CUDA
+#USE_CUDA := yes
 
 # 3. Optimization
 # -O3 implies vectorization and simd (but not AVX)
@@ -113,12 +116,19 @@ INWARD_FIT := -DINWARDFIT
 # Derived settings
 ################################################################
 
-CPPFLAGS := -I. ${USE_MATRIPLEX} ${USE_INTRINSICS} ${USE_CUDA} -std=c++11
+CPPFLAGS := -I. ${USE_MATRIPLEX} ${USE_INTRINSICS} -std=c++11
 CXXFLAGS := ${OPT} ${OSX_CXXFLAGS}
 
-LDFLAGS_HOST :=
+LDFLAGS_HOST := 
 LDFLAGS_MIC  :=
 
+ifdef USE_CUDA
+CPPFLAGS += -DUSE_CUDA -I/nfs/opt/cuda/include
+#CPPFLAGS += -I/home/ml15/tools/cub
+CPPFLAGS += -I${CUBROOT}
+LDFLAGS_HOST += -L${CUDALIBDIR}
+endif
+
 CPPFLAGS += ${USE_STATE_VALIDITY_CHECKS} ${USE_SCATTERING} ${USE_LINEAR_INTERPOLATION} ${ENDTOEND} ${USE_ETA_SEGMENTATION} ${INWARD_FIT} ${GEN_FLAT_ETA}
 
 ifdef USE_VTUNE_NOTIFY
@@ -130,7 +140,8 @@ endif
 endif
 
 ifneq ($(CXX),icc)
-  #CXXFLAGS += -Wall -Wno-unknown-pragmas
+  CPPFLAGS += -I/opt/rh/python27/root/usr/include
+  LDFLAGS_HOST += -L/opt/rh/python27/root/usr/lib64
 endif
 
 ifeq ($(CXX),icc)

diff --git a/Math/MatrixRepresentationsStatic.h b/Math/MatrixRepresentationsStatic.h
@@ -241,6 +241,9 @@ namespace Math {
      inline T* Array() { return fArray; }
 
      inline const T* Array() const { return fArray; }
+#ifdef __CUDACC__
+     T* ArrayCU();
+#endif
 
       /**
          assignment : only symmetric to symmetric allowed

diff --git a/Math/SMatrix.h b/Math/SMatrix.h
@@ -272,6 +272,9 @@ class SMatrix {
    const T* Array() const;
    /// return pointer to internal array
    T* Array();
+#ifdef __CUDACC__
+   T* ArrayCU();
+#endif
 
    /** @name --- STL-like interface --- 
        The iterators access the matrix element in the order how they are 

diff --git a/Math/SVector.h b/Math/SVector.h
@@ -185,6 +185,9 @@ class SVector {
    const T* Array() const;
    /// return non-const pointer to internal array
    T* Array();
+#ifdef __CUDACC__
+   T* ArrayCU();
+#endif
 
    /** @name --- STL-like interface --- */
 

diff --git a/Matriplex/GenMul.pm b/Matriplex/GenMul.pm
@@ -532,6 +532,75 @@ sub multiply_standard
 
 # ----------------------------------------------------------------------
 
+sub generate_addend_gpu
+{
+  my ($S, $x, $y) = @_;
+
+  return undef if $S->{$x}{pat} eq '0' or  $S->{$y}{pat} eq '0';
+  return "1"   if $S->{$x}{pat} eq '1' and $S->{$y}{pat} eq '1';
+
+  my $xstr = sprintf "$S->{$x}{mat}{name}\[%2d*$S->{$x}{mat}{name}N+$S->{$x}{mat}{name}n]", $S->{$x}{idx};
+  my $ystr = sprintf "$S->{$y}{mat}{name}\[%2d*$S->{$y}{mat}{name}N+$S->{$y}{mat}{name}n]", $S->{$y}{idx};
+
+  return $xstr if $S->{$y}{pat} eq '1';
+  return $ystr if $S->{$x}{pat} eq '1';
+
+  return "${xstr}*${ystr}";
+}
+
+sub multiply_gpu
+{
+  # Standard mutiplication - outputs unrolled C code, one line
+  # per target matrix element.
+  # Arguments: a, b, c   -- all GenMul::MBase with right dimensions.
+  # Does:      c = a * b
+
+  check_multiply_arguments(@_);
+
+  my ($S, $a, $b, $c) = @_;
+
+  my $is_c_symmetric = $c->isa("GenMul::MatrixSym");
+
+  # With no_size_check matrices do not have to be compatible.
+  my $k_max = $a->{N} <= $b->{M} ? $a->{N} : $b->{M};
+
+  for (my $i = 0; $i < $c->{M}; ++$i)
+  {
+    my $j_max = $is_c_symmetric ?  $i + 1 : $c->{N};
+
+    for (my $j = 0; $j < $j_max; ++$j)
+    {
+      my $x = $c->idx($i, $j);
+
+      printf "$S->{prefix}$c->{name}\[%2d*$c->{name}N+$c->{name}n\] = ", $x;
+
+      my @sum;
+
+      for (my $k = 0; $k < $k_max; ++$k)
+      {
+        $S->generate_indices_and_patterns_for_multiplication($i, $j, $k);
+
+        my $addend = $S->generate_addend_gpu('a', 'b');
+
+        push @sum, $addend if defined $addend;
+      }
+      if (@sum)
+      {
+        print join(" + ", @sum), ";";
+      }
+      else
+      {
+        print "0;"
+      }
+      print "\n";
+    }
+  }
+
+  $S->delete_temporaries();
+}
+
+# ----------------------------------------------------------------------
+
 sub load_if_needed
 {
   my ($S, $x) = @_;
@@ -709,6 +778,7 @@ sub dump_multiply_std_and_intrinsic
   }
 
   print <<"FNORD";
+#ifndef __CUDACC__
 #ifdef MPLEX_INTRINSICS
 
    for (int n = 0; n < N; n += MPLEX_INTRINSICS_WIDTH_BYTES / sizeof(T))
@@ -732,6 +802,11 @@ FNORD
   print <<"FNORD";
    }
 #endif
+#else  // __CUDACC__
+FNORD
+  $S->multiply_gpu($a, $b, $c);
+  print <<"FNORD";
+#endif  // __CUDACC__
 FNORD
 
 

diff --git a/Matriplex/Matriplex.h b/Matriplex/Matriplex.h
@@ -131,6 +131,9 @@ class Matriplex
         for (int j = 0; j < N; ++j)
         {
            fArray[i*N + j] = * (const T*) (arr + i*sizeof(T) + vi[j]);
+           //if(j==2) {
+             //printf("cpu -- %d : %d,  %f\n", i, vi[j], fArray[i*N+j]);
+           //}
         }
       }
    }

diff --git a/Matrix.h b/Matrix.h
@@ -50,11 +50,17 @@ inline double dtime()
     return( tseconds );
 }
 
+#ifdef __CUDACC__
+__host__ __device__
+#endif
 inline float hipo(float x, float y)
 {
   return std::sqrt(x*x + y*y);
 }
 
+#ifdef __CUDACC__
+__host__ __device__
+#endif
 inline void sincos4(const float x, float& sin, float& cos)
 {
    // Had this writen with explicit division by factorial.
@@ -74,7 +80,7 @@ inline void sincos4(const float x, float& sin, float& cos)
   #ifdef __INTEL_COMPILER
     #define ASSUME_ALIGNED(a, b) __assume_aligned(a, b)
   #else
-    #define ASSUME_ALIGNED(a, b) __builtin_assume_aligned(a, b)
+    #define ASSUME_ALIGNED(a, b) a = static_cast<decltype(a)>(__builtin_assume_aligned(a, b))
   #endif
 
   #include "Matriplex/MatriplexSym.h"

diff --git a/Track.h b/Track.h
@@ -140,6 +140,11 @@ class Track
 
   const float* posArray() const {return state_.parameters.Array();}
   const float* errArray() const {return state_.errors.Array();}
+//#ifdef USE_CUDA
+#if __CUDACC__
+  __device__ float* posArrayCU();
+  __device__ float* errArrayCU();
+#endif
 
   // Non-const versions needed for CopyOut of Matriplex.
   SVector6&     parameters_nc() {return state_.parameters;}
@@ -149,8 +154,17 @@ class Track
   SVector3 position() const {return SVector3(state_.parameters[0],state_.parameters[1],state_.parameters[2]);}
   SVector3 momentum() const {return SVector3(state_.parameters[3],state_.parameters[4],state_.parameters[5]);}
 
+#if __CUDACC__
+  __host__ __device__
+#endif
   int      charge() const {return state_.charge;}
+#if __CUDACC__
+  __host__ __device__
+#endif
   float    chi2()   const {return chi2_;}
+#if __CUDACC__
+  __host__ __device__
+#endif
   int      label()  const {return label_;}
 
   float x()      const { return state_.parameters[0];}
@@ -200,12 +214,18 @@ class Track
     }
   }
 
+#if __CUDACC__
+  __host__ __device__
+#endif
   void addHitIdx(int hitIdx,float chi2)
   {
     hitIdxArr_[++hitIdxPos_] = hitIdx;
     if (hitIdx >= 0) { ++nGoodHitIdx_; chi2_+=chi2; }
   }
 
+#if __CUDACC__
+  __host__ __device__
+#endif
   int getHitIdx(int posHitIdx) const
   {
     return hitIdxArr_[posHitIdx];
@@ -222,6 +242,9 @@ class Track
     }
   }
 
+#if __CUDACC__
+  __host__ __device__
+#endif
   void setHitIdx(int posHitIdx, int newIdx) {
     hitIdxArr_[posHitIdx] = newIdx;
   }
@@ -233,6 +256,16 @@ class Track
     }
   }
 
+#if __CUDACC__
+  __host__ __device__
+#endif
+  void setNGoodHitIdx(int nHits) {
+    nGoodHitIdx_ = nHits;
+  }
+
+#if __CUDACC__
+  __host__ __device__
+#endif
   void resetHits()
   {
     hitIdxPos_   = -1;
@@ -250,8 +283,18 @@ class Track
     }
     return layers;
   }
+
+#if __CUDACC__
+  __host__ __device__
+#endif
   void setCharge(int chg)  {state_.charge=chg;}
+#if __CUDACC__
+  __host__ __device__
+#endif
   void setChi2(float chi2) {chi2_=chi2;}
+#if __CUDACC__
+  __host__ __device__
+#endif
   void setLabel(int lbl)   {label_=lbl;}
 
   void setState(const TrackState& newState) {state_=newState;}