From d2c106d4203eeb8110a4c16b9c70c974d72dfb12 Mon Sep 17 00:00:00 2001
From: Dan Riley <Daniel.Riley@cornell.edu>
Date: Thu, 19 May 2016 14:47:10 -0400
Subject: [PATCH 01/13] proof of concept for sharing code between host/mic and
 GPU

---
 Config.h                       |   2 +-
 Makefile.config                |  14 +-
 mkFit/GPlex.h                  |   6 +
 mkFit/Makefile                 |   2 +-
 mkFit/PropagationMPlex.cc      | 206 +-------------------------
 mkFit/PropagationMPlex.icc     | 195 ++++++++++++++++++++++++
 mkFit/fittestMPlex.cc          |  28 +++-
 mkFit/kalmanUpdater_kernels.cu |   8 +-
 mkFit/mkFit.cc                 |  26 ++--
 mkFit/propagation_kernels.cu   | 263 ++++++---------------------------
 10 files changed, 303 insertions(+), 447 deletions(-)
 create mode 100644 mkFit/PropagationMPlex.icc
diff --git a/Config.h b/Config.h
index 977ea4903204b..98763f200d03e 100644
--- a/Config.h
+++ b/Config.h
@@ -5,7 +5,7 @@
 #include <string> // won't compile on clang gcc for mac OS w/o this!
 
 //#define PRINTOUTS_FOR_PLOTS
-#define POLCOORD
+//#define POLCOORD
 
 namespace Config
 {
diff --git a/Makefile.config b/Makefile.config
index 19cbe25b94e95..5244e37e48264 100644
--- a/Makefile.config
+++ b/Makefile.config
@@ -29,7 +29,7 @@ endif
 # CUDA compiler
 NV := nvcc
 # Comment out to compile for CPU
-# USE_CUDA := -DUSE_CUDA
+USE_CUDA := yes
 
 # 3. Optimization
 # -O3 implies vectorization and simd (but not AVX)
@@ -106,12 +106,17 @@ INWARD_FIT := -DINWARDFIT
 # Derived settings
 ################################################################
 
-CPPFLAGS := -I. ${USE_MATRIPLEX} ${USE_INTRINSICS} ${USE_CUDA} -std=c++11
+CPPFLAGS := -I. ${USE_MATRIPLEX} ${USE_INTRINSICS} -std=c++11
 CXXFLAGS := ${OPT} ${OSX_CXXFLAGS}
 
-LDFLAGS_HOST :=
+LDFLAGS_HOST := 
 LDFLAGS_MIC  :=
 
+ifdef USE_CUDA
+CPPFLAGS += -DUSE_CUDA -I/nfs/opt/cuda/include
+LDFLAGS_HOST += -L${CUDALIBDIR}
+endif
+
 CPPFLAGS += ${USE_STATE_VALIDITY_CHECKS} ${USE_SCATTERING} ${USE_LINEAR_INTERPOLATION} ${ENDTOEND} ${USE_ETA_SEGMENTATION} ${INWARD_FIT} ${GEN_FLAT_ETA}
 
 ifdef USE_VTUNE_NOTIFY
@@ -123,7 +128,8 @@ endif
 endif
 
 ifneq ($(CXX),icc)
-  #CXXFLAGS += -Wall -Wno-unknown-pragmas
+  CPPFLAGS += -I/opt/rh/python27/root/usr/include
+  LDFLAGS_HOST += -L/opt/rh/python27/root/usr/lib64
 endif
 
 ifeq ($(CXX),icc)
diff --git a/mkFit/GPlex.h b/mkFit/GPlex.h
index 9dfe3b86e6e02..f450c3c9d4e8d 100644
--- a/mkFit/GPlex.h
+++ b/mkFit/GPlex.h
@@ -16,6 +16,12 @@ struct GPlex {
   T* ptr;
   size_t pitch, stride, x, y;
 
+  __device__ T  operator[](int xx) const { return ptr[xx]; }
+  __device__ T& operator[](int xx)       { return ptr[xx]; }
+
+  __device__ T& operator()(int n, int i, int j)       { return ptr[n + i*stride]; }
+  __device__ T  operator()(int n, int i, int j) const { return ptr[n + i*stride]; }
+
   void allocate(size_t ntracks, size_t plex_size) {
     x = ntracks;
     y = plex_size;
diff --git a/mkFit/Makefile b/mkFit/Makefile
index b6433baaefdaf..6e5017e5b3e72 100644
--- a/mkFit/Makefile
+++ b/mkFit/Makefile
@@ -56,7 +56,7 @@ echo:
 ################################################################
 
 # Should be a lib, really
-ABOVE_OBJS := $(patsubst %, ../%.o, Config Matrix Event Hit Track Propagation KalmanUtils Simulation Geometry SimpleGeom fittest buildtest ConformalUtils seedtest BinInfoUtils)
+ABOVE_OBJS := $(patsubst %, ../%.o, Config Matrix Event Hit Track Propagation KalmanUtils Simulation Geometry SimpleGeom fittest buildtest ConformalUtils seedtest BinInfoUtils TTreeValidation)
 
 ${ABOVE_OBJS}:
 	${MAKE} -C ..
diff --git a/mkFit/PropagationMPlex.cc b/mkFit/PropagationMPlex.cc
index 85c0fc85e2c57..5f23c74a5621f 100644
--- a/mkFit/PropagationMPlex.cc
+++ b/mkFit/PropagationMPlex.cc
@@ -542,6 +542,8 @@ void helixAtRFromIterativePolar(const MPlexLV& inPar, const MPlexQI& inChg, MPle
     }
 }
 
+#include "PropagationMPlex.icc"
+
 void helixAtRFromIterative(const MPlexLV& inPar, const MPlexQI& inChg, MPlexLV& outPar, const MPlexQF &msRad, MPlexLL& errorProp) {
 
   errorProp.SetVal(0);
@@ -549,209 +551,7 @@ void helixAtRFromIterative(const MPlexLV& inPar, const MPlexQI& inChg, MPlexLV&
 #pragma simd
   for (int n = 0; n < NN; ++n)
     {
-
-      //initialize erroProp to identity matrix
-      errorProp(n,0,0) = 1.f;
-      errorProp(n,1,1) = 1.f;
-      errorProp(n,2,2) = 1.f;
-      errorProp(n,3,3) = 1.f;
-      errorProp(n,4,4) = 1.f;
-      errorProp(n,5,5) = 1.f;
-
-      const float xin  = inPar.ConstAt(n, 0, 0);
-      const float yin  = inPar.ConstAt(n, 1, 0);
-      const float pxin = inPar.ConstAt(n, 3, 0);
-      const float pyin = inPar.ConstAt(n, 4, 0);
-      const float pzin = inPar.ConstAt(n, 5, 0);
-      const float r    = msRad.ConstAt(n, 0, 0);
-      float r0 = hipo(xin, yin);
-      
-      dprint(std::endl << "attempt propagation from r=" << r0 << " to r=" << r << std::endl
-        << "x=" << xin << " y=" << yin  << " z=" << inPar.ConstAt(n, 2, 0) << " px=" << pxin << " py=" << pyin << " pz=" << pzin << " q=" << inChg.ConstAt(n, 0, 0));
-
-      if (std::abs(r-r0)<0.0001f) {
-	dprint("distance less than 1mum, skip");
-	continue;
-      }
-      
-      const float pt2    = pxin*pxin+pyin*pyin;
-      const float pt     = std::sqrt(pt2);
-      const float ptinv  = 1.f/pt;
-      const float pt2inv = ptinv*ptinv;
-      //p=0.3Br => r=p/(0.3*B)
-      const float k = inChg.ConstAt(n, 0, 0) * 100.f / (-Config::sol*Config::Bfield);
-      const float invcurvature = 1.f/(pt*k);//in 1./cm
-      const float ctgTheta=pzin*ptinv;
-      
-      dprint("curvature=" << 1.f/invcurvature);
-      
-      //variables to be updated at each iterations
-      float totalDistance = 0;
-      //derivatives initialized to value for first iteration, i.e. distance = r-r0in
-      float dTDdx = r0>0.f ? -xin/r0 : 0.f;
-      float dTDdy = r0>0.f ? -yin/r0 : 0.f;
-      float dTDdpx = 0.;
-      float dTDdpy = 0.;
-      for (unsigned int i=0;i<Config::Niter;++i)
-	{
-	  dprint("propagation iteration #" << i);
-	  
-	  const float x  = outPar.At(n, 0, 0);
-	  const float y  = outPar.At(n, 1, 0);
-	  const float px = outPar.At(n, 3, 0);
-	  const float py = outPar.At(n, 4, 0);
-	  r0 = hipo(outPar.At(n, 0, 0), outPar.At(n, 1, 0));
-	  
-	  dprint("r0=" << r0 << " pt=" << pt);
-#ifdef DEBUG
-	  // if (dump) {
-	  //    if (r==r0) {
-	  //       std::cout << "distance = 0 at iteration=" << i << std::endl;
-	  //       break;
-	  //    }
-	  // }
-#endif
-	  
-	  //distance=r-r0;//remove temporary
-	  totalDistance+=(r-r0);
-	  
-	  dprint("distance=" << (r-r0) << " angPath=" << (r-r0)*invcurvature);
-	  
-	  //float angPath = (r-r0)*invcurvature;
-          float cosAP, sinAP;
-	  if (Config::useTrigApprox) {
-	    sincos4((r-r0)*invcurvature, sinAP, cosAP);
-	  } else {
-	    cosAP=std::cos((r-r0)*invcurvature);
-	    sinAP=std::sin((r-r0)*invcurvature);
-	  }
-
-	  //helix propagation formulas
-	  //http://www.phys.ufl.edu/~avery/fitting/fitting4.pdf
-	  outPar.At(n, 0, 0) = outPar.At(n, 0, 0) + k*(px*sinAP-py*(1-cosAP));
-	  outPar.At(n, 1, 0) = outPar.At(n, 1, 0) + k*(py*sinAP+px*(1-cosAP));
-	  outPar.At(n, 2, 0) = outPar.At(n, 2, 0) + (r-r0)*ctgTheta;
-	  outPar.At(n, 3, 0) = px*cosAP-py*sinAP;
-	  outPar.At(n, 4, 0) = py*cosAP+px*sinAP;
-	  //outPar.At(n, 5, 0) = pz; //take this out as it is redundant
-
-	  if (i+1 != Config::Niter && r0 > 0 && std::abs((r-r0)*invcurvature)>0.000000001f)
-	  {
-	     //update derivatives on total distance for next step, where totalDistance+=r-r0
-	     //now r0 depends on px and py
-
-	     dprint("r0=" << 1.f/r0 << " r0inv=" << r0 << " pt=" << pt);
-
-	     //update derivative on D
-	     const float dAPdpx = -(r-r0)*invcurvature*px*pt2inv;//r0 is now 1./r0 (this could go above the redefinition of r0!)
-	     const float dAPdpy = -(r-r0)*invcurvature*py*pt2inv;
-	     r0 = 1.f/r0;//WARNING, now r0 is r0inv (one less temporary)
-	     const float dAPdx = -x*r0*invcurvature;
-	     const float dAPdy = -y*r0*invcurvature;
-	     //reduce temporary variables
-	     //dxdx = 1 + k*dAPdx*(px*cosAP - py*sinAP);
-	     //dydx = k*dAPdx*(py*cosAP + px*sinAP);
-	     //dTDdx -= r0*(x*dxdx + y*dydx);
-	     dTDdx -= r0*(x*(1.f + k*dAPdx*(px*cosAP - py*sinAP)) + y*(k*dAPdx*(py*cosAP + px*sinAP)));
-	     //reuse same temporary variables
-	     //dxdy = k*dAPdy*(px*cosAP - py*sinAP);
-	     //dydy = 1 + k*dAPdy*(py*cosAP + px*sinAP);
-	     //dTDdy -= r0*(x*dxdy + y*dydy);
-	     dTDdy -= r0*(x*(k*dAPdy*(px*cosAP - py*sinAP)) + y*(1.f + k*dAPdy*(py*cosAP + px*sinAP)));
-	     //dxdpx = k*(sinAP + px*cosAP*dAPdpx - py*sinAP*dAPdpx);
-	     //dydpx = k*(py*cosAP*dAPdpx + 1. - cosAP + px*sinAP*dAPdpx);
-	     //dTDdpx -= r0*(x*dxdpx + y*dydpx);
-	     dTDdpx -= r0*(x*(k*(sinAP + px*cosAP*dAPdpx - py*sinAP*dAPdpx)) + y*(k*(py*cosAP*dAPdpx + 1.f - cosAP + px*sinAP*dAPdpx)));
-	     //dxdpy = k*(px*cosAP*dAPdpy - 1. + cosAP - py*sinAP*dAPdpy);
-	     //dydpy = k*(sinAP + py*cosAP*dAPdpy + px*sinAP*dAPdpy);
-	     //dTDdpy -= r0*(x*dxdpy + y*(dydpy);
-	     dTDdpy -= r0*(x*(k*(px*cosAP*dAPdpy - 1.f + cosAP - py*sinAP*dAPdpy)) + y*(k*(sinAP + py*cosAP*dAPdpy + px*sinAP*dAPdpy)));
-
-	  }
-	  dprint("iteration end, dump parameters" << std::endl
-		 << "pos = " << outPar.At(n, 0, 0) << " " << outPar.At(n, 1, 0) << " " << outPar.At(n, 2, 0) << std::endl
-		 << "mom = " << outPar.At(n, 3, 0) << " " << outPar.At(n, 4, 0) << " " << outPar.At(n, 5, 0) << std::endl
-		 << "r=" << std::sqrt( outPar.At(n, 0, 0)*outPar.At(n, 0, 0) + outPar.At(n, 1, 0)*outPar.At(n, 1, 0) ) << " pT=" << std::sqrt( outPar.At(n, 3, 0)*outPar.At(n, 3, 0) + outPar.At(n, 4, 0)*outPar.At(n, 4, 0) ));
-	}
-      
-      const float TD=totalDistance;
-      const float TP=TD*invcurvature;//totalAngPath
-      
-      dprint("TD=" << TD << " TP=" << TP << " arrived at r=" << std::sqrt(outPar.At(n, 0, 0)*outPar.At(n, 0, 0)+outPar.At(n, 1, 0)*outPar.At(n, 1, 0)) << std::endl
-        << "pos = " << outPar.At(n, 0, 0) << " " << outPar.At(n, 1, 0) << " " << outPar.At(n, 2, 0) << std::endl
-        << "mom = " << outPar.At(n, 3, 0) << " " << outPar.At(n, 4, 0) << " " << outPar.At(n, 5, 0));
-
-      const float iC = invcurvature;
-      const float dCdpx = k*pxin*ptinv;
-      const float dCdpy = k*pyin*ptinv;
-      const float dTPdx = dTDdx*iC;
-      const float dTPdy = dTDdy*iC;
-      const float dTPdpx = (dTDdpx - TD*dCdpx*iC)*iC; // MT change: avoid division
-      const float dTPdpy = (dTDdpy - TD*dCdpy*iC)*iC; // MT change: avoid division
-
-      //std::cout << "dTPdx=" << dTPdx << " dTPdy=" << dTPdy << " dTPdpx=" << dTPdpx << " dTPdpy=" << dTPdpy << std::endl;
-      
-      float cosTP, sinTP;
-      if (Config::useTrigApprox) {
-	sincos4(TP, sinTP, cosTP);
-      } else {
-	cosTP = std::cos(TP);
-	sinTP = std::sin(TP);
-      }
-
-      dprint("sinTP=" << sinTP << " cosTP=" << cosTP << " TD=" << TD);
-
-      //now try to make full jacobian
-      //derive these to compute jacobian
-      //x = xin + k*(pxin*sinTP-pyin*(1-cosTP));
-      //y = yin + k*(pyin*sinTP+pxin*(1-cosTP));
-      //z = zin + k*TP*pzin;
-      //px = pxin*cosTP-pyin*sinTP;
-      //py = pyin*cosTP+pxin*sinTP;
-      //pz = pzin;
-      //jacobian
-      
-      errorProp(n,0,0) = 1 + k*dTPdx*(pxin*cosTP - pyin*sinTP);	//dxdx;
-      errorProp(n,0,1) = k*dTPdy*(pxin*cosTP - pyin*sinTP);	//dxdy;
-      errorProp(n,0,2) = 0.;
-      errorProp(n,0,3) = k*(sinTP + pxin*cosTP*dTPdpx - pyin*sinTP*dTPdpx); //dxdpx;
-      errorProp(n,0,4) = k*(pxin*cosTP*dTPdpy - 1.f + cosTP - pyin*sinTP*dTPdpy);//dxdpy;
-      errorProp(n,0,5) = 0.;
-      
-      errorProp(n,1,0) = k*dTPdx*(pyin*cosTP + pxin*sinTP);	//dydx;
-      errorProp(n,1,1) = 1 + k*dTPdy*(pyin*cosTP + pxin*sinTP);	//dydy;
-      errorProp(n,1,2) = 0.;
-      errorProp(n,1,3) = k*(pyin*cosTP*dTPdpx + 1.f - cosTP + pxin*sinTP*dTPdpx);//dydpx;
-      errorProp(n,1,4) = k*(sinTP + pyin*cosTP*dTPdpy + pxin*sinTP*dTPdpy); //dydpy;
-      errorProp(n,1,5) = 0.;
-      
-      errorProp(n,2,0) = k*pzin*dTPdx;	//dzdx;
-      errorProp(n,2,1) = k*pzin*dTPdy;	//dzdy;
-      errorProp(n,2,2) = 1.f;
-      errorProp(n,2,3) = k*pzin*dTPdpx;//dzdpx;
-      errorProp(n,2,4) = k*pzin*dTPdpy;//dzdpy;
-      errorProp(n,2,5) = k*TP; //dzdpz;
-      
-      errorProp(n,3,0) = -dTPdx*(pxin*sinTP + pyin*cosTP);	//dpxdx;
-      errorProp(n,3,1) = -dTPdy*(pxin*sinTP + pyin*cosTP);	//dpxdy;
-      errorProp(n,3,2) = 0.;
-      errorProp(n,3,3) = cosTP - dTPdpx*(pxin*sinTP + pyin*cosTP); //dpxdpx;
-      errorProp(n,3,4) = -sinTP - dTPdpy*(pxin*sinTP + pyin*cosTP);//dpxdpy;
-      errorProp(n,3,5) = 0.;
-      
-      errorProp(n,4,0) = -dTPdx*(pyin*sinTP - pxin*cosTP); //dpydx;
-      errorProp(n,4,1) = -dTPdy*(pyin*sinTP - pxin*cosTP);	//dpydy;
-      errorProp(n,4,2) = 0.;
-      errorProp(n,4,3) = +sinTP - dTPdpx*(pyin*sinTP - pxin*cosTP);//dpydpx;
-      errorProp(n,4,4) = +cosTP - dTPdpy*(pyin*sinTP - pxin*cosTP);//dpydpy;
-      errorProp(n,4,5) = 0.;
-      
-      errorProp(n,5,0) = 0.;
-      errorProp(n,5,1) = 0.;
-      errorProp(n,5,2) = 0.;
-      errorProp(n,5,3) = 0.;
-      errorProp(n,5,4) = 0.;
-      errorProp(n,5,5) = 1.f;
+      helixAtRFromIterative_impl(inPar, inChg, outPar, msRad, errorProp, n);
     }
 
 #ifdef DEBUG
diff --git a/mkFit/PropagationMPlex.icc b/mkFit/PropagationMPlex.icc
new file mode 100644
index 0000000000000..1b5ce0bbd4f71
--- /dev/null
+++ b/mkFit/PropagationMPlex.icc
@@ -0,0 +1,195 @@
+template<typename Tf, typename Ti, typename TfLL1, typename Tf11, typename TfLLL>
+#ifdef __CUDACC__
+__device__
+#endif
+inline bool helixAtRFromIterative_impl(const Tf& inPar, const Ti& inChg, TfLL1& outPar, const Tf11& msRad, TfLLL& errorProp, int n)
+{
+  //initialize erroProp to identity matrix
+  errorProp(n,0,0) = 1.f;
+  errorProp(n,1,1) = 1.f;
+  errorProp(n,2,2) = 1.f;
+  errorProp(n,3,3) = 1.f;
+  errorProp(n,4,4) = 1.f;
+  errorProp(n,5,5) = 1.f;
+
+  const float xin  = inPar(n, 0, 0);
+  const float yin  = inPar(n, 1, 0);
+  const float pxin = inPar(n, 3, 0);
+  const float pyin = inPar(n, 4, 0);
+  const float pzin = inPar(n, 5, 0);
+  const float r    = msRad(n, 0, 0); 
+  float r0 = hipo(xin, yin);
+
+  dprint(std::endl << "attempt propagation from r=" << r0 << " to r=" << r << std::endl
+    << "x=" << xin << " y=" << yin  << " z=" << inPar.ConstAt(n, 2, 0) << " px=" << pxin << " py=" << pyin << " pz=" << pzin << " q=" << inChg.ConstAt(n, 0, 0));
+
+  if (std::abs(r-r0)<0.0001) {
+  	dprint("distance less than 1mum, skip");
+    return false;
+  }
+  const float pt2    = pxin*pxin+pyin*pyin;
+  const float pt     = std::sqrt(pt2);
+  const float ptinv  = 1.f/pt;
+  const float pt2inv = ptinv*ptinv;
+  //p=0.3Br => r=p/(0.3*B)
+  const float k = inChg(n, 0, 0) * 100.f / (-Config::sol*Config::Bfield);
+  const float invcurvature = 1.f/(pt*k);//in 1./cm
+  const float ctgTheta=pzin*ptinv;
+
+  //variables to be updated at each iterations
+  float totalDistance = 0;
+  //derivatives initialized to value for first iteration, i.e. distance = r-r0in
+  float dTDdx = r0>0. ? -xin/r0 : 0.;
+  float dTDdy = r0>0. ? -yin/r0 : 0.;
+  float dTDdpx = 0.;
+  float dTDdpy = 0.;
+  //5 iterations is a good starting point
+  //const unsigned int Niter = 10;
+  // const unsigned int Niter = 5+std::round(r-r0)/2;
+  for (unsigned int iter=0; iter < Config::Niter; ++iter) {
+
+	  dprint("propagation iteration #" << i);
+	  const float x  = outPar(n, 0, 0);
+	  const float y  = outPar(n, 1, 0);
+	  const float px = outPar(n, 3, 0);
+	  const float py = outPar(n, 4, 0);
+    r0 = hipo(x, y);
+
+	  dprint("r0=" << r0 << " pt=" << pt);
+
+    totalDistance += (r-r0);
+	  dprint("distance=" << (r-r0) << " angPath=" << (r-r0)*invcurvature);
+
+    float cosAP, sinAP;
+    if (Config::useTrigApprox) {  // TODO: uncomment
+      sincos4((r-r0)*invcurvature, sinAP, cosAP);
+    } else {
+      cosAP=std::cos((r-r0)*invcurvature);
+      sinAP=std::sin((r-r0)*invcurvature);
+    }
+
+    //helix propagation formulas
+    //http://www.phys.ufl.edu/~avery/fitting/fitting4.pdf
+	  outPar(n, 0, 0) = outPar(n, 0, 0) + k*(px*sinAP-py*(1-cosAP));
+	  outPar(n, 1, 0) = outPar(n, 1, 0) + k*(py*sinAP+px*(1-cosAP));
+	  outPar(n, 2, 0) = outPar(n, 2, 0) + (r-r0)*ctgTheta;
+	  outPar(n, 3, 0) = px*cosAP-py*sinAP;
+	  outPar(n, 4, 0) = py*cosAP+px*sinAP;
+    //outPar(n, 5, 0) = pz; //take this out as it is redundant
+
+	  if (iter+1 != Config::Niter && r0 > 0 && std::abs((r-r0)*invcurvature)>0.000000001f)
+	  {
+	     //update derivatives on total distance for next step, where totalDistance+=r-r0
+	     //now r0 depends on px and py
+
+	     dprint("r0=" << 1.f/r0 << " r0inv=" << r0 << " pt=" << pt);
+
+	     //update derivative on D
+	     const float dAPdpx = -(r-r0)*invcurvature*px*pt2inv;//r0 is now 1./r0 (this could go above the redefinition of r0!)
+	     const float dAPdpy = -(r-r0)*invcurvature*py*pt2inv;
+	     r0 = 1.f/r0;//WARNING, now r0 is r0inv (one less temporary)
+	     const float dAPdx = -x*r0*invcurvature;
+	     const float dAPdy = -y*r0*invcurvature;
+	     //reduce temporary variables
+	     //dxdx = 1 + k*dAPdx*(px*cosAP - py*sinAP);
+	     //dydx = k*dAPdx*(py*cosAP + px*sinAP);
+	     //dTDdx -= r0*(x*dxdx + y*dydx);
+	     dTDdx -= r0*(x*(1.f + k*dAPdx*(px*cosAP - py*sinAP)) + y*(k*dAPdx*(py*cosAP + px*sinAP)));
+	     //reuse same temporary variables
+	     //dxdy = k*dAPdy*(px*cosAP - py*sinAP);
+	     //dydy = 1 + k*dAPdy*(py*cosAP + px*sinAP);
+	     //dTDdy -= r0*(x*dxdy + y*dydy);
+	     dTDdy -= r0*(x*(k*dAPdy*(px*cosAP - py*sinAP)) + y*(1.f + k*dAPdy*(py*cosAP + px*sinAP)));
+	     //dxdpx = k*(sinAP + px*cosAP*dAPdpx - py*sinAP*dAPdpx);
+	     //dydpx = k*(py*cosAP*dAPdpx + 1. - cosAP + px*sinAP*dAPdpx);
+	     //dTDdpx -= r0*(x*dxdpx + y*dydpx);
+	     dTDdpx -= r0*(x*(k*(sinAP + px*cosAP*dAPdpx - py*sinAP*dAPdpx)) + y*(k*(py*cosAP*dAPdpx + 1.f - cosAP + px*sinAP*dAPdpx)));
+	     //dxdpy = k*(px*cosAP*dAPdpy - 1. + cosAP - py*sinAP*dAPdpy);
+	     //dydpy = k*(sinAP + py*cosAP*dAPdpy + px*sinAP*dAPdpy);
+	     //dTDdpy -= r0*(x*dxdpy + y*(dydpy);
+	     dTDdpy -= r0*(x*(k*(px*cosAP*dAPdpy - 1.f + cosAP - py*sinAP*dAPdpy)) + y*(k*(sinAP + py*cosAP*dAPdpy + px*sinAP*dAPdpy)));
+
+	  }
+	  dprint("iteration end, dump parameters" << std::endl
+		 << "pos = " << outPar.At(n, 0, 0) << " " << outPar.At(n, 1, 0) << " " << outPar.At(n, 2, 0) << std::endl
+		 << "mom = " << outPar.At(n, 3, 0) << " " << outPar.At(n, 4, 0) << " " << outPar.At(n, 5, 0) << std::endl
+		 << "r=" << std::sqrt( outPar.At(n, 0, 0)*outPar.At(n, 0, 0) + outPar.At(n, 1, 0)*outPar.At(n, 1, 0) ) 
+     << " pT=" << std::sqrt( outPar.At(n, 3, 0)*outPar.At(n, 3, 0) + outPar.At(n, 4, 0)*outPar.At(n, 4, 0) ));
+    const float TD=totalDistance;
+    const float TP=TD*invcurvature;//totalAngPath
+    
+    dprint("TD=" << TD << " TP=" << TP << " arrived at r=" << std::sqrt(outPar.At(n, 0, 0)*outPar.At(n, 0, 0)+outPar.At(n, 1, 0)*outPar.At(n, 1, 0))
+      << std::endl
+      << "pos = " << outPar.At(n, 0, 0) << " " << outPar.At(n, 1, 0) << " " << outPar.At(n, 2, 0) << std::endl
+      << "mom = " << outPar.At(n, 3, 0) << " " << outPar.At(n, 4, 0) << " " << outPar.At(n, 5, 0));
+
+    const float iC=invcurvature;
+    const float dCdpx = k*pxin*ptinv;
+    const float dCdpy = k*pyin*ptinv;
+    const float dTPdx = dTDdx*iC;
+    const float dTPdy = dTDdy*iC;
+    const float dTPdpx = (dTDdpx - TD*dCdpx*iC)*iC; // MT change: avoid division
+    const float dTPdpy = (dTDdpy - TD*dCdpy*iC)*iC; // MT change: avoid division
+    
+    float cosTP, sinTP;
+    if (Config::useTrigApprox) {
+      sincos4(TP, sinTP, cosTP);
+    } else {
+      cosTP = std::cos(TP);
+      sinTP = std::sin(TP);
+    }
+
+    //now try to make full jacobian
+    //derive these to compute jacobian
+    //x = xin + k*(pxin*sinTP-pyin*(1-cosTP));
+    //y = yin + k*(pyin*sinTP+pxin*(1-cosTP));
+    //z = zin + k*TP*pzin;
+    //px = pxin*cosTP-pyin*sinTP;
+    //py = pyin*cosTP+pxin*sinTP;
+    //pz = pzin;
+    //jacobian
+
+    errorProp(n,0,0) = 1 + k*dTPdx*(pxin*cosTP - pyin*sinTP);	//dxdx;
+    errorProp(n,0,1) = k*dTPdy*(pxin*cosTP - pyin*sinTP);	//dxdy;
+    errorProp(n,0,2) = 0.;
+    errorProp(n,0,3) = k*(sinTP + pxin*cosTP*dTPdpx - pyin*sinTP*dTPdpx); //dxdpx;
+    errorProp(n,0,4) = k*(pxin*cosTP*dTPdpy - 1.f + cosTP - pyin*sinTP*dTPdpy);//dxdpy;
+    errorProp(n,0,5) = 0.;
+    
+    errorProp(n,1,0) = k*dTPdx*(pyin*cosTP + pxin*sinTP);	//dydx;
+    errorProp(n,1,1) = 1 + k*dTPdy*(pyin*cosTP + pxin*sinTP);	//dydy;
+    errorProp(n,1,2) = 0.;
+    errorProp(n,1,3) = k*(pyin*cosTP*dTPdpx + 1.f - cosTP + pxin*sinTP*dTPdpx);//dydpx;
+    errorProp(n,1,4) = k*(sinTP + pyin*cosTP*dTPdpy + pxin*sinTP*dTPdpy); //dydpy;
+    errorProp(n,1,5) = 0.;
+    
+    errorProp(n,2,0) = k*pzin*dTPdx;	//dzdx;
+    errorProp(n,2,1) = k*pzin*dTPdy;	//dzdy;
+    errorProp(n,2,2) = 1.f;
+    errorProp(n,2,3) = k*pzin*dTPdpx;//dzdpx;
+    errorProp(n,2,4) = k*pzin*dTPdpy;//dzdpy;
+    errorProp(n,2,5) = k*TP; //dzdpz;
+    
+    errorProp(n,3,0) = -dTPdx*(pxin*sinTP + pyin*cosTP);	//dpxdx;
+    errorProp(n,3,1) = -dTPdy*(pxin*sinTP + pyin*cosTP);	//dpxdy;
+    errorProp(n,3,2) = 0.;
+    errorProp(n,3,3) = cosTP - dTPdpx*(pxin*sinTP + pyin*cosTP); //dpxdpx;
+    errorProp(n,3,4) = -sinTP - dTPdpy*(pxin*sinTP + pyin*cosTP);//dpxdpy;
+    errorProp(n,3,5) = 0.;
+    
+    errorProp(n,4,0) = -dTPdx*(pyin*sinTP - pxin*cosTP); //dpydx;
+    errorProp(n,4,1) = -dTPdy*(pyin*sinTP - pxin*cosTP);	//dpydy;
+    errorProp(n,4,2) = 0.;
+    errorProp(n,4,3) = +sinTP - dTPdpx*(pyin*sinTP - pxin*cosTP);//dpydpx;
+    errorProp(n,4,4) = +cosTP - dTPdpy*(pyin*sinTP - pxin*cosTP);//dpydpy;
+    errorProp(n,4,5) = 0.;
+    
+    errorProp(n,5,0) = 0.;
+    errorProp(n,5,1) = 0.;
+    errorProp(n,5,2) = 0.;
+    errorProp(n,5,3) = 0.;
+    errorProp(n,5,4) = 0.;
+    errorProp(n,5,5) = 1.f;
+  }
+  return true;
+}
diff --git a/mkFit/fittestMPlex.cc b/mkFit/fittestMPlex.cc
index d775dca00e3b7..8ad9bf693e0ed 100644
--- a/mkFit/fittestMPlex.cc
+++ b/mkFit/fittestMPlex.cc
@@ -16,10 +16,14 @@
 #include "TTree.h"
 #endif
 
+//#define DEBUG
+#include <Debug.h>
+
 #include <omp.h>
 
 #include <iostream>
 #include <memory>
+#include <mutex>
 
 #if defined(USE_VTUNE_PAUSE)
 #include "ittnotify.h"
@@ -31,11 +35,15 @@ void make_validation_tree(const char         *fname,
                           std::vector<Track> &simtracks,
                           std::vector<Track> &rectracks)
 {
-#ifndef NO_ROOT
-
    assert(simtracks.size() == rectracks.size());
 
    float pt_mc, pt_fit, pt_err, chg;
+   int goodtrk = 0;
+
+#ifndef NO_ROOT
+   static std::mutex roolock;
+
+   std::lock_guard<std::mutex> rooguard(roolock);
 
    TFile *file = TFile::Open(fname, "recreate");
    TTree *tree = new TTree("T", "Validation Tree for simple Kalman fitter");;
@@ -44,6 +52,7 @@ void make_validation_tree(const char         *fname,
    tree->Branch("pt_fit", &pt_fit, "pt_fit");
    tree->Branch("pt_err", &pt_err, "pt_err");
    tree->Branch("chg",    &chg,    "chg");
+#endif
 
 #ifdef USE_CUDA
    std::vector<float> diff_vec;
@@ -63,12 +72,18 @@ void make_validation_tree(const char         *fname,
                     recerr[3][4]*recp[3]*recp[4] * 2) / pt_fit;
       chg = simtracks[i].charge();
 
+
+#ifndef NO_ROOT
       tree->Fill();
+#endif
 
 #ifdef USE_CUDA
       float diff = (pt_mc - pt_fit) / pt_err;
       if (std::abs(diff) < 5.0f) {
         diff_vec.push_back(diff);
+        ++goodtrk;
+      } else {
+        dprint("pt_mc, pt_fit, pt_err " << pt_mc << " " << pt_fit << " " << pt_err);
       }
 #endif
    }
@@ -84,10 +99,11 @@ void make_validation_tree(const char         *fname,
        std::accumulate(diff_vec.begin(), diff_vec.end(), 0.0)
        / diff_vec.size());
 
-   std::cerr << "Mean value for (pt_mc-pt_fit)/pt_err: " << mean
-            << " standard dev: " << stdev << std::endl;
+   std::cerr << goodtrk << " good tracks, mean pt pull: " << mean
+             << " standard dev: " << stdev << std::endl;
 #endif
 
+#ifndef NO_ROOT
    file->Write();
    file->Close();
    delete file;
@@ -208,11 +224,11 @@ double runFittingTestPlexGPU(FitterCU<float> &cuFitter,
                          Nhits,
                          simtracks, itrack, end, ev.layerHits_);
 
-#ifndef NO_ROOT
+//#ifndef NO_ROOT
       double time_output = dtime();
       mkfp->OutputFittedTracks(rectracks, itrack, end);
       std::cerr << "Output time: " << (dtime() - time_output)*1e3 << std::endl;
-#endif
+//#endif
    }
 
    time = dtime() - time;
diff --git a/mkFit/kalmanUpdater_kernels.cu b/mkFit/kalmanUpdater_kernels.cu
index cfd3ea7587608..67189817e65af 100644
--- a/mkFit/kalmanUpdater_kernels.cu
+++ b/mkFit/kalmanUpdater_kernels.cu
@@ -129,15 +129,13 @@ __device__ void multResidualsAdd_fn(
   /*int i = threadIdx.x;*/
   /*int n = threadIdx.x + blockIdx.x * blockDim.x;*/
 
-  float x0, x1, x2;
-
   /*for (int z = 0; z < (N-1)/gridDim.x  +1; z++) {*/
     /*n += z*gridDim.x;*/
      if (n < N) {
        // manually substract into local vars -- 3 of them
-       x0 = c[0 * cN + n] - b[0 * bN + n];
-       x1 = c[1 * cN + n] - b[1 * bN + n];
-       x2 = c[2 * cN + n] - b[2 * bN + n];
+       const float x0 = c[0 * cN + n] - b[0 * bN + n];
+       const float x1 = c[1 * cN + n] - b[1 * bN + n];
+       const float x2 = c[2 * cN + n] - b[2 * bN + n];
 
        // generate loop (can also write it manually this time, it's not much)
        // WARNING: highly numerically sensitive expressions.
diff --git a/mkFit/mkFit.cc b/mkFit/mkFit.cc
index 505a341b3b32d..76ab882c782a3 100644
--- a/mkFit/mkFit.cc
+++ b/mkFit/mkFit.cc
@@ -21,6 +21,9 @@
 #include "FitterCU.h"
 #endif
 
+//#define DEBUG
+#include "Debug.h"
+
 #include <omp.h>
 
 #include <tbb/task_scheduler_init.h>
@@ -180,16 +183,16 @@ void test_standard()
   std::vector<Event> events;
   std::vector<Validation> validations(Config::nEvents);
 
+  events.reserve(Config::nEvents);
   // simulations are all performed before the fitting loop.
   // It is mandatory in order to see the benefits of running
   // multiple streams.
   for (int evt = 1; evt <= Config::nEvents; ++evt) {
     printf("Simulating event %d\n", evt);
-    Event ev(geom, val, evt);
-    ev.Simulate();
-    ev.resetLayerHitMap(true);
-    
-    events.push_back(ev);
+    events.emplace_back(geom, val, evt);
+    events.back().Simulate();
+    events.back().resetLayerHitMap(true);
+    dprint("Event #" << events.back().evtID() << " simtracks " << events.back().simTracks_.size() << " layerhits " << events.back().layerHits_.size());
   }
 
   // The first call to a GPU function always take a very long time.
@@ -238,22 +241,19 @@ void test_standard()
       printf("Processing event %d with thread %d\n", evt, idx);
       Event &ev = events[evt-1];
       std::vector<Track> plex_tracks_ev;
+      dprint("cuFitter thread " << thr_idx << " simTracks.size(): " << ev.simTracks_.size());
       plex_tracks_ev.resize(ev.simTracks_.size());
       double tmp = 0, tmp2bh = 0, tmp2 = 0, tmp2ce = 0;
 
       if (g_run_fit_std) tmp = runFittingTestPlexGPU(cuFitter, ev, plex_tracks_ev);
 
-      printf("Matriplex fit = %.5f  -------------------------------------", tmp);
+      printf("Matriplex fit = %.5f\n-------------------------------------", tmp);
       printf("\n");
       s_tmp    += tmp;
 #if 1  // 0 for timing, 1 for validation
-      // Validation crashes for multiple threads.
-      // It is something in relation to ROOT. Not sure what. 
-      if (omp_get_num_threads() <= 1) {
-        if (g_run_fit_std) {
-          std::string tree_name = "validation-plex-" + std::to_string(evt) + ".root";
-          make_validation_tree(tree_name.c_str(), ev.simTracks_, plex_tracks_ev);
-        }
+      if (g_run_fit_std) {
+        std::string tree_name = "validation-plex-" + std::to_string(evt) + ".root";
+        make_validation_tree(tree_name.c_str(), ev.simTracks_, plex_tracks_ev);
       }
 #endif
     }
diff --git a/mkFit/propagation_kernels.cu b/mkFit/propagation_kernels.cu
index 6b6f3f8b2d9ec..5c25621b74328 100644
--- a/mkFit/propagation_kernels.cu
+++ b/mkFit/propagation_kernels.cu
@@ -1,10 +1,30 @@
 #include "Config.h"
+#include "Debug.h"
 #include "propagation_kernels.h"
 #include <stdio.h>
 
-#define L 6
-#define LL 36
-#define LS 21
+constexpr int L = 6;
+constexpr int LL = 36;
+constexpr int LS = 21;
+
+template <typename T, int D1, int D2>
+struct GPlexReg {
+  __device__ T  operator[](int xx) const { return arr[xx]; }
+  __device__ T& operator[](int xx)       { return arr[xx]; }
+
+  __device__ T& operator()(int n, int i, int j)       { return arr[i*D2 + j]; }
+  __device__ T  operator()(int n, int i, int j) const { return arr[i*D2 + j]; }
+
+  __device__ void SetVal(T v)
+  {
+     for (int i = 0; i < D1; ++i)
+     {
+        arr[i] = v;
+     }
+  }
+
+  T arr[D1];
+};
 
 // values from 32 to 512 give good results.
 // 32 gives slightly better results (on a K40)
@@ -12,7 +32,7 @@
 #define MAX_BLOCKS_X 65535 // CUDA constraint
 
 __device__ float hipo(float x, float y) {
-  return sqrt(x*x + y*y);
+  return std::sqrt(x*x + y*y);
 }
 __device__ void sincos4(float x, float& sin, float& cos) {
    // Had this writen with explicit division by factorial.
@@ -91,213 +111,36 @@ __device__ void computeJacobianSimple(float *errorProp,
 
 /// Compute MsRad /////////////////////////////////////////////////////////////
 // Not passing msRad.stride, as QF == 1 (second dim f msRad)
-__device__ void computeMsRad_fn(const float* __restrict__ msPar,
-    size_t stride_msPar, float* msRad, int N, int n) {
+__device__ void computeMsRad_fn(const GPlex<float>& __restrict__ msPar,
+    float* msRad, int N, int n) {
   /*int n = threadIdx.x + blockIdx.x * blockDim.x;*/
   if (n < N) {
-    *msRad = hipo(msPar[n], msPar[n + stride_msPar]);
+    *msRad = hipo(msPar.ptr[n], msPar.ptr[n + msPar.stride]);
   }
 }
 
-__device__ 
-void helixAtRFromIterative_fn(float *inPar, size_t inPar_stride,
-    int *inChg, float *outPar, size_t outPar_stride, float msRad, 
-    float *errorProp_reg, int N, int n) {
+#include "PropagationMPlex.icc"
 
-  size_t opN = outPar_stride;
-  size_t ipN = inPar_stride;
+__device__ 
+void helixAtRFromIterative_fn(const GPlex<float>& inPar,
+    const GPlex<int>& inChg, GPlex<float>& outPar_global, const GPlexReg<float,1,1>& msRad, 
+    GPlexReg<float, LL, L>& errorProp, int N, int n) {
 
   /*int n = threadIdx.x + blockIdx.x * blockDim.x;*/
 
-  float outPar_reg[5];
+  GPlexReg<float, LL, 1> outPar;
 
   if (n < N) {
     for (int j = 0; j < 5; ++j) {
-      outPar_reg[j] = outPar[n+j*opN]; 
-    }
-    const float& xin = inPar[n + 0*ipN]; 
-    const float& yin = inPar[n + 1*ipN]; 
-    const float& pxin = inPar[n + 3*ipN]; 
-    const float& pyin = inPar[n + 4*ipN]; 
-    const float& pzin = inPar[n + 5*ipN]; 
-    const float& r = msRad; 
-    float r0 = hipo(xin, yin);
-
-    if (fabs(r-r0)<0.0001) {
-      // get an identity matrix
-      computeJacobianSimple(errorProp_reg, 0, 1, 1, 1, 1, 1, 0, 1, 0, N);
-      return;  // continue;
-    }
-    float pt2    = pxin*pxin+pyin*pyin;
-    float pt     = sqrt(pt2);
-    float ptinv  = 1./pt;
-    float pt2inv = ptinv*ptinv;
-    //p=0.3Br => r=p/(0.3*B)
-    float k = inChg[n] * 100. / (-0.299792458*Config::Bfield);
-    float invcurvature = 1./(pt*k);//in 1./cm
-    float ctgTheta=pzin*ptinv;
-
-    //variables to be updated at each iterations
-    float totalDistance = 0;
-    //derivatives initialized to value for first iteration, i.e. distance = r-r0in
-    float dTDdx = r0>0. ? -xin/r0 : 0.;
-    float dTDdy = r0>0. ? -yin/r0 : 0.;
-    float dTDdpx = 0.;
-    float dTDdpy = 0.;
-    //temporaries used within the loop (declare here to reduce memory operations)
-    float x = 0.;
-    float y = 0.;
-    float px = 0.;
-    float py = 0.;
-    float cosAP=0.;
-    float sinAP=0.;
-    float dAPdx = 0.;
-    float dAPdy = 0.;
-    float dAPdpx = 0.;
-    float dAPdpy = 0.;
-    // float dxdvar = 0.;
-    // float dydvar = 0.;
-    //5 iterations is a good starting point
-    //const unsigned int Niter = 10;
-    // const unsigned int Niter = 5+std::round(r-r0)/2;
-    for (unsigned int iter=0; iter < Config::Niter; ++iter) {
-      x  = outPar_reg[0];
-      y  = outPar_reg[1];
-      px = outPar_reg[3];
-      py = outPar_reg[4];
-      r0 = hipo(outPar_reg[0], outPar_reg[1]);
-
-      totalDistance += (r-r0);
-      if (Config::useTrigApprox) {  // TODO: uncomment
-        sincos4((r-r0)*invcurvature, sinAP, cosAP);
-      } else {
-        cosAP=cos((r-r0)*invcurvature);
-        sinAP=sin((r-r0)*invcurvature);
-      }
-
-      //helix propagation formulas
-      //http://www.phys.ufl.edu/~avery/fitting/fitting4.pdf
-      outPar_reg[0] = outPar_reg[0] + k*(px*sinAP-py*(1-cosAP));
-      outPar_reg[1] = outPar_reg[1] + k*(py*sinAP+px*(1-cosAP));
-      outPar_reg[2] = outPar_reg[2] + (r-r0)*ctgTheta;
-      outPar_reg[3] = px*cosAP-py*sinAP;
-      outPar_reg[4] = py*cosAP+px*sinAP;
-      //outPar.At(n, 5, 0) = pz; //take this out as it is redundant
-
-      if (Config::useSimpleJac==0 && 
-          iter +1 != Config::Niter &&
-          r0 > 0 && fabs((r-r0)*invcurvature)>0.000000001) {
-        //update derivatives on total distance for next step, where totalDistance+=r-r0
-        //now r0 depends on px and py
-        r0 = 1./r0;//WARNING, now r0 is r0inv (one less temporary)
-
-        //update derivative on D
-        dAPdx = -x*r0*invcurvature;
-        dAPdy = -y*r0*invcurvature;
-        dAPdpx = -(r-1./r0)*invcurvature*px*pt2inv;//weird, using r0 instead of 1./r0 improves things but it should be wrong since r0 in now r0inv
-        dAPdpy = -(r-1./r0)*invcurvature*py*pt2inv;//weird, using r0 instead of 1./r0 improves things but it should be wrong since r0 in now r0inv
-        //reduce temporary variables
-        //dxdx = 1 + k*dAPdx*(px*cosAP - py*sinAP);
-        //dydx = k*dAPdx*(py*cosAP + px*sinAP);
-        //dTDdx -= r0*(x*dxdx + y*dydx);
-        dTDdx -= r0*(x*(1 + k*dAPdx*(px*cosAP - py*sinAP)) + y*(k*dAPdx*(py*cosAP + px*sinAP)));
-        //reuse same temporary variables
-        //dxdy = k*dAPdy*(px*cosAP - py*sinAP);
-        //dydy = 1 + k*dAPdy*(py*cosAP + px*sinAP);
-        //dTDdy -= r0*(x*dxdy + y*dydy);
-        dTDdy -= r0*(x*(k*dAPdy*(px*cosAP - py*sinAP)) + y*(1 + k*dAPdy*(py*cosAP + px*sinAP)));
-        //dxdpx = k*(sinAP + px*cosAP*dAPdpx - py*sinAP*dAPdpx);
-        //dydpx = k*(py*cosAP*dAPdpx + 1. - cosAP + px*sinAP*dAPdpx);
-        //dTDdpx -= r0*(x*dxdpx + y*dydpx);
-        dTDdpx -= r0*(x*(k*(sinAP + px*cosAP*dAPdpx - py*sinAP*dAPdpx)) + y*(k*(py*cosAP*dAPdpx + 1. - cosAP + px*sinAP*dAPdpx)));
-        //dxdpy = k*(px*cosAP*dAPdpy - 1. + cosAP - py*sinAP*dAPdpy);
-        //dydpy = k*(sinAP + py*cosAP*dAPdpy + px*sinAP*dAPdpy);
-        //dTDdpy -= r0*(x*dxdpy + y*(k*dydpy);
-        dTDdpy -= r0*(x*(k*(px*cosAP*dAPdpy - 1. + cosAP - py*sinAP*dAPdpy)) + y*(k*(sinAP + py*cosAP*dAPdpy + px*sinAP*dAPdpy)));
-
-      }
-      float& TD=totalDistance;
-      float  TP=TD*invcurvature;//totalAngPath
-      
-      float& iC=invcurvature;
-      float dCdpx = k*pxin*ptinv;
-      float dCdpy = k*pyin*ptinv;
-      float dTPdx = dTDdx*iC;
-      float dTPdy = dTDdy*iC;
-      float dTPdpx = (dTDdpx - TD*dCdpx*iC)*iC; // MT change: avoid division
-      float dTPdpy = (dTDdpy - TD*dCdpy*iC)*iC; // MT change: avoid division
-      
-      float cosTP, sinTP;
-      if (Config::useTrigApprox) {
-        sincos4(TP, sinTP, cosTP);
-      } else {
-        cosTP = cos(TP);
-        sinTP = sin(TP);
-      }
-
-      if (Config::useSimpleJac) { 
-        //assume total path length s as given and with no uncertainty
-        float p = pt2 + pzin*pzin;
-        p = sqrt(p);
-        float s = TD*p*ptinv;
-        computeJacobianSimple(errorProp_reg, s, k, p, pxin, pyin, pzin, TP, cosTP, sinTP, N);
-      } else {
-        //now try to make full jacobian
-        //derive these to compute jacobian
-        //x = xin + k*(pxin*sinTP-pyin*(1-cosTP));
-        //y = yin + k*(pyin*sinTP+pxin*(1-cosTP));
-        //z = zin + k*TP*pzin;
-        //px = pxin*cosTP-pyin*sinTP;
-        //py = pyin*cosTP+pxin*sinTP;
-        //pz = pzin;
-        //jacobian
-
-        errorProp_reg[(0*L + 0)] = 1 + k*dTPdx*(pxin*cosTP - pyin*sinTP);	//dxdx;
-        errorProp_reg[(0*L + 1)] = k*dTPdy*(pxin*cosTP - pyin*sinTP);	//dxdy;
-        errorProp_reg[(0*L + 2)] = 0.;
-        errorProp_reg[(0*L + 3)] = k*(sinTP + pxin*cosTP*dTPdpx - pyin*sinTP*dTPdpx); //dxdpx;
-        errorProp_reg[(0*L + 4)] = k*(pxin*cosTP*dTPdpy - 1. + cosTP - pyin*sinTP*dTPdpy);//dxdpy;
-        errorProp_reg[(0*L + 5)] = 0.;
-
-        errorProp_reg[(1*L + 0)] = k*dTPdx*(pyin*cosTP + pxin*sinTP);	//dydx;
-        errorProp_reg[(1*L + 1)] = 1 + k*dTPdy*(pyin*cosTP + pxin*sinTP);	//dydy;
-        errorProp_reg[(1*L + 2)] = 0.;
-        errorProp_reg[(1*L + 3)] = k*(pyin*cosTP*dTPdpx + 1. - cosTP + pxin*sinTP*dTPdpx);//dydpx;
-        errorProp_reg[(1*L + 4)] = k*(sinTP + pyin*cosTP*dTPdpy + pxin*sinTP*dTPdpy); //dydpy;
-        errorProp_reg[(1*L + 5)] = 0.;
-
-        errorProp_reg[(2*L + 0)] = k*pzin*dTPdx;	//dzdx;
-        errorProp_reg[(2*L + 1)] = k*pzin*dTPdy;	//dzdy;
-        errorProp_reg[(2*L + 2)] = 1.;
-        errorProp_reg[(2*L + 3)] = k*pzin*dTPdpx;//dzdpx;
-        errorProp_reg[(2*L + 4)] = k*pzin*dTPdpy;//dzdpy;
-        errorProp_reg[(2*L + 5)] = k*TP; //dzdpz;
-
-        errorProp_reg[(3*L + 0)] = -dTPdx*(pxin*sinTP + pyin*cosTP);	//dpxdx;
-        errorProp_reg[(3*L + 1)] = -dTPdy*(pxin*sinTP + pyin*cosTP);	//dpxdy;
-        errorProp_reg[(3*L + 2)] = 0.;
-        errorProp_reg[(3*L + 3)] = cosTP - dTPdpx*(pxin*sinTP + pyin*cosTP); //dpxdpx;
-        errorProp_reg[(3*L + 4)] = -sinTP - dTPdpy*(pxin*sinTP + pyin*cosTP);//dpxdpy;
-        errorProp_reg[(3*L + 5)] = 0.;
-
-        errorProp_reg[(4*L + 0)] = -dTPdx*(pyin*sinTP - pxin*cosTP); //dpydx;
-        errorProp_reg[(4*L + 1)] = -dTPdy*(pyin*sinTP - pxin*cosTP);	//dpydy;
-        errorProp_reg[(4*L + 2)] = 0.;
-        errorProp_reg[(4*L + 3)] = +sinTP - dTPdpx*(pyin*sinTP - pxin*cosTP);//dpydpx;
-        errorProp_reg[(4*L + 4)] = +cosTP - dTPdpy*(pyin*sinTP - pxin*cosTP);//dpydpy;
-        errorProp_reg[(4*L + 5)] = 0.;
-
-        errorProp_reg[(5*L + 0)] = 0.;
-        errorProp_reg[(5*L + 1)] = 0.;
-        errorProp_reg[(5*L + 2)] = 0.;
-        errorProp_reg[(5*L + 3)] = 0.;
-        errorProp_reg[(5*L + 4)] = 0.;
-        errorProp_reg[(5*L + 5)] = 1.;
-      }
+      outPar[j] = outPar_global(n, j, 0);
     }
+    errorProp.SetVal(0);
+
+    helixAtRFromIterative_impl(inPar, inChg, outPar, msRad, errorProp, n);
+
     // Once computations are done. Get values from registers to global memory.
     for (int j = 0; j < 5; ++j) {
-      outPar[n + j*opN] = outPar_reg[j];
+      outPar_global(n, j, 0) = outPar[j];
     }
   }
 }
@@ -396,27 +239,23 @@ __device__ void similarity_fn(float* a, float *b, size_t stride_outErr,
 }
 
 __global__ void propagation_kernel(
-    const float* __restrict__ msPar, size_t stride_msPar, 
-    float *inPar, size_t inPar_stride, int *inChg,
-    float *outPar, size_t outPar_stride, float *errorProp,
-    size_t errorProp_stride, float *outErr, size_t outErr_stride, int N) {
+    GPlex<float> msPar,
+    GPlex<float> inPar, GPlex<int> inChg,
+    GPlex<float> outPar, GPlex<float> errorProp,
+    GPlex<float> outErr, int N) {
 
   int grid_width = blockDim.x * gridDim.x;
   int n = threadIdx.x + blockIdx.x * blockDim.x;
-  float msRad_reg;
+  GPlexReg<float,1,1> msRad_reg;
   // Using registers instead of shared memory is ~ 30% faster.
-  float errorProp_reg[LL];
+  GPlexReg<float, LL, L> errorProp_reg;
   // If there is more matrices than MAX_BLOCKS_X * BLOCK_SIZE_X 
   for (int z = 0; z < (N-1)/grid_width  +1; z++) {
     n += z*grid_width;
     if (n < N) {
-      computeMsRad_fn(msPar, stride_msPar, &msRad_reg, N, n);
-      if (Config::doIterative) {
-        helixAtRFromIterative_fn(inPar, inPar_stride,
-            inChg, outPar, outPar_stride, msRad_reg, 
-            errorProp_reg, N, n);
-      }
-      similarity_fn(errorProp_reg, outErr, outErr_stride, N, n);
+      computeMsRad_fn(msPar, msRad_reg.arr, N, n);
+      helixAtRFromIterative_fn(inPar, inChg, outPar, msRad_reg, errorProp_reg, N, n);
+      similarity_fn(errorProp_reg.arr, outErr.ptr, outErr.stride, N, n);
     }
   }
 }
@@ -432,9 +271,5 @@ void propagation_wrapper(cudaStream_t& stream,
                        MAX_BLOCKS_X);
   dim3 grid(gridx, 1, 1);
   dim3 block(BLOCK_SIZE_X, 1, 1);
-  propagation_kernel <<<grid, block, 0, stream >>>
-    (msPar.ptr, msPar.stride,
-     inPar.ptr, inPar.stride, inChg.ptr,
-     outPar.ptr, outPar.stride, errorProp.ptr,
-     errorProp.stride, outErr.ptr, outErr.stride, N);
+  propagation_kernel <<<grid, block, 0, stream >>>(msPar, inPar, inChg, outPar, errorProp, outErr, N);
 }

From 2ba575313cf895b26cda345e0991f39acae44bdf Mon Sep 17 00:00:00 2001
From: Dan Riley <Daniel.Riley@cornell.edu>
Date: Sat, 21 May 2016 07:23:08 -0700
Subject: [PATCH 02/13] icc optimizations

---
 Matrix.h                     |   2 +-
 mkFit/PropagationMPlex.cc    |   9 +-
 mkFit/PropagationMPlex.icc   | 365 ++++++++++++++++++-----------------
 mkFit/propagation_kernels.cu |   2 +-
 4 files changed, 190 insertions(+), 188 deletions(-)

diff --git a/Matrix.h b/Matrix.h
index 4e6953f225734..aec127f56322c 100644
--- a/Matrix.h
+++ b/Matrix.h
@@ -71,7 +71,7 @@ inline void sincos4(const float x, float& sin, float& cos)
   #ifdef __INTEL_COMPILER
     #define ASSUME_ALIGNED(a, b) __assume_aligned(a, b)
   #else
-    #define ASSUME_ALIGNED(a, b) __builtin_assume_aligned(a, b)
+    #define ASSUME_ALIGNED(a, b) a = __builtin_assume_aligned(a, b)
   #endif
 
   #include "Matriplex/MatriplexSym.h"
diff --git a/mkFit/PropagationMPlex.cc b/mkFit/PropagationMPlex.cc
index 5f23c74a5621f..d2fd96d930b10 100644
--- a/mkFit/PropagationMPlex.cc
+++ b/mkFit/PropagationMPlex.cc
@@ -542,17 +542,14 @@ void helixAtRFromIterativePolar(const MPlexLV& inPar, const MPlexQI& inChg, MPle
     }
 }
 
+//#pragma omp declare simd simdlen(NN) notinbranch linear(n)
 #include "PropagationMPlex.icc"
 
 void helixAtRFromIterative(const MPlexLV& inPar, const MPlexQI& inChg, MPlexLV& outPar, const MPlexQF &msRad, MPlexLL& errorProp) {
-
   errorProp.SetVal(0);
 
-#pragma simd
-  for (int n = 0; n < NN; ++n)
-    {
-      helixAtRFromIterative_impl(inPar, inChg, outPar, msRad, errorProp, n);
-    }
+  //#pragma ivdep
+  helixAtRFromIterative_impl(inPar, inChg, outPar, msRad, errorProp, 0, NN);
 
 #ifdef DEBUG
   {
diff --git a/mkFit/PropagationMPlex.icc b/mkFit/PropagationMPlex.icc
index 1b5ce0bbd4f71..d1f8f0015f870 100644
--- a/mkFit/PropagationMPlex.icc
+++ b/mkFit/PropagationMPlex.icc
@@ -2,194 +2,199 @@ template<typename Tf, typename Ti, typename TfLL1, typename Tf11, typename TfLLL
 #ifdef __CUDACC__
 __device__
 #endif
-inline bool helixAtRFromIterative_impl(const Tf& inPar, const Ti& inChg, TfLL1& outPar, const Tf11& msRad, TfLLL& errorProp, int n)
+static inline bool helixAtRFromIterative_impl(const Tf& __restrict__ inPar,  const   Ti& __restrict__ inChg, 
+                                                 TfLL1& __restrict__ outPar, const Tf11& __restrict__ msRad, 
+                                                 TfLLL& __restrict__ errorProp, int nmin, int nmax)
 {
-  //initialize erroProp to identity matrix
-  errorProp(n,0,0) = 1.f;
-  errorProp(n,1,1) = 1.f;
-  errorProp(n,2,2) = 1.f;
-  errorProp(n,3,3) = 1.f;
-  errorProp(n,4,4) = 1.f;
-  errorProp(n,5,5) = 1.f;
-
-  const float xin  = inPar(n, 0, 0);
-  const float yin  = inPar(n, 1, 0);
-  const float pxin = inPar(n, 3, 0);
-  const float pyin = inPar(n, 4, 0);
-  const float pzin = inPar(n, 5, 0);
-  const float r    = msRad(n, 0, 0); 
-  float r0 = hipo(xin, yin);
-
-  dprint(std::endl << "attempt propagation from r=" << r0 << " to r=" << r << std::endl
-    << "x=" << xin << " y=" << yin  << " z=" << inPar.ConstAt(n, 2, 0) << " px=" << pxin << " py=" << pyin << " pz=" << pzin << " q=" << inChg.ConstAt(n, 0, 0));
-
-  if (std::abs(r-r0)<0.0001) {
-  	dprint("distance less than 1mum, skip");
-    return false;
-  }
-  const float pt2    = pxin*pxin+pyin*pyin;
-  const float pt     = std::sqrt(pt2);
-  const float ptinv  = 1.f/pt;
-  const float pt2inv = ptinv*ptinv;
-  //p=0.3Br => r=p/(0.3*B)
-  const float k = inChg(n, 0, 0) * 100.f / (-Config::sol*Config::Bfield);
-  const float invcurvature = 1.f/(pt*k);//in 1./cm
-  const float ctgTheta=pzin*ptinv;
-
-  //variables to be updated at each iterations
-  float totalDistance = 0;
-  //derivatives initialized to value for first iteration, i.e. distance = r-r0in
-  float dTDdx = r0>0. ? -xin/r0 : 0.;
-  float dTDdy = r0>0. ? -yin/r0 : 0.;
-  float dTDdpx = 0.;
-  float dTDdpy = 0.;
-  //5 iterations is a good starting point
-  //const unsigned int Niter = 10;
-  // const unsigned int Niter = 5+std::round(r-r0)/2;
-  for (unsigned int iter=0; iter < Config::Niter; ++iter) {
-
-	  dprint("propagation iteration #" << i);
-	  const float x  = outPar(n, 0, 0);
-	  const float y  = outPar(n, 1, 0);
-	  const float px = outPar(n, 3, 0);
-	  const float py = outPar(n, 4, 0);
-    r0 = hipo(x, y);
-
-	  dprint("r0=" << r0 << " pt=" << pt);
-
-    totalDistance += (r-r0);
-	  dprint("distance=" << (r-r0) << " angPath=" << (r-r0)*invcurvature);
-
-    float cosAP, sinAP;
-    if (Config::useTrigApprox) {  // TODO: uncomment
-      sincos4((r-r0)*invcurvature, sinAP, cosAP);
-    } else {
-      cosAP=std::cos((r-r0)*invcurvature);
-      sinAP=std::sin((r-r0)*invcurvature);
-    }
+#pragma simd
+  for (int n = nmin; n < nmax; ++n) {
+    //initialize erroProp to identity matrix
+    errorProp(n,0,0) = 1.f;
+    errorProp(n,1,1) = 1.f;
+    errorProp(n,2,2) = 1.f;
+    errorProp(n,3,3) = 1.f;
+    errorProp(n,4,4) = 1.f;
+    errorProp(n,5,5) = 1.f;
+
+    const float xin  = inPar(n, 0, 0);
+    const float yin  = inPar(n, 1, 0);
+    const float pxin = inPar(n, 3, 0);
+    const float pyin = inPar(n, 4, 0);
+    const float pzin = inPar(n, 5, 0);
+    const float r    = msRad(n, 0, 0); 
+    float r0 = hipo(xin, yin);
+
+    dprint(std::endl << "attempt propagation from r=" << r0 << " to r=" << r << std::endl
+      << "x=" << xin << " y=" << yin  << " z=" << inPar.ConstAt(n, 2, 0) << " px=" << pxin << " py=" << pyin << " pz=" << pzin << " q=" << inChg.ConstAt(n, 0, 0));
 
-    //helix propagation formulas
-    //http://www.phys.ufl.edu/~avery/fitting/fitting4.pdf
-	  outPar(n, 0, 0) = outPar(n, 0, 0) + k*(px*sinAP-py*(1-cosAP));
-	  outPar(n, 1, 0) = outPar(n, 1, 0) + k*(py*sinAP+px*(1-cosAP));
-	  outPar(n, 2, 0) = outPar(n, 2, 0) + (r-r0)*ctgTheta;
-	  outPar(n, 3, 0) = px*cosAP-py*sinAP;
-	  outPar(n, 4, 0) = py*cosAP+px*sinAP;
-    //outPar(n, 5, 0) = pz; //take this out as it is redundant
-
-	  if (iter+1 != Config::Niter && r0 > 0 && std::abs((r-r0)*invcurvature)>0.000000001f)
-	  {
-	     //update derivatives on total distance for next step, where totalDistance+=r-r0
-	     //now r0 depends on px and py
-
-	     dprint("r0=" << 1.f/r0 << " r0inv=" << r0 << " pt=" << pt);
-
-	     //update derivative on D
-	     const float dAPdpx = -(r-r0)*invcurvature*px*pt2inv;//r0 is now 1./r0 (this could go above the redefinition of r0!)
-	     const float dAPdpy = -(r-r0)*invcurvature*py*pt2inv;
-	     r0 = 1.f/r0;//WARNING, now r0 is r0inv (one less temporary)
-	     const float dAPdx = -x*r0*invcurvature;
-	     const float dAPdy = -y*r0*invcurvature;
-	     //reduce temporary variables
-	     //dxdx = 1 + k*dAPdx*(px*cosAP - py*sinAP);
-	     //dydx = k*dAPdx*(py*cosAP + px*sinAP);
-	     //dTDdx -= r0*(x*dxdx + y*dydx);
-	     dTDdx -= r0*(x*(1.f + k*dAPdx*(px*cosAP - py*sinAP)) + y*(k*dAPdx*(py*cosAP + px*sinAP)));
-	     //reuse same temporary variables
-	     //dxdy = k*dAPdy*(px*cosAP - py*sinAP);
-	     //dydy = 1 + k*dAPdy*(py*cosAP + px*sinAP);
-	     //dTDdy -= r0*(x*dxdy + y*dydy);
-	     dTDdy -= r0*(x*(k*dAPdy*(px*cosAP - py*sinAP)) + y*(1.f + k*dAPdy*(py*cosAP + px*sinAP)));
-	     //dxdpx = k*(sinAP + px*cosAP*dAPdpx - py*sinAP*dAPdpx);
-	     //dydpx = k*(py*cosAP*dAPdpx + 1. - cosAP + px*sinAP*dAPdpx);
-	     //dTDdpx -= r0*(x*dxdpx + y*dydpx);
-	     dTDdpx -= r0*(x*(k*(sinAP + px*cosAP*dAPdpx - py*sinAP*dAPdpx)) + y*(k*(py*cosAP*dAPdpx + 1.f - cosAP + px*sinAP*dAPdpx)));
-	     //dxdpy = k*(px*cosAP*dAPdpy - 1. + cosAP - py*sinAP*dAPdpy);
-	     //dydpy = k*(sinAP + py*cosAP*dAPdpy + px*sinAP*dAPdpy);
-	     //dTDdpy -= r0*(x*dxdpy + y*(dydpy);
-	     dTDdpy -= r0*(x*(k*(px*cosAP*dAPdpy - 1.f + cosAP - py*sinAP*dAPdpy)) + y*(k*(sinAP + py*cosAP*dAPdpy + px*sinAP*dAPdpy)));
-
-	  }
-	  dprint("iteration end, dump parameters" << std::endl
-		 << "pos = " << outPar.At(n, 0, 0) << " " << outPar.At(n, 1, 0) << " " << outPar.At(n, 2, 0) << std::endl
-		 << "mom = " << outPar.At(n, 3, 0) << " " << outPar.At(n, 4, 0) << " " << outPar.At(n, 5, 0) << std::endl
-		 << "r=" << std::sqrt( outPar.At(n, 0, 0)*outPar.At(n, 0, 0) + outPar.At(n, 1, 0)*outPar.At(n, 1, 0) ) 
-     << " pT=" << std::sqrt( outPar.At(n, 3, 0)*outPar.At(n, 3, 0) + outPar.At(n, 4, 0)*outPar.At(n, 4, 0) ));
-    const float TD=totalDistance;
-    const float TP=TD*invcurvature;//totalAngPath
+    if (std::abs(r-r0)<0.0001f) {
+    	dprint("distance less than 1mum, skip");
+      continue;
+    }
+    const float pt2    = pxin*pxin+pyin*pyin;
+    const float pt     = std::sqrt(pt2);
+    const float ptinv  = 1.f/pt;
+    const float pt2inv = ptinv*ptinv;
+    //p=0.3Br => r=p/(0.3*B)
+    const float k = inChg(n, 0, 0) * 100.f / (-Config::sol*Config::Bfield);
+    const float invcurvature = 1.f/(pt*k);//in 1./cm
+    const float ctgTheta=pzin*ptinv;
+
+    //variables to be updated at each iterations
+    float totalDistance = 0;
+    //derivatives initialized to value for first iteration, i.e. distance = r-r0in
+    float dTDdx = r0>0.0f ? -xin/r0 : 0.0f;
+    float dTDdy = r0>0.0f ? -yin/r0 : 0.0f;
+    float dTDdpx = 0.f;
+    float dTDdpy = 0.f;
+    //5 iterations is a good starting point
+    //const unsigned int Niter = 10;
+    // const unsigned int Niter = 5+std::round(r-r0)/2;
+#pragma ivdep
+    for (unsigned int iter=0; iter < Config::Niter; ++iter) {
+
+  	  dprint("propagation iteration #" << i);
+  	  const float x  = outPar(n, 0, 0);
+  	  const float y  = outPar(n, 1, 0);
+  	  const float px = outPar(n, 3, 0);
+  	  const float py = outPar(n, 4, 0);
+      r0 = hipo(x, y);
+
+  	  dprint("r0=" << r0 << " pt=" << pt);
+
+      totalDistance += (r-r0);
+  	  dprint("distance=" << (r-r0) << " angPath=" << (r-r0)*invcurvature);
+
+      float cosAP, sinAP;
+      if (Config::useTrigApprox) {  // TODO: uncomment
+        sincos4((r-r0)*invcurvature, sinAP, cosAP);
+      } else {
+        cosAP=std::cos((r-r0)*invcurvature);
+        sinAP=std::sin((r-r0)*invcurvature);
+      }
+
+      //helix propagation formulas
+      //http://www.phys.ufl.edu/~avery/fitting/fitting4.pdf
+  	  outPar(n, 0, 0) = outPar(n, 0, 0) + k*(px*sinAP-py*(1-cosAP));
+  	  outPar(n, 1, 0) = outPar(n, 1, 0) + k*(py*sinAP+px*(1-cosAP));
+  	  outPar(n, 2, 0) = outPar(n, 2, 0) + (r-r0)*ctgTheta;
+  	  outPar(n, 3, 0) = px*cosAP-py*sinAP;
+  	  outPar(n, 4, 0) = py*cosAP+px*sinAP;
+      //outPar(n, 5, 0) = pz; //take this out as it is redundant
+
+  	  if (iter+1 != Config::Niter && r0 > 0 && std::abs((r-r0)*invcurvature)>0.000000001f)
+  	  {
+  	     //update derivatives on total distance for next step, where totalDistance+=r-r0
+  	     //now r0 depends on px and py
+
+  	     dprint("r0=" << 1.f/r0 << " r0inv=" << r0 << " pt=" << pt);
+
+  	     //update derivative on D
+  	     const float dAPdpx = -(r-r0)*invcurvature*px*pt2inv;//r0 is now 1./r0 (this could go above the redefinition of r0!)
+  	     const float dAPdpy = -(r-r0)*invcurvature*py*pt2inv;
+  	     r0 = 1.f/r0;//WARNING, now r0 is r0inv (one less temporary)
+  	     const float dAPdx = -x*r0*invcurvature;
+  	     const float dAPdy = -y*r0*invcurvature;
+  	     //reduce temporary variables
+  	     //dxdx = 1 + k*dAPdx*(px*cosAP - py*sinAP);
+  	     //dydx = k*dAPdx*(py*cosAP + px*sinAP);
+  	     //dTDdx -= r0*(x*dxdx + y*dydx);
+  	     dTDdx -= r0*(x*(1.f + k*dAPdx*(px*cosAP - py*sinAP)) + y*(k*dAPdx*(py*cosAP + px*sinAP)));
+  	     //reuse same temporary variables
+  	     //dxdy = k*dAPdy*(px*cosAP - py*sinAP);
+  	     //dydy = 1 + k*dAPdy*(py*cosAP + px*sinAP);
+  	     //dTDdy -= r0*(x*dxdy + y*dydy);
+  	     dTDdy -= r0*(x*(k*dAPdy*(px*cosAP - py*sinAP)) + y*(1.f + k*dAPdy*(py*cosAP + px*sinAP)));
+  	     //dxdpx = k*(sinAP + px*cosAP*dAPdpx - py*sinAP*dAPdpx);
+  	     //dydpx = k*(py*cosAP*dAPdpx + 1. - cosAP + px*sinAP*dAPdpx);
+  	     //dTDdpx -= r0*(x*dxdpx + y*dydpx);
+  	     dTDdpx -= r0*(x*(k*(sinAP + px*cosAP*dAPdpx - py*sinAP*dAPdpx)) + y*(k*(py*cosAP*dAPdpx + 1.f - cosAP + px*sinAP*dAPdpx)));
+  	     //dxdpy = k*(px*cosAP*dAPdpy - 1. + cosAP - py*sinAP*dAPdpy);
+  	     //dydpy = k*(sinAP + py*cosAP*dAPdpy + px*sinAP*dAPdpy);
+  	     //dTDdpy -= r0*(x*dxdpy + y*(dydpy);
+  	     dTDdpy -= r0*(x*(k*(px*cosAP*dAPdpy - 1.f + cosAP - py*sinAP*dAPdpy)) + y*(k*(sinAP + py*cosAP*dAPdpy + px*sinAP*dAPdpy)));
+
+  	  }
+  	  dprint("iteration end, dump parameters" << std::endl
+  		 << "pos = " << outPar.At(n, 0, 0) << " " << outPar.At(n, 1, 0) << " " << outPar.At(n, 2, 0) << std::endl
+  		 << "mom = " << outPar.At(n, 3, 0) << " " << outPar.At(n, 4, 0) << " " << outPar.At(n, 5, 0) << std::endl
+  		 << "r=" << std::sqrt( outPar.At(n, 0, 0)*outPar.At(n, 0, 0) + outPar.At(n, 1, 0)*outPar.At(n, 1, 0) ) 
+       << " pT=" << std::sqrt( outPar.At(n, 3, 0)*outPar.At(n, 3, 0) + outPar.At(n, 4, 0)*outPar.At(n, 4, 0) ));
+      const float TD=totalDistance;
+      const float TP=TD*invcurvature;//totalAngPath
     
-    dprint("TD=" << TD << " TP=" << TP << " arrived at r=" << std::sqrt(outPar.At(n, 0, 0)*outPar.At(n, 0, 0)+outPar.At(n, 1, 0)*outPar.At(n, 1, 0))
-      << std::endl
-      << "pos = " << outPar.At(n, 0, 0) << " " << outPar.At(n, 1, 0) << " " << outPar.At(n, 2, 0) << std::endl
-      << "mom = " << outPar.At(n, 3, 0) << " " << outPar.At(n, 4, 0) << " " << outPar.At(n, 5, 0));
-
-    const float iC=invcurvature;
-    const float dCdpx = k*pxin*ptinv;
-    const float dCdpy = k*pyin*ptinv;
-    const float dTPdx = dTDdx*iC;
-    const float dTPdy = dTDdy*iC;
-    const float dTPdpx = (dTDdpx - TD*dCdpx*iC)*iC; // MT change: avoid division
-    const float dTPdpy = (dTDdpy - TD*dCdpy*iC)*iC; // MT change: avoid division
+      dprint("TD=" << TD << " TP=" << TP << " arrived at r=" << std::sqrt(outPar.At(n, 0, 0)*outPar.At(n, 0, 0)+outPar.At(n, 1, 0)*outPar.At(n, 1, 0))
+        << std::endl
+        << "pos = " << outPar.At(n, 0, 0) << " " << outPar.At(n, 1, 0) << " " << outPar.At(n, 2, 0) << std::endl
+        << "mom = " << outPar.At(n, 3, 0) << " " << outPar.At(n, 4, 0) << " " << outPar.At(n, 5, 0));
+
+      const float iC=invcurvature;
+      const float dCdpx = k*pxin*ptinv;
+      const float dCdpy = k*pyin*ptinv;
+      const float dTPdx = dTDdx*iC;
+      const float dTPdy = dTDdy*iC;
+      const float dTPdpx = (dTDdpx - TD*dCdpx*iC)*iC; // MT change: avoid division
+      const float dTPdpy = (dTDdpy - TD*dCdpy*iC)*iC; // MT change: avoid division
     
-    float cosTP, sinTP;
-    if (Config::useTrigApprox) {
-      sincos4(TP, sinTP, cosTP);
-    } else {
-      cosTP = std::cos(TP);
-      sinTP = std::sin(TP);
-    }
-
-    //now try to make full jacobian
-    //derive these to compute jacobian
-    //x = xin + k*(pxin*sinTP-pyin*(1-cosTP));
-    //y = yin + k*(pyin*sinTP+pxin*(1-cosTP));
-    //z = zin + k*TP*pzin;
-    //px = pxin*cosTP-pyin*sinTP;
-    //py = pyin*cosTP+pxin*sinTP;
-    //pz = pzin;
-    //jacobian
-
-    errorProp(n,0,0) = 1 + k*dTPdx*(pxin*cosTP - pyin*sinTP);	//dxdx;
-    errorProp(n,0,1) = k*dTPdy*(pxin*cosTP - pyin*sinTP);	//dxdy;
-    errorProp(n,0,2) = 0.;
-    errorProp(n,0,3) = k*(sinTP + pxin*cosTP*dTPdpx - pyin*sinTP*dTPdpx); //dxdpx;
-    errorProp(n,0,4) = k*(pxin*cosTP*dTPdpy - 1.f + cosTP - pyin*sinTP*dTPdpy);//dxdpy;
-    errorProp(n,0,5) = 0.;
+      float cosTP, sinTP;
+      if (Config::useTrigApprox) {
+        sincos4(TP, sinTP, cosTP);
+      } else {
+        cosTP = std::cos(TP);
+        sinTP = std::sin(TP);
+      }
+
+      //now try to make full jacobian
+      //derive these to compute jacobian
+      //x = xin + k*(pxin*sinTP-pyin*(1-cosTP));
+      //y = yin + k*(pyin*sinTP+pxin*(1-cosTP));
+      //z = zin + k*TP*pzin;
+      //px = pxin*cosTP-pyin*sinTP;
+      //py = pyin*cosTP+pxin*sinTP;
+      //pz = pzin;
+      //jacobian
+
+      errorProp(n,0,0) = 1 + k*dTPdx*(pxin*cosTP - pyin*sinTP);	//dxdx;
+      errorProp(n,0,1) = k*dTPdy*(pxin*cosTP - pyin*sinTP);	//dxdy;
+      errorProp(n,0,2) = 0.;
+      errorProp(n,0,3) = k*(sinTP + pxin*cosTP*dTPdpx - pyin*sinTP*dTPdpx); //dxdpx;
+      errorProp(n,0,4) = k*(pxin*cosTP*dTPdpy - 1.f + cosTP - pyin*sinTP*dTPdpy);//dxdpy;
+      errorProp(n,0,5) = 0.;
     
-    errorProp(n,1,0) = k*dTPdx*(pyin*cosTP + pxin*sinTP);	//dydx;
-    errorProp(n,1,1) = 1 + k*dTPdy*(pyin*cosTP + pxin*sinTP);	//dydy;
-    errorProp(n,1,2) = 0.;
-    errorProp(n,1,3) = k*(pyin*cosTP*dTPdpx + 1.f - cosTP + pxin*sinTP*dTPdpx);//dydpx;
-    errorProp(n,1,4) = k*(sinTP + pyin*cosTP*dTPdpy + pxin*sinTP*dTPdpy); //dydpy;
-    errorProp(n,1,5) = 0.;
+      errorProp(n,1,0) = k*dTPdx*(pyin*cosTP + pxin*sinTP);	//dydx;
+      errorProp(n,1,1) = 1 + k*dTPdy*(pyin*cosTP + pxin*sinTP);	//dydy;
+      errorProp(n,1,2) = 0.;
+      errorProp(n,1,3) = k*(pyin*cosTP*dTPdpx + 1.f - cosTP + pxin*sinTP*dTPdpx);//dydpx;
+      errorProp(n,1,4) = k*(sinTP + pyin*cosTP*dTPdpy + pxin*sinTP*dTPdpy); //dydpy;
+      errorProp(n,1,5) = 0.;
     
-    errorProp(n,2,0) = k*pzin*dTPdx;	//dzdx;
-    errorProp(n,2,1) = k*pzin*dTPdy;	//dzdy;
-    errorProp(n,2,2) = 1.f;
-    errorProp(n,2,3) = k*pzin*dTPdpx;//dzdpx;
-    errorProp(n,2,4) = k*pzin*dTPdpy;//dzdpy;
-    errorProp(n,2,5) = k*TP; //dzdpz;
+      errorProp(n,2,0) = k*pzin*dTPdx;	//dzdx;
+      errorProp(n,2,1) = k*pzin*dTPdy;	//dzdy;
+      errorProp(n,2,2) = 1.f;
+      errorProp(n,2,3) = k*pzin*dTPdpx;//dzdpx;
+      errorProp(n,2,4) = k*pzin*dTPdpy;//dzdpy;
+      errorProp(n,2,5) = k*TP; //dzdpz;
     
-    errorProp(n,3,0) = -dTPdx*(pxin*sinTP + pyin*cosTP);	//dpxdx;
-    errorProp(n,3,1) = -dTPdy*(pxin*sinTP + pyin*cosTP);	//dpxdy;
-    errorProp(n,3,2) = 0.;
-    errorProp(n,3,3) = cosTP - dTPdpx*(pxin*sinTP + pyin*cosTP); //dpxdpx;
-    errorProp(n,3,4) = -sinTP - dTPdpy*(pxin*sinTP + pyin*cosTP);//dpxdpy;
-    errorProp(n,3,5) = 0.;
+      errorProp(n,3,0) = -dTPdx*(pxin*sinTP + pyin*cosTP);	//dpxdx;
+      errorProp(n,3,1) = -dTPdy*(pxin*sinTP + pyin*cosTP);	//dpxdy;
+      errorProp(n,3,2) = 0.;
+      errorProp(n,3,3) = cosTP - dTPdpx*(pxin*sinTP + pyin*cosTP); //dpxdpx;
+      errorProp(n,3,4) = -sinTP - dTPdpy*(pxin*sinTP + pyin*cosTP);//dpxdpy;
+      errorProp(n,3,5) = 0.;
     
-    errorProp(n,4,0) = -dTPdx*(pyin*sinTP - pxin*cosTP); //dpydx;
-    errorProp(n,4,1) = -dTPdy*(pyin*sinTP - pxin*cosTP);	//dpydy;
-    errorProp(n,4,2) = 0.;
-    errorProp(n,4,3) = +sinTP - dTPdpx*(pyin*sinTP - pxin*cosTP);//dpydpx;
-    errorProp(n,4,4) = +cosTP - dTPdpy*(pyin*sinTP - pxin*cosTP);//dpydpy;
-    errorProp(n,4,5) = 0.;
+      errorProp(n,4,0) = -dTPdx*(pyin*sinTP - pxin*cosTP); //dpydx;
+      errorProp(n,4,1) = -dTPdy*(pyin*sinTP - pxin*cosTP);	//dpydy;
+      errorProp(n,4,2) = 0.;
+      errorProp(n,4,3) = +sinTP - dTPdpx*(pyin*sinTP - pxin*cosTP);//dpydpx;
+      errorProp(n,4,4) = +cosTP - dTPdpy*(pyin*sinTP - pxin*cosTP);//dpydpy;
+      errorProp(n,4,5) = 0.;
     
-    errorProp(n,5,0) = 0.;
-    errorProp(n,5,1) = 0.;
-    errorProp(n,5,2) = 0.;
-    errorProp(n,5,3) = 0.;
-    errorProp(n,5,4) = 0.;
-    errorProp(n,5,5) = 1.f;
+      errorProp(n,5,0) = 0.;
+      errorProp(n,5,1) = 0.;
+      errorProp(n,5,2) = 0.;
+      errorProp(n,5,3) = 0.;
+      errorProp(n,5,4) = 0.;
+      errorProp(n,5,5) = 1.f;
+    }
   }
-  return true;
 }
diff --git a/mkFit/propagation_kernels.cu b/mkFit/propagation_kernels.cu
index 5c25621b74328..79a78d06235cc 100644
--- a/mkFit/propagation_kernels.cu
+++ b/mkFit/propagation_kernels.cu
@@ -136,7 +136,7 @@ void helixAtRFromIterative_fn(const GPlex<float>& inPar,
     }
     errorProp.SetVal(0);
 
-    helixAtRFromIterative_impl(inPar, inChg, outPar, msRad, errorProp, n);
+    helixAtRFromIterative_impl(inPar, inChg, outPar, msRad, errorProp, n, n);
 
     // Once computations are done. Get values from registers to global memory.
     for (int j = 0; j < 5; ++j) {

From 32e4ed81e8bc487bfdfd144aae10ed8c63bc1272 Mon Sep 17 00:00:00 2001
From: Dan Riley <Daniel.Riley@cornell.edu>
Date: Sat, 21 May 2016 10:36:55 -0400
Subject: [PATCH 03/13] fix optimized version for CUDA

---
 Matrix.h                     | 2 +-
 mkFit/PropagationMPlex.icc   | 2 +-
 mkFit/propagation_kernels.cu | 2 +-
 3 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/Matrix.h b/Matrix.h
index aec127f56322c..a5e2e220dc1a7 100644
--- a/Matrix.h
+++ b/Matrix.h
@@ -71,7 +71,7 @@ inline void sincos4(const float x, float& sin, float& cos)
   #ifdef __INTEL_COMPILER
     #define ASSUME_ALIGNED(a, b) __assume_aligned(a, b)
   #else
-    #define ASSUME_ALIGNED(a, b) a = __builtin_assume_aligned(a, b)
+    #define ASSUME_ALIGNED(a, b) a = static_cast<decltype(a)>(__builtin_assume_aligned(a, b))
   #endif
 
   #include "Matriplex/MatriplexSym.h"
diff --git a/mkFit/PropagationMPlex.icc b/mkFit/PropagationMPlex.icc
index d1f8f0015f870..a9bfd217d063a 100644
--- a/mkFit/PropagationMPlex.icc
+++ b/mkFit/PropagationMPlex.icc
@@ -2,7 +2,7 @@ template<typename Tf, typename Ti, typename TfLL1, typename Tf11, typename TfLLL
 #ifdef __CUDACC__
 __device__
 #endif
-static inline bool helixAtRFromIterative_impl(const Tf& __restrict__ inPar,  const   Ti& __restrict__ inChg, 
+static inline void helixAtRFromIterative_impl(const Tf& __restrict__ inPar,  const   Ti& __restrict__ inChg, 
                                                  TfLL1& __restrict__ outPar, const Tf11& __restrict__ msRad, 
                                                  TfLLL& __restrict__ errorProp, int nmin, int nmax)
 {
diff --git a/mkFit/propagation_kernels.cu b/mkFit/propagation_kernels.cu
index 79a78d06235cc..72bd3a0a01732 100644
--- a/mkFit/propagation_kernels.cu
+++ b/mkFit/propagation_kernels.cu
@@ -136,7 +136,7 @@ void helixAtRFromIterative_fn(const GPlex<float>& inPar,
     }
     errorProp.SetVal(0);
 
-    helixAtRFromIterative_impl(inPar, inChg, outPar, msRad, errorProp, n, n);
+    helixAtRFromIterative_impl(inPar, inChg, outPar, msRad, errorProp, n, n+1);
 
     // Once computations are done. Get values from registers to global memory.
     for (int j = 0; j < 5; ++j) {

From 3b8ea23e31c839e787767c50e2d98ab3a143a321 Mon Sep 17 00:00:00 2001
From: Matthieu Lefebvre <ml15@princeton.edu>
Date: Thu, 31 Mar 2016 12:45:06 -0400
Subject: [PATCH 04/13] Porting Best Hit to GPU, part 1.

Following steps have been taken:
1) Refactors CUDA call to fittest to increase mkFit clarity

2) Starts porting best hit to GPU

arrays of FitterCU in MkBuilder
tweak cuda compilation to allow separate files for __device__ functions
wrapper for computeChi2gpu
  * addIntoUpperLeft3x3
  * invertCramerSym
  * Chi2Similarity
  * resErr in registers
  * no CMS Geometry
  * copy XHitPos to GPU
Adds a Hit Structure class for CUDA

3) Adds tricks to use Hit/SVector class on GPU

Try to minimize the amount of rewritting requires, and of preprocessing
required by nvcc.
Adds function *declarations* to access private variable to the classes.
*Define* these function with as if they were declared with __device__ in a cu file.

Save point:
  *figured out that the problem was that the hits array needs to be shifted
from one hit_cnt to another one.
  * pitch -> stride. fix that silly mistake

Save point:
  * moves loop over hits inside cuFitter.addBestHit

Compare new hit with previous best one (Chi2) on the GPU.
Copy the hit with the lowest chi2.

4)
best hit gpu: removes data transfers from the loop over hits.
best_hit gpu: in a single routine. still needs more cleaning

5) Starts merging gpu wrappers for best hit

6) Loops over hit_cnt inside bestHit kernel

7) Adds updateTracks with best hits to bestHit wrapper

8) minChi2: global mem -> register

9) bestHit: global mem -> registers

10) Decouples CPU and GPU addBestHit functions

11) Adds selectHitRange for bestHit to the GPU code

12) Removes XHitPos/XHitSize CPU->GPU transfers

13) Changes d_outErr to d_Err_iC

to be coherent with the CPU code
additional unused gpu wrapper

14) Refactors GPlex class -- introducces templates

GPlex is now templated with the type of matriplex it mirrors.
It allows to share copy methods and largely reduce boiler plate code

members of GPlex have been rename to follow the convention
from MATRIPLEX
---
 Config.h                       |   2 +-
 Hit.h                          |   4 +
 Makefile.config                |   4 +-
 Math/SVector.h                 |   3 +
 mkFit/FitterCU-imp.h           | 206 ++++++------
 mkFit/FitterCU.h               |  65 ++--
 mkFit/GPlex.h                  |  53 ++-
 mkFit/HitStructuresCU.cu       |  47 +++
 mkFit/HitStructuresCU.h        |  29 ++
 mkFit/KalmanUtilsMPlex.cc      |   5 +-
 mkFit/KalmanUtilsMPlex.h       |   8 +
 mkFit/Makefile                 |  13 +-
 mkFit/MkBuilder.cc             |  53 ++-
 mkFit/MkBuilder.h              |   4 +
 mkFit/MkFitter.cc              |  15 +-
 mkFit/MkFitter.h               |   2 +
 mkFit/PropagationMPlex.cc      |   4 +-
 mkFit/buildtestMPlex.cc        |   4 +
 mkFit/computeChi2_kernels.cu   | 568 +++++++++++++++++++++++++++++++++
 mkFit/computeChi2_kernels.h    |  42 +++
 mkFit/fittestMPlex.cc          |  68 ++++
 mkFit/fittestMPlex.h           |   1 +
 mkFit/kalmanUpdater_kernels.cu |  17 +-
 mkFit/kalmanUpdater_kernels.h  |  17 +-
 mkFit/mkFit.cc                 |  74 +----
 mkFit/propagation_kernels.cu   |  84 ++++-
 mkFit/propagation_kernels.h    |  15 +-
 mkFit/reorganize.cu            |  15 +-
 mkFit/reorganize.h             |   2 +-
 29 files changed, 1195 insertions(+), 229 deletions(-)
 create mode 100644 mkFit/HitStructuresCU.cu
 create mode 100644 mkFit/HitStructuresCU.h
 create mode 100644 mkFit/computeChi2_kernels.cu
 create mode 100644 mkFit/computeChi2_kernels.h

diff --git a/Config.h b/Config.h
index 5c106c04c659c..278743ac9e44c 100644
--- a/Config.h
+++ b/Config.h
@@ -167,7 +167,7 @@ namespace Config
     #ifdef __MIC__
       #define MPT_SIZE 16
     #elif defined USE_CUDA
-      #define MPT_SIZE 10000
+      #define MPT_SIZE 8 // 20000
     #else
       #define MPT_SIZE 8
     #endif
diff --git a/Hit.h b/Hit.h
index 5da4ffa2374bc..fc3153b17b646 100644
--- a/Hit.h
+++ b/Hit.h
@@ -191,6 +191,10 @@ class Hit
 
   const float* posArray() const {return state_.pos_.Array();}
   const float* errArray() const {return state_.err_.Array();}
+//#ifdef USE_CUDA
+  float* posArrayCU();
+  float* errArrayCU();
+//#endif
 
   // Non-const versions needed for CopyOut of Matriplex.
   SVector3&     parameters_nc() {return state_.pos_;}
diff --git a/Makefile.config b/Makefile.config
index 99d46ebd8cf23..552cb048fe9b4 100644
--- a/Makefile.config
+++ b/Makefile.config
@@ -22,14 +22,14 @@
 # 2. Use gcc (clang by default on mac) or icc
 # Comment out to force using standard c++. For mic only icc can be used.
 ifdef INTEL_LICENSE_FILE
-CXX := icc
+CXX := icpc
 endif
 
 # 2.1 Use nvcc to compile cuda code
 # CUDA compiler
 NV := nvcc
 # Comment out to compile for CPU
-# USE_CUDA := -DUSE_CUDA
+USE_CUDA := -DUSE_CUDA
 
 # 3. Optimization
 # -O3 implies vectorization and simd (but not AVX)
diff --git a/Math/SVector.h b/Math/SVector.h
index 7de0c830a9538..a81c019499f8b 100644
--- a/Math/SVector.h
+++ b/Math/SVector.h
@@ -185,6 +185,9 @@ class SVector {
    const T* Array() const;
    /// return non-const pointer to internal array
    T* Array();
+//#ifdef USE_CUDA
+   T* ArrayCU();
+//#endif
    
    /** @name --- STL-like interface --- */
    
diff --git a/mkFit/FitterCU-imp.h b/mkFit/FitterCU-imp.h
index 8a94f3d5f4aaa..d65c1e596a922 100644
--- a/mkFit/FitterCU-imp.h
+++ b/mkFit/FitterCU-imp.h
@@ -1,3 +1,5 @@
+#include <cstdlib>
+
 template <typename T>
 void FitterCU<T>::setNumberTracks(idx_t Ntracks) {
   N = Ntracks;
@@ -22,13 +24,15 @@ void FitterCU<T>::destroyStream() {
 
 template <typename T>
 void FitterCU<T>::allocateDevice() {
+  d_par_iP.allocate(Nalloc, LV);
   d_par_iC.allocate(Nalloc, LV);
+
+  d_Err_iP.allocate(Nalloc, LS);
+  d_Err_iC.allocate(Nalloc, LS);
+
   d_inChg.allocate(Nalloc, QI);
-  d_par_iP.allocate(Nalloc, LV);
   d_errorProp.allocate(Nalloc, LL);
-  d_Err_iP.allocate(Nalloc, LS);
   d_msPar.allocate(Nalloc, HV);
-  d_outErr.allocate(Nalloc, LS);
   d_msErr.allocate(Nalloc, HS);
 
   cudaCheckError()
@@ -42,123 +46,138 @@ void FitterCU<T>::freeDevice() {
   d_errorProp.free();
   d_Err_iP.free();
   d_msPar.free();
-  d_outErr.free();
+  d_Err_iC.free();
   d_msErr.free();
 
   cudaCheckError()
 }
 
 template <typename T>
-void FitterCU<T>::sendInParToDevice(const MPlexLV& inPar) {
-  cudaMemcpy2DAsync(d_par_iC.ptr, d_par_iC.pitch, inPar.fArray, N*sizeof(T),
-               N*sizeof(T), LV, cudaMemcpyHostToDevice, stream);
-  cudaCheckError()
+void FitterCU<T>::kalmanUpdateMerged() {
+  kalmanUpdate_wrapper(stream, d_Err_iP, d_msErr,
+                       d_par_iP, d_msPar, d_par_iC, d_Err_iC, N);
 }
 
 template <typename T>
-void FitterCU<T>::sendInErrToDevice(const MPlexLS& inErr) {
-  cudaMemcpy2DAsync(d_outErr.ptr, d_outErr.pitch, inErr.fArray, N*sizeof(T),
-               N*sizeof(T), LS, cudaMemcpyHostToDevice, stream);
-  cudaCheckError()
+void FitterCU<T>::propagationMerged() {
+  propagation_wrapper(stream, d_msPar, d_par_iC, d_inChg,
+                      //d_par_iP, d_Err_iC, d_Err_iP, N); // TODO: Check outErr/errorProp
+                      d_par_iP, d_errorProp, d_Err_iP, N);
 }
 
 template <typename T>
-void FitterCU<T>::sendInChgToDevice(const MPlexQI& inChg) {
-  cudaMemcpy2DAsync(d_inChg.ptr, d_inChg.pitch, inChg.fArray, N*sizeof(T),
-               N*sizeof(T), QI, cudaMemcpyHostToDevice, stream);
-  cudaCheckError()
+void FitterCU<T>::computeChi2gpu(
+    const MPlexLS &psErr, MPlexHS &msErr,
+    MPlexHV& msPar, const MPlexLV& propPar, GPlexQF& d_outChi2, int NN) {
+
+  // TODO: add CMSGeom
+  if (Config::useCMSGeom) {
+    //propagateHelixToRMPlex(psErr,  psPar, inChg,  msPar, propErr, propPar);
+    throw std::runtime_error("useCMSGeom not implemented yet for GPU");
+  } else {}
+
+  computeChi2_wrapper(stream, d_Err_iP, d_msErr, //d_resErr, 
+      d_msPar, d_par_iP, d_outChi2, N);
 }
 
+// FIXME: Temporary. Separate allocations / transfers
 template <typename T>
-void FitterCU<T>::sendMsRadToDevice(const MPlexQF& msRad) {
-  cudaMemcpy2DAsync(d_msRad.ptr, d_msRad.pitch, msRad.fArray, N*sizeof(T),
-               N*sizeof(T), QF, cudaMemcpyHostToDevice, stream);
+void FitterCU<T>::allocate_extra_addBestHit() {
+  d_outChi2.allocate(Nalloc, QF);
+  d_XHitPos.allocate(Nalloc, QI);
+  d_XHitSize.allocate(Nalloc, QI);
+  // FIXME: Make those GPlex-es. and use .allocate()
+  cudaMalloc((void**)&d_HitsIdx, Nalloc*sizeof(int)); cudaCheckError();
+  cudaMalloc((void**)&d_Chi2, Nalloc*sizeof(float)); cudaCheckError();
   cudaCheckError()
 }
 
 template <typename T>
-void FitterCU<T>::sendOutParToDevice(const MPlexLV& outPar) {
-  cudaMemcpy2DAsync(d_par_iP.ptr, d_par_iP.pitch, outPar.fArray, N*sizeof(T),
-               N*sizeof(T), LV, cudaMemcpyHostToDevice, stream);
-  cudaCheckError()
-}
+void FitterCU<T>::free_extra_addBestHit() {
+  cudaFree(d_HitsIdx); cudaCheckError();
+  cudaFree(d_Chi2); cudaCheckError();
 
-template <typename T>
-void FitterCU<T>::sendOutErrToDevice(const MPlexLS& outErr) {
-  cudaMemcpy2DAsync(d_Err_iP.ptr, d_Err_iP.pitch, outErr.fArray, N*sizeof(T),
-               N*sizeof(T), LS, cudaMemcpyHostToDevice, stream);
-  cudaCheckError()
+  d_XHitPos.free(); cudaCheckError();
+  d_XHitSize.free(); cudaCheckError();
+  d_outChi2.free(); cudaCheckError();
 }
 
+// FIXME: Temporary. Separate allocations / transfers
 template <typename T>
-void FitterCU<T>::sendMsParToDevice(const MPlexHV& msPar) {
-  cudaMemcpy2DAsync(d_msPar.ptr, d_msPar.pitch, msPar.fArray, N*sizeof(T),
-               N*sizeof(T), HV, cudaMemcpyHostToDevice, stream);
-  cudaCheckError()
-}
+void FitterCU<T>::prepare_addBestHit(
+    const MPlexLS &psErr, const MPlexLV& propPar,
+    const MPlexQI &inChg, 
+    size_t NN) {
+  setNumberTracks(NN);  // temporary: should be end - beg
 
-template <typename T>
-void FitterCU<T>::sendMsErrToDevice(const MPlexHS& msErr) {
-  cudaMemcpy2DAsync(d_msErr.ptr, d_msErr.pitch, msErr.fArray, N*sizeof(T),
-               N*sizeof(T), HS, cudaMemcpyHostToDevice, stream);
+  createStream();
   cudaCheckError()
-}
 
-template <typename T>
-void FitterCU<T>::getOutParFromDevice(MPlexLV& outPar) {
-  cudaMemcpy2DAsync(outPar.fArray, N*sizeof(T), d_par_iC.ptr, d_par_iC.pitch,
-               N*sizeof(T), LV, cudaMemcpyDeviceToHost, stream);
-  cudaCheckError()
+  // psErr -> d_Err_iP
+  cudaMemcpy2DAsync(d_Err_iP.ptr, d_Err_iP.pitch, psErr.fArray, N*sizeof(T),
+               N*sizeof(T), LS, cudaMemcpyHostToDevice, stream);
+  // sendOutParToDevice(propPar);  // d_par_iP
+  d_par_iP.copyAsyncFromHost(stream, propPar);
+  //sendInChgToDevice(inChg);
+  d_inChg.copyAsyncFromHost(stream, inChg);
 }
 
+// TODO: Temporary. Separate allocations / transfers
 template <typename T>
-void FitterCU<T>::getErrorPropFromDevice(MPlexLL& errorProp) {
-  cudaMemcpy2DAsync(errorProp.fArray, N*sizeof(T),
-               d_errorProp.ptr, d_errorProp.pitch,
-               N*sizeof(T), LL, cudaMemcpyDeviceToHost, stream);
-  cudaCheckError()
-}
+void FitterCU<T>::finalize_addBestHit(
+    MPlexHS &msErr, MPlexHV& msPar,
+    MPlexLS &outErr, MPlexLV &outPar,
+    MPlexQI &HitsIdx, MPlexQF &Chi2) {
+  //getOutParFromDevice(outPar);  // <- d_par_iC
+  d_par_iC.copyAsyncToHost(stream, outPar);
+  //getOutErrFromDevice(outErr);  // <- d_Err_iC
+  d_Err_iC.copyAsyncToHost(stream, outErr);
+
+  //
+  // Get msPar, msErr, chi2 and HitIdx out from the GPU to the CPU
+  cudaMemcpy2DAsync(msPar.fArray, N*sizeof(T), d_msPar.ptr, d_msPar.pitch, 
+               N*sizeof(T), HV, cudaMemcpyDeviceToHost, stream);
+  cudaMemcpy2DAsync(msErr.fArray, N*sizeof(T), d_msErr.ptr, d_msErr.pitch, 
+               N*sizeof(T), HS, cudaMemcpyDeviceToHost, stream);
+  cudaMemcpyAsync(HitsIdx.fArray, d_HitsIdx, N*sizeof(int), cudaMemcpyDeviceToHost, stream);
+  cudaMemcpyAsync(Chi2.fArray, d_Chi2, N*sizeof(float), cudaMemcpyDeviceToHost, stream);
 
-template <typename T>
-void FitterCU<T>::getOutErrFromDevice(MPlexLS& outErr) {
-  cudaMemcpy2DAsync(outErr.fArray, N*sizeof(T), d_outErr.ptr, d_outErr.pitch,
-               N*sizeof(T), LS, cudaMemcpyDeviceToHost, stream);
-  cudaCheckError()
-}
 
-template <typename T>
-void FitterCU<T>::getMsRadFromDevice(MPlexQF& msRad) {
-  cudaMemcpy2DAsync(msRad.fArray, N*sizeof(T), d_msRad.ptr, d_msRad.pitch,
-               N*sizeof(T), QF, cudaMemcpyDeviceToHost, stream);
-  cudaCheckError()
+  destroyStream();
 }
 
 template <typename T>
-void FitterCU<T>::setOutParFromInPar() {
-  cudaMemcpy2DAsync(d_par_iP.ptr, d_par_iP.pitch, d_par_iC.ptr, d_par_iC.pitch,
-               N*sizeof(T), LV, cudaMemcpyDeviceToDevice, stream);
-  cudaCheckError()
-}
+void FitterCU<T>::addBestHit(BunchOfHitsCU &bunch) {
 
-template <typename T>
-void FitterCU<T>::setOutErrFromInErr() {
-  cudaMemcpy2DAsync(d_Err_iP.ptr, d_Err_iP.pitch, d_outErr.ptr, d_outErr.pitch,
-               N*sizeof(T), LS, cudaMemcpyDeviceToDevice, stream);
-  cudaCheckError()
-}
+  selectHitRanges_wrapper(stream, bunch, d_XHitPos, d_XHitSize, 
+      d_Err_iP, d_par_iP, N);
 
-template <typename T>
-void FitterCU<T>::kalmanUpdateMerged() {
+  // TODO: get this thing inside bestHit_kernel
+  int maxSize = getMaxNumHits_wrapper(d_XHitSize, N);
+
+  bestHit_wrapper(stream, bunch, d_XHitPos,
+                  d_Err_iP, d_msErr, d_msPar, d_par_iP, d_outChi2,
+                  d_Chi2, d_HitsIdx,
+                  maxSize, N);
+
+  // updateParametersMPlex
   kalmanUpdate_wrapper(stream, d_Err_iP, d_msErr,
-                       d_par_iP, d_msPar, d_par_iC, d_outErr, N);
-}
+                       d_par_iP, d_msPar, d_par_iC, d_Err_iC, N);
+  //updateParametersMPlex(Err[iP], Par[iP], Chg, msErr[Nhits], msPar[Nhits],
+	//    Err[iC], Par[iC]);
+}   
 
+#if 0 
 template <typename T>
-void FitterCU<T>::propagationMerged() {
-  propagation_wrapper(stream, d_msPar, d_par_iC, d_inChg,
-                      d_par_iP, d_errorProp, d_Err_iP, N);
+void FitterCU<T>::propagateTracksToR(float radius, int N) {
+  //propagateHelixToRMPlex(Err[iC], Par[iC], Chg, R,
+                         //Err[iP], Par[iP], N_proc);
+  propagationForBuilding_wrapper(stream, radius,
+    d_par_iC, d_inChg, d_par_iP, d_errorProp, d_Err_iP, N);
+  //propagation_wrapper(stream, d_msPar, d_par_iC, d_inChg,
+  //                    d_par_iP, d_errorProp, d_Err_iP, N);
 }
-
+#endif
 
 template <typename T>
 void FitterCU<T>::FitTracks(MPlexQI &Chg, MPlexLV& par_iC, MPlexLS& err_iC,
@@ -177,9 +196,12 @@ void FitterCU<T>::FitTracks(MPlexQI &Chg, MPlexLV& par_iC, MPlexLS& err_iC,
 
   setNumberTracks(end-beg);
 
-  sendInChgToDevice(Chg);
-  sendInParToDevice(par_iC);
-  sendInErrToDevice(err_iC);
+  //sendInChgToDevice(Chg);
+  d_inChg.copyAsyncFromHost(stream, Chg);
+  //sendInParToDevice(par_iC);
+  d_par_iC.copyAsyncFromHost(stream, par_iC);
+  //sendInErrToDevice(err_iC);
+  d_Err_iC.copyAsyncFromHost(stream, err_iC);
 
   cudaEventRecord(start, 0);
  
@@ -188,8 +210,10 @@ void FitterCU<T>::FitTracks(MPlexQI &Chg, MPlexLV& par_iC, MPlexLS& err_iC,
   {
     // Switch outPut and inPut parameters and errors
     // similar to iC <-> iP in the CPU code.
-    setOutParFromInPar();
-    setOutErrFromInErr(); // d_Err_iP
+    //setOutParFromInPar();
+    d_par_iP.copyAsyncFromDevice(stream, d_par_iC); 
+    //setOutErrFromInErr(); // d_Err_iP
+    d_Err_iP.copyAsyncFromDevice(stream, d_Err_iC);
     
     double time_input = dtime();
     int itrack;
@@ -207,8 +231,10 @@ void FitterCU<T>::FitTracks(MPlexQI &Chg, MPlexLV& par_iC, MPlexLS& err_iC,
     }
     total_reorg += (dtime() - time_input)*1e3;
 
-    sendMsParToDevice(msPar[hi]);
-    sendMsErrToDevice(msErr[hi]);
+    //sendMsParToDevice(msPar[hi]);
+    d_msPar.copyAsyncFromHost(stream, msPar[hi]);
+    //sendMsErrToDevice(msErr[hi]);
+    d_msErr.copyAsyncFromHost(stream, msErr[hi]);
 
     propagationMerged();
     kalmanUpdateMerged();
@@ -220,8 +246,10 @@ void FitterCU<T>::FitTracks(MPlexQI &Chg, MPlexLV& par_iC, MPlexLS& err_iC,
   std::cerr << "CUDA etime: " << etime << " ms.\n";
   std::cerr << "Total reorg: " << total_reorg << " ms.\n";
 
-  getOutParFromDevice(par_iC);
-  getOutErrFromDevice(err_iC);
+  //getOutParFromDevice(par_iC);
+  d_par_iC.copyAsyncToHost(stream, par_iC);
+  //getOutErrFromDevice(err_iC);
+  d_Err_iC.copyAsyncToHost(stream, err_iC);
   
   cudaStreamSynchronize(stream);
   // freeDevice(); -> moved to mkFit/mkFit.cc
diff --git a/mkFit/FitterCU.h b/mkFit/FitterCU.h
index a56a912559ea3..c37fbb0abff79 100644
--- a/mkFit/FitterCU.h
+++ b/mkFit/FitterCU.h
@@ -8,6 +8,8 @@
 #include "Matrix.h"
 #include "propagation_kernels.h"
 #include "kalmanUpdater_kernels.h"
+#include "computeChi2_kernels.h"
+#include "HitStructuresCU.h"
 #include "GPlex.h"
 
 #define LV 6
@@ -58,27 +60,25 @@ class FitterCU {
 
   void setNumberTracks(idx_t Ntracks);
 
-  void sendInParToDevice(const MPlexLV& inPar);
-  void sendInErrToDevice(const MPlexLS& inErr);
-  void sendInChgToDevice(const MPlexQI& inChg);
-  void sendMsRadToDevice(const MPlexQF& msRad);
-  void sendOutParToDevice(const MPlexLV& outPar);
-  void sendOutErrToDevice(const MPlexLS& outErr);
-  void sendMsParToDevice(const MPlexHV& msPar);
-  
-  void getErrorPropFromDevice(MPlexLL& errorProp);
-  void getMsRadFromDevice(MPlexQF& msRad);
+  void propagationMerged();
+  void kalmanUpdateMerged();
 
-  void setOutParFromInPar();
-  void setOutErrFromInErr();
+  void computeChi2gpu(const MPlexLS &psErr, MPlexHS &msErr,
+      MPlexHV& msPar, const MPlexLV& propPar, GPlexQF& d_outChi2, int NN);
 
-  // updater specfic transfers.
-  void sendMsErrToDevice(const MPlexHS& msErr);
-  void getOutParFromDevice(MPlexLV& outPar);
-  void getOutErrFromDevice(MPlexLS& outErr);
+  void allocate_extra_addBestHit();
+  void free_extra_addBestHit();
 
-  void propagationMerged();
-  void kalmanUpdateMerged();
+  void prepare_addBestHit(
+      const MPlexLS &psErr, const MPlexLV& propPar,
+      const MPlexQI &inChg, 
+      size_t NN);
+  void finalize_addBestHit(
+      MPlexHS &msErr, MPlexHV& msPar,
+      MPlexLS &outErr, MPlexLV &outPar,
+      MPlexQI &HitsIdx, MPlexQF &Chi2);
+
+  void addBestHit(BunchOfHitsCU &bunch);
 
   // fitting higher order methods
   void FitTracks(MPlexQI &Chg, MPlexLV& par_iC, MPlexLS& err_iC,
@@ -91,18 +91,27 @@ class FitterCU {
   // to allocated arrays that can be used for several sets of tracks.
   idx_t Nalloc;
   idx_t N;
+
   /* data */
-  GPlex<T> d_par_iC;  // LV
-  GPlex<int> d_inChg;  // QI
-  GPlex<T> d_par_iP; // LV
-  GPlex<T> d_msRad;  // QF
-  GPlex<T> d_errorProp;  // LL
-  GPlex<T> d_Err_iP;
-  GPlex<T> d_msPar;
-
-  GPlex<T> d_outErr;
-  GPlex<T> d_msErr;
+  GPlexLV d_par_iP; // LV
+  GPlexLV d_par_iC; // LV
+
+  GPlexLS d_Err_iP; // LS
+  GPlexLS d_Err_iC; // LS
+
+  GPlexQI d_inChg;  // QI
+  GPlexQF d_msRad;  // QF
+  GPlexLL d_errorProp;  // LL
+
+  GPlexHV d_msPar;
+  GPlexHS d_msErr;
   
+  GPlexQI d_XHitPos;  // QI : 1D arrary following itracks
+  GPlexQI d_XHitSize;  // QI : " "
+  GPlexQF d_outChi2;
+  int *d_HitsIdx;
+  float *d_Chi2;
+
   // everything run in a stream so multiple instance of FitterCU can
   // run concurrently on the GPU.
   cudaStream_t stream;
diff --git a/mkFit/GPlex.h b/mkFit/GPlex.h
index 9dfe3b86e6e02..087b82c7f28e0 100644
--- a/mkFit/GPlex.h
+++ b/mkFit/GPlex.h
@@ -4,6 +4,16 @@
 #include <cuda_runtime.h>
 #include <stdio.h>
 
+#include "Matrix.h"
+
+#define cudaCheckError() {                                          \
+  cudaError_t e=cudaGetLastError();                                 \
+  if(e!=cudaSuccess) {                                              \
+    printf("Cuda failure %s:%d: '%s'\n",__FILE__,__LINE__,cudaGetErrorString(e));           \
+    exit(0); \
+  }                                                                 \
+}
+
 // GPU implementation of a Matriplex-like structure
 // The number of tracks is the fast dimension and is padded in order to have
 // consecutive and aligned memory accesses. For cached reads, this result in a
@@ -11,23 +21,52 @@
 // See:
 // http://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#global-memory-3-0
 // In practice, The number of tracks (ntracks) is set to be MPT_SIZE
-template <typename T>
+template <typename T, typename M>
 struct GPlex { 
   T* ptr;
-  size_t pitch, stride, x, y;
+  size_t pitch, stride, N, kSize;
 
-  void allocate(size_t ntracks, size_t plex_size) {
-    x = ntracks;
-    y = plex_size;
-    cudaMallocPitch((void**)&ptr, &pitch, x*sizeof(T), y);
+  void allocate(size_t ntracks, size_t aSize) {
+    N = ntracks;
+    kSize = aSize;
+    cudaMallocPitch((void**)&ptr, &pitch, N*sizeof(T), kSize);
     stride = pitch/sizeof(T);  // Number of elements
   }
   void free() {
     cudaFree(ptr);
-    x = 0; y = 0; pitch = 0; stride = 0;
+    N = 0; kSize = 0; pitch = 0; stride = 0;
   }
   //cudaMemcpy2D(d_msErr.ptr, d_msErr.pitch, msErr.fArray, N*sizeof(T),
                //N*sizeof(T), HS, cudaMemcpyHostToDevice);
+
+  void copyAsyncFromHost(cudaStream_t& stream, const M& mplex) {
+    cudaMemcpy2DAsync(ptr, pitch, mplex.fArray, N*sizeof(T),
+                      N*sizeof(T), kSize, cudaMemcpyHostToDevice, stream);
+    cudaCheckError();
+  }
+  void copyAsyncToHost(cudaStream_t& stream, M& mplex) {
+    cudaMemcpy2DAsync(mplex.fArray, N*sizeof(T), ptr, pitch,
+                      N*sizeof(T), kSize, cudaMemcpyDeviceToHost, stream);
+    cudaCheckError();
+  }
+  void copyAsyncFromDevice(cudaStream_t& stream, GPlex<T, M>& gplex) {
+    cudaMemcpy2DAsync(ptr, pitch, gplex.ptr, gplex.pitch,
+                      N*sizeof(T), kSize, cudaMemcpyDeviceToDevice, stream);
+    cudaCheckError();
+  }
 };
 
+using GPlexLL = GPlex<float, MPlexLL>;
+using GPlexLV = GPlex<float, MPlexLV>;
+using GPlexLS = GPlex<float, MPlexLS>;
+
+using GPlexHH = GPlex<float, MPlexHH>;
+using GPlexHV = GPlex<float, MPlexHV>;
+using GPlexHS = GPlex<float, MPlexHS>;
+
+using GPlexLH = GPlex<float, MPlexLH>;
+
+using GPlexQF = GPlex<float, MPlexQF>;
+using GPlexQI = GPlex<int, MPlexQI>;
+
 #endif  // _GPLEX_H_
diff --git a/mkFit/HitStructuresCU.cu b/mkFit/HitStructuresCU.cu
new file mode 100644
index 0000000000000..2c7ad5d2b57c5
--- /dev/null
+++ b/mkFit/HitStructuresCU.cu
@@ -0,0 +1,47 @@
+
+#include <vector>
+#include <algorithm>
+
+#include "HitStructuresCU.h"
+
+BunchOfHitsCU::BunchOfHitsCU() :
+      m_real_size {Config::maxHitsPerBunch}, m_fill_index {0} {
+  cudaMalloc((void**)&m_hits, sizeof(Hit)*m_real_size);
+}
+
+BunchOfHitsCU::~BunchOfHitsCU() {
+  cudaFree(m_hits);
+  m_fill_index = 0;
+}
+
+void BunchOfHitsCU::copyBunchOfHitsFromCPU(BunchOfHits& bunch) {
+  m_fill_index = bunch.m_fill_index;
+  cudaMemcpy(m_hits, bunch.m_hits, sizeof(Hit)*m_fill_index, cudaMemcpyHostToDevice);
+}
+
+void BunchOfHitsCU::allocatePhiBinInfos(int num_phi_bins) {
+  this->num_phi_bins = num_phi_bins;
+  cudaMalloc((void**)&m_phi_bin_infos_first, sizeof(int)*num_phi_bins);
+  cudaMalloc((void**)&m_phi_bin_infos_second, sizeof(int)*num_phi_bins);
+}
+
+void BunchOfHitsCU::freePhiBinInfos() {
+  cudaFree(m_phi_bin_infos_first);
+  cudaFree(m_phi_bin_infos_second);
+}
+
+void BunchOfHitsCU::copyPhiBinInfosFromCPU(BunchOfHits &bunch) {
+  // Strip the bin_infos pairs into two separate vectors
+  // We cannot use std::pair on the GPU
+  std::vector<int> first(num_phi_bins);
+  std::vector<int> second(num_phi_bins);
+
+  for (int i = 0; i < num_phi_bins; ++i) {
+    std::pair<int, int> &infos = bunch.m_phi_bin_infos[i];  
+    first[i] = infos.first;
+    second[i] = infos.second;
+  }
+
+  cudaMemcpy(m_phi_bin_infos_first, &first[0], sizeof(int)*num_phi_bins, cudaMemcpyHostToDevice);
+  cudaMemcpy(m_phi_bin_infos_second, &second[0], sizeof(int)*num_phi_bins, cudaMemcpyHostToDevice);
+}
diff --git a/mkFit/HitStructuresCU.h b/mkFit/HitStructuresCU.h
new file mode 100644
index 0000000000000..a652ab11bdd3b
--- /dev/null
+++ b/mkFit/HitStructuresCU.h
@@ -0,0 +1,29 @@
+#ifndef _HIT_STRUCTURES_H_
+#define _HIT_STRUCTURES_H_
+
+#include "HitStructures.h"
+#include "Config.h"
+
+class BunchOfHitsCU {
+ public:
+  Hit *m_hits;
+  int m_real_size;
+  int m_fill_index;
+
+  int num_phi_bins;
+  int *m_phi_bin_infos_first;
+  int *m_phi_bin_infos_second;
+
+  BunchOfHitsCU();
+  ~BunchOfHitsCU();
+
+  void copyBunchOfHitsFromCPU(BunchOfHits &bunch);
+
+  void allocatePhiBinInfos(int num_phi_bins);
+  void freePhiBinInfos();
+  void copyPhiBinInfosFromCPU(BunchOfHits &bunch);
+};
+
+
+#endif  // _HIT_STRUCTURES_H_
+
diff --git a/mkFit/KalmanUtilsMPlex.cc b/mkFit/KalmanUtilsMPlex.cc
index 51f6a27888532..ecc56226c6397 100644
--- a/mkFit/KalmanUtilsMPlex.cc
+++ b/mkFit/KalmanUtilsMPlex.cc
@@ -1,6 +1,10 @@
 #include "KalmanUtilsMPlex.h"
 #include "PropagationMPlex.h"
 
+#ifdef USE_CUDA
+#include "FitterCU.h"
+#endif
+
 namespace
 {
   using idx_t = Matriplex::idx_t;
@@ -298,7 +302,6 @@ void updateParametersMPlex(const MPlexLS &psErr,  const MPlexLV& psPar, const MP
 #endif
 }
 
-
 void computeChi2MPlex(const MPlexLS &psErr,  const MPlexLV& psPar, const MPlexQI &inChg,
                       const MPlexHS &msErr,  const MPlexHV& msPar,
                             MPlexQF& outChi2)
diff --git a/mkFit/KalmanUtilsMPlex.h b/mkFit/KalmanUtilsMPlex.h
index 39e969fb4419b..02fe1b9d2aa9e 100644
--- a/mkFit/KalmanUtilsMPlex.h
+++ b/mkFit/KalmanUtilsMPlex.h
@@ -4,10 +4,18 @@
 #include "Track.h"
 #include "Matrix.h"
 
+#include "FitterCU.h"
+
 void updateParametersMPlex(const MPlexLS &psErr,  const MPlexLV& psPar, const MPlexQI &inChg,
                            const MPlexHS &msErr,  const MPlexHV& msPar,
                                  MPlexLS &outErr,       MPlexLV& outPar);
 
+#ifdef USE_CUDA  // FIXME: temporary; move to FitterCU
+void computeChi2MPlex_tmp(const MPlexLS &psErr,  const MPlexLV& psPar, const MPlexQI &inChg,
+                      const MPlexHS &msErr,  const MPlexHV& msPar,
+                            MPlexQF& outChi2,
+                            FitterCU<float>& cuFitter);
+#endif
 void computeChi2MPlex(const MPlexLS &psErr,  const MPlexLV& psPar, const MPlexQI &inChg,
 		      const MPlexHS &msErr,  const MPlexHV& msPar,
                             MPlexQF& outChi2);
diff --git a/mkFit/Makefile b/mkFit/Makefile
index d2bcc45be65e5..ab550c7c2e6ed 100644
--- a/mkFit/Makefile
+++ b/mkFit/Makefile
@@ -73,12 +73,21 @@ CU_OBJS := $(CU_SRCS:.cu=.o)
 
 LDFLAGS_CU := -lcudart
 
+# To share __device__ function across several source files
+# 1) compile with --device-c (works as -c should be)
+# 2) Create some kind of dictionary with all the .cu.o
+# 3) The dictionary AND the original .o files should be used
+# -- Works only for CUDA_VERSION >= 5
 # TODO: Clean the "-I.. -std=c++11"
 ${CU_OBJS}: %.o: %.cu
-	${NV} -c -o $@ $< -I.. -std=c++11 -DUSE_MATRIPLEX
+	${NV} --device-c -o $@ $< -I.. -std=c++11 -DUSE_MATRIPLEX
+
+CU_LINK := cu_link.o
+${CU_LINK}: ${CU_OBJS}
+	${NV} --device-link $^ --output-file $@
 endif
 
-ALLOBJS := ${MKFOBJS} ${ABOVE_OBJS} ${CU_OBJS}
+ALLOBJS := ${MKFOBJS} ${ABOVE_OBJS} ${CU_OBJS} ${CU_LINK}
 
 ${MKFDEPS}: auto-genmplex
 
diff --git a/mkFit/MkBuilder.cc b/mkFit/MkBuilder.cc
index f46a0daa3eb0e..a2221fef35ccf 100644
--- a/mkFit/MkBuilder.cc
+++ b/mkFit/MkBuilder.cc
@@ -5,6 +5,10 @@
 
 #include "MkFitter.h"
 
+#ifdef USE_CUDA
+#include "FitterCU.h"
+#endif
+
 #include <omp.h>
 
 namespace
@@ -28,10 +32,17 @@ MkBuilder::MkBuilder() :
   m_event_of_hits(Config::nLayers)
 {
   m_mkfp_arr.resize(Config::numThreadsFinder);
+#ifdef USE_CUDA
+  m_cuFitter_arr.resize(Config::numThreadsFinder);
+#endif
 
   for (int i = 0; i < Config::numThreadsFinder; ++i)
   {
     m_mkfp_arr[i] = new (_mm_malloc(sizeof(MkFitter), 64)) MkFitter(0);
+#ifdef USE_CUDA
+    m_cuFitter_arr[i] = new FitterCU<float>(NN);
+    m_cuFitter_arr[i]->allocateDevice();
+#endif
   }
 }
 
@@ -40,6 +51,9 @@ MkBuilder::~MkBuilder()
    for (int i = 0; i < Config::numThreadsFinder; ++i)
    {
      _mm_free(m_mkfp_arr[i]);
+#ifdef USE_CUDA
+     m_cuFitter_arr[i]->freeDevice();
+#endif
    }
 }
 
@@ -218,6 +232,11 @@ void MkBuilder::quality_print()
 
 void MkBuilder::FindTracksBestHit(EventOfCandidates& event_of_cands)
 {
+#ifdef USE_CUDA  // FIXME: temporary; move to FitterCU
+        m_cuFitter_arr[omp_get_thread_num()]->allocate_extra_addBestHit();
+#endif
+
+  std::cout << "Finding best hits...\n";
   // partition recseeds into eta bins
   for (int iseed = 0; iseed < m_recseeds.size(); ++iseed)
   {
@@ -286,6 +305,34 @@ void MkBuilder::FindTracksBestHit(EventOfCandidates& event_of_cands)
         //   _mm_prefetch((char*) & bunch_of_hits.m_hits[i], _MM_HINT_T1);
         // }
 
+#ifdef USE_CUDA
+        FitterCU<float> *cuFitter = m_cuFitter_arr[omp_get_thread_num()];
+        cuFitter->prepare_addBestHit(
+            mkfp->Err[mkfp->iP], mkfp->Par[mkfp->iP],
+            mkfp->Chg,
+            NN);
+
+        BunchOfHitsCU bunch_of_hits_cu;
+        bunch_of_hits_cu.copyBunchOfHitsFromCPU(bunch_of_hits);
+        bunch_of_hits_cu.allocatePhiBinInfos(bunch_of_hits.m_phi_bin_infos.size());
+        bunch_of_hits_cu.copyPhiBinInfosFromCPU(bunch_of_hits);
+
+        cuFitter->addBestHit(bunch_of_hits_cu);
+
+        bunch_of_hits_cu.freePhiBinInfos();
+        cuFitter->finalize_addBestHit(
+            mkfp->msErr[mkfp->Nhits], mkfp->msPar[mkfp->Nhits],
+            mkfp->Err[mkfp->iC], mkfp->Par[mkfp->iC],
+            mkfp->HitsIdx[mkfp->Nhits], mkfp-> Chi2);
+
+        // ...
+        mkfp->SetNhits(ilay + 1);  //here again assuming one hit per layer (is this needed?)
+
+        if (ilay + 1 < Config::nLayers)
+        {
+          mkfp->PropagateTracksToR(m_event->geom_.Radius(ilay+1), end - itrack);
+        }
+#else
         mkfp->SelectHitRanges(bunch_of_hits, end - itrack);
 
 // #ifdef PRINTOUTS_FOR_PLOTS
@@ -316,13 +363,17 @@ void MkBuilder::FindTracksBestHit(EventOfCandidates& event_of_cands)
 #endif
         }
 
+#endif  // USE_CUDA
       } // end of layer loop
 
       mkfp->OutputFittedTracksAndHitIdx(etabin_of_candidates.m_candidates, itrack, end, true);
 
     } // end of seed loop
-
    } //end of parallel section over seeds
+#ifdef USE_CUDA  // FIXME: temporary; move to FitterCU
+        m_cuFitter_arr[omp_get_thread_num()]->free_extra_addBestHit();
+#endif
+
 }
 
 
diff --git a/mkFit/MkBuilder.h b/mkFit/MkBuilder.h
index fe03661b21066..152f7c502bbac 100644
--- a/mkFit/MkBuilder.h
+++ b/mkFit/MkBuilder.h
@@ -4,6 +4,7 @@
 #include <vector>
 
 #include "HitStructures.h"
+#include "FitterCU.h"
 
 class Event;
 class EventTmp;
@@ -18,6 +19,9 @@ class MkBuilder
   EventOfHits    m_event_of_hits;
 
   std::vector<MkFitter*> m_mkfp_arr;
+#ifdef USE_CUDA
+  std::vector<FitterCU<float>*> m_cuFitter_arr;
+#endif
 
   std::vector<Track>     m_recseeds;
 
diff --git a/mkFit/MkFitter.cc b/mkFit/MkFitter.cc
index 823c8e7c2874b..adf072ca1e73d 100644
--- a/mkFit/MkFitter.cc
+++ b/mkFit/MkFitter.cc
@@ -3,9 +3,6 @@
 
 #include "PropagationMPlex.h"
 #include "KalmanUtilsMPlex.h"
-#ifdef USE_CUDA
-#include "FitterCU.h"
-#endif
 
 #include <sstream>
 
@@ -40,7 +37,10 @@ void MkFitter::InputTracksAndHits(std::vector<Track>&  tracks,
   // assert(end - beg == NN);
 
   int itrack;
-#ifdef USE_CUDA
+
+// FIXME: uncomment when track building is ported to GPU.
+#if 0
+//#ifdef USE_CUDA
   // This openmp loop brings some performances when using
   // a single thread to fit all events.
   // However, it is more advantageous to use the threads to
@@ -63,7 +63,9 @@ void MkFitter::InputTracksAndHits(std::vector<Track>&  tracks,
 // CopyIn seems fast enough, but indirections are quite slow.
 // For GPU computations, it has been moved in between kernels
 // in an attempt to overlap CPU and GPU computations.
-#ifndef USE_CUDA
+// FIXME: uncomment when track building is ported to GPU.
+#if 1
+//#ifndef USE_CUDA
     for (int hi = 0; hi < Nhits; ++hi)
     {
       const int hidx = trk.getHitIdx(hi);
@@ -670,7 +672,6 @@ void MkFitter::SelectHitRanges(BunchOfHits &bunch_of_hits, const int N_proc)
     phiBinPlus  = std::max(0,phiBinPlus);
     phiBinPlus  = std::min(Config::nPhiPart-1,phiBinPlus);
 
-
     PhiBinInfo_t binInfoMinus = bunch_of_hits.m_phi_bin_infos[phiBinMinus];
     PhiBinInfo_t binInfoPlus  = bunch_of_hits.m_phi_bin_infos[phiBinPlus];
 
@@ -922,9 +923,9 @@ void MkFitter::AddBestHit(BunchOfHits &bunch_of_hits)
 #ifdef DEBUG
   std::cout << "update parameters" << std::endl;
 #endif
+
   updateParametersMPlex(Err[iP], Par[iP], Chg, msErr[Nhits], msPar[Nhits],
 			Err[iC], Par[iC]);
-
   //std::cout << "Par[iP](0,0,0)=" << Par[iP](0,0,0) << " Par[iC](0,0,0)=" << Par[iC](0,0,0)<< std::endl;
 }
 
diff --git a/mkFit/MkFitter.h b/mkFit/MkFitter.h
index c0ad0e9a4cba1..66cc7d142c024 100644
--- a/mkFit/MkFitter.h
+++ b/mkFit/MkFitter.h
@@ -8,6 +8,8 @@
 #include "HitStructures.h"
 #include "BinInfoUtils.h"
 
+#include "FitterCU.h"
+
 //#define DEBUG 1
 
 class CandCloner;
diff --git a/mkFit/PropagationMPlex.cc b/mkFit/PropagationMPlex.cc
index 2315ace35dd42..cbde06d2085fc 100644
--- a/mkFit/PropagationMPlex.cc
+++ b/mkFit/PropagationMPlex.cc
@@ -699,7 +699,7 @@ void applyMaterialEffects(const MPlexQF &hitsRl, const MPlexQF& hitsXi, MPlexLS
 
 void propagateHelixToRMPlex(const MPlexLS &inErr,  const MPlexLV& inPar,
                             const MPlexQI &inChg,  const MPlexHV& msPar, 
-			          MPlexLS &outErr,       MPlexLV& outPar)
+                            MPlexLS &outErr,       MPlexLV& outPar)
 {
 #ifdef DEBUG
   const bool dump = false;
@@ -793,7 +793,7 @@ void propagateHelixToRMPlex(const MPlexLS &inErr,  const MPlexLV& inPar,
 
 void propagateHelixToRMPlex(const MPlexLS& inErr,  const MPlexLV& inPar,
                             const MPlexQI& inChg,  const float    r,
-			    MPlexLS&       outErr, MPlexLV&       outPar,
+                            MPlexLS&       outErr, MPlexLV&       outPar,
                             const int      N_proc)
 {
 #ifdef DEBUG
diff --git a/mkFit/buildtestMPlex.cc b/mkFit/buildtestMPlex.cc
index 76ccdc1cf1693..6dd4775099d00 100644
--- a/mkFit/buildtestMPlex.cc
+++ b/mkFit/buildtestMPlex.cc
@@ -7,6 +7,7 @@
 #include "BinInfoUtils.h"
 
 #include "MkBuilder.h"
+#include "FitterCU.h"
 
 #include <omp.h>
 
@@ -73,6 +74,7 @@ double runBuildingTestPlexBestHit(Event& ev)
 {
   MkBuilder builder;
 
+  std::cout << "Building event...\n";
   builder.begin_event(&ev, 0, __func__);
 
   double time = dtime();
@@ -81,10 +83,12 @@ double runBuildingTestPlexBestHit(Event& ev)
   __itt_resume();
 #endif
 
+  std::cout << "Fitting seeds...\n";
   builder.fit_seeds();
 
   EventOfCandidates event_of_cands;
 
+  std::cout << "Finding best hits...\n";
   builder.FindTracksBestHit(event_of_cands);
 
 #ifdef USE_VTUNE_PAUSE
diff --git a/mkFit/computeChi2_kernels.cu b/mkFit/computeChi2_kernels.cu
new file mode 100644
index 0000000000000..f7fdabcf2b8a2
--- /dev/null
+++ b/mkFit/computeChi2_kernels.cu
@@ -0,0 +1,568 @@
+#include <stdio.h>
+#include <algorithm>
+#include "GPlex.h"
+#include "kalmanUpdater_kernels.h"
+#include "computeChi2_kernels.h"
+#include "HitStructuresCU.h"
+
+#include <thrust/device_vector.h>
+#include <thrust/fill.h>
+#include <thrust/reduce.h>
+#include <thrust/functional.h>
+
+#include "Hit.h"
+
+#define L 6
+#define HS 6
+#define BLOCK_SIZE_X 32
+#define MAX_BLOCKS_X 65535 // CUDA constraint
+
+
+template <>
+__device__ float* SVector3::ArrayCU() {
+  return fArray; 
+}
+
+template <>
+__device__ float* SVector6::ArrayCU() {
+  return fArray; 
+}
+
+__device__ float *Hit::posArrayCU() {
+  return state_.pos_.ArrayCU();
+}
+
+__device__ float *Hit::errArrayCU() {
+  return state_.err_.ArrayCU();
+}
+
+__device__ void chi2Similarity_fn(
+    float *a, size_t aN,
+    float *b, size_t bN,
+    float *c, // in registers
+    float *d, size_t dN) {
+
+  int n = threadIdx.x + blockIdx.x * blockDim.x;
+
+  // manually subrtact into local vars -- 3 of them
+  float x0 = a[0 * aN + n] - b[0 * aN + n];
+  float x1 = a[1 * aN + n] - b[1 * aN + n];
+  float x2 = a[2 * aN + n] - b[2 * aN + n];
+  d[0 * dN + n] = c[0]*x0*x0 + c[2]*x1*x1 + c[5]*x2*x2 +
+              2*( c[1]*x1*x0 + c[3]*x2*x0 + c[4]*x1*x2);
+}
+
+__device__ void computeChi2_fn(
+    float* propErr, size_t propErr_stride,
+    float* msErr, size_t msErr_stride,
+    /*float* resErr, size_t resErr_stride,*/
+    float *msPar, size_t msPar_stride,
+    float *propPar, size_t propPar_stride,
+    float *outChi2, size_t outChi2_stride,
+    const int N) {
+  int grid_width = blockDim.x * gridDim.x;
+  int n = threadIdx.x + blockIdx.x * blockDim.x;
+  float resErr_reg[HS];
+
+  for (int z = 0; z < (N-1)/grid_width  +1; z++) {
+    n += z*grid_width;
+    if (n < N) {
+      for (int j = 0; j < HS; ++j) {
+        resErr_reg[j] = 0; //resErr[j*resErr_stride + n];
+      }
+      addIntoUpperLeft3x3_fn(propErr, propErr_stride,
+          msErr, msErr_stride, resErr_reg, N, n);
+      invertCramerSym_fn(resErr_reg);
+
+      chi2Similarity_fn(msPar, msPar_stride,
+          propPar, propPar_stride, resErr_reg, 
+          outChi2, outChi2_stride);
+      /*for (int j = 0; j < HS; ++j) {*/
+        /*resErr[j*resErr_stride + n] = resErr_reg[j];*/
+      /*}*/
+    }
+  }
+}
+
+void computeChi2_wrapper(cudaStream_t &stream, 
+    GPlexLS propErr, GPlexHS msErr, // GPlex<float> resErr,
+    GPlexHV msPar, GPlexLV propPar, GPlexQF outChi2,
+    const int N) {
+  int gridx = std::min((N-1)/BLOCK_SIZE_X + 1,
+                       MAX_BLOCKS_X);
+  dim3 grid(gridx, 1, 1);
+  dim3 block(BLOCK_SIZE_X, 1, 1);
+#if 0
+  computeChi2_kernel <<< grid, block, 0, stream >>>
+    (propErr.ptr, propErr.stride,
+     msErr.ptr, msErr.stride,
+     /*resErr.ptr, resErr.stride,*/
+     msPar.ptr, msPar.stride,
+     propPar.ptr, propPar.stride,
+     outChi2.ptr, outChi2.stride,
+     N);
+#endif
+ }
+
+template <typename T>
+__device__ void SlurpIn_fn(float *fArray, int stride, int kSize, 
+                           const char *arr, int *vi, int N) {
+  int j = threadIdx.x + blockDim.x * blockIdx.x;
+  if (j<N) {
+    for (int i = 0; i < kSize; ++i) { // plex_size
+      int *XHitPos = vi;
+      int off = XHitPos[j] * sizeof(Hit);
+      fArray[i*stride+ j] = * (const T*) (arr + i*sizeof(T) + off);
+    }
+  }
+}
+
+
+__device__ void HitToMs_fn(float *msErr, int msErr_stride, int msErr_plex_size,
+                           float *msPar, int msPar_stride, int msPar_plex_size,
+                           Hit *hits, int *XHitPos, int hit_cnt, int N) {
+  /*int j = threadIdx.x + blockDim.x*blockIdx.x;*/
+
+  const char *varr      = (char*) hits;
+  const int   off_error = (char*) hits[0].errArrayCU() - varr;
+  const int   off_param = (char*) hits[0].posArrayCU() - varr;
+  
+  /*if (j<N) {*/
+  /*if (j==1) {*/
+    /*int hi = XHitPos[j];*/
+    /*Hit *hits_shifted = &hits[hit_cnt];*/
+    /*for (int i = 0; i < msPar_plex_size; ++i) {*/
+      /*msPar[i*msPar_stride+ j] = hits_shifted[hi].posArrayCU()[i];*/
+      /*if (msPar[i*msPar_stride+ j] != hits_shifted[hi].posArrayCU()[i]) {*/
+        /*printf("(C:%f/G:%f)    ", msPar[i*msPar_stride + j], hits_shifted[hi].posArrayCU()[i]);*/
+      /*}*/
+    /*}*/
+    /*printf("\n");*/
+  /*}*/
+
+  SlurpIn_fn<float>(msErr, msErr_stride, msErr_plex_size, varr + (hit_cnt*sizeof(Hit)) + off_error, XHitPos, N);
+  SlurpIn_fn<float>(msPar, msPar_stride, msPar_plex_size, varr + (hit_cnt*sizeof(Hit)) + off_param, XHitPos, N);
+  
+  /*if (j==2) {*/
+    /*for (int i = 0; i < msPar_plex_size; ++i) {*/
+      /*printf("GPU:Par[%d*N+%d]=%f   ", i, j, msPar[i*msErr_stride + j]);*/
+    /*}*/
+  /*}*/
+}
+
+
+void HitToMs_wrapper(cudaStream_t& stream,
+    GPlexHS &msErr, GPlexHV &msPar, BunchOfHitsCU &bunch, 
+    GPlexQI &XHitPos, int hit_cnt, int N) {
+  int gridx = std::min((N-1)/BLOCK_SIZE_X + 1,
+                       MAX_BLOCKS_X);
+  dim3 grid(gridx, 1, 1);
+  dim3 block(BLOCK_SIZE_X, 1, 1);
+#if 0
+  HitToMs_kernel <<< grid, block, 0 , stream >>>
+    (msErr.ptr, msErr.stride, msErr.y,
+     msPar.ptr, msPar.stride, msPar.y,
+     bunch.m_hits, XHitPos.ptr, hit_cnt, N);
+#endif
+}
+
+
+__device__ void getNewBestHitChi2_fn(float *outChi2, float &minChi2,
+    int &bestHit, int hit_cnt, int N) {
+  int itrack = threadIdx.x + blockDim.x*blockIdx.x;
+
+  if (itrack < N) {
+    float chi2 = fabs(outChi2[itrack]);
+    if (chi2 < minChi2) {
+      minChi2 = chi2;
+      bestHit = hit_cnt;
+    }
+  }
+}
+
+void getNewBestHitChi2_wrapper(cudaStream_t &stream,
+    GPlexQF &outChi2, float *minChi2, int *bestHit, int hit_cnt, int N) {
+  int gridx = std::min((N-1)/BLOCK_SIZE_X + 1,
+                       MAX_BLOCKS_X);
+  dim3 grid(gridx, 1, 1);
+  dim3 block(BLOCK_SIZE_X, 1, 1);
+#if 0
+  getNewBestHitChi2_kernel <<< grid, block, 0, stream >>>
+    (outChi2.ptr, minChi2, bestHit, hit_cnt, N);
+#endif
+}
+
+void fill_array_cu(float *array, int size, int value) {
+  thrust::device_ptr<float> d_ptr(array);
+  thrust::fill(d_ptr, d_ptr + size, value);
+}
+
+
+__device__ void updateTracksWithBestHit_fn(Hit *hits, int *XHitPos,
+    float minChi2, int bestHit,
+    float *msErr, int msErr_stride, int msErr_plex_size,
+    float *msPar, int msPar_stride, int msPar_plex_size,
+    float *propPar, int propPar_stride,
+    float *Chi2, int *HitsIdx,
+    int N) {
+  int itrack = threadIdx.x + blockDim.x*blockIdx.x;
+  if (itrack < N) {
+    if (bestHit >= 0)
+    {
+      const char *varr      = (char*) hits;
+      const int   off_error = (char*) hits[0].errArrayCU() - varr;
+      const int   off_param = (char*) hits[0].posArrayCU() - varr;
+
+      /*Hit   &hit  = hits[ XHitPos.At[itrack] + bestHit[itrack] ];*/
+      float &chi2_local = minChi2;
+	  
+      /*msErr[Nhits].CopyIn(itrack, hit.errArray());*/
+      /*SlurpIn_fn<float>(msErr, msErr_stride, msErr_plex_size,
+        varr + (itrack*sizeof(Hit)) + off_error, XHitPos, N);*/
+      /*msPar[Nhits].CopyIn(itrack, hit.posArray());*/
+      /*SlurpIn_fn<float>(msPar, msPar_stride, msPar_plex_size,
+        varr + (itrack*sizeof(Hit)) + off_param, XHitPos, N);*/
+      for (int i = 0; i < msErr_plex_size; ++i) {
+        msErr[i*msErr_stride + itrack] =  hits[XHitPos[itrack]+bestHit].errArrayCU()[i];
+      }
+      for (int i = 0; i < msPar_plex_size; ++i) {
+        msPar[i*msErr_stride + itrack] =  hits[XHitPos[itrack]+bestHit].posArrayCU()[i];
+      }
+      /*Chi2(itrack, 0, 0) += chi2_local;*/
+      Chi2[itrack] += chi2_local;
+      /*HitsIdx[Nhits](itrack, 0, 0) = XHitPos.At(itrack, 0, 0) + bestHit[itrack];*/
+      HitsIdx[itrack] = XHitPos[itrack] + bestHit;
+    }
+    else
+    {
+      /*msErr[Nhits].SetDiagonal3x3(itrack, 666);*/
+      msErr[0*msErr_stride + itrack] = 666;
+      msErr[1*msErr_stride + itrack] = 0;
+      msErr[2*msErr_stride + itrack] = 666;
+      msErr[3*msErr_stride + itrack] = 0;
+      msErr[4*msErr_stride + itrack] = 0;
+      msErr[5*msErr_stride + itrack] = 666;
+
+      /*msPar[Nhits](itrack,0,0) = Par[iP](itrack,0,0);*/
+      /*msPar[Nhits](itrack,1,0) = Par[iP](itrack,1,0);*/
+      /*msPar[Nhits](itrack,2,0) = Par[iP](itrack,2,0);*/
+      for (int i = 0; i < msPar_plex_size; ++i) {
+        msPar[i*msPar_stride + itrack] = propPar[i*propPar_stride + itrack]; 
+      }
+      /*HitsIdx[Nhits](itrack, 0, 0) = -1;*/
+      HitsIdx[itrack] = -1;
+
+      // Don't update chi2
+    }
+  }
+}
+
+void updateTracksWithBestHit_wrapper(cudaStream_t &stream,
+    BunchOfHitsCU &bunch, GPlexQI &XHitPos, 
+    float *minChi2, int *best_hit, 
+    GPlexHS &msErr, GPlexHV &msPar,
+    GPlexLV &propPar,
+    float *Chi2, int *HitsIdx, int N) {
+  int gridx = std::min((N-1)/BLOCK_SIZE_X + 1,
+                       MAX_BLOCKS_X);
+  dim3 grid(gridx, 1, 1);
+  dim3 block(BLOCK_SIZE_X, 1, 1);
+/*
+  updateTracksWithBestHit_kernel <<< grid, block, 0, stream >>>
+      (bunch.m_hits, XHitPos.ptr,
+       minChi2, best_hit,
+       msErr.ptr, msErr.stride, msErr.y,
+       msPar.ptr, msPar.stride, msPar.y,
+       propPar.ptr, propPar.stride,
+       Chi2, HitsIdx,
+       N);
+*/
+}
+
+int getMaxNumHits_wrapper(GPlexQI d_XHitSize, int N) {
+  thrust::device_ptr<int> d_ptr(d_XHitSize.ptr);
+  int maxSize=  thrust::reduce(d_ptr, d_ptr + N, -1, thrust::maximum<int>());
+  maxSize = std::min(maxSize, Config::maxHitsConsidered);
+
+  return maxSize;
+}
+
+__global__ void bestHit_kernel(
+    Hit *hits, int *XHitPos, 
+    float* propErr, size_t propErr_stride,
+    float* msErr, size_t msErr_stride, size_t msErr_plex_size,
+    float *msPar, size_t msPar_stride, size_t msPar_plex_size,
+    float *propPar, size_t propPar_stride,
+    float *outChi2, size_t outChi2_stride,
+    float *Chi2, int *HitsIdx,
+    int maxSize, int N) {
+
+  /*int itrack = threadIdx.x + blockDim.x*blockIdx.x;*/
+  int bestHit_reg = -1;
+  float minChi2_reg = 15.f;
+
+  for (int hit_cnt = 0; hit_cnt < maxSize; ++hit_cnt)
+  {
+    HitToMs_fn(msErr, msErr_stride, msErr_plex_size,
+               msPar, msPar_stride, msPar_plex_size,
+               hits, XHitPos, hit_cnt, N);
+#if 0
+      // TODO: add CMSGeom
+      if (Config::useCMSGeom) {
+        //propagateHelixToRMPlex(psErr,  psPar, inChg,  msPar, propErr, propPar);
+        throw std::runtime_error("useCMSGeom not implemented yet for GPU");
+      } else {}
+#endif
+    computeChi2_fn(propErr, propErr_stride,
+                   msErr, msErr_stride,
+                   msPar, msPar_stride,
+                   propPar, propPar_stride,
+                   outChi2, outChi2_stride,
+                   N);
+    getNewBestHitChi2_fn(outChi2, minChi2_reg, bestHit_reg, hit_cnt, N);
+  }
+  updateTracksWithBestHit_fn
+      (hits, XHitPos,
+       minChi2_reg, bestHit_reg,
+       msErr, msErr_stride, msErr_plex_size,
+       msPar, msPar_stride, msPar_plex_size,
+       propPar, propPar_stride,
+       Chi2, HitsIdx,
+       N);
+}
+
+
+void bestHit_wrapper(cudaStream_t &stream,
+    BunchOfHitsCU &bunch, GPlexQI &XHitPos, 
+    GPlexLS &propErr, GPlexHS &msErr, GPlexHV &msPar,
+    GPlexLV &propPar, GPlexQF &outChi2,
+    float *Chi2, int *HitsIdx,
+    int maxSize, int N) {
+  int gridx = std::min((N-1)/BLOCK_SIZE_X + 1,
+                       MAX_BLOCKS_X);
+  dim3 grid(gridx, 1, 1);
+  dim3 block(BLOCK_SIZE_X, 1, 1);
+
+  bestHit_kernel <<< grid, block, 0, stream >>>
+    (bunch.m_hits, XHitPos.ptr,
+     propErr.ptr, propErr.stride,
+     msErr.ptr, msErr.stride, msErr.kSize,
+     msPar.ptr, msPar.stride, msPar.kSize,
+     propPar.ptr, propPar.stride,
+     outChi2.ptr, outChi2.stride,
+     Chi2, HitsIdx,
+     maxSize, N);
+}
+
+__device__ float downPhi_fn(float phi) {
+  while (phi >= Config::PI) {phi-=Config::TwoPI;}
+  return phi;
+}
+	
+__device__ float upPhi_fn(float phi) {
+  while (phi <= -Config::PI) {phi+=Config::TwoPI;}
+  return phi;
+}
+
+__device__ float normalizedPhi_fn(float phi) {
+  //  return std::fmod(phi, (float) Config::PI); // return phi +pi out of phase for |phi| beyond boundary! 
+  if (abs(phi)>=Config::PI) {phi = (phi>0 ? downPhi_fn(phi) : upPhi_fn(phi));}
+  return phi;
+}
+
+__device__ int getPhiPartition_fn(float phi)
+{
+  //assume phi is between -PI and PI
+  //  if (!(fabs(phi)<Config::PI)) std::cout << "anomalous phi=" << phi << std::endl;
+  //  const float phiPlusPi  = std::fmod(phi+Config::PI,Config::TwoPI); // normaliztion done here
+  const float phiPlusPi = phi+Config::PI; 
+  int bin = phiPlusPi*Config::fPhiFactor;
+  
+  // theoretically these checks below should be taken care of by normalizedPhi, however...
+  // these condition checks appeared in very bizarre corner case where propagated phi == pi != Config::PI in check of normalizedPhi (but not unexpected... comparing float point numbers)
+  // i.e. delta on floating point check smaller than comparison... making what should be bin = nPhiPart - 1 instead bin = nPhiPart (out of bounds!!) ...or worse if unsigned bin < 0, bin == int max!
+  if (bin<0)                      bin = 0;
+  else if (bin>=Config::nPhiPart) bin = Config::nPhiPart - 1;
+
+  return bin;
+}
+
+__device__ float getPhi_fn(float x, float y)
+{
+  return atan2(y,x); 
+}
+
+__global__ void selectHitRanges_kernel(Hit *hits,
+    int *phi_bin_infos_first, int *phi_bin_infos_second, int bunch_fill_index,
+    int *XHitPos, int *XHitSize, 
+    float *Err, int Err_stride,
+    float *Par, int Par_stride,
+    bool useCMSGeom, int N) {
+  int itrack = threadIdx.x + blockDim.x*blockIdx.x;
+  if (itrack < N) {
+    // must store hit vector into a data member so it can be used in hit selection.
+    // or ... can have it passed into other functions.
+    // somewhat yucky, either way.
+
+    // Also, must store two ints per Matriplex elements ... first index and size.
+    // These are XPos and XSize
+
+    /*const int iI = iP;*/
+    // Hmmh ... this should all be solved by partitioning ... let's try below ...
+    //
+    // float eta = getEta(eta_predx,eta_predy,eta_predz);
+    // //protect against anomalous eta (should go into getEtaPartition maybe?)
+    // if (fabs(eta) > etaDet) eta = (eta>0 ? etaDet*0.99 : -etaDet*0.99);
+    // unsigned int etabin = getEtaPartition(eta,etaDet);
+
+    const float predx = Par[(0*1 + 0)*Par_stride + itrack];  // Par[iI].ConstAt(itrack, 0, 0);
+    const float predy = Par[(1*1 + 0)*Par_stride + itrack];  // Par[iI].ConstAt(itrack, 1, 0);
+    const float predz = Par[(2*1 + 0)*Par_stride + itrack];  // Par[iI].ConstAt(itrack, 2, 0);
+
+    float phi = getPhi_fn(predx,predy);
+
+    const float px2py2 = predx*predx+predy*predy; // predicted radius^2
+    const float dphidx = -predy/px2py2;
+    const float dphidy =  predx/px2py2;
+    // const float dphi2  =     dphidx*dphidx*(Err[iI].ConstAt(itrack, 0, 0) /*propState.errors.At(0,0)*/) +
+    //                          dphidy*dphidy*(Err[iI].ConstAt(itrack, 1, 1) /*propState.errors.At(1,1)*/) +
+    //                      2 * dphidx*dphidy*(Err[iI].ConstAt(itrack, 0, 1) /*propState.errors.At(0,1)*/);
+    const float dphi2  =     dphidx*dphidx*Err[(0)*Err_stride + itrack] +
+                             dphidy*dphidy*Err[(2)*Err_stride + itrack] +
+                         2 * dphidx*dphidy*Err[(1)*Err_stride + itrack];
+
+    const float dphi       = sqrtf(fabs(dphi2));//how come I get negative squared errors sometimes? MT -- how small?
+    const float nSigmaDphi = fminf(fmaxf(Config::nSigma*dphi, Config::minDPhi), Config::PI);
+    //const float nSigmaDphi = Config::nSigma*dphi;
+
+    float dPhiMargin = 0.;
+    if (useCMSGeom) {
+      //now correct for bending and for layer thickness unsing linear approximation
+      /*const float predpx = Par[iP].ConstAt(itrack, 3, 0);*/
+      /*const float predpy = Par[iP].ConstAt(itrack, 4, 0);*/
+      const float predpx = Par[(3*1 + 0)*Par_stride + itrack];
+      const float predpy = Par[(4*1 + 0)*Par_stride + itrack];
+      float deltaR = Config::cmsDeltaRad; //fixme! using constant vale, to be taken from layer properties
+      float radius = sqrt(px2py2);
+      float pt     = sqrt(predpx*predpx + predpy*predpy);
+      float cosTheta = ( predx*predpx + predy*predpy )/(pt*radius);
+      float hipo = deltaR/cosTheta;
+      float dist = sqrt(hipo*hipo - deltaR*deltaR);
+      dPhiMargin = dist/radius;
+    }
+    const float dphiMinus = normalizedPhi_fn(phi-nSigmaDphi-dPhiMargin);
+    const float dphiPlus  = normalizedPhi_fn(phi+nSigmaDphi+dPhiMargin);
+// FIXME ^ OK
+
+#ifdef DEBUG
+    std::ostringstream xout;
+    bool               xout_dump = false;
+    xout << "--------------------------------------------------------------------------------\n";
+    xout << "phi  = " << phi  << ", dphiMinus = " << dphiMinus << ", dphiPlus = " << dphiPlus << std::endl;
+    xout << "dphi = " << dphi  << ", dphi2 = " << dphi2 << ", nSigmaDphi = " << nSigmaDphi << ", nSigma = " << Config::nSigma << std::endl;
+#endif
+
+    int   phiBinMinus = getPhiPartition_fn(dphiMinus);
+    int   phiBinPlus  = getPhiPartition_fn(dphiPlus);
+
+#ifdef DEBUG
+    xout << "phiBinMinus = " << phiBinMinus << ", phiBinPlus = " << phiBinPlus << std::endl;
+#endif
+
+    // XXXX are these checks really needed?
+    phiBinMinus = fmaxf(0,phiBinMinus);
+    phiBinMinus = fminf(Config::nPhiPart-1,phiBinMinus);
+    phiBinPlus  = fmaxf(0,phiBinPlus);
+    phiBinPlus  = fminf(Config::nPhiPart-1,phiBinPlus);
+
+    //PhiBinInfo_t binInfoMinus = bunch_of_hits.m_phi_bin_infos[phiBinMinus];
+    //PhiBinInfo_t binInfoPlus  = bunch_of_hits.m_phi_bin_infos[phiBinPlus];
+    int binInfoMinus_first = phi_bin_infos_first[phiBinMinus];
+    int binInfoMinus_second = phi_bin_infos_second[phiBinMinus];
+    int binInfoPlus_first = phi_bin_infos_first[phiBinPlus];
+    int binInfoPlus_second = phi_bin_infos_second[phiBinPlus];
+
+
+    /*if (binInfoPlus.first + binInfoPlus.second - binInfoMinus.first > Config::maxHitsConsidered)*/
+    if (binInfoPlus_first + binInfoPlus_second - binInfoMinus_first > Config::maxHitsConsidered)
+    {
+      // XXXX
+      // Do something smart to reduce the range.
+      // I'd go for taking the exact phi bin and then walking left and right ...
+      // but this gives the wrap-around problem again.
+    }
+
+    // XXXX
+    // Hmmh ... maybe the copying of extras should be done on demand.
+    // BunchOfHits could know how many extras it has already.
+    // Or Giuseppe is right ... and we should just update the index vector for SlurpIn
+    // instead of shifting of the base address as is done now. Sigh.
+    
+    // fixme: temporary to avoid wrapping
+    // This is now fixed with Config::maxHitsConsidered extra hits copied to the end +
+    // changing XHitBegin/End to XHitPos/Size.
+    // Putting all of it into DEBUG
+#ifdef DEBUG
+    if (binInfoMinus > binInfoPlus)
+    {
+      // xout_dump = true;
+      xout << "FIXER IN:  phiBinMinus = " << phiBinMinus << ", phiBinPlus = " << phiBinPlus << std::endl;
+      xout << "FIXER IN:  BIMinus.first = " << binInfoMinus.first << ", BIPlus.first = " << binInfoPlus.first << std::endl;
+      xout << "FIXER IN:  BIMinus.second = " << binInfoMinus.second << ", BIPlus.second = " << binInfoPlus.second << std::endl;
+
+      int phibin = getPhiPartition(phi);
+
+      xout << "FIXER   :  phibin = " << phibin << std::endl;
+
+      // XXXX are those two really needed?
+      phibin = std::max(0,phibin);
+      phibin = std::min(Config::nPhiPart-1,phibin);
+
+      xout << "FIXER   :  phibin = " << phibin << std::endl;
+    }
+#endif
+
+    XHitPos[itrack] = binInfoMinus_first;
+    XHitSize[itrack] = binInfoPlus_first + binInfoPlus_second - binInfoMinus_first;
+    if (XHitSize[itrack] < 0)
+    {
+      // XXX It would be nice to have BunchOfHits.m_n_real_hits.
+      /*XHitSize[itrack] += bunch_of_hits.m_fill_index - Config::maxHitsConsidered;*/
+      XHitSize[itrack] += bunch_fill_index - Config::maxHitsConsidered;
+    }
+
+    // XXXX Hack to limit N_hits to maxHitsConsidered.
+    // Should at least take hits around central bin -- to be explored, esp. with jet presence.
+    // Strange ... this is worse than just taking first 25 hits !!!
+    // Comment out for now. Must talk to Giuseppe about this.
+    // if (XHitSize.At(itrack, 0, 0) > Config::maxHitsConsidered)
+    // {
+    //   xout_dump = true;
+    //   XHitPos .At(itrack, 0, 0) += (XHitSize.At(itrack, 0, 0) - Config::maxHitsConsidered) / 2;
+    //   XHitSize.At(itrack, 0, 0) = Config::maxHitsConsidered;
+    // }
+
+#ifdef DEBUG
+    xout << "found range firstHit=" << XHitPos.At(itrack, 0, 0) << " size=" << XHitSize.At(itrack, 0, 0) << std::endl;
+    if (xout_dump)
+       std::cout << xout.str();
+#endif
+
+  }
+}
+
+void selectHitRanges_wrapper(cudaStream_t &stream, BunchOfHitsCU &bunch, 
+    GPlexQI &XHitPos, GPlexQI &XHitSize,
+    GPlexLS &Err, GPlexLV &Par,
+    int N) {
+  int gridx = std::min((N-1)/BLOCK_SIZE_X + 1,
+                       MAX_BLOCKS_X);
+  dim3 grid(gridx, 1, 1);
+  dim3 block(BLOCK_SIZE_X, 1, 1);
+
+  selectHitRanges_kernel <<< grid, block, 0, stream >>>
+    (bunch.m_hits, bunch.m_phi_bin_infos_first, bunch.m_phi_bin_infos_second, bunch.m_fill_index,
+     XHitPos.ptr, XHitSize.ptr, 
+     Err.ptr, Err.stride, Par.ptr, Par.stride,
+     Config::useCMSGeom, N);
+}
diff --git a/mkFit/computeChi2_kernels.h b/mkFit/computeChi2_kernels.h
new file mode 100644
index 0000000000000..e37ce1e7e0dc3
--- /dev/null
+++ b/mkFit/computeChi2_kernels.h
@@ -0,0 +1,42 @@
+#ifndef _COMPUTE_CHI2_KERNELS_H_
+#define _COMPUTE_CHI2_KERNELS_H_
+
+#include "HitStructuresCU.h"
+#include "GPlex.h"
+
+void computeChi2_wrapper(cudaStream_t &stream, 
+    GPlexLS propErr, GPlexHS msErr, // GPlex<float> resErr,
+    GPlexHV msPar, GPlexLV propPar, GPlexQF outChi2,
+    const int N);
+
+void HitToMs_wrapper(cudaStream_t& stream,
+    GPlexHS &msErr, GPlexHV &msPar, BunchOfHitsCU &bunch, 
+    GPlexQI &XHitPos, int hit_cnt, int N);
+
+void getNewBestHitChi2_wrapper(cudaStream_t &stream,
+    GPlexQF &outChi2, float *minChi2, int *bestHit, int hit_cnt, int N);
+
+void fill_array_cu(float *array, int size, int value);
+
+void updateTracksWithBestHit_wrapper(cudaStream_t &stream,
+    BunchOfHitsCU &bunch, GPlexQI &XHitPos, 
+    float *minChi2, int *best_hit, 
+    GPlexHS &msErr, GPlexHV &msPar,
+    GPlexLV &propPar,
+    float *chi2, int *HitsIdx, int N);
+
+int getMaxNumHits_wrapper(GPlexQI d_XHitSize, int N);
+
+void bestHit_wrapper(cudaStream_t &stream,
+    BunchOfHitsCU &bunch, GPlexQI &XHitPos, 
+    GPlexLS &propErr, GPlexHS &msErr, GPlexHV &msPar,
+    GPlexLV &propPar, GPlexQF &outChi2,
+    float *Chi2, int *HitsIdx,
+    int maxSize, int N);
+
+void selectHitRanges_wrapper(cudaStream_t &stream, BunchOfHitsCU &bunch, 
+    GPlexQI &XHitPos, GPlexQI &XHitSize,
+    GPlexLS &Err, GPlexLV &Par,
+    int N);
+
+#endif
diff --git a/mkFit/fittestMPlex.cc b/mkFit/fittestMPlex.cc
index 3c3d039f2a8fe..aed92e764bad9 100644
--- a/mkFit/fittestMPlex.cc
+++ b/mkFit/fittestMPlex.cc
@@ -8,6 +8,7 @@
 
 #include "MkFitter.h"
 #if USE_CUDA
+#include "fittestMPlex.h"
 #include "FitterCU.h"
 #endif
 
@@ -151,6 +152,73 @@ double runFittingTestPlex(Event& ev, std::vector<Track>& rectracks)
 }
 
 #ifdef USE_CUDA
+void runAllEventsFittingTestPlexGPU(std::vector<Event>& events)
+{
+  double s_tmp = 0.0;
+#if 0
+  In principle, the warmup loop should not be required.
+  The separate_first_call_for_meaningful_profiling_numbers() function
+  should be enough.
+  // Warmup loop
+  for (int i = 0; i < 1; ++i) {
+    FitterCU<float> cuFitter(NN);
+    cuFitter.allocateDevice();
+    Event &ev = events[0];
+    std::vector<Track> plex_tracks_ev;
+    plex_tracks_ev.resize(ev.simTracks_.size());
+
+    if (g_run_fit_std) runFittingTestPlexGPU(cuFitter, ev, plex_tracks_ev);
+    cuFitter.freeDevice();
+  }
+#endif
+
+  // Reorgnanization (copyIn) can eventually be multithreaded.
+  omp_set_nested(1);
+      
+  omp_set_num_threads(Config::numThreadsEvents);
+  double total_gpu_time = dtime();
+#pragma omp parallel reduction(+:s_tmp)
+  {
+  int numThreadsEvents = omp_get_num_threads();
+  int thr_idx = omp_get_thread_num();
+
+  // FitterCU is declared here to share allocations and deallocations
+  // between the multiple events processed by a single thread.
+  FitterCU<float> cuFitter(NN);
+  cuFitter.allocateDevice();
+
+    for (int evt = thr_idx+1; evt <= Config::nEvents; evt+= numThreadsEvents) {
+      int idx = thr_idx;
+      printf("==============================================================\n");
+      printf("Processing event %d with thread %d\n", evt, idx);
+      Event &ev = events[evt-1];
+      std::vector<Track> plex_tracks_ev;
+      plex_tracks_ev.resize(ev.simTracks_.size());
+      double tmp = 0, tmp2bh = 0, tmp2 = 0, tmp2ce = 0;
+
+      //if (g_run_fit_std) tmp = runFittingTestPlexGPU(cuFitter, ev, plex_tracks_ev);
+      runFittingTestPlexGPU(cuFitter, ev, plex_tracks_ev);
+
+      printf("Matriplex fit = %.5f  -------------------------------------", tmp);
+      printf("\n");
+      s_tmp    += tmp;
+#if 0  // 0 for timing, 1 for validation
+      // Validation crashes for multiple threads.
+      // It is something in relation to ROOT. Not sure what. 
+      if (omp_get_num_threads() <= 1) {
+        if (g_run_fit_std) {
+          std::string tree_name = "validation-plex-" + std::to_string(evt) + ".root";
+          make_validation_tree(tree_name.c_str(), ev.simTracks_, plex_tracks_ev);
+        }
+      }
+#endif
+    }
+    cuFitter.freeDevice();
+  }
+  std::cerr << "###### Total GPU time: " << dtime() - total_gpu_time << " ######\n";
+}
+
+
 double runFittingTestPlexGPU(FitterCU<float> &cuFitter, 
     Event& ev, std::vector<Track>& rectracks)
 {
diff --git a/mkFit/fittestMPlex.h b/mkFit/fittestMPlex.h
index 5fba69308fb23..2b88d56a17680 100644
--- a/mkFit/fittestMPlex.h
+++ b/mkFit/fittestMPlex.h
@@ -15,6 +15,7 @@ void   make_validation_tree(const char         *fname,
 double runFittingTestPlex(Event& ev, std::vector<Track>& rectracks);
 
 #ifdef USE_CUDA
+void runAllEventsFittingTestPlexGPU(std::vector<Event>& events);
 double runFittingTestPlexGPU(FitterCU<float> &cuFitter, Event& ev, std::vector<Track>& rectracks);
 #endif
 
diff --git a/mkFit/kalmanUpdater_kernels.cu b/mkFit/kalmanUpdater_kernels.cu
index cfd3ea7587608..c6bb458aaabba 100644
--- a/mkFit/kalmanUpdater_kernels.cu
+++ b/mkFit/kalmanUpdater_kernels.cu
@@ -235,6 +235,17 @@ __global__ void kalmanUpdate_kernel(
       for (int j = 0; j < HS; ++j) {
         resErr_reg[j] = 0; //resErr[j*resErr_stride + n];
       }
+
+      // FIXME: Add useCMSGeom -> port propagateHelixToRMPlex
+#if 0
+      if (Config::useCMSGeom) {
+        propagateHelixToRMPlex(psErr,  psPar, inChg,  msPar, propErr, propPar);
+      } else {
+        propErr = psErr;
+        propPar = psPar;
+      }
+#endif
+
       addIntoUpperLeft3x3_fn(propErr, propErr_stride,
           msErr, msErr_stride, resErr_reg, N, n);
       invertCramerSym_fn(resErr_reg);
@@ -252,9 +263,9 @@ __global__ void kalmanUpdate_kernel(
 }
 
 void kalmanUpdate_wrapper(cudaStream_t& stream,
-    GPlex<float>& d_propErr, GPlex<float>& d_msErr,
-    GPlex<float>& d_par_iP, GPlex<float>& d_msPar,
-    GPlex<float>& d_par_iC, GPlex<float>& d_outErr,
+    GPlexLS& d_propErr, GPlexHS& d_msErr,
+    GPlexLV& d_par_iP, GPlexHV& d_msPar,
+    GPlexLV& d_par_iC, GPlexLS& d_outErr,
     const int N) {
   int gridx = std::min((N-1)/BLOCK_SIZE_X + 1,
                        MAX_BLOCKS_X);
diff --git a/mkFit/kalmanUpdater_kernels.h b/mkFit/kalmanUpdater_kernels.h
index 3c00760050582..562b1ab3e371a 100644
--- a/mkFit/kalmanUpdater_kernels.h
+++ b/mkFit/kalmanUpdater_kernels.h
@@ -4,13 +4,20 @@
 #include "GPlex.h"
 
 void kalmanUpdate_wrapper(cudaStream_t& stream,
-    GPlex<float>& d_propErr, GPlex<float>& d_msErr,
-    GPlex<float>& d_par_iP, GPlex<float>& d_msPar,
-    GPlex<float>& d_par_iC, GPlex<float>& d_outErr,
+    GPlexLS& d_propErr, GPlexHS& d_msErr,
+    GPlexLV& d_par_iP, GPlexHV& d_msPar,
+    GPlexLV& d_par_iC, GPlexLS& d_outErr,
     const int N);
 
-void reorganizeMs_wrapper(cudaStream_t& stream, GPlex<float>& msPar, float *full_posArray,
-    GPlex<float>& msErr, float *full_errArray, int *full_hitIdx, int hi, int maxHits,
+void reorganizeMs_wrapper(cudaStream_t& stream, GPlexQF& msPar,
+    float *full_posArray, GPlexHS& msErr, 
+    float *full_errArray, int *full_hitIdx, int hi, int maxHits,
     int N, int hs, int hv, int Nhits);
 
+__device__ void addIntoUpperLeft3x3_fn(const float* __restrict__ a, size_t aN, 
+                                       const float* __restrict__ b, size_t bN, 
+                                       float *c, const int N, int n);
+
+__device__ void invertCramerSym_fn(float *a);
+
 #endif  // _KALMAN_UPDATER_KERNELS_H_
diff --git a/mkFit/mkFit.cc b/mkFit/mkFit.cc
index 64ac9983058ee..a1e7ac51080f9 100644
--- a/mkFit/mkFit.cc
+++ b/mkFit/mkFit.cc
@@ -17,6 +17,7 @@
 #include "FitterCU.h"
 #endif
 
+#include <cstdlib>
 #include <omp.h>
 
 #if defined(USE_VTUNE_PAUSE)
@@ -160,7 +161,6 @@ void test_standard()
   // fittest time. Sum of all events. In case of multiple events
   // being run simultaneously in different streams this time will
   // be larger than the elapsed time.
-  double s_tmp = 0.0;
 
   std::vector<Event> events;
   std::vector<Validation> validations(Config::nEvents);
@@ -185,67 +185,25 @@ void test_standard()
   // tell you how much time is spend running cudaDeviceSynchronize(),
   // use another function). 
   separate_first_call_for_meaningful_profiling_numbers();
-#if 0
-  In principle, the warmup loop should not be required.
-  The separate_first_call_for_meaningful_profiling_numbers() function
-  should be enough.
-  // Warmup loop
-  for (int i = 0; i < 1; ++i) {
-    FitterCU<float> cuFitter(NN);
-    cuFitter.allocateDevice();
-    Event &ev = events[0];
-    std::vector<Track> plex_tracks_ev;
-    plex_tracks_ev.resize(ev.simTracks_.size());
-
-    if (g_run_fit_std) runFittingTestPlexGPU(cuFitter, ev, plex_tracks_ev);
-    cuFitter.freeDevice();
-  }
-#endif
 
-  // Reorgnanization (copyIn) can eventually be multithreaded.
-  omp_set_nested(1);
-      
-  omp_set_num_threads(Config::numThreadsEvents);
-  double total_gpu_time = dtime();
-#pragma omp parallel reduction(+:s_tmp)
+  if (g_run_fit_std) runAllEventsFittingTestPlexGPU(events);
+
+  for (int evt = 1; evt <= Config::nEvents; ++evt)
   {
-  int numThreadsEvents = omp_get_num_threads();
-  int thr_idx = omp_get_thread_num();
-
-  // FitterCU is declared here to share allocations and deallocations
-  // between the multiple events processed by a single thread.
-  FitterCU<float> cuFitter(NN);
-  cuFitter.allocateDevice();
-
-    for (int evt = thr_idx+1; evt <= Config::nEvents; evt+= numThreadsEvents) {
-      int idx = thr_idx;
-      printf("==============================================================\n");
-      printf("Processing event %d with thread %d\n", evt, idx);
-      Event &ev = events[evt-1];
-      std::vector<Track> plex_tracks_ev;
-      plex_tracks_ev.resize(ev.simTracks_.size());
-      double tmp = 0, tmp2bh = 0, tmp2 = 0, tmp2ce = 0;
-
-      if (g_run_fit_std) tmp = runFittingTestPlexGPU(cuFitter, ev, plex_tracks_ev);
-
-      printf("Matriplex fit = %.5f  -------------------------------------", tmp);
-      printf("\n");
-      s_tmp    += tmp;
-#if 1  // 0 for timing, 1 for validation
-      // Validation crashes for multiple threads.
-      // It is something in relation to ROOT. Not sure what. 
-      if (omp_get_num_threads() <= 1) {
-        if (g_run_fit_std) {
-          std::string tree_name = "validation-plex-" + std::to_string(evt) + ".root";
-          make_validation_tree(tree_name.c_str(), ev.simTracks_, plex_tracks_ev);
-        }
-      }
-#endif
+    printf("\n");
+    printf("Processing event %d\n", evt);
+
+    Event& ev = events[evt-1];
+
+    //plex_tracks.resize(ev.simTracks_.size());
+    omp_set_num_threads(Config::numThreadsFinder);
+
+    if (g_run_build_bh) {
+      double my_time = runBuildingTestPlexBestHit(ev);
+      std::cout << "BestHit -- GPU: " << my_time << std::endl;
     }
-    cuFitter.freeDevice();
+    std::exit(0);
   }
-  std::cerr << "###### Total GPU time: " << dtime() - total_gpu_time << " ######\n";
-
 #else
   for (int evt = 1; evt <= Config::nEvents; ++evt)
   {
diff --git a/mkFit/propagation_kernels.cu b/mkFit/propagation_kernels.cu
index 6b6f3f8b2d9ec..a3ced43524b81 100644
--- a/mkFit/propagation_kernels.cu
+++ b/mkFit/propagation_kernels.cu
@@ -11,10 +11,10 @@
 #define BLOCK_SIZE_X 32
 #define MAX_BLOCKS_X 65535 // CUDA constraint
 
-__device__ float hipo(float x, float y) {
+__device__ float hipo_cu(float x, float y) {
   return sqrt(x*x + y*y);
 }
-__device__ void sincos4(float x, float& sin, float& cos) {
+__device__ void sincos4_cu(float x, float& sin, float& cos) {
    // Had this writen with explicit division by factorial.
    // The *whole* fitting test ran like 2.5% slower on MIC, sigh.
    cos  = 1;
@@ -95,10 +95,18 @@ __device__ void computeMsRad_fn(const float* __restrict__ msPar,
     size_t stride_msPar, float* msRad, int N, int n) {
   /*int n = threadIdx.x + blockIdx.x * blockDim.x;*/
   if (n < N) {
-    *msRad = hipo(msPar[n], msPar[n + stride_msPar]);
+    *msRad = hipo_cu(msPar[n], msPar[n + stride_msPar]);
   }
 }
 
+__device__ void assignMsRad_fn(const float r, float* msRad, int N, int n) {
+  /*int n = threadIdx.x + blockIdx.x * blockDim.x;*/
+  if (n < N) {
+    *msRad = r;
+  }
+}
+
+
 __device__ 
 void helixAtRFromIterative_fn(float *inPar, size_t inPar_stride,
     int *inChg, float *outPar, size_t outPar_stride, float msRad, 
@@ -121,7 +129,7 @@ void helixAtRFromIterative_fn(float *inPar, size_t inPar_stride,
     const float& pyin = inPar[n + 4*ipN]; 
     const float& pzin = inPar[n + 5*ipN]; 
     const float& r = msRad; 
-    float r0 = hipo(xin, yin);
+    float r0 = hipo_cu(xin, yin);
 
     if (fabs(r-r0)<0.0001) {
       // get an identity matrix
@@ -165,11 +173,11 @@ void helixAtRFromIterative_fn(float *inPar, size_t inPar_stride,
       y  = outPar_reg[1];
       px = outPar_reg[3];
       py = outPar_reg[4];
-      r0 = hipo(outPar_reg[0], outPar_reg[1]);
+      r0 = hipo_cu(outPar_reg[0], outPar_reg[1]);
 
       totalDistance += (r-r0);
       if (Config::useTrigApprox) {  // TODO: uncomment
-        sincos4((r-r0)*invcurvature, sinAP, cosAP);
+        sincos4_cu((r-r0)*invcurvature, sinAP, cosAP);
       } else {
         cosAP=cos((r-r0)*invcurvature);
         sinAP=sin((r-r0)*invcurvature);
@@ -229,7 +237,7 @@ void helixAtRFromIterative_fn(float *inPar, size_t inPar_stride,
       
       float cosTP, sinTP;
       if (Config::useTrigApprox) {
-        sincos4(TP, sinTP, cosTP);
+        sincos4_cu(TP, sinTP, cosTP);
       } else {
         cosTP = cos(TP);
         sinTP = sin(TP);
@@ -395,6 +403,7 @@ __device__ void similarity_fn(float* a, float *b, size_t stride_outErr,
   }
 }
 
+// PropagationMPlex.cc:propagateHelixToRMPlex, first version with 6 arguments 
 __global__ void propagation_kernel(
     const float* __restrict__ msPar, size_t stride_msPar, 
     float *inPar, size_t inPar_stride, int *inChg,
@@ -415,6 +424,9 @@ __global__ void propagation_kernel(
         helixAtRFromIterative_fn(inPar, inPar_stride,
             inChg, outPar, outPar_stride, msRad_reg, 
             errorProp_reg, N, n);
+      } else {
+        // TODO: not ported for now. Assuming Config::doIterative
+        // helixAtRFromIntersection(inPar, inChg, outPar, msRad, errorProp);
       }
       similarity_fn(errorProp_reg, outErr, outErr_stride, N, n);
     }
@@ -423,10 +435,10 @@ __global__ void propagation_kernel(
 
 
 void propagation_wrapper(cudaStream_t& stream,
-    GPlex<float>& msPar,
-    GPlex<float>& inPar, GPlex<int>& inChg,
-    GPlex<float>& outPar, GPlex<float>& errorProp,
-    GPlex<float>& outErr, 
+    GPlexHV& msPar,
+    GPlexLV& inPar, GPlexQI& inChg,
+    GPlexLV& outPar, GPlexLL& errorProp,
+    GPlexLS& outErr, 
     const int N) {
   int gridx = std::min((N-1)/BLOCK_SIZE_X + 1,
                        MAX_BLOCKS_X);
@@ -438,3 +450,53 @@ void propagation_wrapper(cudaStream_t& stream,
      outPar.ptr, outPar.stride, errorProp.ptr,
      errorProp.stride, outErr.ptr, outErr.stride, N);
 }
+
+
+// PropagationMPlex.cc:propagateHelixToRMPlex, second version with 7 arguments 
+// Imposes the radius
+__global__ void propagationForBuilding_kernel(
+    float r,
+    float *inPar, size_t inPar_stride, int *inChg,
+    float *outPar, size_t outPar_stride, float *errorProp,
+    size_t errorProp_stride, float *outErr, size_t outErr_stride, int N) {
+
+  int grid_width = blockDim.x * gridDim.x;
+  int n = threadIdx.x + blockIdx.x * blockDim.x;
+  float msRad_reg;
+  // Using registers instead of shared memory is ~ 30% faster.
+  float errorProp_reg[LL];
+  // If there is more matrices than MAX_BLOCKS_X * BLOCK_SIZE_X 
+  for (int z = 0; z < (N-1)/grid_width  +1; z++) {
+    n += z*grid_width;
+    if (n < N) {
+      assignMsRad_fn(r, &msRad_reg, N, n);
+      if (Config::doIterative) {
+        helixAtRFromIterative_fn(inPar, inPar_stride,
+            inChg, outPar, outPar_stride, msRad_reg, 
+            errorProp_reg, N, n);
+      } else {
+        // TODO: not ported for now. Assuming Config::doIterative
+        // helixAtRFromIntersection(inPar, inChg, outPar, msRad, errorProp);
+      }
+      similarity_fn(errorProp_reg, outErr, outErr_stride, N, n);
+    }
+  }
+}
+
+void propagationForBuilding_wrapper(cudaStream_t& stream,
+    float radius,
+    GPlexLV& inPar, GPlexQI& inChg,
+    GPlexLV& outPar, GPlexLL& errorProp,
+    GPlexLS& outErr, 
+    const int N) {
+  int gridx = std::min((N-1)/BLOCK_SIZE_X + 1,
+                       MAX_BLOCKS_X);
+  dim3 grid(gridx, 1, 1);
+  dim3 block(BLOCK_SIZE_X, 1, 1);
+  propagationForBuilding_kernel<<<grid, block, 0, stream >>>
+    (radius,
+     inPar.ptr, inPar.stride, inChg.ptr,
+     outPar.ptr, outPar.stride, errorProp.ptr,
+     errorProp.stride, outErr.ptr, outErr.stride, N);
+}
+
diff --git a/mkFit/propagation_kernels.h b/mkFit/propagation_kernels.h
index 2fb92f0689f39..c2a44b4a7952c 100644
--- a/mkFit/propagation_kernels.h
+++ b/mkFit/propagation_kernels.h
@@ -4,10 +4,17 @@
 #include "GPlex.h"
 
 void propagation_wrapper(cudaStream_t& stream,
-    GPlex<float>& msPar,
-    GPlex<float>& inPar, GPlex<int>& inChg,
-    GPlex<float>& outPar, GPlex<float>& errorProp,
-    GPlex<float>& outErr, 
+    GPlexHV& msPar,
+    GPlexLV& inPar, GPlexQI& inChg,
+    GPlexLV& outPar, GPlexLL& errorProp,
+    GPlexLS& outErr, 
+    const int N);
+
+void propagationForBuilding_wrapper(cudaStream_t& stream,
+    float radius,
+    GPlexLV& inPar, GPlexQI& inChg,
+    GPlexLV& outPar, GPlexLL& errorProp,
+    GPlexLS& outErr, 
     const int N);
 
 #endif  // _PROPAGATION_KERNELS_H_
diff --git a/mkFit/reorganize.cu b/mkFit/reorganize.cu
index 80d3b7912cac9..523b86a69b55e 100644
--- a/mkFit/reorganize.cu
+++ b/mkFit/reorganize.cu
@@ -17,11 +17,11 @@ __global__ void toMatriplex_kernel(float *dst, int dst_stride,
   }
 }
 
-void toMatriplex_wrapper(cudaStream_t& stream, GPlex<float> &dst, GPlex<float> &src, int N, int LS) {
-  dim3 block(16, 8, 1);
-  dim3 grid((N-1)/16 + 1, (LS-1)/8 +1, 1);
-  toMatriplex_kernel <<<grid, block, 0, stream>>> (dst.ptr, dst.stride, src.ptr, src.stride, N, LS);
-}
+/*void toMatriplex_wrapper(cudaStream_t& stream, GPlex<float> &dst, GPlex<float> &src, int N, int LS) {*/
+  /*dim3 block(16, 8, 1);*/
+  /*dim3 grid((N-1)/16 + 1, (LS-1)/8 +1, 1);*/
+  /*toMatriplex_kernel <<<grid, block, 0, stream>>> (dst.ptr, dst.stride, src.ptr, src.stride, N, LS);*/
+/*}*/
 
 
 __global__ void reorganizeMs(float *msPar, size_t msPar_stride,
@@ -54,8 +54,9 @@ __global__ void reorganizeMs(float *msPar, size_t msPar_stride,
   }
 }
 
-void reorganizeMs_wrapper(cudaStream_t& stream, GPlex<float>& msPar, float *full_posArray,
-    GPlex<float>& msErr, float *full_errArray, int *full_hitIdx, int hi, int maxHits,
+void reorganizeMs_wrapper(cudaStream_t& stream, GPlex<float, MPlexQF>& msPar,
+    float *full_posArray, GPlex<float, MPlexHS>& msErr, 
+    float *full_errArray, int *full_hitIdx, int hi, int maxHits,
     int N, int hs, int hv, int Nhits) {
   dim3 block(16, 6, 1);
   dim3 grid((N-1)/16 + 1, (hs-1)/6 +1, 1);
diff --git a/mkFit/reorganize.h b/mkFit/reorganize.h
index fd72eb8f91be6..70f70b81c819f 100644
--- a/mkFit/reorganize.h
+++ b/mkFit/reorganize.h
@@ -3,6 +3,6 @@
 
 #include "GPlex.h"
 
-void toMatriplex_wrapper(cudaStream_t& stream, GPlex<float> &dst, GPlex<float> &src, int n, int ls);
+// void toMatriplex_wrapper(cudaStream_t& stream, GPlex<float> &dst, GPlex<float> &src, int n, int ls);
 
 #endif

From 624279355be20b311a4949ffff71d272d6d5b123 Mon Sep 17 00:00:00 2001
From: Matthieu Lefebvre <ml15@princeton.edu>
Date: Wed, 25 May 2016 21:45:11 -0400
Subject: [PATCH 05/13] Partially fixes change of coordinate system for the gpu
 routines

quick and dirty adapt, following dan's way later.
computechi2_wrapper now reworking.

Propagates coordinate change to kalman updater on the GPU

Adds newbesthit to computechi2gpu

addBestHit GPU working anew

Adapts perl multiplication generator for CUDA. Uses it in kalmanUpdater_kernels.

Changes kalmanUpdate_kernel arguments to GPlex

Changes kernel arguments from pointers to GPlex

Changes arrays to GPlex or to GPlexReg

Makes small CPU functions __host__ __device__
---
 BinInfoUtils.h                 |  12 +
 Hit.h                          |   6 +
 Makefile.config                |   2 +-
 Matriplex/GenMul.pm            |  75 ++++++
 mkFit/FitterCU-imp.h           | 113 +++++++--
 mkFit/FitterCU.h               |  13 +-
 mkFit/GPlex.h                  |  30 +++
 mkFit/MkBuilder.cc             |  29 +++
 mkFit/MkFitter.cc              | 119 ++++++++++
 mkFit/MkFitter.h               |   5 +
 mkFit/computeChi2_kernels.cu   | 360 +++++++++++++++-------------
 mkFit/computeChi2_kernels.h    |  19 +-
 mkFit/kalmanUpdater_kernels.cu | 412 +++++++++++++++++++++++++++++++--
 mkFit/kalmanUpdater_kernels.h  |  11 +-
 mkFit/propagation_kernels.cu   |  62 ++---
 15 files changed, 1015 insertions(+), 253 deletions(-)

diff --git a/BinInfoUtils.h b/BinInfoUtils.h
index 908eca6534c53..7c9481caa3caa 100644
--- a/BinInfoUtils.h
+++ b/BinInfoUtils.h
@@ -12,18 +12,27 @@ typedef std::pair<int, int>                 BinInfo;
 typedef std::vector<std::vector<BinInfo>>   BinInfoLayerMap;
 typedef std::vector<BinInfoLayerMap>        BinInfoMap;
 
+#ifdef __CUDACC__
+__host__ __device__
+#endif
 inline float downPhi(float phi)
 {
   while (phi >= Config::PI) {phi-=Config::TwoPI;}
   return phi;
 }
 	
+#ifdef __CUDACC__
+__host__ __device__
+#endif
 inline float upPhi(float phi)
 {
   while (phi <= -Config::PI) {phi+=Config::TwoPI;}
   return phi;
 }
 
+#ifdef __CUDACC__
+__host__ __device__
+#endif
 inline float normalizedPhi(float phi)
 {
   //  return std::fmod(phi, (float) Config::PI); // return phi +pi out of phase for |phi| beyond boundary! 
@@ -31,6 +40,9 @@ inline float normalizedPhi(float phi)
   return phi;
 }
 
+#ifdef __CUDACC__
+__host__ __device__
+#endif
 inline int getPhiPartition(float phi)
 {
   //assume phi is between -PI and PI
diff --git a/Hit.h b/Hit.h
index b90c6fb282a0e..dd08d20a271e1 100644
--- a/Hit.h
+++ b/Hit.h
@@ -85,11 +85,17 @@ inline float getInvRad2(float x, float y){
   return 1.0f/(x*x + y*y);
 }
 
+#ifdef __CUDACC__
+__host__ __device__
+#endif
 inline float getPhi(float x, float y)
 {
   return std::atan2(y,x); 
 }
 
+#ifdef __CUDACC__
+__host__ __device__
+#endif
 inline float getTheta(float r, float z){
   return std::atan2(r,z);
 }
diff --git a/Makefile.config b/Makefile.config
index cd16348fdf0d1..95e3458701991 100644
--- a/Makefile.config
+++ b/Makefile.config
@@ -29,7 +29,7 @@ endif
 # CUDA compiler
 NV := nvcc
 # Comment out to compile for CPU
-#USE_CUDA := yes
+USE_CUDA := yes
 
 # 3. Optimization
 # -O3 implies vectorization and simd (but not AVX)
diff --git a/Matriplex/GenMul.pm b/Matriplex/GenMul.pm
index e9c1305d5e117..f8713c570e73d 100644
--- a/Matriplex/GenMul.pm
+++ b/Matriplex/GenMul.pm
@@ -531,6 +531,75 @@ sub multiply_standard
 
 # ----------------------------------------------------------------------
 
+sub generate_addend_gpu
+{
+  my ($S, $x, $y) = @_;
+
+  return undef if $S->{$x}{pat} eq '0' or  $S->{$y}{pat} eq '0';
+  return "1"   if $S->{$x}{pat} eq '1' and $S->{$y}{pat} eq '1';
+
+  my $xstr = sprintf "$S->{$x}{mat}{name}\[%2d*$S->{$x}{mat}{name}N+$S->{$x}{mat}{name}n]", $S->{$x}{idx};
+  my $ystr = sprintf "$S->{$y}{mat}{name}\[%2d*$S->{$y}{mat}{name}N+$S->{$y}{mat}{name}n]", $S->{$y}{idx};
+
+  return $xstr if $S->{$y}{pat} eq '1';
+  return $ystr if $S->{$x}{pat} eq '1';
+
+  return "${xstr}*${ystr}";
+}
+
+sub multiply_gpu
+{
+  # Standard mutiplication - outputs unrolled C code, one line
+  # per target matrix element.
+  # Arguments: a, b, c   -- all GenMul::MBase with right dimensions.
+  # Does:      c = a * b
+
+  check_multiply_arguments(@_);
+
+  my ($S, $a, $b, $c) = @_;
+
+  my $is_c_symmetric = $c->isa("GenMul::MatrixSym");
+
+  # With no_size_check matrices do not have to be compatible.
+  my $k_max = $a->{N} <= $b->{M} ? $a->{N} : $b->{M};
+
+  for (my $i = 0; $i < $c->{M}; ++$i)
+  {
+    my $j_max = $is_c_symmetric ?  $i + 1 : $c->{N};
+
+    for (my $j = 0; $j < $j_max; ++$j)
+    {
+      my $x = $c->idx($i, $j);
+
+      printf "$S->{prefix}$c->{name}\[%2d*$c->{name}N+$c->{name}n\] = ", $x;
+
+      my @sum;
+
+      for (my $k = 0; $k < $k_max; ++$k)
+      {
+        $S->generate_indices_and_patterns_for_multiplication($i, $j, $k);
+
+        my $addend = $S->generate_addend_gpu('a', 'b');
+
+        push @sum, $addend if defined $addend;
+      }
+      if (@sum)
+      {
+        print join(" + ", @sum), ";";
+      }
+      else
+      {
+        print "0;"
+      }
+      print "\n";
+    }
+  }
+
+  $S->delete_temporaries();
+}
+
+# ----------------------------------------------------------------------
+
 sub load_if_needed
 {
   my ($S, $x) = @_;
@@ -708,6 +777,7 @@ sub dump_multiply_std_and_intrinsic
   }
 
   print <<"FNORD";
+#ifndef __CUDACC__
 #ifdef MPLEX_INTRINSICS
 
    for (int n = 0; n < N; n += MPLEX_INTRINSICS_WIDTH_BYTES / sizeof(T))
@@ -731,6 +801,11 @@ FNORD
   print <<"FNORD";
    }
 #endif
+#else  // __CUDACC__
+FNORD
+  $S->multiply_gpu($a, $b, $c);
+  print <<"FNORD";
+#endif  // __CUDACC__
 FNORD
 
   
diff --git a/mkFit/FitterCU-imp.h b/mkFit/FitterCU-imp.h
index d65c1e596a922..b4a47e15770d1 100644
--- a/mkFit/FitterCU-imp.h
+++ b/mkFit/FitterCU-imp.h
@@ -58,6 +58,24 @@ void FitterCU<T>::kalmanUpdateMerged() {
                        d_par_iP, d_msPar, d_par_iC, d_Err_iC, N);
 }
 
+template <typename T>
+void FitterCU<T>::kalmanUpdate_standalone(
+    const MPlexLS &psErr,  const MPlexLV& psPar, const MPlexQI &inChg,
+    const MPlexHS &msErr,  const MPlexHV& msPar,
+    MPlexLS &outErr,       MPlexLV& outPar)
+{
+  //d_Err_iP.copyAsyncFromHost(stream, psErr);
+  //d_msErr.copyAsyncFromHost(stream, msErr);
+  //d_par_iP.copyAsyncFromHost(stream, psPar);
+  //d_msPar.copyAsyncFromHost(stream, msPar);
+
+  kalmanUpdate_wrapper(stream, d_Err_iP, d_msErr,
+                       d_par_iP, d_msPar, d_par_iC, d_Err_iC, N);
+
+  //d_par_iC.copyAsyncToHost(stream, outPar);
+  //d_Err_iC.copyAsyncToHost(stream, outErr);
+}
+
 template <typename T>
 void FitterCU<T>::propagationMerged() {
   propagation_wrapper(stream, d_msPar, d_par_iC, d_inChg,
@@ -66,18 +84,83 @@ void FitterCU<T>::propagationMerged() {
 }
 
 template <typename T>
-void FitterCU<T>::computeChi2gpu(
-    const MPlexLS &psErr, MPlexHS &msErr,
-    MPlexHV& msPar, const MPlexLV& propPar, GPlexQF& d_outChi2, int NN) {
-
-  // TODO: add CMSGeom
-  if (Config::useCMSGeom) {
-    //propagateHelixToRMPlex(psErr,  psPar, inChg,  msPar, propErr, propPar);
-    throw std::runtime_error("useCMSGeom not implemented yet for GPU");
-  } else {}
-
-  computeChi2_wrapper(stream, d_Err_iP, d_msErr, //d_resErr, 
-      d_msPar, d_par_iP, d_outChi2, N);
+void FitterCU<T>::computeChi2gpu(const MPlexLS &psErr, const MPlexLV& propPar,
+    const MPlexQI &inChg, MPlexHS &msErr, MPlexHV& msPar,
+    BunchOfHitsCU &d_bunch, //MPlexQI &XHitPos, MPlexQI &XHitSize,
+    MPlexQF &Chi2, MPlexQI &HitsIdx,
+    int NN) {
+
+  //float *d_minChi2;
+  //int *d_bestHit;
+  //cudaMalloc((void**)&d_minChi2, NN*sizeof(float));
+  //cudaMalloc((void**)&d_bestHit, NN*sizeof(int));
+
+  //cudaMemcpyAsync(d_minChi2, minChi2, NN*sizeof(float), cudaMemcpyHostToDevice, stream);
+  //cudaMemcpyAsync(d_bestHit, bestHit, NN*sizeof(int), cudaMemcpyHostToDevice, stream);
+
+  //cudaMemset(d_bestHit, -1, NN*sizeof(int));
+  //fill_array_cu(d_minChi2, NN, 15.f);
+
+  //d_Err_iP.copyAsyncFromHost(stream, psErr);
+  //d_par_iP.copyAsyncFromHost(stream, propPar);
+  //d_msErr.copyAsyncFromHost(stream, msErr);
+  //d_msPar.copyAsyncFromHost(stream, msPar);
+  //d_XHitPos.copyAsyncFromHost(stream, XHitPos);
+  //d_XHitSize.copyAsyncFromHost(stream, XHitSize);
+
+  //cudaMemcpy2DAsync(d_Chi2, NN*sizeof(float), Chi2.fArray, NN*sizeof(float), 
+               //NN*sizeof(float), 1, cudaMemcpyHostToDevice, stream);
+  //cudaMemcpy2DAsync(d_HitsIdx, NN*sizeof(int), HitsIdx.fArray, NN*sizeof(int), 
+               //NN*sizeof(int), 1, cudaMemcpyHostToDevice, stream);
+
+  //cudaStreamSynchronize(stream);
+  //cudaCheckError();
+
+  selectHitRanges_wrapper(stream, d_bunch, d_XHitPos, d_XHitSize, 
+      d_Err_iP, d_par_iP, N);
+
+  int maxSize2 = getMaxNumHits_wrapper(d_XHitSize, N);
+  bestHit_wrapper(stream, d_bunch, d_XHitPos,
+                  d_Err_iP, d_msErr, d_msPar, d_par_iP, d_outChi2,
+                  d_Chi2, d_HitsIdx,
+                  maxSize2, N);
+#if 0
+  for (int hit_cnt = 0; hit_cnt < maxSize2; ++hit_cnt)
+  {
+    // TODO: add CMSGeom
+    if (Config::useCMSGeom) {
+      //propagateHelixToRMPlex(psErr,  psPar, inChg,  msPar, propErr, propPar);
+      throw std::runtime_error("useCMSGeom not implemented yet for GPU");
+    } else {}
+    HitToMs_wrapper(stream, d_msErr, d_msPar, d_bunch, d_XHitPos, hit_cnt, NN);
+
+    computeChi2_wrapper(stream, d_Err_iP, d_msErr, //d_resErr, 
+        d_msPar, d_par_iP, d_outChi2, NN);
+
+    getNewBestHitChi2_wrapper(stream, d_outChi2, d_minChi2, d_bestHit, hit_cnt, NN);
+
+    cudaStreamSynchronize(stream);
+    cudaCheckError();
+  }
+  updateTracksWithBestHit_wrapper(stream,
+    d_bunch, d_XHitPos, d_minChi2, d_bestHit, 
+    d_msErr, d_msPar, d_par_iP, d_Chi2, d_HitsIdx, N);
+#endif
+
+  //d_outChi2.copyAsyncToHost(stream, outChi2);
+  //cudaMemcpyAsync(minChi2, d_minChi2, NN*sizeof(float), cudaMemcpyDeviceToHost, stream);
+  //cudaMemcpyAsync(bestHit, d_bestHit, NN*sizeof(int), cudaMemcpyDeviceToHost, stream);
+
+  //cudaMemcpy2DAsync(Chi2.fArray, NN*sizeof(float), d_Chi2, NN*sizeof(float), 
+  //             NN*sizeof(float), 1, cudaMemcpyDeviceToHost, stream);
+  //cudaMemcpy2DAsync(HitsIdx.fArray, NN*sizeof(int), d_HitsIdx, NN*sizeof(int), 
+  //             NN*sizeof(int), 1, cudaMemcpyDeviceToHost, stream);
+
+  //cudaStreamSynchronize(stream);
+  //cudaCheckError();
+
+  //cudaFree(d_minChi2);
+  //cudaFree(d_bestHit);
 }
 
 // FIXME: Temporary. Separate allocations / transfers
@@ -112,7 +195,7 @@ void FitterCU<T>::prepare_addBestHit(
 
   createStream();
   cudaCheckError()
-
+#if 1
   // psErr -> d_Err_iP
   cudaMemcpy2DAsync(d_Err_iP.ptr, d_Err_iP.pitch, psErr.fArray, N*sizeof(T),
                N*sizeof(T), LS, cudaMemcpyHostToDevice, stream);
@@ -120,6 +203,7 @@ void FitterCU<T>::prepare_addBestHit(
   d_par_iP.copyAsyncFromHost(stream, propPar);
   //sendInChgToDevice(inChg);
   d_inChg.copyAsyncFromHost(stream, inChg);
+#endif
 }
 
 // TODO: Temporary. Separate allocations / transfers
@@ -128,6 +212,7 @@ void FitterCU<T>::finalize_addBestHit(
     MPlexHS &msErr, MPlexHV& msPar,
     MPlexLS &outErr, MPlexLV &outPar,
     MPlexQI &HitsIdx, MPlexQF &Chi2) {
+#if 1
   //getOutParFromDevice(outPar);  // <- d_par_iC
   d_par_iC.copyAsyncToHost(stream, outPar);
   //getOutErrFromDevice(outErr);  // <- d_Err_iC
@@ -141,7 +226,7 @@ void FitterCU<T>::finalize_addBestHit(
                N*sizeof(T), HS, cudaMemcpyDeviceToHost, stream);
   cudaMemcpyAsync(HitsIdx.fArray, d_HitsIdx, N*sizeof(int), cudaMemcpyDeviceToHost, stream);
   cudaMemcpyAsync(Chi2.fArray, d_Chi2, N*sizeof(float), cudaMemcpyDeviceToHost, stream);
-
+#endif
 
   destroyStream();
 }
diff --git a/mkFit/FitterCU.h b/mkFit/FitterCU.h
index c37fbb0abff79..370356d4816ba 100644
--- a/mkFit/FitterCU.h
+++ b/mkFit/FitterCU.h
@@ -62,9 +62,16 @@ class FitterCU {
 
   void propagationMerged();
   void kalmanUpdateMerged();
-
-  void computeChi2gpu(const MPlexLS &psErr, MPlexHS &msErr,
-      MPlexHV& msPar, const MPlexLV& propPar, GPlexQF& d_outChi2, int NN);
+  void kalmanUpdate_standalone(
+      const MPlexLS &psErr,  const MPlexLV& psPar, const MPlexQI &inChg,
+      const MPlexHS &msErr,  const MPlexHV& msPar,
+      MPlexLS &outErr,       MPlexLV& outPar);
+
+  void computeChi2gpu(const MPlexLS &psErr, const MPlexLV& propPar,
+    const MPlexQI &inChg, MPlexHS &msErr, MPlexHV& msPar,
+    BunchOfHitsCU &d_bunch, //MPlexQI &XHitPos, MPlexQI &XHitSize,
+    MPlexQF &Chi2, MPlexQI &HitsIdx,
+    int NN);
 
   void allocate_extra_addBestHit();
   void free_extra_addBestHit();
diff --git a/mkFit/GPlex.h b/mkFit/GPlex.h
index 2a22524fae3a9..d831c051c650e 100644
--- a/mkFit/GPlex.h
+++ b/mkFit/GPlex.h
@@ -75,4 +75,34 @@ using GPlexLH = GPlex<float, MPlexLH>;
 using GPlexQF = GPlex<float, MPlexQF>;
 using GPlexQI = GPlex<int, MPlexQI>;
 
+
+template <typename T, int D1, int D2>
+struct GPlexReg {
+  __device__ T  operator[](int xx) const { return arr[xx]; }
+  __device__ T& operator[](int xx)       { return arr[xx]; }
+
+  __device__ T& operator()(int n, int i, int j)       { return arr[i*D2 + j]; }
+  __device__ T  operator()(int n, int i, int j) const { return arr[i*D2 + j]; }
+
+  __device__ void SetVal(T v)
+  {
+     for (int i = 0; i < D1; ++i)
+     {
+        arr[i] = v;
+     }
+  }
+
+  T arr[D1];
+};
+
+using GPlexRegLL = GPlexReg<float, 36, 6>;
+using GPlexRegLH = GPlexReg<float, 18, 6>;
+using GPlexRegHH = GPlexReg<float, 9, 3>;
+using GPlexRegLV = GPlexReg<float, 6, 1>;
+using GPlexRegHS = GPlexReg<float, 6, 1>;
+using GPlexRegHV = GPlexReg<float, 3, 1>;
+using GPlexReg2V = GPlexReg<float, 2, 1>;
+using GPlexReg2S = GPlexReg<float, 3, 1>;
+using GPlexRegQF = GPlexReg<float, 1, 1>;
+
 #endif  // _GPLEX_H_
diff --git a/mkFit/MkBuilder.cc b/mkFit/MkBuilder.cc
index 3c6af213c08f7..146935b873689 100644
--- a/mkFit/MkBuilder.cc
+++ b/mkFit/MkBuilder.cc
@@ -340,7 +340,36 @@ void MkBuilder::FindTracksBestHit(EventOfCandidates& event_of_cands)
 
             //make candidates with best hit
             dprint("make new candidates");
+#ifdef USE_CUDA
+            BunchOfHitsCU bunch_of_hits_cu;
+            bunch_of_hits_cu.copyBunchOfHitsFromCPU(bunch_of_hits);
+            bunch_of_hits_cu.allocatePhiBinInfos(bunch_of_hits.m_phi_bin_infos.size());
+            bunch_of_hits_cu.copyPhiBinInfosFromCPU(bunch_of_hits);
+
+            FitterCU<float> cuFitter(NN);
+            cuFitter.allocateDevice();
+            cuFitter.allocate_extra_addBestHit();
+            cuFitter.prepare_addBestHit(
+                mkfp->Err[mkfp->iP], mkfp->Par[mkfp->iP],
+                mkfp->Chg,
+                NN);
+
+            //mkfp->AddBestHit_gpu(bunch_of_hits, cuFitter, bunch_of_hits_cu);
+            cuFitter.addBestHit(bunch_of_hits_cu);
+
+            cuFitter.finalize_addBestHit(
+                mkfp->msErr[mkfp->Nhits], mkfp->msPar[mkfp->Nhits],
+                mkfp->Err[mkfp->iC], mkfp->Par[mkfp->iC],
+                mkfp->HitsIdx[mkfp->Nhits], mkfp-> Chi2);
+            cuFitter.free_extra_addBestHit();
+            cuFitter.freeDevice();
+
+            bunch_of_hits_cu.freePhiBinInfos();
+
             mkfp->AddBestHit(bunch_of_hits);
+#else
+            mkfp->AddBestHit(bunch_of_hits);
+#endif
             mkfp->SetNhits(ilay + 1);  //here again assuming one hit per layer (is this needed?)
 
             //propagate to layer
diff --git a/mkFit/MkFitter.cc b/mkFit/MkFitter.cc
index 20bf89b785d62..0f3f8463bec1b 100644
--- a/mkFit/MkFitter.cc
+++ b/mkFit/MkFitter.cc
@@ -1607,3 +1607,122 @@ void MkFitter::CopyOutParErr(std::vector<std::vector<Track> >& seed_cand_vec,
               << " etaBin=" << getEtaBin(cand.posEta()));
   }
 }
+
+////////////////////////////////// //////////////////////////////////
+// Temporary
+#ifdef USE_CUDA
+void MkFitter::AddBestHit_gpu(const BunchOfHits &bunch_of_hits, FitterCU<float> &cuFitter,
+    BunchOfHitsCU &bunch_of_hits_cu)
+{
+#ifdef USE_CUDA
+    cuFitter.computeChi2gpu(Err[iP], Par[iP], Chg, msErr[Nhits], msPar[Nhits],
+        bunch_of_hits_cu, // XHitPos, XHitSize,
+        Chi2, HitsIdx[Nhits], NN);
+    cuFitter.kalmanUpdate_standalone(Err[iP], Par[iP], Chg, msErr[Nhits], msPar[Nhits],
+        Err[iC], Par[iC]);
+#else
+  //fixme solve ambiguity NN vs beg-end
+  float minChi2[NN];
+  std::fill_n(minChi2, NN, Config::chi2Cut);
+  int bestHit[NN];
+  std::fill_n(bestHit, NN, -1);
+
+
+  const char *varr      = (char*) bunch_of_hits.m_hits;
+
+  const int   off_error = (char*) bunch_of_hits.m_hits[0].errArray() - varr;
+  const int   off_param = (char*) bunch_of_hits.m_hits[0].posArray() - varr;
+
+  int idx[NN]      __attribute__((aligned(64)));
+  int idx_chew[NN] __attribute__((aligned(64)));
+
+  int maxSize = -1;
+
+  // Determine maximum number of hits for tracks in the collection.
+  // At the same time prefetch the first set of hits to L1 and the second one to L2.
+  for (int it = 0; it < NN; ++it)
+  {
+    int off = XHitPos.At(it, 0, 0) * sizeof(Hit);
+
+    idx[it]      = off;
+    idx_chew[it] = it*sizeof(Hit);
+
+    maxSize = std::max(maxSize, XHitSize.At(it, 0, 0));
+  }
+
+  // XXXX MT Uber hack to avoid tracks with like 300 hits to process.
+  //fixme this makes results dependent on vector unit size
+  maxSize = std::min(maxSize, Config::maxHitsConsidered);
+// Has basically no effect, it seems.
+//#pragma noprefetch
+  for (int hit_cnt = 0; hit_cnt < maxSize; ++hit_cnt, varr += sizeof(Hit))
+  {
+    //fixme what if size is zero???
+    msErr[Nhits].SlurpIn(varr + off_error, idx);
+    msPar[Nhits].SlurpIn(varr + off_param, idx);
+
+    //now compute the chi2 of track state vs hit
+    MPlexQF outChi2;
+    computeChi2MPlex(Err[iP], Par[iP], Chg, msErr[Nhits], msPar[Nhits], outChi2);
+    //update best hit in case chi2<minChi2
+#pragma simd
+    for (int itrack = 0; itrack < NN; ++itrack)
+    {
+      // make sure the hit was in the compatiblity window for the candidate
+      if (hit_cnt >= XHitSize.At(itrack, 0, 0)) continue;
+      const float chi2 = std::abs(outChi2[itrack]);//fixme negative chi2 sometimes...
+      dprint("chi2=" << chi2 << " minChi2[itrack]=" << minChi2[itrack]);
+      if (chi2 < minChi2[itrack]) 
+      {
+        minChi2[itrack]=chi2;
+        bestHit[itrack]=hit_cnt;
+      }
+    }
+  } // end loop over hits
+
+  //copy in MkFitter the hit with lowest chi2
+  for (int itrack = 0; itrack < NN; ++itrack)
+  {
+    _mm_prefetch((const char*) & bunch_of_hits.m_hits[XHitPos.At(itrack, 0, 0) + bestHit[itrack]], _MM_HINT_T0);
+  }
+
+#pragma simd
+  for (int itrack = 0; itrack < NN; ++itrack)
+  {
+    //fixme decide what to do in case no hit found
+    if (bestHit[itrack] >= 0)
+    {
+      const Hit &hit  = bunch_of_hits.m_hits[ XHitPos.At(itrack, 0, 0) + bestHit[itrack] ];
+      const float chi2 = minChi2[itrack];
+
+      dprint("ADD BEST HIT FOR TRACK #" << itrack << std::endl
+        << "prop x=" << Par[iP].ConstAt(itrack, 0, 0) << " y=" << Par[iP].ConstAt(itrack, 1, 0) << std::endl
+        << "copy in hit #" << bestHit[itrack] << " x=" << hit.position()[0] << " y=" << hit.position()[1]);
+	  
+      msErr[Nhits].CopyIn(itrack, hit.errArray());
+      msPar[Nhits].CopyIn(itrack, hit.posArray());
+      Chi2(itrack, 0, 0) += chi2;
+      HitsIdx[Nhits](itrack, 0, 0) = XHitPos.At(itrack, 0, 0) + bestHit[itrack];
+    }
+    else
+    {
+      dprint("ADD FAKE HIT FOR TRACK #" << itrack);
+
+      msErr[Nhits].SetDiagonal3x3(itrack, 666);
+      msPar[Nhits](itrack,0,0) = Par[iP](itrack,0,0);
+      msPar[Nhits](itrack,1,0) = Par[iP](itrack,1,0);
+      msPar[Nhits](itrack,2,0) = Par[iP](itrack,2,0);
+      HitsIdx[Nhits](itrack, 0, 0) = -1;
+
+      // Don't update chi2
+    }
+  }
+
+  //now update the track parameters with this hit (note that some calculations are already done when computing chi2... not sure it's worth caching them?)
+  dprint("update parameters");
+  updateParametersMPlex(Err[iP], Par[iP], Chg, msErr[Nhits], msPar[Nhits],
+      Err[iC], Par[iC]);
+  //std::cout << "Par[iP](0,0,0)=" << Par[iP](0,0,0) << " Par[iC](0,0,0)=" << Par[iC](0,0,0)<< std::endl;
+#endif
+}
+#endif
diff --git a/mkFit/MkFitter.h b/mkFit/MkFitter.h
index d04783df4f55f..e10d13f69c49c 100644
--- a/mkFit/MkFitter.h
+++ b/mkFit/MkFitter.h
@@ -9,6 +9,7 @@
 #include "BinInfoUtils.h"
 
 #include "FitterCU.h"
+#include "HitStructuresCU.h"
 
 //#define DEBUG 1
 
@@ -109,6 +110,10 @@ struct MkFitter
 
   void SelectHitRanges(const BunchOfHits &bunch_of_hits, const int N_proc);
   void AddBestHit     (const BunchOfHits &bunch_of_hits);
+#ifdef USE_CUDA
+  void AddBestHit_gpu (const BunchOfHits &bunch_of_hits, FitterCU<float> &cuFitter,
+      BunchOfHitsCU &bunch_of_hits_cu);
+#endif
 
   void FindCandidates(const BunchOfHits &bunch_of_hits, std::vector<std::vector<Track> >& tmp_candidates,
                       const int offset, const int N_proc);
diff --git a/mkFit/computeChi2_kernels.cu b/mkFit/computeChi2_kernels.cu
index f7fdabcf2b8a2..ba83a13c53cba 100644
--- a/mkFit/computeChi2_kernels.cu
+++ b/mkFit/computeChi2_kernels.cu
@@ -3,17 +3,19 @@
 #include "GPlex.h"
 #include "kalmanUpdater_kernels.h"
 #include "computeChi2_kernels.h"
-#include "HitStructuresCU.h"
 
 #include <thrust/device_vector.h>
 #include <thrust/fill.h>
 #include <thrust/reduce.h>
 #include <thrust/functional.h>
 
+#include "HitStructuresCU.h"
+#include "BinInfoUtils.h"
 #include "Hit.h"
 
 #define L 6
 #define HS 6
+#define HV 3
 #define BLOCK_SIZE_X 32
 #define MAX_BLOCKS_X 65535 // CUDA constraint
 
@@ -37,90 +39,151 @@ __device__ float *Hit::errArrayCU() {
 }
 
 __device__ void chi2Similarity_fn(
-    float *a, size_t aN,
-    float *b, size_t bN,
-    float *c, // in registers
+    GPlexReg2V &a,
+    GPlexReg2S &c, // in registers
     float *d, size_t dN) {
 
   int n = threadIdx.x + blockIdx.x * blockDim.x;
 
   // manually subrtact into local vars -- 3 of them
-  float x0 = a[0 * aN + n] - b[0 * aN + n];
-  float x1 = a[1 * aN + n] - b[1 * aN + n];
-  float x2 = a[2 * aN + n] - b[2 * aN + n];
-  d[0 * dN + n] = c[0]*x0*x0 + c[2]*x1*x1 + c[5]*x2*x2 +
-              2*( c[1]*x1*x0 + c[3]*x2*x0 + c[4]*x1*x2);
+  /*float x0 = a[0 * aN + n] - b[0 * aN + n];*/
+  /*float x1 = a[1 * aN + n] - b[1 * aN + n];*/
+  /*float x2 = a[2 * aN + n] - b[2 * aN + n];*/
+  /*d[0 * dN + n] = c[0]*x0*x0 + c[2]*x1*x1 + c[5]*x2*x2 +*/
+              /*2*( c[1]*x1*x0 + c[3]*x2*x0 + c[4]*x1*x2);*/
+  d[0 * dN + n] = c[0]*a[0]*a[0]
+                + c[2]*a[1]*a[1] 
+            + 2*( c[1]*a[1]*a[0]);
+}
+
+__device__ void RotateResidulsOnTangentPlane_fn(const float r00,//r00
+				  float r01,//r01
+				  GPlexRegHV &a  ,//res_glo
+          GPlexReg2V &b  )//res_loc
+{
+
+   // res_loc = rotT * res_glo
+   //   B     =  R   *    A   
+  b[0] =  r00*a[0] + r01*a[1];
+  b[1] =  a[2];
+}
+
+__device__ void ProjectResErr_fn(float a00,
+		   float a01,
+		   GPlexRegHS &b, 
+       GPlexRegHH &c)
+{
+  // C = A * B, C is 3x3, A is 3x3 , B is 3x3 sym
+
+  // Based on script generation and adapted to custom sizes.
+      c[ 0] = a00*b[ 0] + a01*b[ 1];
+      c[ 1] = a00*b[ 1] + a01*b[ 2];
+      c[ 2] = a00*b[ 3] + a01*b[ 4];
+      c[ 3] = b[ 3];
+      c[ 4] = b[ 4];
+      c[ 5] = b[ 5];
+      c[ 6] = a01*b[ 0] - a00*b[ 1];
+      c[ 7] = a01*b[ 1] - a00*b[ 2];
+      c[ 8] = a01*b[ 3] - a00*b[ 4];
+}
+
+__device__ void ProjectResErrTransp_fn(float a00,
+			 float a01, GPlexRegHH &b, GPlexReg2S &c)
+{
+  // C = A * B, C is 3x3 sym, A is 3x3 , B is 3x3
+
+  // Based on script generation and adapted to custom sizes.
+      c[ 0] = b[ 0]*a00 + b[ 1]*a01;
+      c[ 1] = b[ 3]*a00 + b[ 4]*a01;
+      c[ 2] = b[ 5];
 }
 
 __device__ void computeChi2_fn(
-    float* propErr, size_t propErr_stride,
-    float* msErr, size_t msErr_stride,
-    /*float* resErr, size_t resErr_stride,*/
-    float *msPar, size_t msPar_stride,
-    float *propPar, size_t propPar_stride,
-    float *outChi2, size_t outChi2_stride,
-    const int N) {
+    GPlexLS &propErr, GPlexHS &msErr, GPlexHV &msPar, GPlexLV &propPar,
+    GPlexQF &outChi2, const int N) {
   int grid_width = blockDim.x * gridDim.x;
   int n = threadIdx.x + blockIdx.x * blockDim.x;
-  float resErr_reg[HS];
+  /*float resErr_reg[HS]; // ~ resErr_glo*/
+  GPlexRegHS resErr_reg;
 
   for (int z = 0; z < (N-1)/grid_width  +1; z++) {
     n += z*grid_width;
+
     if (n < N) {
+
+      // coordinate change
+      float rotT00;
+      float rotT01;
+      const float r = hipo(msPar(n, 0, 0), msPar(n, 1, 0));
+      rotT00 = -(msPar(n, 1, 0) + propPar(n, 1, 0))/(2*r);
+      rotT01 =  (msPar(n, 0, 0) + propPar(n, 0, 0))/(2*r);
+
+      /*float res_glo[HV];*/
+      GPlexRegHV res_glo;
+      subtractFirst3_fn(msPar, propPar, res_glo, N, n);
+
       for (int j = 0; j < HS; ++j) {
         resErr_reg[j] = 0; //resErr[j*resErr_stride + n];
       }
-      addIntoUpperLeft3x3_fn(propErr, propErr_stride,
-          msErr, msErr_stride, resErr_reg, N, n);
-      invertCramerSym_fn(resErr_reg);
-
-      chi2Similarity_fn(msPar, msPar_stride,
-          propPar, propPar_stride, resErr_reg, 
-          outChi2, outChi2_stride);
-      /*for (int j = 0; j < HS; ++j) {*/
-        /*resErr[j*resErr_stride + n] = resErr_reg[j];*/
-      /*}*/
+      addIntoUpperLeft3x3_fn(propErr, msErr, resErr_reg, N, n);
+
+      GPlexReg2V res_loc;   //position residual in local coordinates
+      RotateResidulsOnTangentPlane_fn(rotT00,rotT01,res_glo,res_loc);
+      /*MPlex2S resErr_loc;//covariance sum in local position coordinates*/
+      /*MPlexHH tempHH;*/
+      GPlexReg2S resErr_loc; // 2x2 sym
+      GPlexRegHH tempHH;  // 3*3 sym
+      ProjectResErr_fn  (rotT00, rotT01, resErr_reg, tempHH);
+      ProjectResErrTransp_fn(rotT00, rotT01, tempHH, resErr_loc);
+
+      /*invertCramerSym_fn(resErr_reg);*/
+      invertCramerSym2x2_fn(resErr_loc);
+
+      chi2Similarity_fn(res_loc, resErr_loc, outChi2.ptr, outChi2.stride);
     }
   }
 }
 
+__global__ void computeChi2_kernel(
+    GPlexLS propErr, GPlexHS msErr, GPlexHV msPar, GPlexLV propPar,
+    GPlexQF outChi2, const int N) {
+  int itrack = threadIdx.x + blockDim.x*blockIdx.x;
+  if (itrack < N) {
+    computeChi2_fn
+      (propErr, msErr, msPar, propPar,
+       outChi2, N);
+  }
+}
+
 void computeChi2_wrapper(cudaStream_t &stream, 
-    GPlexLS propErr, GPlexHS msErr, // GPlex<float> resErr,
-    GPlexHV msPar, GPlexLV propPar, GPlexQF outChi2,
+    GPlexLS &propErr, GPlexHS &msErr, // GPlex<float> resErr,
+    GPlexHV &msPar, GPlexLV &propPar, GPlexQF &outChi2,
     const int N) {
   int gridx = std::min((N-1)/BLOCK_SIZE_X + 1,
                        MAX_BLOCKS_X);
   dim3 grid(gridx, 1, 1);
   dim3 block(BLOCK_SIZE_X, 1, 1);
-#if 0
   computeChi2_kernel <<< grid, block, 0, stream >>>
-    (propErr.ptr, propErr.stride,
-     msErr.ptr, msErr.stride,
-     /*resErr.ptr, resErr.stride,*/
-     msPar.ptr, msPar.stride,
-     propPar.ptr, propPar.stride,
-     outChi2.ptr, outChi2.stride,
-     N);
-#endif
+    (propErr, msErr, msPar, propPar, outChi2, N);
  }
 
-template <typename T>
-__device__ void SlurpIn_fn(float *fArray, int stride, int kSize, 
+template <typename GPlexObj>
+__device__ void SlurpIn_fn(GPlexObj to, // float *fArray, int stride, int kSize, 
                            const char *arr, int *vi, int N) {
   int j = threadIdx.x + blockDim.x * blockIdx.x;
   if (j<N) {
-    for (int i = 0; i < kSize; ++i) { // plex_size
+    for (int i = 0; i < to.kSize; ++i) { // plex_size
       int *XHitPos = vi;
       int off = XHitPos[j] * sizeof(Hit);
-      fArray[i*stride+ j] = * (const T*) (arr + i*sizeof(T) + off);
+      /*fArray[i*stride+ j] = * (const T*) (arr + i*sizeof(T) + off);*/
+      to(j, i, 0) = * (decltype(to.ptr)) (arr + i*sizeof(decltype(*to.ptr)) + off);
     }
   }
 }
 
 
-__device__ void HitToMs_fn(float *msErr, int msErr_stride, int msErr_plex_size,
-                           float *msPar, int msPar_stride, int msPar_plex_size,
-                           Hit *hits, int *XHitPos, int hit_cnt, int N) {
+__device__ void HitToMs_fn(GPlexHS &msErr, GPlexHV &msPar,
+                           Hit *hits, GPlexQI &XHitPos, int hit_cnt, int N) {
   /*int j = threadIdx.x + blockDim.x*blockIdx.x;*/
 
   const char *varr      = (char*) hits;
@@ -140,8 +203,10 @@ __device__ void HitToMs_fn(float *msErr, int msErr_stride, int msErr_plex_size,
     /*printf("\n");*/
   /*}*/
 
-  SlurpIn_fn<float>(msErr, msErr_stride, msErr_plex_size, varr + (hit_cnt*sizeof(Hit)) + off_error, XHitPos, N);
-  SlurpIn_fn<float>(msPar, msPar_stride, msPar_plex_size, varr + (hit_cnt*sizeof(Hit)) + off_param, XHitPos, N);
+  /*SlurpIn_fn<float>(msErr.ptr, msErr.stride, msErr.kSize, varr + (hit_cnt*sizeof(Hit)) + off_error, XHitPos.ptr, N);*/
+  /*SlurpIn_fn<float>(msPar.ptr, msPar.stride, msPar.kSize, varr + (hit_cnt*sizeof(Hit)) + off_param, XHitPos.ptr, N);*/
+  SlurpIn_fn(msErr, varr + (hit_cnt*sizeof(Hit)) + off_error, XHitPos.ptr, N);
+  SlurpIn_fn(msPar, varr + (hit_cnt*sizeof(Hit)) + off_param, XHitPos.ptr, N);
   
   /*if (j==2) {*/
     /*for (int i = 0; i < msPar_plex_size; ++i) {*/
@@ -150,6 +215,11 @@ __device__ void HitToMs_fn(float *msErr, int msErr_stride, int msErr_plex_size,
   /*}*/
 }
 
+__global__ void HitToMs_kernel(GPlexHS msErr, GPlexHV msPar,
+    Hit *hits, GPlexQI XHitPos, int hit_cnt, int N) {
+
+    HitToMs_fn(msErr, msPar, hits, XHitPos, hit_cnt, N);
+}
 
 void HitToMs_wrapper(cudaStream_t& stream,
     GPlexHS &msErr, GPlexHV &msPar, BunchOfHitsCU &bunch, 
@@ -158,11 +228,9 @@ void HitToMs_wrapper(cudaStream_t& stream,
                        MAX_BLOCKS_X);
   dim3 grid(gridx, 1, 1);
   dim3 block(BLOCK_SIZE_X, 1, 1);
-#if 0
+#if 1
   HitToMs_kernel <<< grid, block, 0 , stream >>>
-    (msErr.ptr, msErr.stride, msErr.y,
-     msPar.ptr, msPar.stride, msPar.y,
-     bunch.m_hits, XHitPos.ptr, hit_cnt, N);
+    (msErr, msPar, bunch.m_hits, XHitPos, hit_cnt, N);
 #endif
 }
 
@@ -180,31 +248,36 @@ __device__ void getNewBestHitChi2_fn(float *outChi2, float &minChi2,
   }
 }
 
+__global__ void getNewBestHitChi2_kernel(float *outChi2, float *minChi2,
+    int *bestHit, int hit_cnt, int N) {
+  int itrack = threadIdx.x + blockDim.x*blockIdx.x;
+  if (itrack < N) {
+    getNewBestHitChi2_fn(outChi2, minChi2[itrack], bestHit[itrack], hit_cnt, N);
+  }
+}
+
 void getNewBestHitChi2_wrapper(cudaStream_t &stream,
     GPlexQF &outChi2, float *minChi2, int *bestHit, int hit_cnt, int N) {
   int gridx = std::min((N-1)/BLOCK_SIZE_X + 1,
                        MAX_BLOCKS_X);
   dim3 grid(gridx, 1, 1);
   dim3 block(BLOCK_SIZE_X, 1, 1);
-#if 0
+#if 1
   getNewBestHitChi2_kernel <<< grid, block, 0, stream >>>
     (outChi2.ptr, minChi2, bestHit, hit_cnt, N);
 #endif
 }
 
-void fill_array_cu(float *array, int size, int value) {
+void fill_array_cu(float *array, int size, float value) {
   thrust::device_ptr<float> d_ptr(array);
   thrust::fill(d_ptr, d_ptr + size, value);
 }
 
 
-__device__ void updateTracksWithBestHit_fn(Hit *hits, int *XHitPos,
+__device__ void updateTracksWithBestHit_fn(Hit *hits, GPlexQI &XHitPos,
     float minChi2, int bestHit,
-    float *msErr, int msErr_stride, int msErr_plex_size,
-    float *msPar, int msPar_stride, int msPar_plex_size,
-    float *propPar, int propPar_stride,
-    float *Chi2, int *HitsIdx,
-    int N) {
+    GPlexHS &msErr, GPlexHV &msPar, GPlexLV &propPar, 
+    float *Chi2, int *HitsIdx, int N) {
   int itrack = threadIdx.x + blockDim.x*blockIdx.x;
   if (itrack < N) {
     if (bestHit >= 0)
@@ -222,11 +295,11 @@ __device__ void updateTracksWithBestHit_fn(Hit *hits, int *XHitPos,
       /*msPar[Nhits].CopyIn(itrack, hit.posArray());*/
       /*SlurpIn_fn<float>(msPar, msPar_stride, msPar_plex_size,
         varr + (itrack*sizeof(Hit)) + off_param, XHitPos, N);*/
-      for (int i = 0; i < msErr_plex_size; ++i) {
-        msErr[i*msErr_stride + itrack] =  hits[XHitPos[itrack]+bestHit].errArrayCU()[i];
+      for (int i = 0; i < msErr.kSize; ++i) {
+        msErr(itrack, i, 0) = hits[XHitPos[itrack]+bestHit].errArrayCU()[i];
       }
-      for (int i = 0; i < msPar_plex_size; ++i) {
-        msPar[i*msErr_stride + itrack] =  hits[XHitPos[itrack]+bestHit].posArrayCU()[i];
+      for (int i = 0; i < msPar.kSize; ++i) {
+        msPar(itrack, i, 0) = hits[XHitPos[itrack]+bestHit].posArrayCU()[i];
       }
       /*Chi2(itrack, 0, 0) += chi2_local;*/
       Chi2[itrack] += chi2_local;
@@ -236,18 +309,18 @@ __device__ void updateTracksWithBestHit_fn(Hit *hits, int *XHitPos,
     else
     {
       /*msErr[Nhits].SetDiagonal3x3(itrack, 666);*/
-      msErr[0*msErr_stride + itrack] = 666;
-      msErr[1*msErr_stride + itrack] = 0;
-      msErr[2*msErr_stride + itrack] = 666;
-      msErr[3*msErr_stride + itrack] = 0;
-      msErr[4*msErr_stride + itrack] = 0;
-      msErr[5*msErr_stride + itrack] = 666;
+      msErr(itrack, 0, 0) = 666;
+      msErr(itrack, 1, 0) = 0;
+      msErr(itrack, 2, 0) = 666;
+      msErr(itrack, 3, 0) = 0;
+      msErr(itrack, 4, 0) = 0;
+      msErr(itrack, 5, 0) = 666;
 
       /*msPar[Nhits](itrack,0,0) = Par[iP](itrack,0,0);*/
       /*msPar[Nhits](itrack,1,0) = Par[iP](itrack,1,0);*/
       /*msPar[Nhits](itrack,2,0) = Par[iP](itrack,2,0);*/
-      for (int i = 0; i < msPar_plex_size; ++i) {
-        msPar[i*msPar_stride + itrack] = propPar[i*propPar_stride + itrack]; 
+      for (int i = 0; i < msPar.kSize; ++i) {
+        msPar(itrack, i, 0) = propPar(itrack, i, 0);
       }
       /*HitsIdx[Nhits](itrack, 0, 0) = -1;*/
       HitsIdx[itrack] = -1;
@@ -257,6 +330,23 @@ __device__ void updateTracksWithBestHit_fn(Hit *hits, int *XHitPos,
   }
 }
 
+__global__ void updateTracksWithBestHit_kernel(Hit *hits, GPlexQI XHitPos,
+    float *minChi2, int *bestHit,
+    GPlexHS msErr, GPlexHV msPar, GPlexLV propPar, 
+    float *Chi2, int *HitsIdx, int N) {
+  int itrack = threadIdx.x + blockDim.x*blockIdx.x;
+  if (itrack < N) {
+    updateTracksWithBestHit_fn
+        (hits, XHitPos,
+         minChi2[itrack], bestHit[itrack],
+         msErr,
+         msPar,
+         propPar,
+         Chi2, HitsIdx,
+         N);
+  }
+}
+
 void updateTracksWithBestHit_wrapper(cudaStream_t &stream,
     BunchOfHitsCU &bunch, GPlexQI &XHitPos, 
     float *minChi2, int *best_hit, 
@@ -267,16 +357,11 @@ void updateTracksWithBestHit_wrapper(cudaStream_t &stream,
                        MAX_BLOCKS_X);
   dim3 grid(gridx, 1, 1);
   dim3 block(BLOCK_SIZE_X, 1, 1);
-/*
   updateTracksWithBestHit_kernel <<< grid, block, 0, stream >>>
-      (bunch.m_hits, XHitPos.ptr,
+      (bunch.m_hits, XHitPos,
        minChi2, best_hit,
-       msErr.ptr, msErr.stride, msErr.y,
-       msPar.ptr, msPar.stride, msPar.y,
-       propPar.ptr, propPar.stride,
-       Chi2, HitsIdx,
-       N);
-*/
+       msErr, msPar, propPar,
+       Chi2, HitsIdx, N);
 }
 
 int getMaxNumHits_wrapper(GPlexQI d_XHitSize, int N) {
@@ -288,12 +373,14 @@ int getMaxNumHits_wrapper(GPlexQI d_XHitSize, int N) {
 }
 
 __global__ void bestHit_kernel(
-    Hit *hits, int *XHitPos, 
-    float* propErr, size_t propErr_stride,
-    float* msErr, size_t msErr_stride, size_t msErr_plex_size,
-    float *msPar, size_t msPar_stride, size_t msPar_plex_size,
-    float *propPar, size_t propPar_stride,
-    float *outChi2, size_t outChi2_stride,
+    Hit *hits, GPlexQI XHitPos, 
+    GPlexLS propErr, GPlexHS msErr, GPlexHV msPar,
+    GPlexLV propPar, GPlexQF outChi2,
+    /*float* propErr, size_t propErr_stride,*/
+    /*float* msErr, size_t msErr_stride, size_t msErr_plex_size,*/
+    /*float *msPar, size_t msPar_stride, size_t msPar_plex_size,*/
+    /*float *propPar, size_t propPar_stride,*/
+    /*float *outChi2, size_t outChi2_stride,*/
     float *Chi2, int *HitsIdx,
     int maxSize, int N) {
 
@@ -303,9 +390,7 @@ __global__ void bestHit_kernel(
 
   for (int hit_cnt = 0; hit_cnt < maxSize; ++hit_cnt)
   {
-    HitToMs_fn(msErr, msErr_stride, msErr_plex_size,
-               msPar, msPar_stride, msPar_plex_size,
-               hits, XHitPos, hit_cnt, N);
+    HitToMs_fn(msErr, msPar, hits, XHitPos, hit_cnt, N);
 #if 0
       // TODO: add CMSGeom
       if (Config::useCMSGeom) {
@@ -313,20 +398,13 @@ __global__ void bestHit_kernel(
         throw std::runtime_error("useCMSGeom not implemented yet for GPU");
       } else {}
 #endif
-    computeChi2_fn(propErr, propErr_stride,
-                   msErr, msErr_stride,
-                   msPar, msPar_stride,
-                   propPar, propPar_stride,
-                   outChi2, outChi2_stride,
-                   N);
-    getNewBestHitChi2_fn(outChi2, minChi2_reg, bestHit_reg, hit_cnt, N);
+    computeChi2_fn(propErr, msErr, msPar, propPar, outChi2, N);
+    getNewBestHitChi2_fn(outChi2.ptr, minChi2_reg, bestHit_reg, hit_cnt, N);
   }
   updateTracksWithBestHit_fn
       (hits, XHitPos,
        minChi2_reg, bestHit_reg,
-       msErr, msErr_stride, msErr_plex_size,
-       msPar, msPar_stride, msPar_plex_size,
-       propPar, propPar_stride,
+       msErr, msPar, propPar,
        Chi2, HitsIdx,
        N);
 }
@@ -344,59 +422,19 @@ void bestHit_wrapper(cudaStream_t &stream,
   dim3 block(BLOCK_SIZE_X, 1, 1);
 
   bestHit_kernel <<< grid, block, 0, stream >>>
-    (bunch.m_hits, XHitPos.ptr,
-     propErr.ptr, propErr.stride,
-     msErr.ptr, msErr.stride, msErr.kSize,
-     msPar.ptr, msPar.stride, msPar.kSize,
-     propPar.ptr, propPar.stride,
-     outChi2.ptr, outChi2.stride,
+    (bunch.m_hits, XHitPos,
+     propErr, msErr, msPar, propPar, outChi2,
+     /*propErr.ptr, propErr.stride,*/
+     /*msErr.ptr, msErr.stride, msErr.kSize,*/
+     /*msPar.ptr, msPar.stride, msPar.kSize,*/
+     /*outChi2.ptr, outChi2.stride,*/
      Chi2, HitsIdx,
      maxSize, N);
 }
 
-__device__ float downPhi_fn(float phi) {
-  while (phi >= Config::PI) {phi-=Config::TwoPI;}
-  return phi;
-}
-	
-__device__ float upPhi_fn(float phi) {
-  while (phi <= -Config::PI) {phi+=Config::TwoPI;}
-  return phi;
-}
-
-__device__ float normalizedPhi_fn(float phi) {
-  //  return std::fmod(phi, (float) Config::PI); // return phi +pi out of phase for |phi| beyond boundary! 
-  if (abs(phi)>=Config::PI) {phi = (phi>0 ? downPhi_fn(phi) : upPhi_fn(phi));}
-  return phi;
-}
-
-__device__ int getPhiPartition_fn(float phi)
-{
-  //assume phi is between -PI and PI
-  //  if (!(fabs(phi)<Config::PI)) std::cout << "anomalous phi=" << phi << std::endl;
-  //  const float phiPlusPi  = std::fmod(phi+Config::PI,Config::TwoPI); // normaliztion done here
-  const float phiPlusPi = phi+Config::PI; 
-  int bin = phiPlusPi*Config::fPhiFactor;
-  
-  // theoretically these checks below should be taken care of by normalizedPhi, however...
-  // these condition checks appeared in very bizarre corner case where propagated phi == pi != Config::PI in check of normalizedPhi (but not unexpected... comparing float point numbers)
-  // i.e. delta on floating point check smaller than comparison... making what should be bin = nPhiPart - 1 instead bin = nPhiPart (out of bounds!!) ...or worse if unsigned bin < 0, bin == int max!
-  if (bin<0)                      bin = 0;
-  else if (bin>=Config::nPhiPart) bin = Config::nPhiPart - 1;
-
-  return bin;
-}
-
-__device__ float getPhi_fn(float x, float y)
-{
-  return atan2(y,x); 
-}
-
 __global__ void selectHitRanges_kernel(Hit *hits,
     int *phi_bin_infos_first, int *phi_bin_infos_second, int bunch_fill_index,
-    int *XHitPos, int *XHitSize, 
-    float *Err, int Err_stride,
-    float *Par, int Par_stride,
+    GPlexQI XHitPos, GPlexQI XHitSize, GPlexLS Err, GPlexLV Par,
     bool useCMSGeom, int N) {
   int itrack = threadIdx.x + blockDim.x*blockIdx.x;
   if (itrack < N) {
@@ -415,11 +453,11 @@ __global__ void selectHitRanges_kernel(Hit *hits,
     // if (fabs(eta) > etaDet) eta = (eta>0 ? etaDet*0.99 : -etaDet*0.99);
     // unsigned int etabin = getEtaPartition(eta,etaDet);
 
-    const float predx = Par[(0*1 + 0)*Par_stride + itrack];  // Par[iI].ConstAt(itrack, 0, 0);
-    const float predy = Par[(1*1 + 0)*Par_stride + itrack];  // Par[iI].ConstAt(itrack, 1, 0);
-    const float predz = Par[(2*1 + 0)*Par_stride + itrack];  // Par[iI].ConstAt(itrack, 2, 0);
+    const float predx = Par(itrack, (0*1 + 0), 0);  // Par[iI].ConstAt(itrack, 0, 0);
+    const float predy = Par(itrack, (1*1 + 0), 0);  // Par[iI].ConstAt(itrack, 1, 0);
+    const float predz = Par(itrack, (2*1 + 0), 0);  // Par[iI].ConstAt(itrack, 2, 0);
 
-    float phi = getPhi_fn(predx,predy);
+    float phi = getPhi(predx,predy);
 
     const float px2py2 = predx*predx+predy*predy; // predicted radius^2
     const float dphidx = -predy/px2py2;
@@ -427,9 +465,9 @@ __global__ void selectHitRanges_kernel(Hit *hits,
     // const float dphi2  =     dphidx*dphidx*(Err[iI].ConstAt(itrack, 0, 0) /*propState.errors.At(0,0)*/) +
     //                          dphidy*dphidy*(Err[iI].ConstAt(itrack, 1, 1) /*propState.errors.At(1,1)*/) +
     //                      2 * dphidx*dphidy*(Err[iI].ConstAt(itrack, 0, 1) /*propState.errors.At(0,1)*/);
-    const float dphi2  =     dphidx*dphidx*Err[(0)*Err_stride + itrack] +
-                             dphidy*dphidy*Err[(2)*Err_stride + itrack] +
-                         2 * dphidx*dphidy*Err[(1)*Err_stride + itrack];
+    const float dphi2  =     dphidx*dphidx*Err(itrack, 0, 0) +
+                             dphidy*dphidy*Err(itrack, 2, 0) +
+                         2 * dphidx*dphidy*Err(itrack, 1, 0);
 
     const float dphi       = sqrtf(fabs(dphi2));//how come I get negative squared errors sometimes? MT -- how small?
     const float nSigmaDphi = fminf(fmaxf(Config::nSigma*dphi, Config::minDPhi), Config::PI);
@@ -440,8 +478,8 @@ __global__ void selectHitRanges_kernel(Hit *hits,
       //now correct for bending and for layer thickness unsing linear approximation
       /*const float predpx = Par[iP].ConstAt(itrack, 3, 0);*/
       /*const float predpy = Par[iP].ConstAt(itrack, 4, 0);*/
-      const float predpx = Par[(3*1 + 0)*Par_stride + itrack];
-      const float predpy = Par[(4*1 + 0)*Par_stride + itrack];
+      const float predpx = Par(itrack, (3*1 + 0), 0);
+      const float predpy = Par(itrack, (4*1 + 0), 0);
       float deltaR = Config::cmsDeltaRad; //fixme! using constant vale, to be taken from layer properties
       float radius = sqrt(px2py2);
       float pt     = sqrt(predpx*predpx + predpy*predpy);
@@ -450,8 +488,8 @@ __global__ void selectHitRanges_kernel(Hit *hits,
       float dist = sqrt(hipo*hipo - deltaR*deltaR);
       dPhiMargin = dist/radius;
     }
-    const float dphiMinus = normalizedPhi_fn(phi-nSigmaDphi-dPhiMargin);
-    const float dphiPlus  = normalizedPhi_fn(phi+nSigmaDphi+dPhiMargin);
+    const float dphiMinus = normalizedPhi(phi-nSigmaDphi-dPhiMargin);
+    const float dphiPlus  = normalizedPhi(phi+nSigmaDphi+dPhiMargin);
 // FIXME ^ OK
 
 #ifdef DEBUG
@@ -462,8 +500,8 @@ __global__ void selectHitRanges_kernel(Hit *hits,
     xout << "dphi = " << dphi  << ", dphi2 = " << dphi2 << ", nSigmaDphi = " << nSigmaDphi << ", nSigma = " << Config::nSigma << std::endl;
 #endif
 
-    int   phiBinMinus = getPhiPartition_fn(dphiMinus);
-    int   phiBinPlus  = getPhiPartition_fn(dphiPlus);
+    int   phiBinMinus = getPhiPartition(dphiMinus);
+    int   phiBinPlus  = getPhiPartition(dphiPlus);
 
 #ifdef DEBUG
     xout << "phiBinMinus = " << phiBinMinus << ", phiBinPlus = " << phiBinPlus << std::endl;
@@ -561,8 +599,8 @@ void selectHitRanges_wrapper(cudaStream_t &stream, BunchOfHitsCU &bunch,
   dim3 block(BLOCK_SIZE_X, 1, 1);
 
   selectHitRanges_kernel <<< grid, block, 0, stream >>>
-    (bunch.m_hits, bunch.m_phi_bin_infos_first, bunch.m_phi_bin_infos_second, bunch.m_fill_index,
-     XHitPos.ptr, XHitSize.ptr, 
-     Err.ptr, Err.stride, Par.ptr, Par.stride,
+    (bunch.m_hits, bunch.m_phi_bin_infos_first, 
+     bunch.m_phi_bin_infos_second, bunch.m_fill_index,
+     XHitPos, XHitSize, Err, Par,
      Config::useCMSGeom, N);
 }
diff --git a/mkFit/computeChi2_kernels.h b/mkFit/computeChi2_kernels.h
index e37ce1e7e0dc3..510b83907bfa4 100644
--- a/mkFit/computeChi2_kernels.h
+++ b/mkFit/computeChi2_kernels.h
@@ -5,8 +5,8 @@
 #include "GPlex.h"
 
 void computeChi2_wrapper(cudaStream_t &stream, 
-    GPlexLS propErr, GPlexHS msErr, // GPlex<float> resErr,
-    GPlexHV msPar, GPlexLV propPar, GPlexQF outChi2,
+    GPlexLS &propErr, GPlexHS &msErr, // GPlex<float> resErr,
+    GPlexHV &msPar, GPlexLV &propPar, GPlexQF &outChi2,
     const int N);
 
 void HitToMs_wrapper(cudaStream_t& stream,
@@ -16,7 +16,7 @@ void HitToMs_wrapper(cudaStream_t& stream,
 void getNewBestHitChi2_wrapper(cudaStream_t &stream,
     GPlexQF &outChi2, float *minChi2, int *bestHit, int hit_cnt, int N);
 
-void fill_array_cu(float *array, int size, int value);
+void fill_array_cu(float *array, int size, float value);
 
 void updateTracksWithBestHit_wrapper(cudaStream_t &stream,
     BunchOfHitsCU &bunch, GPlexQI &XHitPos, 
@@ -39,4 +39,17 @@ void selectHitRanges_wrapper(cudaStream_t &stream, BunchOfHitsCU &bunch,
     GPlexLS &Err, GPlexLV &Par,
     int N);
 
+__device__ void RotateResidulsOnTangentPlane_fn(const float r00,//r00
+				  float r01,//r01
+				  GPlexRegHV &a  ,//res_glo
+          GPlexReg2V &b  );
+
+__device__ void ProjectResErr_fn(float a00,
+		   float a01,
+		   GPlexRegHS &b, 
+       GPlexRegHH &c);
+
+__device__ void ProjectResErrTransp_fn(float a00,
+			 float a01, GPlexRegHH &b, GPlexReg2S &c);
+
 #endif
diff --git a/mkFit/kalmanUpdater_kernels.cu b/mkFit/kalmanUpdater_kernels.cu
index 2c0b2d68d0d68..f66470c862d64 100644
--- a/mkFit/kalmanUpdater_kernels.cu
+++ b/mkFit/kalmanUpdater_kernels.cu
@@ -1,14 +1,281 @@
 #include "Config.h"
+#include "Hit.h"
 #include "kalmanUpdater_kernels.h"
+#include "computeChi2_kernels.h"
 
 // TODO: Clean all the hard-coded #define
 #define LS 21
 #define HS 6
 #define LH 18
+#define HV 3
 
 #define BLOCK_SIZE_X 32
 #define MAX_BLOCKS_X 65535 // CUDA constraint
 
+/*__device__ float getPhi_fn2(float x, float y)*/
+/*{*/
+  /*return atan2(y,x); */
+/*}*/
+
+/*__device__ float getTheta_fn(float r, float z){*/
+  /*return atan2(r,z);*/
+/*}*/
+
+__device__ void subtract_matrix(const float *a, int aN, const float *b, int bN, 
+    float *c, int cN, int size, int n) {
+  for (int i = 0; i < size; ++i) {
+    c[i*cN + n] = a[i*aN + n] - b[i*bN + n];
+    
+  }
+}
+
+__device__ float getHypot_fn(float x, float y)
+{
+  return sqrt(x*x + y*y);
+}
+
+__device__
+void KalmanHTG_fn(float a00, float a01,
+	       const GPlexReg2S &b, GPlexRegHH &c)
+{
+
+   // HTG  = rot * res_loc
+   //   C  =  A  *    B   
+
+   // Based on script generation and adapted to custom sizes.
+      c[ 0] = a00*b[ 0];
+      c[ 1] = a00*b[ 1];
+      c[ 2] = 0.;
+      c[ 3] = a01*b[ 0];
+      c[ 4] = a01*b[ 1];
+      c[ 5] = 0.;
+      c[ 6] = b[ 1];
+      c[ 7] = b[ 2];
+      c[ 8] = 0.;
+}
+
+__device__
+void KalmanGain_fn(const GPlexLS &A, GPlexRegHH &b, GPlexRegLH &c, int n)
+{
+  // C = A * B, C is 6x3, A is 6x6 sym , B is 6x3
+  using T = float;
+  float *a = A.ptr;
+  int aN = A.stride; int an = n;  // Global array
+  int bN = 1;        int bn = 0;  // Register array
+  int cN = 1;        int cn = 0;
+
+#include "KalmanGain.ah"
+}
+
+__device__
+void KHMult_fn(const GPlexRegLH &a, 
+	    const float b00,
+	    const float b01,
+      GPlexRegLL &c)
+{
+      c[ 0] = a[ 0]*b00;
+      c[ 1] = a[ 0]*b01;
+      c[ 2] = a[ 1];
+      c[ 3] = 0;
+      c[ 4] = 0;
+      c[ 5] = 0;
+      c[ 6] = a[ 3]*b00;
+      c[ 7] = a[ 3]*b01;
+      c[ 8] = a[ 4];
+      c[ 9] = 0;
+      c[10] = 0;
+      c[11] = 0;
+      c[12] = a[ 6]*b00;
+      c[13] = a[ 6]*b01;
+      c[14] = a[ 7];
+      c[15] = 0;
+      c[16] = 0;
+      c[17] = 0;
+      c[18] = a[ 9]*b00;
+      c[19] = a[ 9]*b01;
+      c[20] = a[10];
+      c[21] = 0;
+      c[22] = 0;
+      c[23] = 0;
+      c[24] = a[12]*b00;
+      c[25] = a[12]*b01;
+      c[26] = a[13];
+      c[27] = 0;
+      c[28] = 0;
+      c[29] = 0;
+      c[30] = a[15]*b00;
+      c[31] = a[15]*b01;
+      c[32] = a[16];
+      c[33] = 0;
+      c[34] = 0;
+      c[35] = 0;
+}
+
+__device__
+void KHC_fn(const GPlexRegLL &a, const GPlexLS &B, GPlexLS &C, int n)
+{
+  // C = A * B, C is 6x6, A is 6x6 , B is 6x6 sym
+  using T = float;
+                 int aN = 1; int an = 0;  // Register array
+  T *b = B.ptr;  int bN = B.stride;  int bn = n;
+  T *c = C.ptr;  int cN = C.stride;  int cn = n;
+#include "KHC.ah"
+}
+
+// 
+__device__
+void ConvertToPolar_fn(const GPlexLV &a, GPlexRegLV &b, GPlexRegLL &c, int n)
+{
+  int aN = a.stride; 
+  typedef float T;
+    const float pt = getHypot_fn(a[ 3*aN+n], a[ 4*aN+n]);
+    const float p2 = pt*pt + a[ 5*aN+n]*a[ 5*aN+n];
+    //
+    b[ 0] = a[ 0*aN+n];
+    b[ 1] = a[ 1*aN+n];
+    b[ 2] = a[ 2*aN+n];
+    b[ 3] = 1.0f/pt;
+    b[ 4] = getPhi(a[ 3*aN+n], a[ 4*aN+n]); //fixme: use trig approx
+    b[ 5] = getTheta(pt, a[ 5*aN+n]);
+    //
+    c[ 0] = 1.;
+    c[ 1] = 0.;
+    c[ 2] = 0.;
+    c[ 3] = 0.;
+    c[ 4] = 0.;
+    c[ 5] = 0.;
+    c[ 6] = 0.;
+    c[ 7] = 1.;
+    c[ 8] = 0.;
+    c[ 9] = 0.;
+    c[10] = 0.;
+    c[11] = 0.;
+    c[12] = 0.;
+    c[13] = 0.;
+    c[14] = 1.;
+    c[15] = 0.;
+    c[16] = 0.;
+    c[17] = 0.;
+    c[18] = 0.;
+    c[19] = 0.;
+    c[20] = 0.;
+    c[21] = -a[ 3*aN+n]/(pt*pt*pt);
+    c[22] = -a[ 4*aN+n]/(pt*pt*pt);
+    c[23] = 0.;
+    c[24] = 0.;
+    c[25] = 0.;
+    c[26] = 0.;
+    c[27] = -a[ 4*aN+n]/(pt*pt);
+    c[28] =  a[ 3*aN+n]/(pt*pt);
+    c[29] = 0.;
+    c[30] = 0.;
+    c[31] = 0.;
+    c[32] = 0.;
+    c[33] =  a[ 3*aN+n]*a[ 5*aN+n]/(pt*p2);
+    c[34] =  a[ 4*aN+n]*a[ 5*aN+n]/(pt*p2);
+    c[35] = -pt/p2;
+}
+
+__device__
+void PolarErr_fn(const GPlexRegLL &a, const float *b, int bN, GPlexRegLL &c, int n)
+{
+  // C = A * B, C is 6x6, A is 6x6 , B is 6x6 sym
+ 
+  // Generated code access arrays with variables cN, cn
+  // c[i*cN+cn]  
+  int aN = 1; int an = 0;  // Register array
+              int bn = n;  // Global array
+  int cN = 1; int cn = 0;
+#include "PolarErr.ah"
+}
+
+__device__
+void PolarErrTransp_fn(const GPlexRegLL &a, const GPlexRegLL &b, GPlexLS &C, int n)
+{
+  // C = A * B, C is sym, A is 6x6 , B is 6x6
+  using T = float;
+                 int aN = 1;         int an = 0;
+                 int bN = 1;         int bn = 0;
+  T *c = C.ptr;  int cN = C.stride;  int cn = n;
+#include "PolarErrTransp.ah"
+}
+
+__device__
+void ConvertToCartesian_fn(const GPlexRegLV &a, float *b, int bN, GPlexRegLL &c, int n)
+{
+    const float cosP = std::cos(a[ 4]); //fixme: use trig approx
+    const float sinP = std::sin(a[ 4]);
+    const float cosT = std::cos(a[ 5]);
+    const float sinT = std::sin(a[ 5]);
+    //
+    b[ 0*bN+n] = a[ 0];
+    b[ 1*bN+n] = a[ 1];
+    b[ 2*bN+n] = a[ 2];
+    b[ 3*bN+n] = cosP/a[ 3];
+    b[ 4*bN+n] = sinP/a[ 3];
+    b[ 5*bN+n] = cosT/(sinT*a[ 3]);
+    //
+    c[ 0] = 1.;
+    c[ 1] = 0.;
+    c[ 2] = 0.;
+    c[ 3] = 0.;
+    c[ 4] = 0.;
+    c[ 5] = 0.;
+    c[ 6] = 0.;
+    c[ 7] = 1.;
+    c[ 8] = 0.;
+    c[ 9] = 0.;
+    c[10] = 0.;
+    c[11] = 0.;
+    c[12] = 0.;
+    c[13] = 0.;
+    c[14] = 1.;
+    c[15] = 0.;
+    c[16] = 0.;
+    c[17] = 0.;
+    c[18] = 0.;
+    c[19] = 0.;
+    c[20] = 0.;
+    c[21] = -cosP/(a[ 3]*a[ 3]);
+    c[22] = -sinP/a[ 3];
+    c[23] = 0.;
+    c[24] = 0.;
+    c[25] = 0.;
+    c[26] = 0.;
+    c[27] = -sinP/(a[ 3]*a[ 3]);
+    c[28] =  cosP/a[ 3];
+    c[29] = 0.;
+    c[30] = 0.;
+    c[31] = 0.;
+    c[32] = 0.;
+    c[33] = -cosT/(sinT*a[ 3]*a[ 3]);
+    c[34] = 0.;
+    c[35] = -1.0f/(sinT*sinT*a[ 3]);
+}
+
+__device__
+void CartesianErr_fn(const GPlexRegLL &a, const float *b, int bN, GPlexRegLL &c, int n)
+{
+  // C = A * B, C is 6x6, A is 6x6 , B is 6x6 sym
+  int aN = 1; int an = 0;
+              int bn = n;
+  int cN = 1; int cn = 0;
+
+#include "CartesianErr.ah"
+}
+
+__device__
+void CartesianErrTransp_fn(const GPlexRegLL &a, const GPlexRegLL &b, GPlexLS &C, int n)
+{
+  // C = A * B, C is sym, A is 6x6 , B is 6x6
+  using T = float;
+  int aN = 1; int an = 0;
+  int bN = 1; int bn = 0;
+  T *c = C.ptr;  int cN = C.stride;  int cn = n;
+
+#include "CartesianErrTransp.ah"
+}
+
 
 /// MultKalmanGain ////////////////////////////////////////////////////////////
 
@@ -101,10 +368,39 @@ __device__ void invertCramerSym_fn(float *a) {
   a[5] = s*c22;
 }
 
+__device__ void invertCramerSym2x2_fn(GPlexReg2S &a) {
+  float det = a[0] * a[2] - a[1] * a[1];
+  const float s   = float(1) / det;
+  const float tmp = s * a[2];
+  a[1] *= -s;
+  a[2]  = s * a[0];
+  a[0]  = tmp;
+}
+
+__device__ void subtractFirst3_fn(const GPlexHV __restrict__ &A,
+                                  const GPlexLV __restrict__ &B,
+                                  GPlexRegHV &C, const int N, int n) {
+  using T = float;
+  const T *a = A.ptr;  int aN = A.stride;
+  const T *b = B.ptr;  int bN = B.stride;
+        T *c = C.arr;
+  /*int n = threadIdx.x + blockIdx.x * blockDim.x;*/
+  
+  if(n < N) {
+    c[0] = a[0*aN+n] - b[0*bN+n];
+    c[1] = a[1*aN+n] - b[1*bN+n];
+    c[2] = a[2*aN+n] - b[2*bN+n];
+  }
+}
+
 /// AddIntoUpperLeft3x3  //////////////////////////////////////////////////////
-__device__ void addIntoUpperLeft3x3_fn(const float* __restrict__ a, size_t aN, 
-                                       const float* __restrict__ b, size_t bN, 
-                                       float *c, const int N, int n) {
+__device__ void addIntoUpperLeft3x3_fn(const GPlexLS __restrict__ &A,
+                                       const GPlexHS __restrict__ &B,
+                                       GPlexRegHS &C, const int N, int n) {
+  using T = float;
+  const T *a = A.ptr;  int aN = A.stride;
+  const T *b = B.ptr;  int bN = B.stride;
+        T *c = C.arr;
   /*int n = threadIdx.x + blockIdx.x * blockDim.x;*/
   
   if(n < N) {
@@ -155,6 +451,26 @@ __device__ void multResidualsAdd_fn(
   /*}*/
 }
 
+__device__
+void MultResidualsAdd_all_reg(const GPlexRegLH &a,
+		      const GPlexRegLV &b,
+		      const GPlexReg2V &c,
+          GPlexRegLV &d)
+{
+   // outPar = psPar + kalmanGain*(dPar)
+   //   D    =   B         A         C
+   // where right half of kalman gain is 0 
+
+   // XXX Regenerate with a script.
+      // generate loop (can also write it manually this time, it's not much)
+      d[0] = b[0] + a[ 0] * c[0] + a[ 1] * c[1];
+      d[1] = b[1] + a[ 3] * c[0] + a[ 4] * c[1];
+      d[2] = b[2] + a[ 6] * c[0] + a[ 7] * c[1];
+      d[3] = b[3] + a[ 9] * c[0] + a[10] * c[1];
+      d[4] = b[4] + a[12] * c[0] + a[13] * c[1];
+      d[5] = b[5] + a[15] * c[0] + a[16] * c[1];
+}
+
 /// KalmanGain_x_propErr //////////////////////////////////////////////////////
 __device__ void kalmanGain_x_propErr_fn(
     float* d_kalmanGain,
@@ -209,21 +525,18 @@ __device__ void kalmanGain_x_propErr_fn(
 }
 
 __global__ void kalmanUpdate_kernel(
-    const float* __restrict__ propErr, size_t propErr_stride,
-    const float* __restrict__ msErr, size_t msErr_stride,
-    const float* __restrict__ par_iP, size_t par_iP_stride,
-    const float* __restrict__ msPar, size_t msPar_stride,
-    float *par_iC, size_t par_iC_stride,
-    float *outErr, size_t outErr_stride,
-    const int N) {
+    GPlexLS propErr, const GPlexHS __restrict__ msErr,
+    const GPlexLV __restrict__ par_iP, const GPlexHV __restrict__ msPar,
+    GPlexLV par_iC, GPlexLS outErr, const int N) {
   int grid_width = blockDim.x * gridDim.x;
   // Note: similar results with propErr kept in registers.
   //       It is read-only so using the read-only cache yields more flexibility
   //       wrt block size without increasing the pressure on registers to much.
   int n = threadIdx.x + blockIdx.x * blockDim.x;
   // There is no need to keep resErr and kalmanGain as global memory arrays.
-  float resErr_reg[HS];
-  float kalmanGain_reg[LH];
+  /*float resErr_reg[HS];*/
+  GPlexRegHS resErr_reg;
+  /*float kalmanGain_reg[LH];*/
 
   // If there is more matrices than MAX_BLOCKS_X * BLOCK_SIZE_X 
   for (int z = 0; z < (N-1)/grid_width  +1; z++) {
@@ -243,11 +556,71 @@ __global__ void kalmanUpdate_kernel(
         propPar = psPar;
       }
 #endif
+      float rotT00;
+      float rotT01;
+      const float r = hipo(msPar(n, 0, 0), msPar(n, 1, 0));
+      rotT00 = -(msPar(n, 1, 0) + par_iP(n, 1, 0))/(2*r);
+      rotT01 =  (msPar(n, 0, 0) + par_iP(n, 0, 0))/(2*r);
+
+      GPlexRegHV res_glo;
+      subtractFirst3_fn(msPar, par_iP, res_glo, N, n);
+
+      addIntoUpperLeft3x3_fn(propErr, msErr, resErr_reg, N, n);
+      GPlexReg2V res_loc;   //position residual in local coordinates
+      RotateResidulsOnTangentPlane_fn(rotT00,rotT01,res_glo,res_loc);
+      GPlexReg2S resErr_loc; // 2x2 sym
+      GPlexRegHH tempHH;  // 3*3 sym
+      ProjectResErr_fn  (rotT00, rotT01, resErr_reg, tempHH);
+      ProjectResErrTransp_fn(rotT00, rotT01, tempHH, resErr_loc);
+
+      /*invertCramerSym_fn(resErr_reg);*/
+      invertCramerSym2x2_fn(resErr_loc);
+#ifndef POLCOORD
+    // Move to "polar" coordinates: (x,y,z,1/pT,phi,theta) [can we find a better name?]
+
+    /*MPlexLV propPar_pol;// propagated parameters in "polar" coordinates*/
+    /*MPlexLL jac_pol;    // jacobian from cartesian to "polar"*/
+    /*ConvertToPolar_fn(propPar,propPar_pol,jac_pol);*/
+    /*float propPar_pol[6];*/
+    GPlexRegLV propPar_pol;
+    GPlexRegLL jac_pol;
+    ConvertToPolar_fn(par_iP, propPar_pol, jac_pol, n);
+
+    GPlexRegLL tempLL;
+    PolarErr_fn(jac_pol, propErr.ptr, propErr.stride, tempLL, n);
+    PolarErrTransp_fn(jac_pol, tempLL, propErr, n);// propErr is now propagated errors in "polar" coordinates
+#endif
 
-      addIntoUpperLeft3x3_fn(propErr, propErr_stride,
-          msErr, msErr_stride, resErr_reg, N, n);
-      invertCramerSym_fn(resErr_reg);
+    // Kalman update in "polar" coordinates
+    GPlexRegLH K;
+    KalmanHTG_fn(rotT00, rotT01, resErr_loc, tempHH);
+    KalmanGain_fn(propErr, tempHH, K, n);
+
+#ifdef POLCOORD
+    // FIXME: assuming no polcoord for now
+    MultResidualsAdd(K.arr, propPar, res_loc, outPar);// propPar_pol is now the updated parameters in "polar" coordinates
+    GPlexRegLL tempLL;
+#else
+    /*MultResidualsAdd(K, propPar_pol, res_loc, propPar_pol);// propPar_pol is now the updated parameters in "polar" coordinates*/
+    MultResidualsAdd_all_reg(K, propPar_pol, res_loc, propPar_pol);
+#endif
 
+    KHMult_fn(K, rotT00, rotT01, tempLL);
+    KHC_fn(tempLL, propErr, outErr, n);
+    /*outErr.Subtract(propErr, outErr);// outErr is in "polar" coordinates now*/
+    subtract_matrix(propErr.ptr, propErr.stride, outErr.ptr, outErr.stride, 
+        propErr.ptr, propErr.stride, LS, n);
+
+#ifndef POLCOORD
+    // Go back to cartesian coordinates
+
+    // jac_pol is now the jacobian from "polar" to cartesian
+    // outPar -> par_iC
+    ConvertToCartesian_fn(propPar_pol, par_iC.ptr, par_iC.stride, jac_pol, n);
+    CartesianErr_fn      (jac_pol, outErr.ptr, outErr.stride, tempLL, n);
+    CartesianErrTransp_fn(jac_pol, tempLL, outErr, n);// outErr is in cartesian coordinates now
+#endif
+#if 0
       upParam_MultKalmanGain_fn(propErr, propErr_stride,
           resErr_reg, kalmanGain_reg, N, n);             
       multResidualsAdd_fn(kalmanGain_reg, par_iP, par_iP_stride, 
@@ -256,6 +629,7 @@ __global__ void kalmanUpdate_kernel(
       kalmanGain_x_propErr_fn(kalmanGain_reg,
           propErr, propErr_stride,
           outErr, outErr_stride, N, n);
+#endif
     }
   }
 }
@@ -270,13 +644,7 @@ void kalmanUpdate_wrapper(cudaStream_t& stream,
   dim3 grid(gridx, 1, 1);
   dim3 block(BLOCK_SIZE_X, 1, 1);
   kalmanUpdate_kernel <<<grid, block, 0, stream >>>
-      (d_propErr.ptr, d_propErr.stride,
-       d_msErr.ptr, d_msErr.stride,
-       d_par_iP.ptr, d_par_iP.stride,
-       d_msPar.ptr, d_msPar.stride, 
-       d_par_iC.ptr, d_par_iC.stride, 
-       d_outErr.ptr, d_outErr.stride,
-       N);
+      (d_propErr, d_msErr, d_par_iP, d_msPar, d_par_iC, d_outErr, N);
 }
 
 // Should probably not be in this file, but creating a file for
diff --git a/mkFit/kalmanUpdater_kernels.h b/mkFit/kalmanUpdater_kernels.h
index 562b1ab3e371a..06970601b0162 100644
--- a/mkFit/kalmanUpdater_kernels.h
+++ b/mkFit/kalmanUpdater_kernels.h
@@ -14,10 +14,15 @@ void reorganizeMs_wrapper(cudaStream_t& stream, GPlexQF& msPar,
     float *full_errArray, int *full_hitIdx, int hi, int maxHits,
     int N, int hs, int hv, int Nhits);
 
-__device__ void addIntoUpperLeft3x3_fn(const float* __restrict__ a, size_t aN, 
-                                       const float* __restrict__ b, size_t bN, 
-                                       float *c, const int N, int n);
+__device__ void addIntoUpperLeft3x3_fn(const GPlexLS __restrict__ &A,
+                                       const GPlexHS __restrict__ &B,
+                                       GPlexRegHS &c, const int N, int n);
+
+__device__ void subtractFirst3_fn(const GPlexHV __restrict__ &A,
+                                  const GPlexLV __restrict__ &B,
+                                  GPlexRegHV &C, const int N, int n);
 
 __device__ void invertCramerSym_fn(float *a);
+__device__ void invertCramerSym2x2_fn(GPlexReg2S &a);
 
 #endif  // _KALMAN_UPDATER_KERNELS_H_
diff --git a/mkFit/propagation_kernels.cu b/mkFit/propagation_kernels.cu
index bdd5b44f93ee3..ca1e15087331a 100644
--- a/mkFit/propagation_kernels.cu
+++ b/mkFit/propagation_kernels.cu
@@ -7,45 +7,10 @@ constexpr int L = 6;
 constexpr int LL2 = 36;
 constexpr int LS = 21;
 
-template <typename T, int D1, int D2>
-struct GPlexReg {
-  __device__ T  operator[](int xx) const { return arr[xx]; }
-  __device__ T& operator[](int xx)       { return arr[xx]; }
-
-  __device__ T& operator()(int n, int i, int j)       { return arr[i*D2 + j]; }
-  __device__ T  operator()(int n, int i, int j) const { return arr[i*D2 + j]; }
-
-  __device__ void SetVal(T v)
-  {
-     for (int i = 0; i < D1; ++i)
-     {
-        arr[i] = v;
-     }
-  }
-
-  T arr[D1];
-};
-
 // values from 32 to 512 give good results.
 // 32 gives slightly better results (on a K40)
-#define BLOCK_SIZE_X 32
-#define MAX_BLOCKS_X 65535 // CUDA constraint
-
-#if 0
-__device__ float hipo(float x, float y) {
-  return std::sqrt(x*x + y*y);
-}
-
-__device__ void sincos4_cu(float x, float& sin, float& cos) {
-   // Had this writen with explicit division by factorial.
-   // The *whole* fitting test ran like 2.5% slower on MIC, sigh.
-   cos  = 1;
-   sin  = x;   x *= x * 0.5f;
-   cos -= x;   x *= x * 0.33333333f;
-   sin -= x;   x *= x * 0.25f;
-   cos += x;
-}
-#endif
+constexpr int BLOCK_SIZE_X = 32;
+constexpr int MAX_BLOCKS_X = 65535; // CUDA constraint
 
 // computeJacobianSimple works on values that are in registers.
 // Registers are thread-private. Thus this function has no notion of
@@ -122,10 +87,10 @@ __device__ void assignMsRad_fn(const float r, float* msRad, int N, int n) {
 
 // Not passing msRad.stride, as QF == 1 (second dim f msRad)
 __device__ void computeMsRad_fn(const GPlexHV& __restrict__ msPar,
-    float* msRad, int N, int n) {
+    GPlexRegQF &msRad, int N, int n) {
   /*int n = threadIdx.x + blockIdx.x * blockDim.x;*/
   if (n < N) {
-    *msRad = hipo(msPar.ptr[n], msPar.ptr[n + msPar.stride]);
+    msRad(n, 0, 0) = hipo(msPar(n, 0, 0), msPar(n, 1, 0));
   }
 }
 
@@ -156,9 +121,9 @@ void helixAtRFromIterative_fn(const GPlexLV& inPar,
 }
 
 /// Similarity ////////////////////////////////////////////////////////////////
-__device__ void similarity_fn(float* a, float *b, size_t stride_outErr,
-    int N, int n) {
-  size_t bN = stride_outErr;
+__device__ void similarity_fn(GPlexRegLL &a, GPlexLS &b, int N, int n) {
+
+  size_t bN = b.stride;
   
   // Keep most values in registers.
   float b_reg[LL2];
@@ -257,9 +222,9 @@ __global__ void propagation_kernel(
 
   int grid_width = blockDim.x * gridDim.x;
   int n = threadIdx.x + blockIdx.x * blockDim.x;
-  GPlexReg<float,1,1> msRad_reg;
+  GPlexRegQF msRad_reg;
   // Using registers instead of shared memory is ~ 30% faster.
-  GPlexReg<float, LL2, L> errorProp_reg;
+  GPlexRegLL errorProp_reg;
   // If there is more matrices than MAX_BLOCKS_X * BLOCK_SIZE_X 
   for (int z = 0; z < (N-1)/grid_width  +1; z++) {
     n += z*grid_width;
@@ -276,9 +241,14 @@ __global__ void propagation_kernel(
       }
       similarity_fn(errorProp_reg, outErr, outErr_stride, N, n);
 #endif
-      computeMsRad_fn(msPar, msRad_reg.arr, N, n);
+      computeMsRad_fn(msPar, msRad_reg, N, n);
+#ifdef POLCOORD
+      // FIXME: port me
+      // helixAtRFromIterativePolar(inPar, inChg, outPar, msRad, errorProp);
+#else
       helixAtRFromIterative_fn(inPar, inChg, outPar, msRad_reg, errorProp_reg, N, n);
-      similarity_fn(errorProp_reg.arr, outErr.ptr, outErr.stride, N, n);
+#endif
+      similarity_fn(errorProp_reg, outErr, N, n);
     }
   }
 }

From 65fb82409f0dae803565e76191c3a88049cbf4a1 Mon Sep 17 00:00:00 2001
From: Matthieu Lefebvre <ml15@princeton.edu>
Date: Thu, 16 Jun 2016 15:51:41 -0400
Subject: [PATCH 06/13] New gpu data structure: events of layer of hits (no
 bunches)

  * everything lives on the gpu
  * This is a squash of the following steps

      - Fixes bestHit_wrapper and kernels for LayerOfHits
      - Adds indices selection on the GPU
      - Starts regrouping gpu bestit routines together
      - Re-includes Dan's PropagatinMPlex.icc in the prop. kernels
      - Cleans up HistStructureCU
---
 Makefile.config                  |   3 +-
 mkFit/FitterCU-imp.h             | 187 +++++++----------------
 mkFit/FitterCU.h                 |  20 +--
 mkFit/GPlex.h                    |   1 +
 mkFit/HitStructures.cc           |   3 +-
 mkFit/HitStructuresCU.cu         |  81 +++++-----
 mkFit/HitStructuresCU.h          |  47 +++++-
 mkFit/MkBuilder.cc               | 245 +++++--------------------------
 mkFit/MkFitter.cc                |   2 +-
 mkFit/PropagationMPlex.cc        |  43 ++++--
 mkFit/computeChi2_kernels.cu     |  56 +++----
 mkFit/computeChi2_kernels.h      |   4 +-
 mkFit/index_selection_kernels.cu | 204 +++++++++++++++++++++++++
 mkFit/index_selection_kernels.h  |  11 ++
 mkFit/propagation_kernels.cu     | 143 ++++++++++++++----
 mkFit/propagation_kernels.h      |   7 +-
 16 files changed, 571 insertions(+), 486 deletions(-)
 create mode 100644 mkFit/index_selection_kernels.cu
 create mode 100644 mkFit/index_selection_kernels.h

diff --git a/Makefile.config b/Makefile.config
index 0c1d054f07f06..3f02cae6bf2f0 100644
--- a/Makefile.config
+++ b/Makefile.config
@@ -19,6 +19,7 @@
 # OSXGCC5    := yes
 # To keep Dan's version working
 # TBB_PREFIX := tbb
+#TBB_PREFIX := ${TBBROOT}
 
 # 1. Use ROOT or not (never used on MIC)
 # Comment out to disable root ("yes" is not relevant)
@@ -34,7 +35,7 @@ endif
 
 # 2.1 Use nvcc to compile cuda code
 # CUDA compiler
-NV := nvcc
+NV := nvcc -prec-sqrt=true
 # Comment out to compile for CPU
 USE_CUDA := yes
 
diff --git a/mkFit/FitterCU-imp.h b/mkFit/FitterCU-imp.h
index 5db230402319f..b6023428dcfa6 100644
--- a/mkFit/FitterCU-imp.h
+++ b/mkFit/FitterCU-imp.h
@@ -1,4 +1,5 @@
 #include <cstdlib>
+#include "Config.h"
 
 template <typename T>
 void FitterCU<T>::setNumberTracks(idx_t Ntracks) {
@@ -107,8 +108,8 @@ void FitterCU<T>::computeChi2gpu(const MPlexLS &psErr, const MPlexLV& propPar,
   d_msErr.copyAsyncFromHost(stream, msErr);
   d_msPar.copyAsyncFromHost(stream, msPar);
   //d_XHitPos.copyAsyncFromHost(stream, XHitPos);
-  d_XHitSize.copyAsyncFromHost(stream, XHitSize);
-  d_XHitArr.copyAsyncFromHost(stream, XHitArr);
+  //d_XHitSize.copyAsyncFromHost(stream, XHitSize);
+  //d_XHitArr.copyAsyncFromHost(stream, XHitArr);
 
   //cudaMemcpy2DAsync(d_Chi2, NN*sizeof(float), Chi2.fArray, NN*sizeof(float), 
                //NN*sizeof(float), 1, cudaMemcpyHostToDevice, stream);
@@ -146,9 +147,9 @@ void FitterCU<T>::computeChi2gpu(const MPlexLS &psErr, const MPlexLV& propPar,
   updateTracksWithBestHit_wrapper(stream, d_layer, d_minChi2, d_bestHit, 
     d_msErr, d_msPar, d_par_iP, d_Chi2, d_HitsIdx, N);
 
-  d_outChi2.copyAsyncToHost(stream, outChi2);
-  cudaMemcpyAsync(minChi2, d_minChi2, NN*sizeof(float), cudaMemcpyDeviceToHost, stream);
-  cudaMemcpyAsync(bestHit, d_bestHit, NN*sizeof(int), cudaMemcpyDeviceToHost, stream);
+  //d_outChi2.copyAsyncToHost(stream, outChi2);
+  //cudaMemcpyAsync(minChi2, d_minChi2, NN*sizeof(float), cudaMemcpyDeviceToHost, stream);
+  //cudaMemcpyAsync(bestHit, d_bestHit, NN*sizeof(int), cudaMemcpyDeviceToHost, stream);
 
   cudaMemcpy2DAsync(Chi2.fArray, NN*sizeof(float), d_Chi2, NN*sizeof(float), 
                NN*sizeof(float), 1, cudaMemcpyDeviceToHost, stream);
@@ -170,88 +171,6 @@ void FitterCU<T>::computeChi2gpu(const MPlexLS &psErr, const MPlexLV& propPar,
 }
 #endif
 
-#if 0
-template <typename T>
-void FitterCU<T>::computeChi2gpu(const MPlexLS &psErr, const MPlexLV& propPar,
-    const MPlexQI &inChg, MPlexHS &msErr, MPlexHV& msPar,
-    BunchOfHitsCU &d_bunch, //MPlexQI &XHitPos, MPlexQI &XHitSize,
-    MPlexQF &Chi2, MPlexQI &HitsIdx,
-    int NN) {
-
-  //float *d_minChi2;
-  //int *d_bestHit;
-  //cudaMalloc((void**)&d_minChi2, NN*sizeof(float));
-  //cudaMalloc((void**)&d_bestHit, NN*sizeof(int));
-
-  //cudaMemcpyAsync(d_minChi2, minChi2, NN*sizeof(float), cudaMemcpyHostToDevice, stream);
-  //cudaMemcpyAsync(d_bestHit, bestHit, NN*sizeof(int), cudaMemcpyHostToDevice, stream);
-
-  //cudaMemset(d_bestHit, -1, NN*sizeof(int));
-  //fill_array_cu(d_minChi2, NN, 15.f);
-
-  //d_Err_iP.copyAsyncFromHost(stream, psErr);
-  //d_par_iP.copyAsyncFromHost(stream, propPar);
-  //d_msErr.copyAsyncFromHost(stream, msErr);
-  //d_msPar.copyAsyncFromHost(stream, msPar);
-  //d_XHitPos.copyAsyncFromHost(stream, XHitPos);
-  //d_XHitSize.copyAsyncFromHost(stream, XHitSize);
-
-  //cudaMemcpy2DAsync(d_Chi2, NN*sizeof(float), Chi2.fArray, NN*sizeof(float), 
-               //NN*sizeof(float), 1, cudaMemcpyHostToDevice, stream);
-  //cudaMemcpy2DAsync(d_HitsIdx, NN*sizeof(int), HitsIdx.fArray, NN*sizeof(int), 
-               //NN*sizeof(int), 1, cudaMemcpyHostToDevice, stream);
-
-  //cudaStreamSynchronize(stream);
-  //cudaCheckError();
-
-  selectHitRanges_wrapper(stream, d_bunch, d_XHitPos, d_XHitSize, 
-      d_Err_iP, d_par_iP, N);
-
-  int maxSize2 = getMaxNumHits_wrapper(d_XHitSize, N);
-  bestHit_wrapper(stream, d_bunch, d_XHitPos,
-                  d_Err_iP, d_msErr, d_msPar, d_par_iP, d_outChi2,
-                  d_Chi2, d_HitsIdx,
-                  maxSize2, N);
-#if 0
-  for (int hit_cnt = 0; hit_cnt < maxSize2; ++hit_cnt)
-  {
-    // TODO: add CMSGeom
-    if (Config::useCMSGeom) {
-      //propagateHelixToRMPlex(psErr,  psPar, inChg,  msPar, propErr, propPar);
-      throw std::runtime_error("useCMSGeom not implemented yet for GPU");
-    } else {}
-    HitToMs_wrapper(stream, d_msErr, d_msPar, d_bunch, d_XHitPos, hit_cnt, NN);
-
-    computeChi2_wrapper(stream, d_Err_iP, d_msErr, //d_resErr, 
-        d_msPar, d_par_iP, d_outChi2, NN);
-
-    getNewBestHitChi2_wrapper(stream, d_outChi2, d_minChi2, d_bestHit, hit_cnt, NN);
-
-    cudaStreamSynchronize(stream);
-    cudaCheckError();
-  }
-  updateTracksWithBestHit_wrapper(stream,
-    d_bunch, d_XHitPos, d_minChi2, d_bestHit, 
-    d_msErr, d_msPar, d_par_iP, d_Chi2, d_HitsIdx, N);
-#endif
-
-  //d_outChi2.copyAsyncToHost(stream, outChi2);
-  //cudaMemcpyAsync(minChi2, d_minChi2, NN*sizeof(float), cudaMemcpyDeviceToHost, stream);
-  //cudaMemcpyAsync(bestHit, d_bestHit, NN*sizeof(int), cudaMemcpyDeviceToHost, stream);
-
-  //cudaMemcpy2DAsync(Chi2.fArray, NN*sizeof(float), d_Chi2, NN*sizeof(float), 
-  //             NN*sizeof(float), 1, cudaMemcpyDeviceToHost, stream);
-  //cudaMemcpy2DAsync(HitsIdx.fArray, NN*sizeof(int), d_HitsIdx, NN*sizeof(int), 
-  //             NN*sizeof(int), 1, cudaMemcpyDeviceToHost, stream);
-
-  //cudaStreamSynchronize(stream);
-  //cudaCheckError();
-
-  //cudaFree(d_minChi2);
-  //cudaFree(d_bestHit);
-}
-#endif
-
 // FIXME: Temporary. Separate allocations / transfers
 template <typename T>
 void FitterCU<T>::allocate_extra_addBestHit() {
@@ -267,12 +186,14 @@ void FitterCU<T>::allocate_extra_addBestHit() {
 
 template <typename T>
 void FitterCU<T>::free_extra_addBestHit() {
+  destroyStream();
+
   cudaFree(d_HitsIdx); cudaCheckError();
   cudaFree(d_Chi2); cudaCheckError();
 
-  d_XHitPos.free(); cudaCheckError();
-  d_XHitSize.free(); cudaCheckError();
   d_XHitArr.free(); cudaCheckError();
+  d_XHitSize.free(); cudaCheckError();
+  d_XHitPos.free(); cudaCheckError();
   d_outChi2.free(); cudaCheckError();
 }
 
@@ -280,7 +201,8 @@ void FitterCU<T>::free_extra_addBestHit() {
 template <typename T>
 void FitterCU<T>::prepare_addBestHit(
     const MPlexLS &psErr, const MPlexLV& propPar,
-    const MPlexQI &inChg, 
+    const MPlexQI &inChg,
+    MPlexQI &XHitSize, Matriplex::Matriplex<int, 16, 1, MPT_SIZE> &XHitArr,
     size_t NN) {
   setNumberTracks(NN);  // temporary: should be end - beg
 
@@ -288,12 +210,16 @@ void FitterCU<T>::prepare_addBestHit(
   cudaCheckError()
 #if 1
   // psErr -> d_Err_iP
-  cudaMemcpy2DAsync(d_Err_iP.ptr, d_Err_iP.pitch, psErr.fArray, N*sizeof(T),
-               N*sizeof(T), LS, cudaMemcpyHostToDevice, stream);
-  // sendOutParToDevice(propPar);  // d_par_iP
+  //cudaMemcpy2DAsync(d_Err_iP.ptr, d_Err_iP.pitch, psErr.fArray, N*sizeof(T),
+               //N*sizeof(T), LS, cudaMemcpyHostToDevice, stream);
+  d_Err_iP.copyAsyncFromHost(stream, psErr);
   d_par_iP.copyAsyncFromHost(stream, propPar);
-  //sendInChgToDevice(inChg);
   d_inChg.copyAsyncFromHost(stream, inChg);
+  
+  //cudaMemset2D(d_XHitSize.ptr, d_XHitSize.pitch,
+               //0, sizeof(int)*d_XHitSize.N, d_XHitSize.kSize);
+  //d_XHitSize.copyAsyncFromHost(stream, XHitSize);
+  //d_XHitArr.copyAsyncFromHost(stream, XHitArr);
 #endif
 }
 
@@ -301,25 +227,22 @@ void FitterCU<T>::prepare_addBestHit(
 template <typename T>
 void FitterCU<T>::finalize_addBestHit(
     MPlexHS &msErr, MPlexHV& msPar,
-    MPlexLS &outErr, MPlexLV &outPar,
+    MPlexLS& Err_iC, MPlexLV& Par_iC, 
+    MPlexLS& Err_iP, MPlexLV& Par_iP, 
     MPlexQI &HitsIdx, MPlexQF &Chi2) {
 #if 1
-  //getOutParFromDevice(outPar);  // <- d_par_iC
-  d_par_iC.copyAsyncToHost(stream, outPar);
-  //getOutErrFromDevice(outErr);  // <- d_Err_iC
-  d_Err_iC.copyAsyncToHost(stream, outErr);
+  d_par_iC.copyAsyncToHost(stream, Par_iC);
+  d_Err_iC.copyAsyncToHost(stream, Err_iC);
 
-  //
+  d_par_iP.copyAsyncToHost(stream, Par_iP);
+  d_Err_iP.copyAsyncToHost(stream, Err_iP);
+ 
   // Get msPar, msErr, chi2 and HitIdx out from the GPU to the CPU
-  cudaMemcpy2DAsync(msPar.fArray, N*sizeof(T), d_msPar.ptr, d_msPar.pitch, 
-               N*sizeof(T), HV, cudaMemcpyDeviceToHost, stream);
-  cudaMemcpy2DAsync(msErr.fArray, N*sizeof(T), d_msErr.ptr, d_msErr.pitch, 
-               N*sizeof(T), HS, cudaMemcpyDeviceToHost, stream);
+  d_msPar.copyAsyncToHost(stream, msPar);
+  d_msErr.copyAsyncToHost(stream, msErr);
   cudaMemcpyAsync(HitsIdx.fArray, d_HitsIdx, N*sizeof(int), cudaMemcpyDeviceToHost, stream);
   cudaMemcpyAsync(Chi2.fArray, d_Chi2, N*sizeof(float), cudaMemcpyDeviceToHost, stream);
 #endif
-
-  destroyStream();
 }
 
 template <typename T>
@@ -327,41 +250,52 @@ void FitterCU<T>::setHitsIdxToZero() {
   cudaMemset(d_HitsIdx, 0, Nalloc*sizeof(int));
 }
 
-#if 0
+#if 1
 template <typename T>
-void FitterCU<T>::addBestHit(LayerOfHitsCU &layer) {
+void FitterCU<T>::addBestHit(LayerOfHitsCU &layer, const int ilay, const float radius) {
 
-  selectHitRanges_wrapper(stream, layer, d_XHitPos, d_XHitSize, 
-      d_Err_iP, d_par_iP, N);
+  //selectHitRanges_wrapper(stream, layer, d_XHitPos, d_XHitSize, 
+      //d_Err_iP, d_par_iP, N);
+  selectHitIndices_wrapper(stream, 
+      layer, d_Err_iP, d_par_iP, 
+      d_XHitSize, d_XHitArr, N);
 
   // TODO: get this thing inside bestHit_kernel
   int maxSize = getMaxNumHits_wrapper(d_XHitSize, N);
+  cudaDeviceSynchronize(); cudaCheckError();
 
-  bestHit_wrapper(stream, bunch, d_XHitPos,
+  bestHit_wrapper(stream, layer, d_XHitSize, d_XHitArr,
                   d_Err_iP, d_msErr, d_msPar, d_par_iP, d_outChi2,
                   d_Chi2, d_HitsIdx,
                   maxSize, N);
-
-  // updateParametersMPlex
   kalmanUpdate_wrapper(stream, d_Err_iP, d_msErr,
                        d_par_iP, d_msPar, d_par_iC, d_Err_iC, N);
-  //updateParametersMPlex(Err[iP], Par[iP], Chg, msErr[Nhits], msPar[Nhits],
-	//    Err[iC], Par[iC]);
+  if (ilay + 1 < Config::nLayers) {
+    propagationForBuilding_wrapper(stream, d_Err_iC, d_par_iC, d_inChg, 
+        radius, d_Err_iP, d_par_iP, N); 
+  }
 }   
 #endif
 
-#if 0 
+#if 1
 template <typename T>
 void FitterCU<T>::propagateTracksToR(float radius, int N) {
-  //propagateHelixToRMPlex(Err[iC], Par[iC], Chg, R,
-                         //Err[iP], Par[iP], N_proc);
-  propagationForBuilding_wrapper(stream, radius,
-    d_par_iC, d_inChg, d_par_iP, d_errorProp, d_Err_iP, N);
-  //propagation_wrapper(stream, d_msPar, d_par_iC, d_inChg,
-  //                    d_par_iP, d_errorProp, d_Err_iP, N);
+  propagationForBuilding_wrapper(stream, d_Err_iC, d_par_iC, d_inChg, 
+                                 radius, d_Err_iP, d_par_iP, N); 
 }
 #endif
 
+template <typename T>
+void FitterCU<T>::propagateTracksToR_standalone(float radius, int N,
+    MPlexLS& Err_iC, MPlexLV& par_iC, MPlexQI& inChg, MPlexLS& Err_iP, MPlexLV& Par_iP) {
+  d_Err_iC.copyAsyncFromHost(stream, Err_iC);
+  d_par_iC.copyAsyncFromHost(stream, par_iC);
+  //propagationForBuilding_wrapper(stream, d_Err_iC, d_par_iC, d_inChg, 
+                                 //radius, d_Err_iP, d_par_iP, N); 
+  d_Err_iP.copyAsyncToHost(stream, Err_iP);
+  d_par_iP.copyAsyncToHost(stream, Par_iP);
+}
+
 template <typename T>
 void FitterCU<T>::FitTracks(MPlexQI &Chg, MPlexLV& par_iC, MPlexLS& err_iC,
                             MPlexHV* msPar, MPlexHS* msErr, int Nhits,
@@ -379,11 +313,8 @@ void FitterCU<T>::FitTracks(MPlexQI &Chg, MPlexLV& par_iC, MPlexLS& err_iC,
 
   setNumberTracks(end-beg);
 
-  //sendInChgToDevice(Chg);
   d_inChg.copyAsyncFromHost(stream, Chg);
-  //sendInParToDevice(par_iC);
   d_par_iC.copyAsyncFromHost(stream, par_iC);
-  //sendInErrToDevice(err_iC);
   d_Err_iC.copyAsyncFromHost(stream, err_iC);
 
   cudaEventRecord(start, 0);
@@ -393,9 +324,7 @@ void FitterCU<T>::FitTracks(MPlexQI &Chg, MPlexLV& par_iC, MPlexLS& err_iC,
   {
     // Switch outPut and inPut parameters and errors
     // similar to iC <-> iP in the CPU code.
-    //setOutParFromInPar();
     d_par_iP.copyAsyncFromDevice(stream, d_par_iC); 
-    //setOutErrFromInErr(); // d_Err_iP
     d_Err_iP.copyAsyncFromDevice(stream, d_Err_iC);
     
     double time_input = dtime();
@@ -414,9 +343,7 @@ void FitterCU<T>::FitTracks(MPlexQI &Chg, MPlexLV& par_iC, MPlexLS& err_iC,
     }
     total_reorg += (dtime() - time_input)*1e3;
 
-    //sendMsParToDevice(msPar[hi]);
     d_msPar.copyAsyncFromHost(stream, msPar[hi]);
-    //sendMsErrToDevice(msErr[hi]);
     d_msErr.copyAsyncFromHost(stream, msErr[hi]);
 
     propagationMerged();
@@ -429,9 +356,7 @@ void FitterCU<T>::FitTracks(MPlexQI &Chg, MPlexLV& par_iC, MPlexLS& err_iC,
   std::cerr << "CUDA etime: " << etime << " ms.\n";
   std::cerr << "Total reorg: " << total_reorg << " ms.\n";
 
-  //getOutParFromDevice(par_iC);
   d_par_iC.copyAsyncToHost(stream, par_iC);
-  //getOutErrFromDevice(err_iC);
   d_Err_iC.copyAsyncToHost(stream, err_iC);
   
   cudaStreamSynchronize(stream);
diff --git a/mkFit/FitterCU.h b/mkFit/FitterCU.h
index 11d0172e01625..f7c5666882ee6 100644
--- a/mkFit/FitterCU.h
+++ b/mkFit/FitterCU.h
@@ -9,6 +9,8 @@
 #include "propagation_kernels.h"
 #include "kalmanUpdater_kernels.h"
 #include "computeChi2_kernels.h"
+#include "index_selection_kernels.h"
+
 #include "HitStructuresCU.h"
 #include "GPlex.h"
 
@@ -75,13 +77,6 @@ class FitterCU {
     MPlexQF &Chi2, MPlexQI &HitsIdx, MPlexQF&outChi2, int maxSize,
     int NN);
 #endif
-#if 0
-  void computeChi2gpu(const MPlexLS &psErr, const MPlexLV& propPar,
-    const MPlexQI &inChg, MPlexHS &msErr, MPlexHV& msPar,
-    BunchOfHitsCU &d_bunch, //MPlexQI &XHitPos, MPlexQI &XHitSize,
-    MPlexQF &Chi2, MPlexQI &HitsIdx,
-    int NN);
-#endif
 
   void allocate_extra_addBestHit();
   void free_extra_addBestHit();
@@ -89,16 +84,23 @@ class FitterCU {
   void prepare_addBestHit(
       const MPlexLS &psErr, const MPlexLV& propPar,
       const MPlexQI &inChg, 
+      MPlexQI &XHitSize, Matriplex::Matriplex<int, 16, 1, MPT_SIZE> &XHitArr,
       size_t NN);
   void finalize_addBestHit(
       MPlexHS &msErr, MPlexHV& msPar,
-      MPlexLS &outErr, MPlexLV &outPar,
+      MPlexLS& Err_iC, MPlexLV& Par_iC, 
+      MPlexLS& Err_iP, MPlexLV& Par_iP, 
       MPlexQI &HitsIdx, MPlexQF &Chi2);
   void setHitsIdxToZero();
 
 #if 1
-  void addBestHit(LayerOfHitsCU &layer_of_hits_cu);
+  void addBestHit(LayerOfHitsCU &layer_of_hits_cu, const int ilay, const float radius);
 #endif
+  void propagateTracksToR(float radius, int N);
+  void propagateTracksToR_standalone(float radius, int N,
+      MPlexLS& Err_iC, MPlexLV& par_iC, 
+      MPlexQI& inChg, 
+      MPlexLS& Err_iP, MPlexLV& Par_iP);
 
   // fitting higher order methods
   void FitTracks(MPlexQI &Chg, MPlexLV& par_iC, MPlexLS& err_iC,
diff --git a/mkFit/GPlex.h b/mkFit/GPlex.h
index ae802842e89ab..2892f1e4cfa85 100644
--- a/mkFit/GPlex.h
+++ b/mkFit/GPlex.h
@@ -23,6 +23,7 @@
 // In practice, The number of tracks (ntracks) is set to be MPT_SIZE
 template <typename T, typename M>
 struct GPlex { 
+  using value_type = T;
   T* ptr;
   size_t pitch, stride, N, kSize;
 
diff --git a/mkFit/HitStructures.cc b/mkFit/HitStructures.cc
index 39905bfca5956..bfc3b47808d1e 100644
--- a/mkFit/HitStructures.cc
+++ b/mkFit/HitStructures.cc
@@ -187,7 +187,7 @@ int LayerOfHits::SelectHitIndices(float z, float phi, float dz, float dphi, bool
   // zb1 -= 2; if (zb < 0) zb = 0;
   // zb2 += 2; if (zb >= m_nz) zb = m_nz;
 
-  if (dump)
+  //if (dump)
     printf("LayerOfHits::SelectHitIndices %6.3f %6.3f %6.4f %7.5f %3d %3d %4d %4d\n",
            z, phi, dz, dphi, zb1, zb2, pb1, pb2);
 
@@ -202,6 +202,7 @@ int LayerOfHits::SelectHitIndices(float z, float phi, float dz, float dphi, bool
 
       for (int hi = m_phi_bin_infos[zi][pb].first; hi < m_phi_bin_infos[zi][pb].second; ++hi)
       {
+        printf("hi : %d\n", hi);
         // Here could enforce some furhter selection on hits
 #ifdef LOH_USE_PHI_Z_ARRAYS
         float ddz   = std::abs(z   - m_hit_zs[hi]);
diff --git a/mkFit/HitStructuresCU.cu b/mkFit/HitStructuresCU.cu
index 3381724d7448f..4a19d7a3e5805 100644
--- a/mkFit/HitStructuresCU.cu
+++ b/mkFit/HitStructuresCU.cu
@@ -26,52 +26,57 @@ void LayerOfHitsCU::free_phi_bin_infos() {
 
 void LayerOfHitsCU::copyLayerOfHitsFromCPU(LayerOfHits &layer) {
   cudaMemcpy(m_hits, layer.m_hits, sizeof(Hit)*m_capacity, cudaMemcpyHostToDevice);
+  cudaCheckError();
+  m_zmin = layer.m_zmin;
+  m_zmax = layer.m_zmax;
+  m_fz = layer.m_fz;
+  // FIXME: copy other values
   // TODO: probably quite inefficient:
   for (int i = 0; i < m_nz; ++i) {
-    cudaMemcpy(m_phi_bin_infos + i*m_nphi, &(layer.m_phi_bin_infos[i][0]), sizeof(int)*m_nphi, cudaMemcpyHostToDevice);
+    cudaMemcpy(m_phi_bin_infos + i*m_nphi, &(layer.m_phi_bin_infos[i][0]), 
+               sizeof(PairIntsCU)*m_nphi, cudaMemcpyHostToDevice);
+    cudaCheckError();
   }
 }
 
-#if 0
-BunchOfHitsCU::BunchOfHitsCU() :
-      m_real_size {Config::maxHitsPerBunch}, m_fill_index {0} {
-  cudaMalloc((void**)&m_hits, sizeof(Hit)*m_real_size);
-}
-
-BunchOfHitsCU::~BunchOfHitsCU() {
-  cudaFree(m_hits);
-  m_fill_index = 0;
-}
-
-void BunchOfHitsCU::copyBunchOfHitsFromCPU(BunchOfHits& bunch) {
-  m_fill_index = bunch.m_fill_index;
-  cudaMemcpy(m_hits, bunch.m_hits, sizeof(Hit)*m_fill_index, cudaMemcpyHostToDevice);
-}
-
-void BunchOfHitsCU::allocatePhiBinInfos(int num_phi_bins) {
-  this->num_phi_bins = num_phi_bins;
-  cudaMalloc((void**)&m_phi_bin_infos_first, sizeof(int)*num_phi_bins);
-  cudaMalloc((void**)&m_phi_bin_infos_second, sizeof(int)*num_phi_bins);
+void EventOfHitsCU::allocGPU(EventOfHits &event_of_hits) {
+  m_n_layers = event_of_hits.m_n_layers;
+  // Allocate GPU array. 
+  // Members's address  of array's elements are in the GPU space
+  cudaMalloc((void**)&m_layers_of_hits, m_n_layers*sizeof(LayerOfHitsCU));
+  cudaCheckError();
+  // Allocate CPU array. 
+  // Members's address  of array's elements are in the CPU space
+  // This allows to call allocate for each array's element.
+  m_layers_of_hits_alloc = new LayerOfHitsCU[m_n_layers];
+  for (int i = 0; i < m_n_layers; ++i) {
+    m_layers_of_hits_alloc[i].alloc_hits(event_of_hits.m_layers_of_hits[i].m_capacity);
+    m_layers_of_hits_alloc[i].alloc_phi_bin_infos(
+        event_of_hits.m_layers_of_hits[i].m_nz, 
+        event_of_hits.m_layers_of_hits[i].m_nphi);
+  }
+  cudaCheckError();
 }
 
-void BunchOfHitsCU::freePhiBinInfos() {
-  cudaFree(m_phi_bin_infos_first);
-  cudaFree(m_phi_bin_infos_second);
+void EventOfHitsCU::deallocGPU() {
+  for (int i = 0; i < m_n_layers; ++i) {
+    cudaCheckError();
+    m_layers_of_hits_alloc[i].free_hits();
+    m_layers_of_hits_alloc[i].free_phi_bin_infos();
+    cudaCheckError();
+  }
+  cudaFree(m_layers_of_hits);
+  cudaCheckError();
+  delete[] m_layers_of_hits_alloc;
 }
 
-void BunchOfHitsCU::copyPhiBinInfosFromCPU(BunchOfHits &bunch) {
-  // Strip the bin_infos pairs into two separate vectors
-  // We cannot use std::pair on the GPU
-  std::vector<int> first(num_phi_bins);
-  std::vector<int> second(num_phi_bins);
-
-  for (int i = 0; i < num_phi_bins; ++i) {
-    std::pair<int, int> &infos = bunch.m_phi_bin_infos[i];  
-    first[i] = infos.first;
-    second[i] = infos.second;
+void EventOfHitsCU::copyFromCPU(EventOfHits& event_of_hits) {
+  for (int i = 0; i < event_of_hits.m_n_layers; i++) {
+    m_layers_of_hits_alloc[i].copyLayerOfHitsFromCPU(event_of_hits.m_layers_of_hits[i]);
   }
-
-  cudaMemcpy(m_phi_bin_infos_first, &first[0], sizeof(int)*num_phi_bins, cudaMemcpyHostToDevice);
-  cudaMemcpy(m_phi_bin_infos_second, &second[0], sizeof(int)*num_phi_bins, cudaMemcpyHostToDevice);
+  cudaCheckError();
+  cudaMemcpy(m_layers_of_hits, m_layers_of_hits_alloc, 
+      event_of_hits.m_n_layers*sizeof(LayerOfHitsCU), 
+      cudaMemcpyHostToDevice);
+  cudaCheckError();
 }
-#endif
diff --git a/mkFit/HitStructuresCU.h b/mkFit/HitStructuresCU.h
index 25ca7f93047ca..3851dabfc59fc 100644
--- a/mkFit/HitStructuresCU.h
+++ b/mkFit/HitStructuresCU.h
@@ -4,6 +4,14 @@
 #include "HitStructures.h"
 #include "Config.h"
 
+#define cudaCheckError() {                                          \
+  cudaError_t e=cudaGetLastError();                                 \
+  if(e!=cudaSuccess) {                                              \
+    printf("Cuda failure %s:%d: '%s'\n",__FILE__,__LINE__,cudaGetErrorString(e));           \
+    exit(0); \
+  }                                                                 \
+}
+
 template <typename T1, typename T2>
 struct PairCU {
   T1 first;
@@ -15,11 +23,6 @@ using PairIntsCU = PairCU<int, int>;
 class LayerOfHitsCU {
  public:
   Hit *m_hits;
-
-  //int num_phi_bins;
-  //int num_z_bins;
-  //int *m_phi_bin_infos_first;
-  //int *m_phi_bin_infos_second;
   PairIntsCU *m_phi_bin_infos;
 
   float m_zmin, m_zmax, m_fz;
@@ -48,11 +51,39 @@ class LayerOfHitsCU {
 
   void copyLayerOfHitsFromCPU(LayerOfHits &layer);
 
-  //void copyBunchOfHitsFromCPU(BunchOfHits &bunch);
+#ifdef __CUDACC__
+  __device__
+#endif
+  int   GetZBin(float z)    const { return (z - m_zmin) * m_fz; }
+    
+#ifdef __CUDACC__
+  __device__
+#endif
+  int   GetZBinChecked(float z) const { int zb = GetZBin(z); if (zb < 0) zb = 0; else if (zb >= m_nz) zb = m_nz - 1; return zb; }
+
+  // if you don't pass phi in (-pi, +pi), mask away the upper bits using m_phi_mask
+#ifdef __CUDACC__
+  __device__
+#endif
+  int   GetPhiBin(float phi) const { return floorf(m_fphi * (phi + Config::PI)); }
+};
+
+
+class EventOfHitsCU 
+{
+public:
+  LayerOfHitsCU *m_layers_of_hits;  // the real stuff: on GPU
+  int            m_n_layers;
 
-  //void allocatePhiBinInfos(int num_phi_bins);
-  //void freePhiBinInfos();
+  // The following array is to be able to allocate GPU arrays from
+  // the CPU and then copy the address of the GPU ptr to the GPU structure
+  LayerOfHitsCU *m_layers_of_hits_alloc;
+  
+  EventOfHitsCU() : m_n_layers{} {};
 
+  void allocGPU(EventOfHits &event_of_hits);
+  void deallocGPU();
+  void copyFromCPU(EventOfHits& event_of_hits);
 };
 
 #endif  // _HIT_STRUCTURES_H_
diff --git a/mkFit/MkBuilder.cc b/mkFit/MkBuilder.cc
index 5cd2dd62f8b84..0896414abb621 100644
--- a/mkFit/MkBuilder.cc
+++ b/mkFit/MkBuilder.cc
@@ -299,6 +299,18 @@ void MkBuilder::find_tracks_load_seeds(EventOfCandidates& event_of_cands)
 
 void MkBuilder::FindTracksBestHit(EventOfCandidates& event_of_cands)
 {
+#ifdef USE_CUDA
+  EventOfHitsCU event_of_hits_cu;
+  event_of_hits_cu.allocGPU(m_event_of_hits);
+  event_of_hits_cu.copyFromCPU(m_event_of_hits);
+
+  //printf("cpu: %d  -- gpu %d\n", sizeof(LayerOfHits), sizeof(LayerOfHitsCU));
+  
+  LayerOfHits& l = m_event_of_hits.m_layers_of_hits[Config::nlayers_per_seed];
+  //printf("info %d\n", l.m_phi_bin_infos[0][10].first);
+  //printf("cpu: %f, %f, %f\n", l.m_zmin, l.m_zmax, l.m_fz);
+
+#endif
   tbb::parallel_for(tbb::blocked_range<int>(0, Config::nEtaBin),
     [&](const tbb::blocked_range<int>& ebins)
   {
@@ -309,6 +321,11 @@ void MkBuilder::FindTracksBestHit(EventOfCandidates& event_of_cands)
         [&](const tbb::blocked_range<int>& tracks)
       {
         std::unique_ptr<MkFitter, decltype(retfitr)> mkfp(g_exe_ctx.m_fitters.GetFromPool(), retfitr);
+#ifdef USE_CUDA
+            FitterCU<float> cuFitter(NN);
+            cuFitter.allocateDevice();
+            cuFitter.allocate_extra_addBestHit();
+#endif
         
         for (int itrack = tracks.begin(); itrack < tracks.end(); itrack += NN) {
           int end = std::min(itrack + NN, tracks.end());
@@ -342,35 +359,26 @@ void MkBuilder::FindTracksBestHit(EventOfCandidates& event_of_cands)
             //make candidates with best hit
             dprint("make new candidates");
 #ifdef USE_CUDA
-            LayerOfHitsCU layer_of_hits_cu;
-            layer_of_hits_cu.alloc_hits(layer_of_hits.m_capacity);
-            layer_of_hits_cu.alloc_phi_bin_infos(layer_of_hits.m_nz, layer_of_hits.m_nphi);
-
-            layer_of_hits_cu.copyLayerOfHitsFromCPU(layer_of_hits);
-
-            FitterCU<float> cuFitter(NN);
-            cuFitter.allocateDevice();
-            cuFitter.allocate_extra_addBestHit();
+            cuFitter.setNumberTracks(end-itrack);
             cuFitter.prepare_addBestHit(
                 mkfp->Err[mkfp->iP], mkfp->Par[mkfp->iP],
-                mkfp->Chg,
+                mkfp->Chg, 
+                mkfp->XHitSize, mkfp ->XHitArr,
                 NN);
+            
+            LayerOfHitsCU& layer_of_hits_cu = event_of_hits_cu.m_layers_of_hits_alloc[ilay];
+            float radius = (ilay + 1 < Config::nLayers) ? m_event->geom_.Radius(ilay+1) : 0.0;
+            cuFitter.addBestHit(layer_of_hits_cu, ilay, radius); 
 
-            //cuFitter.addBestHit(layer_of_hits_cu);
-            mkfp->AddBestHit_gpu(layer_of_hits, cuFitter, layer_of_hits_cu, end-itrack);
-
-            //cuFitter.finalize_addBestHit(
-                //mkfp->msErr[mkfp->Nhits], mkfp->msPar[mkfp->Nhits],
-                //mkfp->Err[mkfp->iC], mkfp->Par[mkfp->iC],
-                //mkfp->HitsIdx[mkfp->Nhits], mkfp-> Chi2);
-            cuFitter.free_extra_addBestHit();
-            cuFitter.freeDevice();
+            cuFitter.finalize_addBestHit(
+                mkfp->msErr[mkfp->Nhits], mkfp->msPar[mkfp->Nhits],
+                mkfp->Err[mkfp->iC], mkfp->Par[mkfp->iC],
+                mkfp->Err[mkfp->iP], mkfp->Par[mkfp->iP],
+                mkfp->HitsIdx[mkfp->Nhits], mkfp-> Chi2);
 
-            layer_of_hits_cu.free_phi_bin_infos();
-            layer_of_hits_cu.free_hits();
+            mkfp->SetNhits(ilay + 1);  //here again assuming one hit per layer (is this needed?)
 #else
             mkfp->AddBestHit(layer_of_hits, end - itrack);
-#endif
             mkfp->SetNhits(ilay + 1);  //here again assuming one hit per layer (is this needed?)
 
             //propagate to layer
@@ -380,48 +388,23 @@ void MkBuilder::FindTracksBestHit(EventOfCandidates& event_of_cands)
               mkfp->PropagateTracksToR(m_event->geom_.Radius(ilay+1), end - itrack);
               dcall(post_prop_print(ilay, mkfp.get()));
             }
+#endif
+            //exit(0);
 
           } // end of layer loop
           mkfp->OutputFittedTracksAndHitIdx(etabin_of_candidates.m_candidates, itrack, end, true);
         }
-      }); // end of seed loop
-    }
-  }); //end of parallel section over seeds
-}
-
-#if 0
-// FIXME: Removing BunchOfHits will yield a simpler data structure for the GPU, but for now it breaks everything
 #ifdef USE_CUDA
-            BunchOfHitsCU bunch_of_hits_cu;
-            bunch_of_hits_cu.copyBunchOfHitsFromCPU(bunch_of_hits);
-            bunch_of_hits_cu.allocatePhiBinInfos(bunch_of_hits.m_phi_bin_infos.size());
-            bunch_of_hits_cu.copyPhiBinInfosFromCPU(bunch_of_hits);
-
-            FitterCU<float> cuFitter(NN);
-            cuFitter.allocateDevice();
-            cuFitter.allocate_extra_addBestHit();
-            cuFitter.prepare_addBestHit(
-                mkfp->Err[mkfp->iP], mkfp->Par[mkfp->iP],
-                mkfp->Chg,
-                NN);
-
-            //mkfp->AddBestHit_gpu(bunch_of_hits, cuFitter, bunch_of_hits_cu);
-            cuFitter.addBestHit(bunch_of_hits_cu);
-
-            cuFitter.finalize_addBestHit(
-                mkfp->msErr[mkfp->Nhits], mkfp->msPar[mkfp->Nhits],
-                mkfp->Err[mkfp->iC], mkfp->Par[mkfp->iC],
-                mkfp->HitsIdx[mkfp->Nhits], mkfp-> Chi2);
             cuFitter.free_extra_addBestHit();
             cuFitter.freeDevice();
-
-            bunch_of_hits_cu.freePhiBinInfos();
-
-            mkfp->AddBestHit(bunch_of_hits);
-#else
-            mkfp->AddBestHit(bunch_of_hits);
 #endif
+      }); // end of seed loop
+    }
+  }); //end of parallel section over seeds
+#ifdef USE_CUDA
+  event_of_hits_cu.deallocGPU();
 #endif
+}
 
 
 //------------------------------------------------------------------------------
@@ -930,155 +913,3 @@ void MkBuilder::FindTracksCloneEngineTbb()
     }
   });
 }
-
-
-/////////////////////////////////////////////
-// Backup: conflicts with tbb. Do not know what is going wrong
-//         keep a version of the previous one calling gpu stuffs
-#if 0
-void MkBuilder::FindTracksBestHit(EventOfCandidates& event_of_cands)
-{
-#ifdef USE_CUDA  // FIXME: temporary; move to FitterCU
-        m_cuFitter_arr[omp_get_thread_num()]->allocate_extra_addBestHit();
-#endif
-
-  std::cout << "Finding best hits...\n";
-  // partition recseeds into eta bins
-  for (int iseed = 0; iseed < m_recseeds.size(); ++iseed)
-  {
-    if (m_recseeds[iseed].label() != iseed)
-    {
-      printf("Bad label for recseed %d -- %d\n", iseed, m_recseeds[iseed].label());
-    }
-
-    event_of_cands.InsertCandidate(m_recseeds[iseed]);
-  }
-
-  //dump seeds
-#ifdef DEBUG
-  for (int ebin = 0; ebin < Config::nEtaBin; ++ebin)
-  {
-    EtaBinOfCandidates &etabin_of_candidates = event_of_cands.m_etabins_of_candidates[ebin]; 
-
-    for (int iseed = 0; iseed < etabin_of_candidates.m_fill_index; iseed++)
-    {
-      Track& seed = etabin_of_candidates.m_candidates[iseed];
-      std::cout << "MX - found seed with nFoundHits=" << seed.nFoundHits() << " chi2=" << seed.chi2() 
-                << " x=" << seed.position()[0] << " y=" << seed.position()[1] << " z=" << seed.position()[2] 
-                << " px=" << seed.momentum()[0] << " py=" << seed.momentum()[1] << " pz=" << seed.momentum()[2] 
-                << " pT=" << sqrt(seed.momentum()[0]*seed.momentum()[0]+seed.momentum()[1]*seed.momentum()[1]) 
-                << std::endl;
-    }
-  }
-#endif
-
-  //parallel section over seeds; num_threads can of course be smaller
-  int nseeds = m_recseeds.size();
-
-#pragma omp parallel for
-  for (int ebin = 0; ebin < Config::nEtaBin; ++ebin)
-  {
-    // vectorized loop
-    EtaBinOfCandidates &etabin_of_candidates = event_of_cands.m_etabins_of_candidates[ebin];
-
-    for (int itrack = 0; itrack < etabin_of_candidates.m_fill_index; itrack += NN)
-    {
-      int end = std::min(itrack + NN, etabin_of_candidates.m_fill_index);
-	 
-#ifdef DEBUG
-      std::cout << std::endl;
-      std::cout << "processing track=" << itrack << " etabin=" << ebin << " findex=" << etabin_of_candidates.m_fill_index << " thn=" << omp_get_thread_num() << std::endl;
-#endif
-
-      MkFitter *mkfp = m_mkfp_arr[omp_get_thread_num()];
-
-      mkfp->SetNhits(3);//just to be sure (is this needed?)
-
-      mkfp->InputTracksAndHitIdx(etabin_of_candidates.m_candidates, itrack, end, true);
-
-      //ok now we start looping over layers
-      //loop over layers, starting from after the seed
-      //consider inverting loop order and make layer outer, need to trade off hit prefetching with copy-out of candidates
-      for (int ilay = Config::nlayers_per_seed; ilay < Config::nLayers; ++ilay)
-      {
-        BunchOfHits &bunch_of_hits = m_event_of_hits.m_layers_of_hits[ilay].m_bunches_of_hits[ebin];
-
-        // XXX This should actually be done in some other thread for the next layer while
-        // this thread is crunching the current one.
-        // For now it's done in MkFitter::AddBestHit(), two loops before the data is needed.
-        // for (int i = 0; i < bunch_of_hits.m_fill_index; ++i)
-        // {
-        //   _mm_prefetch((char*) & bunch_of_hits.m_hits[i], _MM_HINT_T1);
-        // }
-
-#ifdef USE_CUDA
-        FitterCU<float> *cuFitter = m_cuFitter_arr[omp_get_thread_num()];
-        cuFitter->prepare_addBestHit(
-            mkfp->Err[mkfp->iP], mkfp->Par[mkfp->iP],
-            mkfp->Chg,
-            NN);
-
-        BunchOfHitsCU bunch_of_hits_cu;
-        bunch_of_hits_cu.copyBunchOfHitsFromCPU(bunch_of_hits);
-        bunch_of_hits_cu.allocatePhiBinInfos(bunch_of_hits.m_phi_bin_infos.size());
-        bunch_of_hits_cu.copyPhiBinInfosFromCPU(bunch_of_hits);
-
-        cuFitter->addBestHit(bunch_of_hits_cu);
-
-        bunch_of_hits_cu.freePhiBinInfos();
-        cuFitter->finalize_addBestHit(
-            mkfp->msErr[mkfp->Nhits], mkfp->msPar[mkfp->Nhits],
-            mkfp->Err[mkfp->iC], mkfp->Par[mkfp->iC],
-            mkfp->HitsIdx[mkfp->Nhits], mkfp-> Chi2);
-
-        // ...
-        mkfp->SetNhits(ilay + 1);  //here again assuming one hit per layer (is this needed?)
-
-        if (ilay + 1 < Config::nLayers)
-        {
-          mkfp->PropagateTracksToR(m_event->geom_.Radius(ilay+1), end - itrack);
-        }
-#else
-        mkfp->SelectHitRanges(bunch_of_hits, end - itrack);
-
-// #ifdef PRINTOUTS_FOR_PLOTS
-// 	     std::cout << "MX number of hits in window in layer " << ilay << " is " <<  mkfp->getXHitEnd(0, 0, 0)-mkfp->getXHitBegin(0, 0, 0) << std::endl;
-// #endif
-
-        //make candidates with best hit
-#ifdef DEBUG
-        std::cout << "make new candidates" << std::endl;
-#endif
-        mkfp->AddBestHit(bunch_of_hits);
-
-        mkfp->SetNhits(ilay + 1);  //here again assuming one hit per layer (is this needed?)
-
-        //propagate to layer
-        // This is sort of a silly fix as no-clone-engine code produces
-        // zero good tracks with propagate-at-the-end.
-        // But at least it doesn't crash with uncaught exception :)
-        if (ilay + 1 < Config::nLayers)
-        {
-#ifdef DEBUG
-          std::cout << "propagate to lay=" << ilay+2 << " start from x=" << mkfp->getPar(0, 0, 0) << " y=" << mkfp->getPar(0, 0, 1) << " z=" << mkfp->getPar(0, 0, 2)<< " r=" << getHypot(mkfp->getPar(0, 0, 0), mkfp->getPar(0, 0, 1))
-                    << " px=" << mkfp->getPar(0, 0, 3) << " py=" << mkfp->getPar(0, 0, 4) << " pz=" << mkfp->getPar(0, 0, 5) << " pT=" << getHypot(mkfp->getPar(0, 0, 3), mkfp->getPar(0, 0, 4)) << std::endl;
-#endif
-          mkfp->PropagateTracksToR(m_event->geom_.Radius(ilay+1), end - itrack);
-#ifdef DEBUG
-          std::cout << "propagate to lay=" << ilay+2 << " arrive at x=" << mkfp->getPar(0, 1, 0) << " y=" << mkfp->getPar(0, 1, 1) << " z=" << mkfp->getPar(0, 1, 2)<< " r=" << getHypot(mkfp->getPar(0, 1, 0), mkfp->getPar(0, 1, 1)) << std::endl;
-#endif
-        }
-
-#endif  // USE_CUDA
-      } // end of layer loop
-
-      mkfp->OutputFittedTracksAndHitIdx(etabin_of_candidates.m_candidates, itrack, end, true);
-
-    } // end of seed loop
-   } //end of parallel section over seeds
-#ifdef USE_CUDA  // FIXME: temporary; move to FitterCU
-        m_cuFitter_arr[omp_get_thread_num()]->free_extra_addBestHit();
-#endif
-
-}
-#endif
diff --git a/mkFit/MkFitter.cc b/mkFit/MkFitter.cc
index 3b0d11585c8dc..ef492f413d4f2 100644
--- a/mkFit/MkFitter.cc
+++ b/mkFit/MkFitter.cc
@@ -517,7 +517,7 @@ void MkFitter::SelectHitIndices(const LayerOfHits &layer_of_hits, const int N_pr
     // const int pb2 = L.GetPhiBin(phi + dphi) + 2;
 
     if (dump)
-      printf("LayerOfHits::SelectHitIndices %6.3f %6.3f %6.4f %7.5f %3d %3d %4d %4d\n",
+      printf("LayerOfHits::SelectHitIndices %6.3f %6.3f %6.6f %7.5f %3d %3d %4d %4d\n",
              z, phi, dz, dphi, zb1, zb2, pb1, pb2);
 
     // MT: One could iterate in "spiral" order, to pick hits close to the center.
diff --git a/mkFit/PropagationMPlex.cc b/mkFit/PropagationMPlex.cc
index 2939cfe215bc2..7d9339c632a3d 100644
--- a/mkFit/PropagationMPlex.cc
+++ b/mkFit/PropagationMPlex.cc
@@ -592,6 +592,10 @@ void helixAtRFromIterative(const MPlexLV& inPar, const MPlexQI& inChg, MPlexLV&
       const float pzin = inPar.ConstAt(n, 5, 0);
       const float r    = msRad.ConstAt(n, 0, 0);
       float r0 = hipo(xin, yin);
+      //if (n==1) printf("cpu %f\n", r0);
+      //if (n==1) printf("cpu %f\n", xin);
+      //if (n==1) printf("cpu %f\n", yin);
+      //if (n==1) printf("cpu %f\n", r0);
       
       dprint_np(n, std::endl << "attempt propagation from r=" << r0 << " to r=" << r << std::endl
         << "x=" << xin << " y=" << yin  << " z=" << inPar.ConstAt(n, 2, 0) << " px=" << pxin << " py=" << pyin << " pz=" << pzin << " q=" << inChg.ConstAt(n, 0, 0));
@@ -996,6 +1000,19 @@ void propagateHelixToRMPlex(const MPlexLS& inErr,  const MPlexLV& inPar,
 {
    outErr = inErr;
    outPar = inPar;
+     //int kk = 0;
+     //{
+       //printf("\n");
+       //printf("outErr %d\n", kk);
+       //for (int i = 0; i < 1; ++i) { for (int j = 0; j < 1; ++j)
+           //printf("%8f ", outErr.At(kk,i,j)); printf("\t");
+       //} printf("\n");
+
+       //printf("outPar %d\n", kk);
+       //for (int i = 0; i < 1; ++i) {
+           //printf("%8f ", outPar.At(kk,i,0)); printf("\t");
+       //} printf("\n");
+     //}
 
    MPlexLL errorProp;
 
@@ -1004,13 +1021,18 @@ void propagateHelixToRMPlex(const MPlexLS& inErr,  const MPlexLV& inPar,
 #pragma simd
    for (int n = 0; n < NN; ++n) {
      msRad.At(n, 0, 0) = r;
+      //if (n == 0) printf("cpu r = %f\n", r);
    }
 
+
 #ifdef POLCOORD
    helixAtRFromIterativePolar(inPar, inChg, outPar, msRad, errorProp, N_proc);
 #else
    helixAtRFromIterative(inPar, inChg, outPar, msRad, errorProp, N_proc);
 #endif
+     //for (int i = 0; i < 6; ++i) {
+       //printf("%8f ", inPar.ConstAt(0,i,0)); printf("\t");
+     //} printf("\n");
 
    //add multiple scattering uncertainty and energy loss (FIXME: in this way it is not applied in track fit)
    if (Config::useCMSGeom) {
@@ -1033,22 +1055,11 @@ void propagateHelixToRMPlex(const MPlexLS& inErr,  const MPlexLV& inPar,
    MultHelixPropTransp(errorProp, temp,   outErr);
 
    // This dump is now out of its place as similarity is done with matriplex ops.
-#ifdef DEBUG
-   if (debug) {
-     for (int kk = 0; kk < N_proc; ++kk)
-     {
-       dprintf("outErr %d\n", kk);
-       for (int i = 0; i < 6; ++i) { for (int j = 0; j < 6; ++j)
-           dprintf("%8f ", outErr.At(kk,i,j)); printf("\n");
-       } dprintf("\n");
-
-       dprintf("outPar %d\n", kk);
-       for (int i = 0; i < 6; ++i) {
-           dprintf("%8f ", outPar.At(kk,i,0)); printf("\n");
-       } dprintf("\n");
-     }
-   }
-#endif
+//#ifdef DEBUG
+   //if (debug) {
+     //for (int kk = 0; kk < N_proc; ++kk)
+   //}
+//#endif
 
    /*
      if (fabs(sqrt(outPar[0]*outPar[0]+outPar[1]*outPar[1])-r)>0.0001) {
diff --git a/mkFit/computeChi2_kernels.cu b/mkFit/computeChi2_kernels.cu
index 7ae1387c25073..29f32205d1be5 100644
--- a/mkFit/computeChi2_kernels.cu
+++ b/mkFit/computeChi2_kernels.cu
@@ -187,11 +187,7 @@ __device__ void SlurpInIdx_fn(GPlexObj to, // float *fArray, int stride, int kSi
   int j = threadIdx.x + blockDim.x * blockIdx.x;
   if (j<N) {
     for (int i = 0; i < to.kSize; ++i) { // plex_size
-      /*fArray[i*stride+ j] = * (const T*) (arr + i*sizeof(T) + off);*/
       to(j, i, 0) = * (decltype(to.ptr)) (arr + i*sizeof(decltype(*to.ptr)) + idx);
-      /*if (j == 2) {*/
-        /*printf("gpu -- %d : %d, %f\n", i, idx, to(j,i,0));*/
-      /*}*/
     }
   }
 }
@@ -264,7 +260,6 @@ __global__ void getNewBestHitChi2_kernel(
   int itrack = threadIdx.x + blockDim.x*blockIdx.x;
   if (itrack < N) {
     getNewBestHitChi2_fn(XHitSize, XHitArr, outChi2, minChi2[itrack], bestHit[itrack], hit_cnt, N);
-    /*printf("GPU [%d]  -- %d : %f\n", itrack, bestHit[itrack], minChi2[itrack]);*/
   }
 }
 
@@ -293,27 +288,18 @@ __device__ void updateTracksWithBestHit_fn(Hit *hits,
     float *Chi2, int *HitsIdx, int N) {
   int itrack = threadIdx.x + blockDim.x*blockIdx.x;
   if (itrack < N) {
-    /*printf("GPU [%d]  -- %d : %f\n", itrack, bestHit, minChi2);*/
     if (bestHit >= 0)
     {
       Hit   &hit  = hits[ bestHit ];
       float &chi2_local = minChi2;
 	  
-      /*msErr[Nhits].CopyIn(itrack, hit.errArray());*/
-      /*SlurpIn_fn<float>(msErr, msErr_stride, msErr_plex_size,
-        varr + (itrack*sizeof(Hit)) + off_error, XHitPos, N);*/
-      /*msPar[Nhits].CopyIn(itrack, hit.posArray());*/
-      /*SlurpIn_fn<float>(msPar, msPar_stride, msPar_plex_size,
-        varr + (itrack*sizeof(Hit)) + off_param, XHitPos, N);*/
       for (int i = 0; i < msErr.kSize; ++i) {
         msErr(itrack, i, 0) = hit.errArrayCU()[i];
       }
       for (int i = 0; i < msPar.kSize; ++i) {
         msPar(itrack, i, 0) = hit.posArrayCU()[i];
       }
-      /*Chi2(itrack, 0, 0) += chi2_local;*/
       Chi2[itrack] += chi2_local;
-      /*HitsIdx[Nhits](itrack, 0, 0) = XHitPos.At(itrack, 0, 0) + bestHit[itrack];*/
       HitsIdx[itrack] = bestHit;
     }
     else
@@ -326,18 +312,12 @@ __device__ void updateTracksWithBestHit_fn(Hit *hits,
       msErr(itrack, 4, 0) = 0;
       msErr(itrack, 5, 0) = 666;
 
-      /*msPar[Nhits](itrack,0,0) = Par[iP](itrack,0,0);*/
-      /*msPar[Nhits](itrack,1,0) = Par[iP](itrack,1,0);*/
-      /*msPar[Nhits](itrack,2,0) = Par[iP](itrack,2,0);*/
       for (int i = 0; i < msPar.kSize; ++i) {
         msPar(itrack, i, 0) = propPar(itrack, i, 0);
       }
-      /*HitsIdx[Nhits](itrack, 0, 0) = -1;*/
       HitsIdx[itrack] = -1;
-
       // Don't update chi2
     }
-    /*printf("GPU [%d]  -- %d : %f\n", itrack, HitsIdx[itrack], Chi2[itrack]);*/
   }
 }
 
@@ -374,24 +354,22 @@ int getMaxNumHits_wrapper(GPlexQI d_XHitSize, int N) {
 }
 
 __global__ void bestHit_kernel(
-    Hit *hits, GPlexQI XHitPos, 
+    Hit *hits, GPlexQI XHitSize, GPlexHitIdx XHitArr, 
     GPlexLS propErr, GPlexHS msErr, GPlexHV msPar,
     GPlexLV propPar, GPlexQF outChi2,
-    /*float* propErr, size_t propErr_stride,*/
-    /*float* msErr, size_t msErr_stride, size_t msErr_plex_size,*/
-    /*float *msPar, size_t msPar_stride, size_t msPar_plex_size,*/
-    /*float *propPar, size_t propPar_stride,*/
-    /*float *outChi2, size_t outChi2_stride,*/
     float *Chi2, int *HitsIdx,
     int maxSize, int N) {
 
-  /*int itrack = threadIdx.x + blockDim.x*blockIdx.x;*/
+  int itrack = threadIdx.x + blockDim.x*blockIdx.x;
   int bestHit_reg = -1;
   float minChi2_reg = 15.f;
 
+  if (itrack < N)
+    HitsIdx[itrack] = 0;
+
   for (int hit_cnt = 0; hit_cnt < maxSize; ++hit_cnt)
   {
-    /*HitToMs_fn(msErr, msPar, hits, XHitPos, hit_cnt, N);*/
+    HitToMs_fn(msErr, msPar, hits, XHitSize, XHitArr, HitsIdx, hit_cnt, N);
 #if 0
       // TODO: add CMSGeom
       if (Config::useCMSGeom) {
@@ -400,19 +378,19 @@ __global__ void bestHit_kernel(
       } else {}
 #endif
     computeChi2_fn(propErr, msErr, msPar, propPar, outChi2, N);
-    /*getNewBestHitChi2_fn(outChi2.ptr, minChi2_reg, bestHit_reg, hit_cnt, N);*/
+    getNewBestHitChi2_fn(XHitSize, XHitArr, outChi2.ptr, minChi2_reg, bestHit_reg, hit_cnt, N);
   }
-  /*updateTracksWithBestHit_fn*/
-      /*(hits, XHitPos,*/
-       /*minChi2_reg, bestHit_reg,*/
-       /*msErr, msPar, propPar,*/
-       /*Chi2, HitsIdx,*/
-       /*N);*/
+  updateTracksWithBestHit_fn
+      (hits, 
+       minChi2_reg, bestHit_reg,
+       msErr, msPar, propPar,
+       Chi2, HitsIdx,
+       N);
 }
 
-#if 0
+#if 1
 void bestHit_wrapper(cudaStream_t &stream,
-    BunchOfHitsCU &bunch, GPlexQI &XHitPos, 
+    LayerOfHitsCU &layer, GPlexQI &XHitSize,  GPlexHitIdx &XHitArr,
     GPlexLS &propErr, GPlexHS &msErr, GPlexHV &msPar,
     GPlexLV &propPar, GPlexQF &outChi2,
     float *Chi2, int *HitsIdx,
@@ -423,7 +401,7 @@ void bestHit_wrapper(cudaStream_t &stream,
   dim3 block(BLOCK_SIZE_X, 1, 1);
 
   bestHit_kernel <<< grid, block, 0, stream >>>
-    (bunch.m_hits, XHitPos,
+    (layer.m_hits, XHitSize, XHitArr,
      propErr, msErr, msPar, propPar, outChi2,
      /*propErr.ptr, propErr.stride,*/
      /*msErr.ptr, msErr.stride, msErr.kSize,*/
@@ -592,7 +570,7 @@ __global__ void selectHitRanges_kernel(Hit *hits,
 }
 
 #if 0
-void selectHitRanges_wrapper(cudaStream_t &stream, BunchOfHitsCU &bunch, 
+void selectHitRanges_wrapper(cudaStream_t &stream, LayerOfHitsCU &layer, 
     GPlexQI &XHitPos, GPlexQI &XHitSize,
     GPlexLS &Err, GPlexLV &Par,
     int N) {
diff --git a/mkFit/computeChi2_kernels.h b/mkFit/computeChi2_kernels.h
index 001a68b1b9e44..51c3262728bf9 100644
--- a/mkFit/computeChi2_kernels.h
+++ b/mkFit/computeChi2_kernels.h
@@ -30,9 +30,9 @@ void updateTracksWithBestHit_wrapper(cudaStream_t &stream,
 
 int getMaxNumHits_wrapper(GPlexQI d_XHitSize, int N);
 
-#if 0
+#if 1
 void bestHit_wrapper(cudaStream_t &stream,
-    BunchOfHitsCU &bunch, GPlexQI &XHitPos, 
+    LayerOfHitsCU &layer, GPlexQI &XHitSize, GPlexHitIdx &XHitArr,
     GPlexLS &propErr, GPlexHS &msErr, GPlexHV &msPar,
     GPlexLV &propPar, GPlexQF &outChi2,
     float *Chi2, int *HitsIdx,
diff --git a/mkFit/index_selection_kernels.cu b/mkFit/index_selection_kernels.cu
new file mode 100644
index 0000000000000..6384de6c64cc7
--- /dev/null
+++ b/mkFit/index_selection_kernels.cu
@@ -0,0 +1,204 @@
+#include "index_selection_kernels.h"
+#include "Config.h"
+#include "HitStructures.h"
+
+#include "stdio.h"
+
+#define BLOCK_SIZE_X 32
+#define MAX_BLOCKS_X 65535 // CUDA constraint
+
+constexpr bool tmp_useCMSGeom = false;
+
+#if 0
+__device__ 
+int GetZBin(float z, const float m_zmin, const float m_fz) {
+  return (z - m_zmin) * m_fz;
+}
+
+__device__
+int GetZBinChecked(float z, const float m_zmin, const float m_fz, const int m_nz) { 
+  int zb = GetZBin(z); 
+  if (zb < 0) zb = 0; else if (zb >= m_nz) zb = m_nz - 1; return zb; 
+}
+#endif
+
+__global__ void selectHitIndices_kernel(LayerOfHitsCU layer_of_hits,
+    GPlexLS Err, GPlexLV Par, GPlexQI XHitSize, GPlexHitIdx XHitArr, int N) {
+  int itrack = threadIdx.x + blockDim.x*blockIdx.x;
+  /*if (itrack == 0) {*/
+  if (itrack < N) {
+    /*printf("info %d\n", layer_of_hits.m_phi_bin_infos[10].first);*/
+    /*LayerOfHitsCU& l = layer_of_hits;*/
+    /*printf("gpu: %f, %f, %f\n", l.m_zmin, l.m_zmax, l.m_fz);*/
+  /*}*/
+  /*if (itrack < N) {*/
+    bool dump = false;
+    const float nSigmaPhi = 3;
+    const float nSigmaZ   = 3;
+
+    int xhitsize_tmp = XHitSize[itrack];
+    XHitSize[itrack] = 0;
+
+    float z, phi, dz, dphi;
+    {
+      const float x = Par(itrack, 0, 0);
+      const float y = Par(itrack, 1, 0);
+
+      const float r2 = x*x + y*y;
+
+      z   = Par(itrack, 2, 0);
+      phi = getPhi(x, y);
+      /*dz  = nSigmaZ * sqrtf(Err(itrack, 2, 2));*/
+      dz  = nSigmaZ * sqrtf(Err[5*Err.stride + itrack]);
+
+      const float dphidx = -y/r2, dphidy = x/r2;
+      /*const float dphi2  = dphidx * dphidx * Err(itrack, 0, 0) +*/
+                           /*dphidy * dphidy * Err(itrack, 1, 1) +*/
+                       /*2 * dphidx * dphidy * Err(itrack, 0, 0);*/
+      const float dphi2  = dphidx * dphidx * Err[0*Err.stride + itrack] +
+                           dphidy * dphidy * Err[2*Err.stride + itrack] +
+                       2 * dphidx * dphidy * Err[0*Err.stride + itrack];
+
+#ifdef HARD_CHECK
+      assert(dphi2 >= 0);
+#endif
+
+      dphi = nSigmaPhi * sqrtf(fabs(dphi2));
+
+      if (tmp_useCMSGeom)
+      {
+        //now correct for bending and for layer thickness unsing linear approximation
+        const float deltaR = Config::cmsDeltaRad; //fixme! using constant value, to be taken from layer properties
+        const float r  = sqrt(r2);
+#ifdef POLCOORD
+        //here alpha is the difference between posPhi and momPhi
+        const float alpha = phi - Par(itrack, 4, 0);
+        float cosA, sinA;
+        if (Config::useTrigApprox) {
+          sincos4(alpha, sinA, cosA);
+        } else {
+          cosA = cos(alpha);
+          sinA = sin(alpha);
+        }
+#else
+        const float px = Par(itrack, 3, 0);
+        const float py = Par(itrack, 4, 0);
+        const float pt = ::sqrt(px*px + py*py);
+        //here alpha is the difference between posPhi and momPhi
+        //const float cosA = ( x*px + dy*py ) / (pt*r);
+        //const float sinA = ( y*px - dx*py ) / (pt*r);
+        // FIXME dx, dy do not exist: 
+        //       does not matter yet for gpu as cms geom is not implemented 
+        const float cosA = ( x*px + y*py ) / (pt*r);
+        const float sinA = ( y*px - x*py ) / (pt*r);
+#endif
+        //take abs so that we always inflate the window
+        const float dist = fabs(deltaR*sinA/cosA);
+
+        dphi += dist / r;
+      }
+    }
+
+    const LayerOfHitsCU &L = layer_of_hits;
+
+    if (fabs(dz)   > L.m_max_dz)   dz   = L.m_max_dz;
+    if (fabs(dphi) > L.m_max_dphi) dphi = L.m_max_dphi;
+
+    const int zb1 = L.GetZBinChecked(z - dz);
+    const int zb2 = L.GetZBinChecked(z + dz) + 1;
+    const int pb1 = L.GetPhiBin(phi - dphi);
+    const int pb2 = L.GetPhiBin(phi + dphi) + 1;
+    // MT: The extra phi bins give us ~1.5% more good tracks at expense of 10% runtime.
+    // const int pb1 = L.GetPhiBin(phi - dphi) - 1;
+    // const int pb2 = L.GetPhiBin(phi + dphi) + 2;
+
+    if (dump)
+      printf("LayerOfHitsCU::SelectHitIndices %6.3f %6.3f %6.6f %7.5f %3d %3d %4d %4d\n",
+             z, phi, dz, dphi, zb1, zb2, pb1, pb2);
+
+    /*if (itrack == 0) {*/
+      /*int i1 = L.m_phi_bin_infos[zb1*L.m_nphi + (pb1 & L.m_phi_mask)].first;*/
+      /*int i2 = L.m_phi_bin_infos[zb1*L.m_nphi + (pb1 & L.m_phi_mask)].second;*/
+
+      /*printf("gpu: %d, %d\n", i1, i2);*/
+    /*}*/
+
+    // MT: One could iterate in "spiral" order, to pick hits close to the center.
+    // http://stackoverflow.com/questions/398299/looping-in-a-spiral
+    // This would then work best with relatively small bin sizes.
+    // Or, set them up so I can always take 3x3 array around the intersection.
+    for (int zi = zb1; zi < zb2; ++zi)
+    {
+      for (int pi = pb1; pi < pb2; ++pi)
+      {
+        const int pb = pi & L.m_phi_mask;
+
+        // MT: The following line is the biggest hog (4% total run time).
+        // This comes from cache misses, I presume.
+        // It might make sense to make first loop to extract bin indices
+        // and issue prefetches at the same time.
+        // Then enter vectorized loop to actually collect the hits in proper order.
+
+#if 1
+        /*for (int hi = L.m_phi_bin_infos[zi][pb].first; hi < L.m_phi_bin_infos[zi][pb].second; ++hi)*/
+        for (int hi = L.m_phi_bin_infos[zi*L.m_nphi + pb].first; hi < L.m_phi_bin_infos[zi*L.m_nphi + pb].second; ++hi)
+        {
+          // MT: Access into m_hit_zs and m_hit_phis is 1% run-time each.
+
+#ifdef LOH_USE_PHI_Z_ARRAYS
+          float ddz   = fabs(z   - L.m_hit_zs[hi]);
+          float ddphi = fabs(phi - L.m_hit_phis[hi]);
+          if (ddphi > Config::PI) ddphi = Config::TwoPI - ddphi;
+
+          if (dump)
+            printf("     SHI %3d %4d %4d %5d  %6.3f %6.3f %6.4f %7.5f   %s\n",
+                   zi, pi, pb, hi,
+                   L.m_hit_zs[hi], L.m_hit_phis[hi], ddz, ddphi,
+                   (ddz < dz && ddphi < dphi) ? "PASS" : "FAIL");
+
+          // MT: Commenting this check out gives full efficiency ...
+          //     and means our error estimations are wrong!
+          // Avi says we should have *minimal* search windows per layer.
+          // Also ... if bins are sufficiently small, we do not need the extra
+          // checks, see above.
+          // if (ddz < dz && ddphi < dphi && XHitSize[itrack] < MPlexHitIdxMax)
+#endif
+          // MT: The following check also makes more sense with spiral traversal,
+          // we'd be taking in closest hits first.
+          if (XHitSize[itrack] < GPlexHitIdxMax)
+          {
+            /*if (itrack == 0)*/
+            /*if (XHitArr(itrack, XHitSize[itrack], 0) != hi) {*/
+              /*printf("before %d, after %d\n", XHitArr(itrack, XHitSize[itrack], 0), hi);*/
+              /*printf("--- %d, after %d\n", XHitSize[itrack], hi);*/
+            /*}*/
+            XHitArr(itrack, XHitSize[itrack]++, 0) = hi;
+          }
+        }
+#endif  // 0
+      }
+    }
+    /*if (itrack == 0)*/
+    /*{*/
+      /*if (XHitSize[itrack] != xhitsize_tmp) {*/
+        /*printf("%d fromCPU %d  --fromGPU %d\n", itrack, xhitsize_tmp, XHitSize[itrack]);*/
+        /*int i1 = L.m_phi_bin_infos[zb1*L.m_nphi + (pb1 & L.m_phi_mask)].first;*/
+        /*int i2 = L.m_phi_bin_infos[zb1*L.m_nphi + (pb1 & L.m_phi_mask)].second;*/
+
+        /*printf("gpu: %d, %d\n", i1, i2);*/
+      /*}*/
+    /*}*/
+  }
+}
+
+void selectHitIndices_wrapper(cudaStream_t& stream,
+    LayerOfHitsCU& layer_of_hits, GPlexLS& Err, GPlexLV& Par, 
+    GPlexQI& XHitSize, GPlexHitIdx& XHitArr, int N) {
+  int gridx = std::min((N-1)/BLOCK_SIZE_X + 1,
+                       MAX_BLOCKS_X);
+  dim3 grid(gridx, 1, 1);
+  dim3 block(BLOCK_SIZE_X, 1, 1);
+  /*printf("Before kernel %d \n", N);*/
+  selectHitIndices_kernel <<< grid, block, 0, stream >>>
+    (layer_of_hits, Err, Par, XHitSize, XHitArr, N);
+}
diff --git a/mkFit/index_selection_kernels.h b/mkFit/index_selection_kernels.h
new file mode 100644
index 0000000000000..225f1a0b15846
--- /dev/null
+++ b/mkFit/index_selection_kernels.h
@@ -0,0 +1,11 @@
+#ifndef _INDEX_SELECTION_KERNELS_H_
+#define _INDEX_SELECTION_KERNELS_H_
+
+#include "HitStructuresCU.h"
+#include "GPlex.h"
+
+void selectHitIndices_wrapper(cudaStream_t& stream,
+    LayerOfHitsCU& layer_of_hits, GPlexLS& Err, GPlexLV& Par, 
+    GPlexQI& XHitSize, GPlexHitIdx& XHitArr, int N);
+
+#endif  // _INDEX_SELECTION_KERNELS_H_
diff --git a/mkFit/propagation_kernels.cu b/mkFit/propagation_kernels.cu
index ca1e15087331a..5c09df58dcc56 100644
--- a/mkFit/propagation_kernels.cu
+++ b/mkFit/propagation_kernels.cu
@@ -12,6 +12,38 @@ constexpr int LS = 21;
 constexpr int BLOCK_SIZE_X = 32;
 constexpr int MAX_BLOCKS_X = 65535; // CUDA constraint
 
+__device__
+void MultHelixProp_fn(const GPlexRegLL& a, const GPlexLS& b, GPlexRegLL& c, int n)
+{
+   // C = A * B
+
+   typedef float T;
+   /*const idx_t N  = NN;*/
+
+   /*const T *a = A.fArray; ASSUME_ALIGNED(a, 64);*/
+   /*const T *b = B.fArray; ASSUME_ALIGNED(b, 64);*/
+         /*T *c = C.fArray; ASSUME_ALIGNED(c, 64);*/
+  /*float *a = A.ptr;*/
+  int aN = 1       ; int an = 0;  // Register array
+  int bN = b.stride; int bn = n;  // Global array
+  int cN = 1;        int cn = 0;
+
+#include "MultHelixProp.ah"
+}
+
+__device__
+void MultHelixPropTransp_fn(const GPlexRegLL& a, const GPlexRegLL& b, GPlexLS& c, int n)
+{
+   // C = B * AT;
+
+   typedef float T;
+  int aN = 1       ; int an = 0;  // Register array
+  int bN = 1       ; int bn = 0;  // Global array
+  int cN = c.stride; int cn = n;
+
+#include "MultHelixPropTransp.ah"
+}
+
 // computeJacobianSimple works on values that are in registers.
 // Registers are thread-private. Thus this function has no notion of
 // parallelism. It is ran serially by each calling thread.
@@ -101,8 +133,6 @@ void helixAtRFromIterative_fn(const GPlexLV& inPar,
     const GPlexQI& inChg, GPlexLV& outPar_global, const GPlexReg<float,1,1>& msRad, 
     GPlexReg<float, LL2, L>& errorProp, int N, int n) {
 
-  /*int n = threadIdx.x + blockIdx.x * blockDim.x;*/
-
   GPlexReg<float, LL2, 1> outPar;
 
   if (n < N) {
@@ -271,49 +301,104 @@ void propagation_wrapper(cudaStream_t& stream,
 // PropagationMPlex.cc:propagateHelixToRMPlex, second version with 7 arguments 
 // Imposes the radius
 __global__ void propagationForBuilding_kernel(
-    float r,
-    float *inPar, size_t inPar_stride, int *inChg,
-    float *outPar, size_t outPar_stride, float *errorProp,
-    size_t errorProp_stride, float *outErr, size_t outErr_stride, int N) {
-#if 0
+    const GPlexLS inErr, const GPlexLV inPar,
+    const GPlexQI inChg, const float radius,
+    GPlexLS outErr, GPlexLV outPar, 
+    const int N) {
+#if 1
   int grid_width = blockDim.x * gridDim.x;
   int n = threadIdx.x + blockIdx.x * blockDim.x;
-  float msRad_reg;
+
+  GPlexRegQF msRad_reg;
   // Using registers instead of shared memory is ~ 30% faster.
-  float errorProp_reg[LL2];
+  GPlexRegLL errorProp_reg;
   // If there is more matrices than MAX_BLOCKS_X * BLOCK_SIZE_X 
-  for (int z = 0; z < (N-1)/grid_width  +1; z++) {
-    n += z*grid_width;
+  /*for (int z = 0; z < (N-1)/grid_width  +1; z++) {*/
+    /*n += z*grid_width;*/
     if (n < N) {
-      assignMsRad_fn(r, &msRad_reg, N, n);
-      if (Config::doIterative) {
-        helixAtRFromIterative_fn(inPar, inPar_stride,
-            inChg, outPar, outPar_stride, msRad_reg, 
-            errorProp_reg, N, n);
-      } else {
-        // TODO: not ported for now. Assuming Config::doIterative
-        // helixAtRFromIntersection(inPar, inChg, outPar, msRad, errorProp);
+      
+      for (int i = 0; i < inErr.kSize; ++i) {
+        outErr[n + i*outErr.stride] = inErr[n + i*inErr.stride];
       }
-      similarity_fn(errorProp_reg, outErr, outErr_stride, N, n);
+      for (int i = 0; i < inPar.kSize; ++i) {
+        outPar[n + i*outPar.stride] = inPar[n + i*inPar.stride];
+      }
+      for (int i = 0; i < 36; ++i) {
+        errorProp_reg[i] = 0.0;
+      }
+     /*if (n == 0)*/
+     /*{*/
+       /*int kk = n;*/
+       /*printf("\n");*/
+       /*printf("outErrGPU %d\n", kk);*/
+       /*for (int i = 0; i < 1; ++i) { for (int j = 0; j < 1; ++j)*/
+           /*printf("%8f ", outErr(kk,i,j)); printf("\t");*/
+       /*} printf("\n");*/
+
+       /*printf("outParGPU %d\n", kk);*/
+       /*for (int i = 0; i < 1; ++i) {*/
+           /*printf("%8f ", outPar(kk,i,0)); printf("\t");*/
+       /*} printf("\n");*/
+     /*}*/
     }
-  }
+
+      /*assignMsRad_fn(radius, &msRad_reg, N, n);*/
+      msRad_reg(n, 0, 0) = radius;
+      /*if (n == 0) printf("gpu r = %f\n", radius);*/
+
+#ifdef POLCOORD
+      // TODO: port me
+      helixAtRFromIterativePolar(inPar, inChg, outPar, msRad, errorProp, N_proc);
+#else
+      helixAtRFromIterative_fn(inPar, inChg, outPar, msRad_reg, errorProp_reg, N, n);
+#endif
+   /*if(n == 0) {*/
+     /*printf("errorProp\n");*/
+     /*for (int i = 0; i < 6; ++i) {*/
+       /*printf("%8f ", inPar(0,i,0)); printf("\t");*/
+     /*} printf("\n");*/
+     /*for (int i = 0; i < 6; ++i) {*/
+       /*printf("%8f ", outPar(0,i,0)); printf("\t");*/
+     /*} printf("\n");*/
+   /*}*/
+
+      // TODO: port me
+      /*if (Config::useCMSGeom) {*/
+        /*MPlexQF hitsRl;*/
+        /*MPlexQF hitsXi;*/
+        /*for (int n = 0; n < NN; ++n) {*/
+        /*hitsRl.At(n, 0, 0) = getRlVal(r, outPar.ConstAt(n, 2, 0));*/
+        /*hitsXi.At(n, 0, 0) = getXiVal(r, outPar.ConstAt(n, 2, 0));*/
+        /*}*/
+        /*applyMaterialEffects(hitsRl, hitsXi, outErr, outPar, N_proc);*/
+      /*}*/
+      /*similarity_fn(errorProp_reg, outErr, N, n);*/
+
+      // Matriplex version of:
+      // result.errors = ROOT::Math::Similarity(errorProp, outErr);
+
+      //MultHelixProp can be optimized for polar coordinates, see GenMPlexOps.pl
+      /*MPlexLL temp;*/
+      /*MultHelixProp      (errorProp, outErr, temp);*/
+      /*MultHelixPropTransp(errorProp, temp,   outErr);*/
+      GPlexRegLL temp;
+      MultHelixProp_fn      (errorProp_reg, outErr, temp, n);
+      MultHelixPropTransp_fn(errorProp_reg, temp,   outErr, n);
+
+  /*}*/
 #endif
 }
 
 void propagationForBuilding_wrapper(cudaStream_t& stream,
-    float radius,
-    GPlexLV& inPar, GPlexQI& inChg,
-    GPlexLV& outPar, GPlexLL& errorProp,
-    GPlexLS& outErr, 
+    const GPlexLS& inErr, const GPlexLV& inPar,
+    const GPlexQI& inChg, const float radius,
+    GPlexLS& outErr, GPlexLV& outPar, 
     const int N) {
   int gridx = std::min((N-1)/BLOCK_SIZE_X + 1,
                        MAX_BLOCKS_X);
   dim3 grid(gridx, 1, 1);
   dim3 block(BLOCK_SIZE_X, 1, 1);
   propagationForBuilding_kernel<<<grid, block, 0, stream >>>
-    (radius,
-     inPar.ptr, inPar.stride, inChg.ptr,
-     outPar.ptr, outPar.stride, errorProp.ptr,
-     errorProp.stride, outErr.ptr, outErr.stride, N);
+    (inErr, inPar, inChg, radius, outErr, outPar, N);
 }
 
diff --git a/mkFit/propagation_kernels.h b/mkFit/propagation_kernels.h
index c2a44b4a7952c..1e1ada98584d5 100644
--- a/mkFit/propagation_kernels.h
+++ b/mkFit/propagation_kernels.h
@@ -11,10 +11,9 @@ void propagation_wrapper(cudaStream_t& stream,
     const int N);
 
 void propagationForBuilding_wrapper(cudaStream_t& stream,
-    float radius,
-    GPlexLV& inPar, GPlexQI& inChg,
-    GPlexLV& outPar, GPlexLL& errorProp,
-    GPlexLS& outErr, 
+    const GPlexLS& inErr, const GPlexLV& inPar,
+    const GPlexQI& inChg, const float radius,
+    GPlexLS& outErr, GPlexLV& outPar, 
     const int N);
 
 #endif  // _PROPAGATION_KERNELS_H_

From 92c9b57bba5d7d0e7e9f2492ce6ee0b76b260311 Mon Sep 17 00:00:00 2001
From: Matthieu Lefebvre <ml15@princeton.edu>
Date: Tue, 28 Jun 2016 10:19:03 -0400
Subject: [PATCH 07/13] BestHit on GPU.

The following steps have been taken (these are message from squashed commits):
1) GPU: msErr and msPar arrays of GPlex-s

2) Makes d_HitsIdx a GPlexQI

3) Makes d_Chi2 a GPlexQF

4) Makes d_HitsIdx an array of GPlexQI

5) Moves data transfers outside the loop over layers

6) Loop over layers inside FitterCU::addBestHit

7) Changes kernels body to call device functions

8) Merges bestHit kernel and kalmanupdate kernel

9) Moves selectHitIndices and maxSize computation to findBestHit kernel

10) Moves ilay and full array of layers to gpu

11) Loop over layers on the gpu

12) Adds Geometry class for the GPU

13) Separates cpu and gpu versions for FindTracksBestHit

14) Moving tracks to the GPU

Partially done:
  * accessors_cu.h contains the __device__ function to access Hit, Tracks, SVector, SMatrix on the GPU.
  * reorganize_gplex contains function to reorganize hits and tracks on the gpu. If we want to split it, more will be have to be made to avoid double definition of accessors.
  * Hits reorganization has been made in a previous commit
  * Track reogranization is completely done on the gpu

15) Reorganize GPlex to Tracks on the GPU

16) Adds EventOfCandidatesCU on the GPU and uses it for bestHit

17) BestHit completely port to GPU

18) Refactors CUDA error management.

19) GPU: Add 'const' qualifiers to constant arguments.

20) Adds streams to data transfers for HitStructuresCU classes.

21) Splits computeChi2_kernels in two: chi2 stuffs and besthit stuffs.
---
 Hit.h                              |   7 +-
 Makefile.config                    |   6 +-
 Math/MatrixRepresentationsStatic.h |   3 +
 Math/SMatrix.h                     |   3 +
 Track.h                            |  42 +++
 mkFit/BuilderCU.cu                 |  54 ++++
 mkFit/BuilderCU.h                  |  28 ++
 mkFit/FitterCU-imp.h               | 404 ++++++++++++-----------
 mkFit/FitterCU.h                   | 117 +++----
 mkFit/GPlex.h                      |   9 +-
 mkFit/GeometryCU.h                 |  18 ++
 mkFit/HitStructuresCU.cu           | 134 ++++++--
 mkFit/HitStructuresCU.h            |  76 ++++-
 mkFit/Makefile                     |   5 +
 mkFit/MkBuilder.cc                 | 107 ++++---
 mkFit/MkBuilder.h                  |   5 +
 mkFit/MkFitter.cc                  | 210 +-----------
 mkFit/MkFitter.h                   |   4 -
 mkFit/accessors_cu.h               |  40 +++
 mkFit/array_algorithms_cu.h        |  59 ++++
 mkFit/atomic_utils.h               | 102 ++++++
 mkFit/best_hit_kernels.cu          | 301 +++++++++++++++++
 mkFit/best_hit_kernels.h           |  49 +++
 mkFit/buildtestMPlex.cc            |  13 +-
 mkFit/computeChi2_kernels.cu       | 496 ++---------------------------
 mkFit/computeChi2_kernels.h        |  60 +---
 mkFit/gpu_utils.h                  |  22 ++
 mkFit/index_selection_kernels.cu   |  58 +---
 mkFit/index_selection_kernels.h    |  10 +-
 mkFit/kalmanUpdater_kernels.cu     |  88 ++---
 mkFit/kalmanUpdater_kernels.h      |  28 +-
 mkFit/mkFit.cc                     |   1 +
 mkFit/propagation_kernels.cu       |  68 ++--
 mkFit/propagation_kernels.h        |  10 +-
 mkFit/reorganize.cu                |  65 ----
 mkFit/reorganize.h                 |   8 -
 mkFit/reorganize_gplex.cu          | 208 ++++++++++++
 mkFit/reorganize_gplex.h           |  48 +++
 38 files changed, 1689 insertions(+), 1277 deletions(-)
 create mode 100644 mkFit/BuilderCU.cu
 create mode 100644 mkFit/BuilderCU.h
 create mode 100644 mkFit/GeometryCU.h
 create mode 100644 mkFit/accessors_cu.h
 create mode 100644 mkFit/array_algorithms_cu.h
 create mode 100644 mkFit/atomic_utils.h
 create mode 100644 mkFit/best_hit_kernels.cu
 create mode 100644 mkFit/best_hit_kernels.h
 create mode 100644 mkFit/gpu_utils.h
 delete mode 100644 mkFit/reorganize.cu
 delete mode 100644 mkFit/reorganize.h
 create mode 100644 mkFit/reorganize_gplex.cu
 create mode 100644 mkFit/reorganize_gplex.h

diff --git a/Hit.h b/Hit.h
index dd08d20a271e1..4b2798ceac224 100644
--- a/Hit.h
+++ b/Hit.h
@@ -202,9 +202,10 @@ class Hit
   const float* posArray() const {return state_.pos_.Array();}
   const float* errArray() const {return state_.err_.Array();}
 //#ifdef USE_CUDA
-  float* posArrayCU();
-  float* errArrayCU();
-//#endif
+#if __CUDACC__
+  __device__ float* posArrayCU();
+  __device__ float* errArrayCU();
+#endif
 
   // Non-const versions needed for CopyOut of Matriplex.
   SVector3&     parameters_nc() {return state_.pos_;}
diff --git a/Makefile.config b/Makefile.config
index 3f02cae6bf2f0..8a2966700c5a2 100644
--- a/Makefile.config
+++ b/Makefile.config
@@ -35,7 +35,9 @@ endif
 
 # 2.1 Use nvcc to compile cuda code
 # CUDA compiler
-NV := nvcc -prec-sqrt=true
+CUBROOT=/home/ml15/tools/cub
+NV := nvcc -prec-sqrt=true -I${CUBROOT} 
+#-g -G -lineinfo
 # Comment out to compile for CPU
 USE_CUDA := yes
 
@@ -122,6 +124,8 @@ LDFLAGS_MIC  :=
 
 ifdef USE_CUDA
 CPPFLAGS += -DUSE_CUDA -I/nfs/opt/cuda/include
+#CPPFLAGS += -I/home/ml15/tools/cub
+CPPFLAGS += -I${CUBROOT}
 LDFLAGS_HOST += -L${CUDALIBDIR}
 endif
 
diff --git a/Math/MatrixRepresentationsStatic.h b/Math/MatrixRepresentationsStatic.h
index 21cf5b9c59c46..9554c37db9137 100644
--- a/Math/MatrixRepresentationsStatic.h
+++ b/Math/MatrixRepresentationsStatic.h
@@ -241,6 +241,9 @@ namespace Math {
      inline T* Array() { return fArray; }
 
      inline const T* Array() const { return fArray; }
+//#ifdef USE_CUDA
+     T* ArrayCU();
+//#endif
 
       /**
          assignment : only symmetric to symmetric allowed
diff --git a/Math/SMatrix.h b/Math/SMatrix.h
index b14edfccc5d56..7714195350738 100644
--- a/Math/SMatrix.h
+++ b/Math/SMatrix.h
@@ -272,6 +272,9 @@ class SMatrix {
    const T* Array() const;
    /// return pointer to internal array
    T* Array();
+//#ifdef USE_CUDA
+   T* ArrayCU();
+//#endif
 
    /** @name --- STL-like interface --- 
        The iterators access the matrix element in the order how they are 
diff --git a/Track.h b/Track.h
index b880fcf044f61..7f4fc6baf3a93 100644
--- a/Track.h
+++ b/Track.h
@@ -125,6 +125,11 @@ class Track
 
   const float* posArray() const {return state_.parameters.Array();}
   const float* errArray() const {return state_.errors.Array();}
+//#ifdef USE_CUDA
+#if __CUDACC__
+  __device__ float* posArrayCU();
+  __device__ float* errArrayCU();
+#endif
 
   // Non-const versions needed for CopyOut of Matriplex.
   SVector6&     parameters_nc() {return state_.parameters;}
@@ -134,8 +139,17 @@ class Track
   SVector3 position() const {return SVector3(state_.parameters[0],state_.parameters[1],state_.parameters[2]);}
   SVector3 momentum() const {return SVector3(state_.parameters[3],state_.parameters[4],state_.parameters[5]);}
 
+#if __CUDACC__
+  __host__ __device__
+#endif
   int      charge() const {return state_.charge;}
+#if __CUDACC__
+  __host__ __device__
+#endif
   float    chi2()   const {return chi2_;}
+#if __CUDACC__
+  __host__ __device__
+#endif
   int      label()  const {return label_;}
 
   float x()      const { return state_.parameters[0];}
@@ -169,12 +183,18 @@ class Track
     return hitsVec;
   }
 
+#if __CUDACC__
+  __host__ __device__
+#endif
   void addHitIdx(int hitIdx,float chi2)
   {
     hitIdxArr_[++hitIdxPos_] = hitIdx;
     if (hitIdx >= 0) { ++nGoodHitIdx_; chi2_+=chi2; }
   }
 
+#if __CUDACC__
+  __host__ __device__
+#endif
   int getHitIdx(int posHitIdx) const
   {
     return hitIdxArr_[posHitIdx];
@@ -191,6 +211,9 @@ class Track
     }
   }
 
+#if __CUDACC__
+  __host__ __device__
+#endif
   void setHitIdx(int posHitIdx, int newIdx) {
     hitIdxArr_[posHitIdx] = newIdx;
   }
@@ -201,6 +224,16 @@ class Track
     }
   }
 
+#if __CUDACC__
+  __host__ __device__
+#endif
+  void setNGoodHitIdx(int nHits) {
+    nGoodHitIdx_ = nHits;
+  }
+
+#if __CUDACC__
+  __host__ __device__
+#endif
   void resetHits()
   {
     hitIdxPos_   = -1;
@@ -219,8 +252,17 @@ class Track
     return layers;
   }
 
+#if __CUDACC__
+  __host__ __device__
+#endif
   void setCharge(int chg)  {state_.charge=chg;}
+#if __CUDACC__
+  __host__ __device__
+#endif
   void setChi2(float chi2) {chi2_=chi2;}
+#if __CUDACC__
+  __host__ __device__
+#endif
   void setLabel(int lbl)   {label_=lbl;}
 
   void setState(const TrackState& newState) {state_=newState;}
diff --git a/mkFit/BuilderCU.cu b/mkFit/BuilderCU.cu
new file mode 100644
index 0000000000000..cfc8e3657e009
--- /dev/null
+++ b/mkFit/BuilderCU.cu
@@ -0,0 +1,54 @@
+#include "BuilderCU.h"
+
+#include "HitStructures.h"
+#include "HitStructuresCU.h"
+#include "GeometryCU.h"
+#include "FitterCU.h"
+#include "Event.h"
+
+
+BuilderCU::BuilderCU(const EventOfHits& event_of_hits, const Event* event,
+                     const EventOfCandidates& event_of_cands)
+{
+  int gplex_size = 1 << 12;
+  cuFitter = new FitterCU<float> (gplex_size);
+  cuFitter->allocateDevice();
+  cuFitter->allocate_extra_addBestHit();
+  cuFitter->createStream();
+  cuFitter->setNumberTracks(gplex_size);
+
+  event_of_hits_cu.allocGPU(event_of_hits);
+  event_of_hits_cu.copyFromCPU(event_of_hits);
+
+  std::vector<float> radii (Config::nLayers);
+  for (int ilay = Config::nlayers_per_seed; ilay < Config::nLayers; ++ilay) {
+    radii[ilay] = event->geom_.Radius(ilay);
+  }
+  geom_cu.allocate();
+  geom_cu.getRadiiFromCPU(&radii[0]);
+
+  event_of_cands_cu.allocGPU(event_of_cands);
+}
+
+
+BuilderCU::~BuilderCU() {
+  event_of_cands_cu.deallocGPU();
+
+  geom_cu.deallocate();
+  event_of_hits_cu.deallocGPU();
+
+  cuFitter->destroyStream();
+  cuFitter->free_extra_addBestHit();
+  cuFitter->freeDevice();
+  delete cuFitter;
+}
+
+
+void BuilderCU::FindTracksBestHit(EventOfCandidates& event_of_cands) 
+{
+  event_of_cands_cu.copyFromCPU(event_of_cands, cuFitter->get_stream());
+
+  cuFitter->addBestHit(event_of_hits_cu, geom_cu, event_of_cands_cu);
+
+  event_of_cands_cu.copyToCPU(event_of_cands, cuFitter->get_stream());
+}
diff --git a/mkFit/BuilderCU.h b/mkFit/BuilderCU.h
new file mode 100644
index 0000000000000..f3a2421949f5a
--- /dev/null
+++ b/mkFit/BuilderCU.h
@@ -0,0 +1,28 @@
+#ifndef BUILDER_CU_H
+#define BUILDER_CU_H 
+
+#include "FitterCU.h"
+#include "HitStructures.h"
+#include "HitStructuresCU.h"
+#include "GeometryCU.h"
+#include "Geometry.h"
+#include "Event.h"
+
+
+class BuilderCU
+{
+public:
+  BuilderCU(const EventOfHits& event_of_hits, const Event* event,
+            const EventOfCandidates& event_of_cands);
+  ~BuilderCU();
+
+  void FindTracksBestHit(EventOfCandidates& event_of_cands);
+private:
+  FitterCU<float> *cuFitter;
+  EventOfHitsCU event_of_hits_cu;
+  EventOfCandidatesCU event_of_cands_cu;
+  GeometryCU geom_cu;
+};
+
+
+#endif /* ifndef BUILDER_CU_H */
diff --git a/mkFit/FitterCU-imp.h b/mkFit/FitterCU-imp.h
index b6023428dcfa6..3e75aea115ea5 100644
--- a/mkFit/FitterCU-imp.h
+++ b/mkFit/FitterCU-imp.h
@@ -1,8 +1,10 @@
 #include <cstdlib>
 #include "Config.h"
+#include "GeometryCU.h"
+#include "reorganize_gplex.h"
 
 template <typename T>
-void FitterCU<T>::setNumberTracks(idx_t Ntracks) {
+void FitterCU<T>::setNumberTracks(const idx_t Ntracks) {
   N = Ntracks;
 
   // Raise an exceptioin when the FitterCU instance is too small
@@ -33,10 +35,17 @@ void FitterCU<T>::allocateDevice() {
 
   d_inChg.allocate(Nalloc, QI);
   d_errorProp.allocate(Nalloc, LL);
-  d_msPar.allocate(Nalloc, HV);
-  d_msErr.allocate(Nalloc, HS);
 
-  cudaCheckError()
+  cudaMalloc((void**)&d_msPar_arr, MAX_HITS * sizeof(GPlexHV));
+  cudaMalloc((void**)&d_msErr_arr, MAX_HITS * sizeof(GPlexHS));
+  for (int hi = 0; hi < MAX_HITS; ++hi) {
+    d_msPar[hi].allocate(Nalloc, HV);
+    d_msErr[hi].allocate(Nalloc, HS);
+  }
+  cudaMemcpy(d_msPar_arr, d_msPar, MAX_HITS*sizeof(GPlexHV), cudaMemcpyHostToDevice);
+  cudaMemcpy(d_msErr_arr, d_msErr, MAX_HITS*sizeof(GPlexHS), cudaMemcpyHostToDevice);
+  cudaMalloc((void**)&d_maxSize, sizeof(int));  //  global maximum
+  cudaCheckError();
 }
 
 template <typename T>
@@ -46,131 +55,49 @@ void FitterCU<T>::freeDevice() {
   d_par_iP.free();
   d_errorProp.free();
   d_Err_iP.free();
-  d_msPar.free();
   d_Err_iC.free();
-  d_msErr.free();
-
-  cudaCheckError()
+  for (int hi = 0; hi < MAX_HITS; ++hi) {
+    d_msPar[hi].free();
+    d_msErr[hi].free();
+  }
+  cudaFree(d_msPar_arr);
+  cudaFree(d_msErr_arr);
+  cudaFree(d_maxSize);
+  cudaCheckError();
 }
 
 template <typename T>
-void FitterCU<T>::kalmanUpdateMerged() {
-  kalmanUpdate_wrapper(stream, d_Err_iP, d_msErr,
-                       d_par_iP, d_msPar, d_par_iC, d_Err_iC, N);
+void FitterCU<T>::kalmanUpdateMerged(const int hit_idx) {
+  kalmanUpdate_wrapper(stream, d_Err_iP, d_msErr[hit_idx],
+                       d_par_iP, d_msPar[hit_idx], d_par_iC, d_Err_iC, N);
 }
 
 template <typename T>
 void FitterCU<T>::kalmanUpdate_standalone(
-    const MPlexLS &psErr,  const MPlexLV& psPar, const MPlexQI &inChg,
-    const MPlexHS &msErr,  const MPlexHV& msPar,
-    MPlexLS &outErr,       MPlexLV& outPar, int N_proc)
+    const MPlexLS &psErr, const MPlexLV& psPar, const MPlexQI &inChg,
+    const MPlexHS &msErr, const MPlexHV& msPar,
+    MPlexLS &outErr, MPlexLV& outPar,
+    const int hit_idx, const int N_proc)
 {
   d_Err_iP.copyAsyncFromHost(stream, psErr);
-  d_msErr.copyAsyncFromHost(stream, msErr);
+  d_msErr[hit_idx].copyAsyncFromHost(stream, msErr);
   d_par_iP.copyAsyncFromHost(stream, psPar);
-  d_msPar.copyAsyncFromHost(stream, msPar);
+  d_msPar[hit_idx].copyAsyncFromHost(stream, msPar);
 
-  kalmanUpdate_wrapper(stream, d_Err_iP, d_msErr,
-                       d_par_iP, d_msPar, d_par_iC, d_Err_iC, N_proc);
+  kalmanUpdate_wrapper(stream, d_Err_iP, d_msErr[hit_idx],
+                       d_par_iP, d_msPar[hit_idx], d_par_iC, d_Err_iC, N_proc);
 
   d_par_iC.copyAsyncToHost(stream, outPar);
   d_Err_iC.copyAsyncToHost(stream, outErr);
 }
 
 template <typename T>
-void FitterCU<T>::propagationMerged() {
-  propagation_wrapper(stream, d_msPar, d_par_iC, d_inChg,
+void FitterCU<T>::propagationMerged(const int hit_idx) {
+  propagation_wrapper(stream, d_msPar[hit_idx], d_par_iC, d_inChg,
                       //d_par_iP, d_Err_iC, d_Err_iP, N); // TODO: Check outErr/errorProp
                       d_par_iP, d_errorProp, d_Err_iP, N);
 }
 
-#if 1
-template <typename T>
-void FitterCU<T>::computeChi2gpu(const MPlexLS &psErr, const MPlexLV& propPar,
-    const MPlexQI &inChg, MPlexHS &msErr, MPlexHV& msPar,
-    float *minChi2, int *bestHit,
-    LayerOfHitsCU &d_layer, MPlexQI &XHitSize, Matriplex::Matriplex<int, 16, 1, MPT_SIZE> &XHitArr,
-    MPlexQF &Chi2, MPlexQI &HitsIdx, MPlexQF &outChi2, int maxSize2, int NN) {
-
-  float *d_minChi2;
-  int *d_bestHit;
-  cudaMalloc((void**)&d_minChi2, NN*sizeof(float));
-  cudaMalloc((void**)&d_bestHit, NN*sizeof(int));
-
-  cudaMemcpyAsync(d_minChi2, minChi2, NN*sizeof(float), cudaMemcpyHostToDevice, stream);
-  cudaMemcpyAsync(d_bestHit, bestHit, NN*sizeof(int), cudaMemcpyHostToDevice, stream);
-
-  cudaMemset(d_bestHit, -1, NN*sizeof(int));
-  fill_array_cu(d_minChi2, NN, 15.f);
-
-  d_Err_iP.copyAsyncFromHost(stream, psErr);
-  d_par_iP.copyAsyncFromHost(stream, propPar);
-  d_msErr.copyAsyncFromHost(stream, msErr);
-  d_msPar.copyAsyncFromHost(stream, msPar);
-  //d_XHitPos.copyAsyncFromHost(stream, XHitPos);
-  //d_XHitSize.copyAsyncFromHost(stream, XHitSize);
-  //d_XHitArr.copyAsyncFromHost(stream, XHitArr);
-
-  //cudaMemcpy2DAsync(d_Chi2, NN*sizeof(float), Chi2.fArray, NN*sizeof(float), 
-               //NN*sizeof(float), 1, cudaMemcpyHostToDevice, stream);
-  //cudaMemcpy2DAsync(d_HitsIdx, NN*sizeof(int), HitsIdx.fArray, NN*sizeof(int), 
-               //NN*sizeof(int), 1, cudaMemcpyHostToDevice, stream);
-
-  //cudaStreamSynchronize(stream);
-  //cudaCheckError();
-
-  //selectHitRanges_wrapper(stream, d_bunch, d_XHitPos, d_XHitSize, 
-      //d_Err_iP, d_par_iP, N);
-
-  int maxSize = getMaxNumHits_wrapper(d_XHitSize, N);
-  //bestHit_wrapper(stream, d_bunch, d_XHitPos,
-                  //d_Err_iP, d_msErr, d_msPar, d_par_iP, d_outChi2,
-                  //d_Chi2, d_HitsIdx,
-                  //maxSize2, N);
-  for (int hit_cnt = 0; hit_cnt < maxSize; ++hit_cnt)
-  {
-    //// TODO: add CMSGeom
-    //if (Config::useCMSGeom) {
-      ////propagateHelixToRMPlex(psErr,  psPar, inChg,  msPar, propErr, propPar);
-      //throw std::runtime_error("useCMSGeom not implemented yet for GPU");
-    //} else {}
-    HitToMs_wrapper(stream, d_msErr, d_msPar, d_layer, d_XHitSize, d_XHitArr, d_HitsIdx, hit_cnt, NN);
-
-    computeChi2_wrapper(stream, d_Err_iP, d_msErr, //d_resErr, 
-        d_msPar, d_par_iP, d_outChi2, NN);
-
-    getNewBestHitChi2_wrapper(stream, d_XHitSize, d_XHitArr, d_outChi2, d_minChi2, d_bestHit, hit_cnt, NN);
-
-    //cudaStreamSynchronize(stream);
-    //cudaCheckError();
-  }
-  updateTracksWithBestHit_wrapper(stream, d_layer, d_minChi2, d_bestHit, 
-    d_msErr, d_msPar, d_par_iP, d_Chi2, d_HitsIdx, N);
-
-  //d_outChi2.copyAsyncToHost(stream, outChi2);
-  //cudaMemcpyAsync(minChi2, d_minChi2, NN*sizeof(float), cudaMemcpyDeviceToHost, stream);
-  //cudaMemcpyAsync(bestHit, d_bestHit, NN*sizeof(int), cudaMemcpyDeviceToHost, stream);
-
-  cudaMemcpy2DAsync(Chi2.fArray, NN*sizeof(float), d_Chi2, NN*sizeof(float), 
-               NN*sizeof(float), 1, cudaMemcpyDeviceToHost, stream);
-  cudaMemcpy2DAsync(HitsIdx.fArray, NN*sizeof(int), d_HitsIdx, NN*sizeof(int), 
-               NN*sizeof(int), 1, cudaMemcpyDeviceToHost, stream);
-  d_msErr.copyAsyncToHost(stream, msErr);
-  d_msPar.copyAsyncToHost(stream, msPar);
-
-
-  cudaStreamSynchronize(stream);
-  cudaCheckError();
-  //for (int itrack = 0; itrack < NN; ++itrack)
-  //{
-    ////printf("CPU [%d]  -- %d : %f\n", itrack, HitsIdx(itrack, 0, 0), Chi2[itrack]);
-  //}
-
-  cudaFree(d_minChi2);
-  cudaFree(d_bestHit);
-}
-#endif
-
 // FIXME: Temporary. Separate allocations / transfers
 template <typename T>
 void FitterCU<T>::allocate_extra_addBestHit() {
@@ -178,18 +105,24 @@ void FitterCU<T>::allocate_extra_addBestHit() {
   d_XHitPos.allocate(Nalloc, QI);
   d_XHitSize.allocate(Nalloc, QI);
   d_XHitArr.allocate(Nalloc, GPlexHitIdxMax);
-  // FIXME: Make those GPlex-es. and use .allocate()
-  cudaMalloc((void**)&d_HitsIdx, Nalloc*sizeof(int)); cudaCheckError();
-  cudaMalloc((void**)&d_Chi2, Nalloc*sizeof(float)); cudaCheckError();
-  cudaCheckError()
+  cudaMalloc((void**)&d_HitsIdx_arr, MAX_HITS * sizeof(GPlexQI));
+  for (int hi = 0; hi < MAX_HITS; ++hi) {
+    d_HitsIdx[hi].allocate(Nalloc, QI);
+  }
+  cudaMemcpy(d_HitsIdx_arr, d_HitsIdx, MAX_HITS*sizeof(GPlexQI), cudaMemcpyHostToDevice);
+  d_Chi2.allocate(Nalloc, QF);
+  d_Label.allocate(Nalloc, QI);
+  cudaCheckError();
 }
 
 template <typename T>
 void FitterCU<T>::free_extra_addBestHit() {
-  destroyStream();
-
-  cudaFree(d_HitsIdx); cudaCheckError();
-  cudaFree(d_Chi2); cudaCheckError();
+  for (int hi = 0; hi < MAX_HITS; ++hi) {
+    d_HitsIdx[hi].free(); cudaCheckError();
+  }
+  cudaFree(d_HitsIdx_arr);
+  d_Label.free(); cudaCheckError();
+  d_Chi2.free(); cudaCheckError();
 
   d_XHitArr.free(); cudaCheckError();
   d_XHitSize.free(); cudaCheckError();
@@ -197,97 +130,56 @@ void FitterCU<T>::free_extra_addBestHit() {
   d_outChi2.free(); cudaCheckError();
 }
 
-// FIXME: Temporary. Separate allocations / transfers
 template <typename T>
-void FitterCU<T>::prepare_addBestHit(
-    const MPlexLS &psErr, const MPlexLV& propPar,
-    const MPlexQI &inChg,
-    MPlexQI &XHitSize, Matriplex::Matriplex<int, 16, 1, MPT_SIZE> &XHitArr,
-    size_t NN) {
-  setNumberTracks(NN);  // temporary: should be end - beg
-
-  createStream();
-  cudaCheckError()
-#if 1
-  // psErr -> d_Err_iP
-  //cudaMemcpy2DAsync(d_Err_iP.ptr, d_Err_iP.pitch, psErr.fArray, N*sizeof(T),
-               //N*sizeof(T), LS, cudaMemcpyHostToDevice, stream);
-  d_Err_iP.copyAsyncFromHost(stream, psErr);
-  d_par_iP.copyAsyncFromHost(stream, propPar);
-  d_inChg.copyAsyncFromHost(stream, inChg);
-  
-  //cudaMemset2D(d_XHitSize.ptr, d_XHitSize.pitch,
-               //0, sizeof(int)*d_XHitSize.N, d_XHitSize.kSize);
-  //d_XHitSize.copyAsyncFromHost(stream, XHitSize);
-  //d_XHitArr.copyAsyncFromHost(stream, XHitArr);
-#endif
+void FitterCU<T>::setHitsIdxToZero(const int hit_idx) {
+  cudaMemset(d_HitsIdx[hit_idx].ptr, 0, Nalloc*sizeof(int));
 }
 
-// TODO: Temporary. Separate allocations / transfers
 template <typename T>
-void FitterCU<T>::finalize_addBestHit(
-    MPlexHS &msErr, MPlexHV& msPar,
-    MPlexLS& Err_iC, MPlexLV& Par_iC, 
-    MPlexLS& Err_iP, MPlexLV& Par_iP, 
-    MPlexQI &HitsIdx, MPlexQF &Chi2) {
-#if 1
-  d_par_iC.copyAsyncToHost(stream, Par_iC);
-  d_Err_iC.copyAsyncToHost(stream, Err_iC);
-
-  d_par_iP.copyAsyncToHost(stream, Par_iP);
-  d_Err_iP.copyAsyncToHost(stream, Err_iP);
- 
-  // Get msPar, msErr, chi2 and HitIdx out from the GPU to the CPU
-  d_msPar.copyAsyncToHost(stream, msPar);
-  d_msErr.copyAsyncToHost(stream, msErr);
-  cudaMemcpyAsync(HitsIdx.fArray, d_HitsIdx, N*sizeof(int), cudaMemcpyDeviceToHost, stream);
-  cudaMemcpyAsync(Chi2.fArray, d_Chi2, N*sizeof(float), cudaMemcpyDeviceToHost, stream);
-#endif
-}
+void FitterCU<T>::addBestHit(EventOfHitsCU &event, GeometryCU &geom_cu,
+                             EventOfCandidatesCU &event_of_cands_cu) {
+    findBestHit_wrapper(stream, event.m_layers_of_hits, 
+                        event_of_cands_cu,
+                        d_XHitSize, d_XHitArr,
+                        d_Err_iP, d_par_iP, 
+                        d_msErr_arr, d_msPar_arr,
+                        d_Err_iC, d_par_iC, d_outChi2,
+                        d_Chi2, d_HitsIdx_arr,
+                        d_inChg, d_Label, geom_cu,
+                        d_maxSize, N);
+}   
 
 template <typename T>
-void FitterCU<T>::setHitsIdxToZero() {
-  cudaMemset(d_HitsIdx, 0, Nalloc*sizeof(int));
+void FitterCU<T>::InputTracksAndHitIdx(const EtaBinOfCandidatesCU &etaBin,
+                              const int beg, const int end, const bool inputProp) {
+  InputTracksCU_wrapper(stream, etaBin, d_Err_iP, d_par_iP,
+                        d_inChg, d_Chi2, d_Label, d_HitsIdx_arr,
+                        beg, end, inputProp, N);
 }
 
-#if 1
 template <typename T>
-void FitterCU<T>::addBestHit(LayerOfHitsCU &layer, const int ilay, const float radius) {
-
-  //selectHitRanges_wrapper(stream, layer, d_XHitPos, d_XHitSize, 
-      //d_Err_iP, d_par_iP, N);
-  selectHitIndices_wrapper(stream, 
-      layer, d_Err_iP, d_par_iP, 
-      d_XHitSize, d_XHitArr, N);
+void FitterCU<T>::OutputTracksAndHitIdx(EtaBinOfCandidatesCU &etaBin,
+                               const int beg, const int end, const bool outputProp) {
+  OutputTracksCU_wrapper(stream, etaBin, d_Err_iP, d_par_iP,
+                         d_inChg, d_Chi2, d_Label, d_HitsIdx_arr,
+                         beg, end, outputProp, N);
+  cudaStreamSynchronize(stream);
+  cudaCheckError();
+}
 
-  // TODO: get this thing inside bestHit_kernel
-  int maxSize = getMaxNumHits_wrapper(d_XHitSize, N);
-  cudaDeviceSynchronize(); cudaCheckError();
-
-  bestHit_wrapper(stream, layer, d_XHitSize, d_XHitArr,
-                  d_Err_iP, d_msErr, d_msPar, d_par_iP, d_outChi2,
-                  d_Chi2, d_HitsIdx,
-                  maxSize, N);
-  kalmanUpdate_wrapper(stream, d_Err_iP, d_msErr,
-                       d_par_iP, d_msPar, d_par_iC, d_Err_iC, N);
-  if (ilay + 1 < Config::nLayers) {
-    propagationForBuilding_wrapper(stream, d_Err_iC, d_par_iC, d_inChg, 
-        radius, d_Err_iP, d_par_iP, N); 
-  }
-}   
-#endif
 
 #if 1
 template <typename T>
-void FitterCU<T>::propagateTracksToR(float radius, int N) {
+void FitterCU<T>::propagateTracksToR(const float radius, const int N) {
   propagationForBuilding_wrapper(stream, d_Err_iC, d_par_iC, d_inChg, 
                                  radius, d_Err_iP, d_par_iP, N); 
 }
 #endif
 
 template <typename T>
-void FitterCU<T>::propagateTracksToR_standalone(float radius, int N,
-    MPlexLS& Err_iC, MPlexLV& par_iC, MPlexQI& inChg, MPlexLS& Err_iP, MPlexLV& Par_iP) {
+void FitterCU<T>::propagateTracksToR_standalone(const float radius, const int N,
+    const MPlexLS& Err_iC, const MPlexLV& par_iC, const MPlexQI& inChg, 
+    MPlexLS& Err_iP, MPlexLV& Par_iP) {
   d_Err_iC.copyAsyncFromHost(stream, Err_iC);
   d_par_iC.copyAsyncFromHost(stream, par_iC);
   //propagationForBuilding_wrapper(stream, d_Err_iC, d_par_iC, d_inChg, 
@@ -343,11 +235,11 @@ void FitterCU<T>::FitTracks(MPlexQI &Chg, MPlexLV& par_iC, MPlexLS& err_iC,
     }
     total_reorg += (dtime() - time_input)*1e3;
 
-    d_msPar.copyAsyncFromHost(stream, msPar[hi]);
-    d_msErr.copyAsyncFromHost(stream, msErr[hi]);
+    d_msPar[hi].copyAsyncFromHost(stream, msPar[hi]);
+    d_msErr[hi].copyAsyncFromHost(stream, msErr[hi]);
 
-    propagationMerged();
-    kalmanUpdateMerged();
+    propagationMerged(hi);
+    kalmanUpdateMerged(hi);
   }
   cudaEventRecord(stop, 0);
   cudaEventSynchronize(stop);
@@ -367,3 +259,133 @@ void FitterCU<T>::FitTracks(MPlexQI &Chg, MPlexLV& par_iC, MPlexLS& err_iC,
   cudaEventDestroy(stop);
 }
 
+///////////////////////////////////////////////////////////////////////////////
+// Backup function: temps that have been deactivated
+///////////////////////////////////////////////////////////////////////////////
+
+#if 0
+template <typename T>
+void FitterCU<T>::computeChi2gpu(const MPlexLS &psErr, const MPlexLV& propPar,
+    const MPlexQI &inChg, MPlexHS &msErr, MPlexHV& msPar,
+    float *minChi2, int *bestHit,
+    LayerOfHitsCU &d_layer, MPlexQI &XHitSize, Matriplex::Matriplex<int, 16, 1, MPT_SIZE> &XHitArr,
+    MPlexQF &Chi2, MPlexQI &HitsIdx, MPlexQF &outChi2, int maxSize2, int hit_idx, int NN) {
+
+  float *d_minChi2;
+  int *d_bestHit;
+  cudaMalloc((void**)&d_minChi2, NN*sizeof(float));
+  cudaMalloc((void**)&d_bestHit, NN*sizeof(int));
+
+  cudaMemcpyAsync(d_minChi2, minChi2, NN*sizeof(float), cudaMemcpyHostToDevice, stream);
+  cudaMemcpyAsync(d_bestHit, bestHit, NN*sizeof(int), cudaMemcpyHostToDevice, stream);
+
+  cudaMemset(d_bestHit, -1, NN*sizeof(int));
+  fill_array_cu(d_minChi2, NN, 15.f);
+
+  d_Err_iP.copyAsyncFromHost(stream, psErr);
+  d_par_iP.copyAsyncFromHost(stream, propPar);
+  d_msErr[hit_idx].copyAsyncFromHost(stream, msErr);
+  d_msPar[hit_idx].copyAsyncFromHost(stream, msPar);
+  //d_XHitPos.copyAsyncFromHost(stream, XHitPos);
+  //d_XHitSize.copyAsyncFromHost(stream, XHitSize);
+  //d_XHitArr.copyAsyncFromHost(stream, XHitArr);
+
+  //cudaMemcpy2DAsync(d_Chi2, NN*sizeof(float), Chi2.fArray, NN*sizeof(float), 
+               //NN*sizeof(float), 1, cudaMemcpyHostToDevice, stream);
+  //cudaMemcpy2DAsync(d_HitsIdx, NN*sizeof(int), HitsIdx.fArray, NN*sizeof(int), 
+               //NN*sizeof(int), 1, cudaMemcpyHostToDevice, stream);
+
+  //cudaStreamSynchronize(stream);
+  //cudaCheckError();
+
+  //selectHitRanges_wrapper(stream, d_bunch, d_XHitPos, d_XHitSize, 
+      //d_Err_iP, d_par_iP, N);
+
+  int maxSize = getMaxNumHits_wrapper(d_XHitSize, N);
+  //bestHit_wrapper(stream, d_bunch, d_XHitPos,
+                  //d_Err_iP, d_msErr, d_msPar, d_par_iP, d_outChi2,
+                  //d_Chi2, d_HitsIdx,
+                  //maxSize2, N);
+  for (int hit_cnt = 0; hit_cnt < maxSize; ++hit_cnt)
+  {
+    //// TODO: add CMSGeom
+    //if (Config::useCMSGeom) {
+      ////propagateHelixToRMPlex(psErr,  psPar, inChg,  msPar, propErr, propPar);
+      //throw std::runtime_error("useCMSGeom not implemented yet for GPU");
+    //} else {}
+    HitToMs_wrapper(stream, d_msErr[hit_idx], d_msPar[hit_idx], d_layer,
+                    d_XHitSize, d_XHitArr, d_HitsIdx[hit_idx], hit_cnt, NN);
+
+    computeChi2_wrapper(stream, d_Err_iP, d_msErr[hit_idx], //d_resErr, 
+        d_msPar[hit_idx], d_par_iP, d_outChi2, NN);
+
+    getNewBestHitChi2_wrapper(stream, d_XHitSize, d_XHitArr, d_outChi2, d_minChi2, d_bestHit, hit_cnt, NN);
+
+    //cudaStreamSynchronize(stream);
+    //cudaCheckError();
+  }
+  updateTracksWithBestHit_wrapper(stream, d_layer, d_minChi2, d_bestHit, 
+    d_msErr[hit_idx], d_msPar[hit_idx], d_par_iP, d_Chi2, d_HitsIdx[hit_idx], N);
+
+  //d_outChi2.copyAsyncToHost(stream, outChi2);
+  //cudaMemcpyAsync(minChi2, d_minChi2, NN*sizeof(float), cudaMemcpyDeviceToHost, stream);
+  //cudaMemcpyAsync(bestHit, d_bestHit, NN*sizeof(int), cudaMemcpyDeviceToHost, stream);
+
+  //cudaMemcpy2DAsync(Chi2.fArray, NN*sizeof(float), d_Chi2, NN*sizeof(float), 
+  //             NN*sizeof(float), 1, cudaMemcpyDeviceToHost, stream);
+  //cudaMemcpy2DAsync(HitsIdx.fArray, NN*sizeof(int), d_HitsIdx, NN*sizeof(int), 
+  //             NN*sizeof(int), 1, cudaMemcpyDeviceToHost, stream);
+  d_Chi2.copyAsyncToHost(stream, Chi2); 
+  d_HitsIdx[hit_idx].copyAsyncToHost(stream, HitsIdx);
+  d_msErr[hit_idx].copyAsyncToHost(stream, msErr);
+  d_msPar[hit_idx].copyAsyncToHost(stream, msPar);
+
+
+  cudaStreamSynchronize(stream);
+  cudaCheckError();
+  //for (int itrack = 0; itrack < NN; ++itrack)
+  //{
+    ////printf("CPU [%d]  -- %d : %f\n", itrack, HitsIdx(itrack, 0, 0), Chi2[itrack]);
+  //}
+
+  cudaFree(d_minChi2);
+  cudaFree(d_bestHit);
+}
+
+// FIXME: Temporary. Separate allocations / transfers
+template <typename T>
+void FitterCU<T>::prepare_addBestHit() {
+    //const MPlexLS &psErr, const MPlexLV& propPar,
+    //const MPlexQI &inChg,
+    //MPlexQI &XHitSize, Matriplex::Matriplex<int, 16, 1, MPT_SIZE> &XHitArr,
+    //size_t num_tracks) {
+  //setNumberTracks(num_tracks);  // temporary: should be end - beg
+
+  //createStream();
+  //cudaCheckError();
+  // psErr -> d_Err_iP
+  //d_Err_iP.copyAsyncFromHost(stream, psErr);
+  //d_par_iP.copyAsyncFromHost(stream, propPar);
+  //d_inChg.copyAsyncFromHost(stream, inChg);
+}
+
+// TODO: Temporary. Separate allocations / transfers
+template <typename T>
+void FitterCU<T>::finalize_addBestHit(
+    MPlexHS *msErr, MPlexHV* msPar,
+    MPlexLS& Err_iP, MPlexLV& Par_iP, 
+    MPlexQI *HitsIdx, 
+    MPlexQI &Label,
+    int start_idx, int end_idx) {
+  d_par_iP.copyAsyncToHost(stream, Par_iP);
+  d_Err_iP.copyAsyncToHost(stream, Err_iP);
+  d_Label.copyAsyncToHost(stream, Label);
+ 
+  // Get msPar, msErr, chi2 and HitIdx out from the GPU to the CPU
+  for (int hit_idx = start_idx; hit_idx < end_idx; ++hit_idx) {
+    d_msPar[hit_idx].copyAsyncToHost(stream, msPar[hit_idx]);
+    d_msErr[hit_idx].copyAsyncToHost(stream, msErr[hit_idx]);
+    d_HitsIdx[hit_idx].copyAsyncToHost(stream, HitsIdx[hit_idx]);
+  }
+}
+#endif
diff --git a/mkFit/FitterCU.h b/mkFit/FitterCU.h
index f7c5666882ee6..0eaffc2df01f7 100644
--- a/mkFit/FitterCU.h
+++ b/mkFit/FitterCU.h
@@ -2,50 +2,36 @@
 #define _PROPAGATOR_CU_H_
 
 #include <cuda_runtime.h>
-#include <omp.h>
-#include <stdexcept>
 
 #include "Matrix.h"
+
+#include "HitStructuresCU.h"
+#include "GPlex.h"
+#include "GeometryCU.h"
+#include "gpu_utils.h"
+
 #include "propagation_kernels.h"
 #include "kalmanUpdater_kernels.h"
 #include "computeChi2_kernels.h"
 #include "index_selection_kernels.h"
+#include "best_hit_kernels.h"
 
-#include "HitStructuresCU.h"
-#include "GPlex.h"
+#include <omp.h>
+#include <stdexcept>
 
-#define LV 6
-#define QI 1
-#define QF 1
+constexpr int LV = 6;
+constexpr int QI = 1;
+constexpr int QF = 1;
 #define LL 36
-#define LS 21
-#define HV 3
-#define HS 6
-#define LH 18
+constexpr int LS = 21;
+constexpr int HV = 3;
+constexpr int HS = 6;
+constexpr int LH = 18;
 
-#define BLOCK_SIZE_X 16
-#define MAX_BLOCKS_X 65535 // CUDA constraint
+#define BLOCK_SIZE_X 256
 
 using idx_t = Matriplex::idx_t;
 
-// Macro for checking cuda errors following a cuda launch or api call
-// This comes from Jeff Larkins (NVIDIA)
-#define cudaCheckError() {                                          \
-  cudaError_t e=cudaGetLastError();                                 \
-  if(e!=cudaSuccess) {                                              \
-    printf("Cuda failure %s:%d: '%s'\n",__FILE__,__LINE__,cudaGetErrorString(e));           \
-    exit(0); \
-  }                                                                 \
-}
-#if 0
-#define cudaCheckErrorSync() {                                      \
-  cudaDeviceSynchronize();                                          \
-  cudaCheckError();                                                 \
-}
-#else
-#define cudaCheckErrorSync() {}
-#endif
-
 void separate_first_call_for_meaningful_profiling_numbers();
 
 template <typename T>
@@ -59,47 +45,54 @@ class FitterCU {
 
   void createStream();
   void destroyStream();
+  cudaStream_t& get_stream() { return stream; }
 
-  void setNumberTracks(idx_t Ntracks);
+  void setNumberTracks(const idx_t Ntracks);
 
-  void propagationMerged();
-  void kalmanUpdateMerged();
+  void propagationMerged(const int hit_idx);
+  void kalmanUpdateMerged(const int hit_idx);
   void kalmanUpdate_standalone(
-      const MPlexLS &psErr,  const MPlexLV& psPar, const MPlexQI &inChg,
-      const MPlexHS &msErr,  const MPlexHV& msPar,
-      MPlexLS &outErr,       MPlexLV& outPar, int N_proc);
+      const MPlexLS &psErr, const MPlexLV& psPar, const MPlexQI &inChg,
+      const MPlexHS &msErr, const MPlexHV& msPar,
+      MPlexLS &outErr, MPlexLV& outPar,
+      const int hit_idx, const int N_proc);
 
-#if 1
+#if 0
   void computeChi2gpu(const MPlexLS &psErr, const MPlexLV& propPar,
     const MPlexQI &inChg, MPlexHS &msErr, MPlexHV& msPar,
     float *minChi2, int *bestHit,
     LayerOfHitsCU &d_layer, MPlexQI &XHitSize, Matriplex::Matriplex<int, 16, 1, MPT_SIZE> &XHitArr,
-    MPlexQF &Chi2, MPlexQI &HitsIdx, MPlexQF&outChi2, int maxSize,
+    MPlexQF &Chi2, MPlexQI &HitsIdx, MPlexQF&outChi2, int maxSize, int hit_idx,
     int NN);
 #endif
 
   void allocate_extra_addBestHit();
   void free_extra_addBestHit();
 
-  void prepare_addBestHit(
-      const MPlexLS &psErr, const MPlexLV& propPar,
-      const MPlexQI &inChg, 
-      MPlexQI &XHitSize, Matriplex::Matriplex<int, 16, 1, MPT_SIZE> &XHitArr,
-      size_t NN);
+#if 0
+  void prepare_addBestHit();
+      //const MPlexLS &psErr, const MPlexLV& propPar,
+      //const MPlexQI &inChg, 
+      //MPlexQI &XHitSize, Matriplex::Matriplex<int, 16, 1, MPT_SIZE> &XHitArr,
+      //size_t NN);
   void finalize_addBestHit(
-      MPlexHS &msErr, MPlexHV& msPar,
-      MPlexLS& Err_iC, MPlexLV& Par_iC, 
+      MPlexHS *msErr, MPlexHV* msPar,
       MPlexLS& Err_iP, MPlexLV& Par_iP, 
-      MPlexQI &HitsIdx, MPlexQF &Chi2);
-  void setHitsIdxToZero();
+      MPlexQI *HitsIdx,
+      MPlexQI &Label,
+      int start_idx, int end_idx);
+#endif
+  void setHitsIdxToZero(const int hit_idx);
 
 #if 1
-  void addBestHit(LayerOfHitsCU &layer_of_hits_cu, const int ilay, const float radius);
+  //void addBestHit(EventOfHitsCU& event, const int ilay, const float *radii, int hit_idx);
+  void addBestHit(EventOfHitsCU& event, GeometryCU &geom_cu,
+                  EventOfCandidatesCU &event_of_cands_cu);
 #endif
-  void propagateTracksToR(float radius, int N);
-  void propagateTracksToR_standalone(float radius, int N,
-      MPlexLS& Err_iC, MPlexLV& par_iC, 
-      MPlexQI& inChg, 
+  void propagateTracksToR(const float radius, const int N);
+  void propagateTracksToR_standalone(const float radius, const int N,
+      const MPlexLS& Err_iC, const MPlexLV& par_iC, 
+      const MPlexQI& inChg, 
       MPlexLS& Err_iP, MPlexLV& Par_iP);
 
   // fitting higher order methods
@@ -107,6 +100,10 @@ class FitterCU {
                  MPlexHV* msPar, MPlexHS* msErr, int Nhits,
                  std::vector<Track> &tracks, int beg, int end,
                  std::vector<HitVec> &layerHits);
+  void InputTracksAndHitIdx(const EtaBinOfCandidatesCU &etaBin,
+                   const int beg, const int end, const bool inputProp);
+  void OutputTracksAndHitIdx(EtaBinOfCandidatesCU &etaBin,
+                    const int beg, const int end, const bool outputProp);
 
  private:
   // N is the actual size, Nalloc should be >= N, as it is intended
@@ -125,16 +122,22 @@ class FitterCU {
   GPlexQF d_msRad;  // QF
   GPlexLL d_errorProp;  // LL
 
-  GPlexHV d_msPar;
-  GPlexHS d_msErr;
+  GPlexHV *d_msPar_arr;  // completely on the GPU
+  GPlexHV d_msPar[MAX_HITS];  // on the CPU, with arrays on the GPU
+  GPlexHS *d_msErr_arr;
+  GPlexHS d_msErr[MAX_HITS];
   
   GPlexQI d_XHitPos;  // QI : 1D arrary following itracks
   GPlexQI d_XHitSize;  // QI : " "
   GPlexHitIdx d_XHitArr;
 
   GPlexQF d_outChi2;
-  int *d_HitsIdx;
-  float *d_Chi2;
+  GPlexQI *d_HitsIdx_arr;
+  GPlexQI d_HitsIdx[MAX_HITS];
+  GPlexQF d_Chi2;
+  GPlexQI d_Label;
+
+  int *d_maxSize;  // max number of tracks for AddBestHit
 
   // everything run in a stream so multiple instance of FitterCU can
   // run concurrently on the GPU.
diff --git a/mkFit/GPlex.h b/mkFit/GPlex.h
index 2892f1e4cfa85..e5c2c48024b8c 100644
--- a/mkFit/GPlex.h
+++ b/mkFit/GPlex.h
@@ -4,16 +4,9 @@
 #include <cuda_runtime.h>
 #include <stdio.h>
 
+#include "gpu_utils.h"
 #include "Matrix.h"
 
-#define cudaCheckError() {                                          \
-  cudaError_t e=cudaGetLastError();                                 \
-  if(e!=cudaSuccess) {                                              \
-    printf("Cuda failure %s:%d: '%s'\n",__FILE__,__LINE__,cudaGetErrorString(e));           \
-    exit(0); \
-  }                                                                 \
-}
-
 // GPU implementation of a Matriplex-like structure
 // The number of tracks is the fast dimension and is padded in order to have
 // consecutive and aligned memory accesses. For cached reads, this result in a
diff --git a/mkFit/GeometryCU.h b/mkFit/GeometryCU.h
new file mode 100644
index 0000000000000..2269f79c6186c
--- /dev/null
+++ b/mkFit/GeometryCU.h
@@ -0,0 +1,18 @@
+#ifndef GEOMETRY_CU_H
+#define GEOMETRY_CU_H
+
+struct GeometryCU {
+  float *radii;
+
+  void allocate() {
+    cudaMalloc((void**)&radii, Config::nLayers * sizeof(float));
+  }
+  void deallocate() {
+    cudaFree(radii);
+  }
+  void getRadiiFromCPU(const float *h_radii) {
+    cudaMemcpy(radii, h_radii, Config::nLayers * sizeof(float), cudaMemcpyHostToDevice);
+  }
+};
+
+#endif /* ifndef GEOMETRY_CU_H */
diff --git a/mkFit/HitStructuresCU.cu b/mkFit/HitStructuresCU.cu
index 4a19d7a3e5805..02ef3f92f6804 100644
--- a/mkFit/HitStructuresCU.cu
+++ b/mkFit/HitStructuresCU.cu
@@ -4,7 +4,7 @@
 
 #include "HitStructuresCU.h"
 
-void LayerOfHitsCU::alloc_hits(int size) {
+void LayerOfHitsCU::alloc_hits(const int size) {
   cudaMalloc((void**)&m_hits, sizeof(Hit)*size);
   m_capacity = size;
 }
@@ -14,7 +14,7 @@ void LayerOfHitsCU::free_hits() {
   m_capacity = 0;
 }
 
-void LayerOfHitsCU::alloc_phi_bin_infos(int nz, int nphi) {
+void LayerOfHitsCU::alloc_phi_bin_infos(const int nz, const int nphi) {
   cudaMalloc((void**)&m_phi_bin_infos, sizeof(PairIntsCU)*nz*nphi);
   m_nz = nz;
 }
@@ -24,22 +24,24 @@ void LayerOfHitsCU::free_phi_bin_infos() {
   m_nz = 0;
 }
 
-void LayerOfHitsCU::copyLayerOfHitsFromCPU(LayerOfHits &layer) {
-  cudaMemcpy(m_hits, layer.m_hits, sizeof(Hit)*m_capacity, cudaMemcpyHostToDevice);
-  cudaCheckError();
+void LayerOfHitsCU::copyLayerOfHitsFromCPU(const LayerOfHits &layer, 
+                                           const cudaStream_t &stream) {
+  cudaMemcpyAsync(m_hits, layer.m_hits, sizeof(Hit)*m_capacity,
+                  cudaMemcpyHostToDevice, stream);
+  /*cudaCheckError();*/
   m_zmin = layer.m_zmin;
   m_zmax = layer.m_zmax;
   m_fz = layer.m_fz;
   // FIXME: copy other values
   // TODO: probably quite inefficient:
   for (int i = 0; i < m_nz; ++i) {
-    cudaMemcpy(m_phi_bin_infos + i*m_nphi, &(layer.m_phi_bin_infos[i][0]), 
-               sizeof(PairIntsCU)*m_nphi, cudaMemcpyHostToDevice);
-    cudaCheckError();
+    cudaMemcpyAsync(m_phi_bin_infos + i*m_nphi, &(layer.m_phi_bin_infos[i][0]), 
+                    sizeof(PairIntsCU)*m_nphi, cudaMemcpyHostToDevice, stream);
+    /*cudaCheckError();*/
   }
 }
 
-void EventOfHitsCU::allocGPU(EventOfHits &event_of_hits) {
+void EventOfHitsCU::allocGPU(const EventOfHits &event_of_hits) {
   m_n_layers = event_of_hits.m_n_layers;
   // Allocate GPU array. 
   // Members's address  of array's elements are in the GPU space
@@ -55,28 +57,122 @@ void EventOfHitsCU::allocGPU(EventOfHits &event_of_hits) {
         event_of_hits.m_layers_of_hits[i].m_nz, 
         event_of_hits.m_layers_of_hits[i].m_nphi);
   }
-  cudaCheckError();
+  /*cudaCheckError();*/
 }
 
 void EventOfHitsCU::deallocGPU() {
   for (int i = 0; i < m_n_layers; ++i) {
-    cudaCheckError();
+    /*cudaCheckError();*/
     m_layers_of_hits_alloc[i].free_hits();
     m_layers_of_hits_alloc[i].free_phi_bin_infos();
-    cudaCheckError();
+    /*cudaCheckError();*/
   }
   cudaFree(m_layers_of_hits);
-  cudaCheckError();
+  /*cudaCheckError();*/
   delete[] m_layers_of_hits_alloc;
 }
 
-void EventOfHitsCU::copyFromCPU(EventOfHits& event_of_hits) {
+void EventOfHitsCU::copyFromCPU(const EventOfHits& event_of_hits,
+                                const cudaStream_t &stream) {
   for (int i = 0; i < event_of_hits.m_n_layers; i++) {
     m_layers_of_hits_alloc[i].copyLayerOfHitsFromCPU(event_of_hits.m_layers_of_hits[i]);
   }
-  cudaCheckError();
-  cudaMemcpy(m_layers_of_hits, m_layers_of_hits_alloc, 
-      event_of_hits.m_n_layers*sizeof(LayerOfHitsCU), 
-      cudaMemcpyHostToDevice);
-  cudaCheckError();
+  /*cudaCheckError();*/
+  cudaMemcpyAsync(m_layers_of_hits, m_layers_of_hits_alloc, 
+                  event_of_hits.m_n_layers*sizeof(LayerOfHitsCU), 
+                  cudaMemcpyHostToDevice, stream);
+  /*cudaCheckError();*/
+}
+
+// ============================================================================
+
+void EtaBinOfCandidatesCU::alloc_tracks(const int ntracks) {
+  m_real_size = ntracks;
+  m_fill_index = 0;
+
+  cudaMalloc((void**)&m_candidates, sizeof(Track)*m_real_size);
+  /*cudaCheckError();*/
+}
+
+
+void EtaBinOfCandidatesCU::free_tracks() {
+  cudaFree(m_candidates);
+  /*cudaCheckError();*/
+  m_real_size = 0;
+  m_fill_index = 0;
+}
+
+
+void EtaBinOfCandidatesCU::copyFromCPU(const EtaBinOfCandidates &eta_bin, 
+                                       const cudaStream_t &stream) {
+  assert (eta_bin.m_fill_index < m_real_size); // or something
+  m_fill_index = eta_bin.m_fill_index;
+
+  cudaMemcpyAsync(m_candidates, &eta_bin.m_candidates[0],
+                  sizeof(Track)*m_fill_index, cudaMemcpyHostToDevice, stream);
+  /*cudaCheckError();*/
+}
+
+
+void EtaBinOfCandidatesCU::copyToCPU(EtaBinOfCandidates &eta_bin,
+                                     const cudaStream_t &stream) const {
+  assert (eta_bin.m_fill_index < m_real_size); // or something
+
+  cudaMemcpyAsync(&eta_bin.m_candidates[0], m_candidates,
+                  sizeof(Track)*m_fill_index, cudaMemcpyDeviceToHost, stream);
+  /*cudaCheckError();*/
+}
+
+// ============================================================================
+
+void EventOfCandidatesCU::allocGPU(const EventOfCandidates &event_of_cands) {
+  m_n_etabins = Config::nEtaBin;
+  // Allocate GPU array. 
+  // Members's address  of array's elements are in the GPU space
+  cudaMalloc((void**)&m_etabins_of_candidates, m_n_etabins*sizeof(EtaBinOfCandidatesCU));
+  /*cudaCheckError();*/
+  // Allocate CPU array. 
+  // Members's address  of array's elements are in the CPU space
+  // This allows to call allocate for each array's element.
+  m_etabins_of_candidates_alloc = new EtaBinOfCandidatesCU[m_n_etabins];
+  for (int i = 0; i < m_n_etabins; ++i) {
+    const EtaBinOfCandidates& h_etabin = event_of_cands.m_etabins_of_candidates[i];
+    m_etabins_of_candidates_alloc[i].alloc_tracks(h_etabin.m_real_size);
+  }
+  /*cudaCheckError();*/
+}
+
+
+void EventOfCandidatesCU::deallocGPU() {
+  for (int i = 0; i < m_n_etabins; ++i) {
+    /*cudaCheckError();*/
+    m_etabins_of_candidates_alloc[i].free_tracks();
+    /*cudaCheckError();*/
+  }
+  cudaFree(m_etabins_of_candidates);
+  /*cudaCheckError();*/
+  delete[] m_etabins_of_candidates_alloc;
+}
+
+
+void EventOfCandidatesCU::copyFromCPU(const EventOfCandidates& event_of_cands,
+                                      const cudaStream_t &stream) {
+  for (int i = 0; i < m_n_etabins; i++) {
+    m_etabins_of_candidates_alloc[i].copyFromCPU(event_of_cands.m_etabins_of_candidates[i]);
+  }
+  /*cudaCheckError();*/
+  cudaMemcpyAsync(m_etabins_of_candidates, m_etabins_of_candidates_alloc, 
+      m_n_etabins*sizeof(EtaBinOfCandidatesCU), 
+      cudaMemcpyHostToDevice, stream);
+  /*cudaCheckError();*/
+}
+
+
+void EventOfCandidatesCU::copyToCPU(EventOfCandidates& event_of_cands, 
+                                    const cudaStream_t &stream) const {
+  for (int i = 0; i < m_n_etabins; i++) {
+    m_etabins_of_candidates_alloc[i].copyToCPU(event_of_cands.m_etabins_of_candidates[i]);
+  }
+  /*cudaCheckError();*/
+  // We do not need to copy the array of pointers to EventOfCandidatesCU back
 }
diff --git a/mkFit/HitStructuresCU.h b/mkFit/HitStructuresCU.h
index 3851dabfc59fc..ea3e9649765f2 100644
--- a/mkFit/HitStructuresCU.h
+++ b/mkFit/HitStructuresCU.h
@@ -3,14 +3,7 @@
 
 #include "HitStructures.h"
 #include "Config.h"
-
-#define cudaCheckError() {                                          \
-  cudaError_t e=cudaGetLastError();                                 \
-  if(e!=cudaSuccess) {                                              \
-    printf("Cuda failure %s:%d: '%s'\n",__FILE__,__LINE__,cudaGetErrorString(e));           \
-    exit(0); \
-  }                                                                 \
-}
+#include "gpu_utils.h"
 
 template <typename T1, typename T2>
 struct PairCU {
@@ -43,29 +36,40 @@ class LayerOfHitsCU {
   LayerOfHitsCU() {};
   ~LayerOfHitsCU() {};
 
-  void alloc_hits(int size);
+  void alloc_hits(const int size);
   void free_hits();
 
-  void alloc_phi_bin_infos(int nz, int nphi);
+  void alloc_phi_bin_infos(const int nz, const int nphi);
   void free_phi_bin_infos();
 
-  void copyLayerOfHitsFromCPU(LayerOfHits &layer);
+  void copyLayerOfHitsFromCPU(const LayerOfHits &layer,
+                              const cudaStream_t &stream=0);
 
 #ifdef __CUDACC__
   __device__
 #endif
-  int   GetZBin(float z)    const { return (z - m_zmin) * m_fz; }
+  int GetZBin(const float z)    const { return (z - m_zmin) * m_fz; }
     
 #ifdef __CUDACC__
   __device__
 #endif
-  int   GetZBinChecked(float z) const { int zb = GetZBin(z); if (zb < 0) zb = 0; else if (zb >= m_nz) zb = m_nz - 1; return zb; }
+  int GetZBinChecked(float z) const {
+    int zb = GetZBin(z);
+    if (zb < 0) { 
+      zb = 0;
+    } else {
+      if (zb >= m_nz) zb = m_nz - 1;
+    }
+    return zb; 
+  }
 
   // if you don't pass phi in (-pi, +pi), mask away the upper bits using m_phi_mask
 #ifdef __CUDACC__
   __device__
 #endif
-  int   GetPhiBin(float phi) const { return floorf(m_fphi * (phi + Config::PI)); }
+  int   GetPhiBin(float phi) const {
+    return floorf(m_fphi * (phi + Config::PI)); 
+  }
 };
 
 
@@ -81,9 +85,49 @@ class EventOfHitsCU
   
   EventOfHitsCU() : m_n_layers{} {};
 
-  void allocGPU(EventOfHits &event_of_hits);
+  void allocGPU(const EventOfHits &event_of_hits);
+  void deallocGPU();
+  void copyFromCPU(const EventOfHits& event_of_hits,
+                   const cudaStream_t &stream=0);
+};
+
+// ============================================================================
+
+class EtaBinOfCandidatesCU 
+{
+public:
+  Track *m_candidates;
+  
+  int m_real_size;
+  int m_fill_index;
+
+  void alloc_tracks(const int ntracks);
+  void free_tracks();
+
+  void copyFromCPU(const EtaBinOfCandidates &eta_bin,
+                   const cudaStream_t &stream=0);
+  void copyToCPU(EtaBinOfCandidates &eta_bin,
+                  const cudaStream_t &stream=0) const;
+};
+
+
+class EventOfCandidatesCU 
+{
+public:
+  EtaBinOfCandidatesCU *m_etabins_of_candidates;  // device array
+  int m_n_etabins;
+  
+  // Host array. For allocation and transfer purposes
+  EtaBinOfCandidatesCU *m_etabins_of_candidates_alloc;
+
+  EventOfCandidatesCU() : m_n_etabins{} {};
+
+  void allocGPU(const EventOfCandidates &event_of_cands);
   void deallocGPU();
-  void copyFromCPU(EventOfHits& event_of_hits);
+  void copyFromCPU(const EventOfCandidates &event_of_cands,
+                   const cudaStream_t &stream=0);
+  void copyToCPU(EventOfCandidates &event_of_cands,
+                 const cudaStream_t &stream=0) const;
 };
 
 #endif  // _HIT_STRUCTURES_H_
diff --git a/mkFit/Makefile b/mkFit/Makefile
index ef5b67d1ff8f7..0a352f9ae09a2 100644
--- a/mkFit/Makefile
+++ b/mkFit/Makefile
@@ -109,6 +109,11 @@ TTree.h:
 	echo "Using dummy rule for TTree.h"
 endif
 
+ifdef CUBROOT
+cub/util_debug.cuh:
+	echo "Using dummy rule for cub/util_debug.cuh"
+endif
+
 ${MKFOBJS}: %.o: %.cc %.d
 	${CXX} ${CPPFLAGS} ${CXXFLAGS} ${VEC_HOST} -c -o $@ $<
 
diff --git a/mkFit/MkBuilder.cc b/mkFit/MkBuilder.cc
index 0896414abb621..27ecaecfb6766 100644
--- a/mkFit/MkBuilder.cc
+++ b/mkFit/MkBuilder.cc
@@ -7,6 +7,7 @@
 
 #ifdef USE_CUDA
 #include "FitterCU.h"
+#include "GeometryCU.h"
 #endif
 //#define DEBUG
 #include "Debug.h"
@@ -299,18 +300,6 @@ void MkBuilder::find_tracks_load_seeds(EventOfCandidates& event_of_cands)
 
 void MkBuilder::FindTracksBestHit(EventOfCandidates& event_of_cands)
 {
-#ifdef USE_CUDA
-  EventOfHitsCU event_of_hits_cu;
-  event_of_hits_cu.allocGPU(m_event_of_hits);
-  event_of_hits_cu.copyFromCPU(m_event_of_hits);
-
-  //printf("cpu: %d  -- gpu %d\n", sizeof(LayerOfHits), sizeof(LayerOfHitsCU));
-  
-  LayerOfHits& l = m_event_of_hits.m_layers_of_hits[Config::nlayers_per_seed];
-  //printf("info %d\n", l.m_phi_bin_infos[0][10].first);
-  //printf("cpu: %f, %f, %f\n", l.m_zmin, l.m_zmax, l.m_fz);
-
-#endif
   tbb::parallel_for(tbb::blocked_range<int>(0, Config::nEtaBin),
     [&](const tbb::blocked_range<int>& ebins)
   {
@@ -321,13 +310,9 @@ void MkBuilder::FindTracksBestHit(EventOfCandidates& event_of_cands)
         [&](const tbb::blocked_range<int>& tracks)
       {
         std::unique_ptr<MkFitter, decltype(retfitr)> mkfp(g_exe_ctx.m_fitters.GetFromPool(), retfitr);
-#ifdef USE_CUDA
-            FitterCU<float> cuFitter(NN);
-            cuFitter.allocateDevice();
-            cuFitter.allocate_extra_addBestHit();
-#endif
         
         for (int itrack = tracks.begin(); itrack < tracks.end(); itrack += NN) {
+
           int end = std::min(itrack + NN, tracks.end());
 
           dprint(std::endl << "processing track=" << itrack << " etabin=" << ebin << " findex=" << etabin_of_candidates.m_fill_index);
@@ -358,26 +343,6 @@ void MkBuilder::FindTracksBestHit(EventOfCandidates& event_of_cands)
 
             //make candidates with best hit
             dprint("make new candidates");
-#ifdef USE_CUDA
-            cuFitter.setNumberTracks(end-itrack);
-            cuFitter.prepare_addBestHit(
-                mkfp->Err[mkfp->iP], mkfp->Par[mkfp->iP],
-                mkfp->Chg, 
-                mkfp->XHitSize, mkfp ->XHitArr,
-                NN);
-            
-            LayerOfHitsCU& layer_of_hits_cu = event_of_hits_cu.m_layers_of_hits_alloc[ilay];
-            float radius = (ilay + 1 < Config::nLayers) ? m_event->geom_.Radius(ilay+1) : 0.0;
-            cuFitter.addBestHit(layer_of_hits_cu, ilay, radius); 
-
-            cuFitter.finalize_addBestHit(
-                mkfp->msErr[mkfp->Nhits], mkfp->msPar[mkfp->Nhits],
-                mkfp->Err[mkfp->iC], mkfp->Par[mkfp->iC],
-                mkfp->Err[mkfp->iP], mkfp->Par[mkfp->iP],
-                mkfp->HitsIdx[mkfp->Nhits], mkfp-> Chi2);
-
-            mkfp->SetNhits(ilay + 1);  //here again assuming one hit per layer (is this needed?)
-#else
             mkfp->AddBestHit(layer_of_hits, end - itrack);
             mkfp->SetNhits(ilay + 1);  //here again assuming one hit per layer (is this needed?)
 
@@ -388,23 +353,75 @@ void MkBuilder::FindTracksBestHit(EventOfCandidates& event_of_cands)
               mkfp->PropagateTracksToR(m_event->geom_.Radius(ilay+1), end - itrack);
               dcall(post_prop_print(ilay, mkfp.get()));
             }
-#endif
-            //exit(0);
-
           } // end of layer loop
           mkfp->OutputFittedTracksAndHitIdx(etabin_of_candidates.m_candidates, itrack, end, true);
         }
-#ifdef USE_CUDA
-            cuFitter.free_extra_addBestHit();
-            cuFitter.freeDevice();
-#endif
       }); // end of seed loop
     }
   }); //end of parallel section over seeds
+}
+
 #ifdef USE_CUDA
+void MkBuilder::FindTracksBestHit_GPU(EventOfCandidates& event_of_cands)
+{
+  EventOfHitsCU event_of_hits_cu;
+  event_of_hits_cu.allocGPU(m_event_of_hits);
+  event_of_hits_cu.copyFromCPU(m_event_of_hits);
+
+  LayerOfHits& l = m_event_of_hits.m_layers_of_hits[Config::nlayers_per_seed];
+
+  MkFitter* mkfp = m_mkfp_arr[0];
+
+  int gplex_size = 1 << 12;
+  FitterCU<float> cuFitter(gplex_size);
+  cuFitter.allocateDevice();
+  cuFitter.allocate_extra_addBestHit();
+  cuFitter.createStream();
+  cuFitter.setNumberTracks(gplex_size);
+
+  std::vector<float> radii (Config::nLayers);
+  for (int ilay = Config::nlayers_per_seed; ilay < Config::nLayers; ++ilay) {
+    radii[ilay] = m_event->geom_.Radius(ilay);
+  }
+  GeometryCU geom_cu;
+  geom_cu.allocate();
+  geom_cu.getRadiiFromCPU(&radii[0]);
+
+  EventOfCandidatesCU event_of_cands_cu;
+  event_of_cands_cu.allocGPU(event_of_cands);
+  event_of_cands_cu.copyFromCPU(event_of_cands);
+
+  //for (int ebin = 0; ebin != Config::nEtaBin; ++ebin) {
+    //EtaBinOfCandidates& etabin_of_candidates = event_of_cands.m_etabins_of_candidates[ebin];
+
+    //EtaBinOfCandidatesCU &etabin_of_cand_cu = event_of_cands_cu.m_etabins_of_candidates_alloc[ebin];
+
+    // FIXME: Do we actually need this loop, if FitterCU is as wide as etabin
+    //for (int itrack = 0; itrack < etabin_of_candidates.m_fill_index; itrack += NN) {
+      //int end = std::min(itrack + NN, etabin_of_candidates.m_fill_index);
+
+      //cuFitter.setNumberTracks(end-itrack);
+      //cuFitter.InputTracksAndHitIdx(etabin_of_cand_cu, itrack, end, true);
+
+      cuFitter.addBestHit(event_of_hits_cu, geom_cu, event_of_cands_cu);
+
+
+      //cuFitter.OutputTracksAndHitIdx(etabin_of_cand_cu, itrack, end, true);
+    //}
+  //}
+
+  event_of_cands_cu.copyToCPU(event_of_cands);
+  event_of_cands_cu.deallocGPU();
+
+  geom_cu.deallocate();
+  cuFitter.destroyStream();
+  cuFitter.free_extra_addBestHit();
+  cuFitter.freeDevice();
   event_of_hits_cu.deallocGPU();
-#endif
+
+  mkfp->SetNhits(Config::nLayers);
 }
+#endif
 
 
 //------------------------------------------------------------------------------
diff --git a/mkFit/MkBuilder.h b/mkFit/MkBuilder.h
index 4631b9d609987..eaeeaa03753d2 100644
--- a/mkFit/MkBuilder.h
+++ b/mkFit/MkBuilder.h
@@ -137,6 +137,11 @@ class MkBuilder
   // --------
 
   void FindTracksBestHit(EventOfCandidates& event_of_cands);
+#ifdef USE_CUDA
+  void FindTracksBestHit_GPU(EventOfCandidates& event_of_cands);
+  const Event* get_event() const { return m_event; }
+  const EventOfHits& get_event_of_hits() const { return m_event_of_hits; }
+#endif
   void FindTracks();
   void FindTracksCloneEngine();
   void FindTracksCloneEngineTbb();
diff --git a/mkFit/MkFitter.cc b/mkFit/MkFitter.cc
index ef492f413d4f2..d3806e46488d7 100644
--- a/mkFit/MkFitter.cc
+++ b/mkFit/MkFitter.cc
@@ -202,7 +202,6 @@ void MkFitter::InputTracksAndHitIdx(const std::vector<Track>& tracks,
   int itrack = 0;
   for (int i = beg; i < end; ++i, ++itrack)
   {
-
     const Track &trk = tracks[i];
 
     Label(itrack, 0, 0) = trk.label();
@@ -215,7 +214,9 @@ void MkFitter::InputTracksAndHitIdx(const std::vector<Track>& tracks,
 
     for (int hi = 0; hi < Nhits; ++hi)
     {
-
+      // MPBL: It does not seem that these values are that dummies
+      //       Not transfering them to the GPU reduces the number of
+      //       nFoundHits in the printouts.
       HitsIdx[hi](itrack, 0, 0) = trk.getHitIdx(hi);//dummy value for now
 
     }
@@ -1190,208 +1191,3 @@ void MkFitter::CopyOutParErr(std::vector<std::vector<Track> >& seed_cand_vec,
   }
 }
 
-////////////////////////////////// //////////////////////////////////
-// Temporary
-#ifdef USE_CUDA
-void MkFitter::AddBestHit_gpu (const LayerOfHits &layer_of_hits, FitterCU<float> &cuFitter,
-                       LayerOfHitsCU &layer_of_hits_cu, const int N_proc)
-{
-#ifdef USE_CUDA
-  cuFitter.setHitsIdxToZero();
-#endif
-
-  float minChi2[NN];
-  int   bestHit[NN];
-  // MT: fill_n gave me crap on MIC, NN=8,16, doing in maxSize search below.
-  // Must be a compiler issue.
-  // std::fill_n(minChi2, NN, Config::chi2Cut);
-  // std::fill_n(bestHit, NN, -1);
-
-  const char *varr      = (char*) layer_of_hits.m_hits;
-
-  const int   off_error = (char*) layer_of_hits.m_hits[0].errArray() - varr;
-  const int   off_param = (char*) layer_of_hits.m_hits[0].posArray() - varr;
-
-  int idx[NN]      __attribute__((aligned(64)));
-
-#ifdef USE_CUDA
-    MPlexQF outChi2;
-    int maxSize = 0;
-    cuFitter.computeChi2gpu(Err[iP], Par[iP], Chg, msErr[Nhits], msPar[Nhits],
-        minChi2, bestHit, layer_of_hits_cu, XHitSize, XHitArr, Chi2, HitsIdx[Nhits], 
-        outChi2, maxSize, NN);
-    cuFitter.kalmanUpdate_standalone(Err[iP], Par[iP], Chg, 
-        msErr[Nhits], msPar[Nhits], Err[iC], Par[iC], N_proc);
-#else
-  int maxSize = 0;
-
-  // Determine maximum number of hits for tracks in the collection.
-  // At the same time prefetch the first set of hits to L1 and the second one to L2.
-  for (int it = 0; it < NN; ++it)
-  {
-    if (it < N_proc)
-    {
-      if (XHitSize[it] > 0)
-      {
-#ifndef NO_PREFETCH
-        _mm_prefetch(varr + XHitArr.At(it, 0, 0) * sizeof(Hit), _MM_HINT_T0);
-	if (XHitSize[it] > 1)
-	{
-	  _mm_prefetch(varr + XHitArr.At(it, 1, 0) * sizeof(Hit), _MM_HINT_T1);
-        }
-#endif
-        maxSize = std::max(maxSize, XHitSize[it]);
-      }
-    }
-
-    idx[it]     = 0;
-    bestHit[it] = -1;
-    minChi2[it] = Config::chi2Cut;
-  }
-
-// Has basically no effect, it seems.
-//#pragma noprefetch
-  for (int hit_cnt = 0; hit_cnt < maxSize; ++hit_cnt)
-  {
-    //fixme what if size is zero???
-
-#pragma simd
-    for (int itrack = 0; itrack < N_proc; ++itrack)
-    {
-      if (hit_cnt < XHitSize[itrack])
-      {
-        idx[itrack] = XHitArr.At(itrack, hit_cnt, 0) * sizeof(Hit);
-        //printf("idx_cpu%d : %d : %d\n", itrack, hit_cnt, idx[itrack]);
-      }
-    }
-#if defined(MIC_INTRINSICS)
-    __m512i vi = _mm512_load_epi32(idx);
-#endif
-
-#ifndef NO_PREFETCH
-    // Prefetch to L2 the hits we'll process after two loops iterations.
-    // Ideally this would be initiated before coming here, for whole bunch_of_hits.m_hits vector.
-    for (int itrack = 0; itrack < N_proc; ++itrack)
-    {
-      if (hit_cnt + 2 < XHitSize[itrack])
-      {
-        _mm_prefetch(varr + XHitArr.At(itrack, hit_cnt+2, 0)*sizeof(Hit), _MM_HINT_T1);
-      }
-    }
-#endif
-
-#ifdef NO_GATHER
-
-#pragma simd
-    for (int itrack = 0; itrack < N_proc; ++itrack)
-    {
-      if (hit_cnt < XHitSize[itrack])
-      {
-        const Hit &hit = layer_of_hits.m_hits[XHitArr.At(itrack, hit_cnt, 0)];
-        msErr[Nhits].CopyIn(itrack, hit.errArray());
-        msPar[Nhits].CopyIn(itrack, hit.posArray());
-      }
-    }
-    
-#else //NO_GATHER
-
-#if defined(MIC_INTRINSICS)
-    msErr[Nhits].SlurpIn(varr + off_error, vi);
-    msPar[Nhits].SlurpIn(varr + off_param, vi);
-#else
-    //printf("mserr\n");
-    msErr[Nhits].SlurpIn(varr + off_error, idx);
-    //printf("msPar\n");
-    msPar[Nhits].SlurpIn(varr + off_param, idx);
-
-    //for (int i = 0; i < msErr[Nhits].kSize; ++i) {
-      //int itrack = 0;
-      //printf("cpu -- %d : %f\n", itrack, msErr[Nhits].At(itrack, i, 0));
-    //}
-
-#endif
-#endif //NO_GATHER
-
-    //now compute the chi2 of track state vs hit
-    MPlexQF outChi2;
-    computeChi2MPlex(Err[iP], Par[iP], Chg, msErr[Nhits], msPar[Nhits], outChi2, N_proc);
-
-#ifndef NO_PREFETCH
-    // Prefetch to L1 the hits we'll process in the next loop iteration.
-    for (int itrack = 0; itrack < N_proc; ++itrack)
-    {
-      if (hit_cnt + 1 < XHitSize[itrack])
-      {
-        _mm_prefetch(varr + XHitArr.At(itrack, hit_cnt+1, 0)*sizeof(Hit), _MM_HINT_T0);
-      }
-    }
-#endif
-
-    //update best hit in case chi2<minChi2
-#pragma simd
-    for (int itrack = 0; itrack < N_proc; ++itrack)
-    {
-      if (hit_cnt < XHitSize[itrack])
-      {
-        const float chi2 = std::abs(outChi2[itrack]);//fixme negative chi2 sometimes...
-        dprint("chi2=" << chi2 << " minChi2[itrack]=" << minChi2[itrack]);
-        if (chi2 < minChi2[itrack])
-        {
-          minChi2[itrack] = chi2;
-          bestHit[itrack] = XHitArr.At(itrack, hit_cnt, 0);
-        }
-      }
-    }
-  } // end loop over hits
-
-  //copy in MkFitter the hit with lowest chi2
-  for (int itrack = 0; itrack < N_proc; ++itrack)
-  {
-    if (bestHit[itrack] >= 0)
-    {
-      _mm_prefetch( (const char*) & layer_of_hits.m_hits[ bestHit[itrack] ], _MM_HINT_T0);
-    }
-  }
-
-#pragma simd
-  for (int itrack = 0; itrack < N_proc; ++itrack)
-  {
-    //printf("CPU [%d]  -- %d : %f\n", itrack, bestHit[itrack], minChi2[itrack]);
-    //fixme decide what to do in case no hit found
-    if (bestHit[itrack] >= 0)
-    {
-      const Hit &hit  = layer_of_hits.m_hits[ bestHit[itrack] ];
-      const float chi2 = minChi2[itrack];
-
-      dprint("ADD BEST HIT FOR TRACK #" << itrack << std::endl
-        << "prop x=" << Par[iP].ConstAt(itrack, 0, 0) << " y=" << Par[iP].ConstAt(itrack, 1, 0) << std::endl
-        << "copy in hit #" << bestHit[itrack] << " x=" << hit.position()[0] << " y=" << hit.position()[1]);
-	  
-      msErr[Nhits].CopyIn(itrack, hit.errArray());
-      msPar[Nhits].CopyIn(itrack, hit.posArray());
-      Chi2(itrack, 0, 0) += chi2;
-      HitsIdx[Nhits](itrack, 0, 0) = bestHit[itrack];
-    }
-    else
-    {
-      dprint("ADD FAKE HIT FOR TRACK #" << itrack);
-
-      msErr[Nhits].SetDiagonal3x3(itrack, 666);
-      msPar[Nhits](itrack,0,0) = Par[iP](itrack,0,0);
-      msPar[Nhits](itrack,1,0) = Par[iP](itrack,1,0);
-      msPar[Nhits](itrack,2,0) = Par[iP](itrack,2,0);
-      HitsIdx[Nhits](itrack, 0, 0) = -1;
-
-      // Don't update chi2
-    }
-    //printf("CPU [%d]  -- %d : %f\n", itrack, HitsIdx[Nhits](itrack, 0, 0), Chi2[itrack]);
-  }
-
-  //now update the track parameters with this hit (note that some calculations are already done when computing chi2... not sure it's worth caching them?)
-  dprint("update parameters");
-  updateParametersMPlex(Err[iP], Par[iP], Chg, msErr[Nhits], msPar[Nhits],
-			Err[iC], Par[iC], N_proc);
-  //std::cout << "Par[iP](0,0,0)=" << Par[iP](0,0,0) << " Par[iC](0,0,0)=" << Par[iC](0,0,0)<< std::endl;
-#endif  // USE_CUDA
-}
-#endif  // USE_CUDA
diff --git a/mkFit/MkFitter.h b/mkFit/MkFitter.h
index 8b48f04350a62..c6fa30123a62b 100644
--- a/mkFit/MkFitter.h
+++ b/mkFit/MkFitter.h
@@ -99,10 +99,6 @@ struct MkFitter
 
   void SelectHitIndices(const LayerOfHits &layer_of_hits, const int N_proc, bool dump=false);
 
-#ifdef USE_CUDA
-  void AddBestHit_gpu (const LayerOfHits &layer_of_hits, FitterCU<float> &cuFitter,
-                       LayerOfHitsCU &layer_of_hits_cu, const int N_proc);
-#endif
   void AddBestHit      (const LayerOfHits &layer_of_hits, const int N_proc);
 
   void FindCandidates(const LayerOfHits &layer_of_hits, std::vector<std::vector<Track> >& tmp_candidates,
diff --git a/mkFit/accessors_cu.h b/mkFit/accessors_cu.h
new file mode 100644
index 0000000000000..f2bb9d9b07c24
--- /dev/null
+++ b/mkFit/accessors_cu.h
@@ -0,0 +1,40 @@
+#ifndef ACCESSORS_CU_H
+#define ACCESSORS_CU_H 
+
+template <>
+__device__ float* SVector3::ArrayCU() {
+  return fArray; 
+}
+
+template <>
+__device__ float* SVector6::ArrayCU() {
+  return fArray; 
+}
+
+template <>
+__device__ float* ROOT::Math::MatRepSym<float,6>::ArrayCU() {
+  return fArray;
+}
+
+template <>
+__device__ float* SMatrixSym66::ArrayCU() {
+  return fRep.ArrayCU(); 
+}
+
+__device__ float *Hit::posArrayCU() {
+  return state_.pos_.ArrayCU();
+}
+
+__device__ float *Hit::errArrayCU() {
+  return state_.err_.ArrayCU();
+}
+
+__device__ float *Track::posArrayCU() {
+  return state_.parameters.ArrayCU();
+}
+
+__device__ float *Track::errArrayCU() {
+  return state_.errors.ArrayCU();
+}
+
+#endif /* ifndef ACCESSORS_CU_H */
diff --git a/mkFit/array_algorithms_cu.h b/mkFit/array_algorithms_cu.h
new file mode 100644
index 0000000000000..439a0a10c6710
--- /dev/null
+++ b/mkFit/array_algorithms_cu.h
@@ -0,0 +1,59 @@
+#ifndef ARRAY_ALGORITHMS_CU_H
+#define ARRAY_ALGORITHMS_CU_H 
+
+#include <cub/cub.cuh>
+#include "atomic_utils.h"
+#include "gpu_utils.h"
+
+template <typename T,
+          int BLOCK_THREADS,
+          int ITEMS_PER_THREAD,
+          cub::BlockReduceAlgorithm ALGORITHM>
+__device__ void reduceMax_fn(const T *d_in, const int in_size, T *d_max) {
+  // Specialize BlockReduce type for our thread block
+  typedef cub::BlockReduce<T, BLOCK_THREADS, ALGORITHM> BlockReduceT;
+  // Shared memory
+  __shared__ typename BlockReduceT::TempStorage temp_storage;
+
+  for (int i = threadIdx.x + blockIdx.x*blockDim.x;
+           i < in_size;
+           i += blockDim.x * gridDim.x) {
+    // Per-thread tile data
+    T data[ITEMS_PER_THREAD];
+    int block_offset = i - threadIdx.x;
+    cub::LoadDirectStriped<BLOCK_THREADS>(threadIdx.x, 
+                                          d_in + block_offset, //blockIdx.x*blockDim.x,
+                                          data);
+    // Compute sum for a single thread block
+    T aggregate = BlockReduceT(temp_storage).Reduce(data, cub::Max());
+
+    // FIXME: Is reduction over block enough, or do we need device-wise reduction
+    //        CPU code reduces on NN (typically 8 or 16) values. So block-wise should
+    //        be good enough.
+    //        Device-wise reductions with atomics are performance killers
+    //if (threadIdx.x == 0) {
+      //AtomicMax(d_max, aggregate);
+    //}
+    *d_max = aggregate;
+  }
+}
+
+template <typename T,
+          int BLOCK_THREADS,
+          int ITEMS_PER_THREAD,
+          cub::BlockReduceAlgorithm ALGORITHM>
+__global__ void reduceMax_kernel(T *d_in, int in_size, T *d_max) {
+  reduceMax_fn<T, BLOCK_THREADS, ITEMS_PER_THREAD, ALGORITHM>(d_in, in_size, d_max);
+}
+
+template <typename T, 
+          int BLOCK_THREADS,
+          int ITEMS_PER_THREAD>
+void max_element_wrapper(T *d_v, int num_elems, T *d_max) {
+  dim3 block (BLOCK_THREADS, 1, 1);
+  dim3 grid (std::min(max_blocks_x, (num_elems-1)/BLOCK_THREADS + 1), 1, 1) ;
+  reduceMax_kernel<T, BLOCK_THREADS, ITEMS_PER_THREAD, cub::BLOCK_REDUCE_RAKING>
+       <<< grid, block >>>
+       (d_v, num_elems, d_max);
+}
+#endif /* ifndef ARRAY_ALGORITHMS_CU_H */
diff --git a/mkFit/atomic_utils.h b/mkFit/atomic_utils.h
new file mode 100644
index 0000000000000..c5f82d99bc50d
--- /dev/null
+++ b/mkFit/atomic_utils.h
@@ -0,0 +1,102 @@
+#ifndef ATOMIC_UTILS_H
+#define ATOMIC_UTILS_H
+
+/* Copyright (c) 2012, NVIDIA CORPORATION. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *  * Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ *  * Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *  * Neither the name of NVIDIA CORPORATION nor the names of its
+ *    contributors may be used to endorse or promote products derived
+ *    from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+ * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+ * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+ * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+// AtomicMAX code from:
+// https://github.com/parallel-forall/code-samples/blob/master/posts/cuda-aware-mpi-example/src/Device.cu
+
+#define uint64 unsigned long long
+
+  template <typename T>
+__device__ void AtomicMax(T * const address, const T value)
+{
+  atomicMax(address, value);
+}
+
+/**
+ * @brief Compute the maximum of 2 single-precision floating point values using an atomic operation
+ *
+ * @param[in]addressThe address of the reference value which might get updated with the maximum
+ * @param[in]valueThe value that is compared to the reference in order to determine the maximum
+ */
+  template <>
+__device__ void AtomicMax(float * const address, const float value)
+{
+  if (* address >= value)
+  {
+    return;
+  }
+
+  int * const address_as_i = (int *)address;
+  int old = * address_as_i, assumed;
+
+  do 
+  {
+    assumed = old;
+    if (__int_as_float(assumed) >= value)
+    {
+      break;
+    }
+    // The value stored at address_as_i might have changed since it has been loaded
+    // into old and into assumed.
+    old = atomicCAS(address_as_i, assumed, __float_as_int(value));
+  } while (assumed != old);  // the other threads did not change address_as_i, so the
+                             // atomicCAS return action makes sense.
+}
+
+/**
+ * @brief Compute the maximum of 2 double-precision floating point values using an atomic operation
+ *
+ * @param[in]addressThe address of the reference value which might get updated with the maximum
+ * @param[in]valueThe value that is compared to the reference in order to determine the maximum
+ */
+  template <>
+__device__ void AtomicMax(double * const address, const double value)
+{
+  if (* address >= value)
+  {
+    return;
+  }
+
+  uint64 * const address_as_i = (uint64 *)address;
+  uint64 old = * address_as_i, assumed;
+
+  do 
+  {
+    assumed = old;
+    if (__longlong_as_double(assumed) >= value)
+    {
+      break;
+    }
+
+    old = atomicCAS(address_as_i, assumed, __double_as_longlong(value));
+  } while (assumed != old);
+}
+
+#endif /* ifndef ATOMIC_UTILS_H */
diff --git a/mkFit/best_hit_kernels.cu b/mkFit/best_hit_kernels.cu
new file mode 100644
index 0000000000000..1ef5b797acadd
--- /dev/null
+++ b/mkFit/best_hit_kernels.cu
@@ -0,0 +1,301 @@
+#include "best_hit_kernels.h"
+
+#include "computeChi2_kernels.h"
+#include "index_selection_kernels.h"
+#include "reorganize_gplex.h"
+#include "array_algorithms_cu.h"
+#include "kalmanUpdater_kernels.h"
+#include "propagation_kernels.h"
+
+#include <thrust/device_vector.h>
+#include <thrust/fill.h>
+#include <thrust/reduce.h>
+#include <thrust/functional.h>
+
+#define BLOCK_SIZE_X 256
+
+
+__device__ void getNewBestHitChi2_fn(
+    const GPlexQI &XHitSize, const GPlexHitIdx &XHitArr,
+    const float *outChi2, float &minChi2,
+    int &bestHit, const int hit_cnt, const int N) {
+  int itrack = threadIdx.x + blockDim.x*blockIdx.x;
+
+  if (itrack < N) {
+    if (hit_cnt < XHitSize[itrack]) {
+      float chi2 = fabs(outChi2[itrack]);
+      if (chi2 < minChi2) {
+        minChi2 = chi2;
+        bestHit = XHitArr(itrack, hit_cnt, 0);
+      }
+    }
+  }
+}
+
+
+__global__ void getNewBestHitChi2_kernel(
+    const GPlexQI XHitSize, const GPlexHitIdx XHitArr,
+    const float *outChi2, float *minChi2,
+    int *bestHit, const int hit_cnt, const int N) {
+  int itrack = threadIdx.x + blockDim.x*blockIdx.x;
+  if (itrack < N) {
+    getNewBestHitChi2_fn(XHitSize, XHitArr, outChi2, minChi2[itrack], bestHit[itrack], hit_cnt, N);
+  }
+}
+
+
+void getNewBestHitChi2_wrapper(const cudaStream_t &stream,
+    const GPlexQI &XHitSize, const GPlexHitIdx &XHitArr,
+    const GPlexQF &outChi2, float *minChi2, int *bestHit, 
+    const int hit_cnt, const int N) {
+  int gridx = std::min((N-1)/BLOCK_SIZE_X + 1,
+                       max_blocks_x);
+  dim3 grid(gridx, 1, 1);
+  dim3 block(BLOCK_SIZE_X, 1, 1);
+  getNewBestHitChi2_kernel <<< grid, block, 0, stream >>>
+    (XHitSize, XHitArr, outChi2.ptr, minChi2, bestHit, hit_cnt, N);
+}
+
+
+void fill_array_cu(float *array, const int size, const float value) {
+  thrust::device_ptr<float> d_ptr(array);
+  thrust::fill(d_ptr, d_ptr + size, value);
+}
+
+
+__device__ void updateTracksWithBestHit_fn(Hit *hits, 
+    const float minChi2, const int bestHit,
+    GPlexHS &msErr, GPlexHV &msPar, const GPlexLV &propPar, 
+    GPlexQF &Chi2, GPlexQI &HitsIdx, const int N) {
+  int itrack = threadIdx.x + blockDim.x*blockIdx.x;
+  if (itrack < N) {
+    if (bestHit >= 0)
+    {
+      Hit   &hit  = hits[ bestHit ];
+      const float &chi2_local = minChi2;
+	  
+      for (int i = 0; i < msErr.kSize; ++i) {
+        msErr(itrack, i, 0) = hit.errArrayCU()[i];
+      }
+      for (int i = 0; i < msPar.kSize; ++i) {
+        msPar(itrack, i, 0) = hit.posArrayCU()[i];
+      }
+      Chi2[itrack] += chi2_local;
+      HitsIdx[itrack] = bestHit;
+    }
+    else
+    {
+      /*msErr[Nhits].SetDiagonal3x3(itrack, 666);*/
+      msErr(itrack, 0, 0) = 666;
+      msErr(itrack, 1, 0) = 0;
+      msErr(itrack, 2, 0) = 666;
+      msErr(itrack, 3, 0) = 0;
+      msErr(itrack, 4, 0) = 0;
+      msErr(itrack, 5, 0) = 666;
+
+      for (int i = 0; i < msPar.kSize; ++i) {
+        msPar(itrack, i, 0) = propPar(itrack, i, 0);
+      }
+      HitsIdx[itrack] = -1;
+      // Don't update chi2
+    }
+  }
+}
+
+
+__global__ void updateTracksWithBestHit_kernel(Hit *hits, 
+    const float *minChi2, const int *bestHit,
+    GPlexHS msErr, GPlexHV msPar, const GPlexLV propPar, 
+    GPlexQF Chi2, GPlexQI HitsIdx, const int N) {
+  int itrack = threadIdx.x + blockDim.x*blockIdx.x;
+  if (itrack < N) {
+    updateTracksWithBestHit_fn
+        (hits, minChi2[itrack], bestHit[itrack],
+         msErr, msPar, propPar, Chi2, HitsIdx, N);
+  }
+}
+
+
+void updateTracksWithBestHit_wrapper(const cudaStream_t &stream,
+    LayerOfHitsCU &layer, const float *minChi2, const int *best_hit, 
+    GPlexHS &msErr, GPlexHV &msPar, const GPlexLV &propPar,
+    GPlexQF &Chi2, GPlexQI &HitsIdx, const int N) {
+  int gridx = std::min((N-1)/BLOCK_SIZE_X + 1,
+                       max_blocks_x);
+  dim3 grid(gridx, 1, 1);
+  dim3 block(BLOCK_SIZE_X, 1, 1);
+  updateTracksWithBestHit_kernel <<< grid, block, 0, stream >>>
+      (layer.m_hits, minChi2, best_hit, msErr, msPar, propPar, Chi2, HitsIdx, N);
+}
+
+
+int getMaxNumHits_wrapper(const GPlexQI d_XHitSize, const int N) {
+  thrust::device_ptr<int> d_ptr(d_XHitSize.ptr);
+  int maxSize=  thrust::reduce(d_ptr, d_ptr + N, -1, thrust::maximum<int>());
+  maxSize = std::min(maxSize, Config::maxHitsConsidered);
+
+  return maxSize;
+}
+
+
+__device__ void bestHit_fn(
+    Hit *hits, const GPlexQI &XHitSize, const GPlexHitIdx &XHitArr, 
+    const GPlexLS &propErr, GPlexHS &msErr, GPlexHV &msPar,
+    const GPlexLV &propPar, GPlexQF &outChi2,
+    GPlexQF &Chi2, GPlexQI &HitsIdx,
+    const int maxSize, const int N) {
+
+  int itrack = threadIdx.x + blockDim.x*blockIdx.x;
+  int bestHit_reg = -1;
+  float minChi2_reg = 15.f;
+
+  if (itrack < N)
+    HitsIdx[itrack] = 0;
+
+  for (int hit_cnt = 0; hit_cnt < maxSize; ++hit_cnt)
+  {
+    HitToMs_fn(msErr, msPar, hits, XHitSize, XHitArr, HitsIdx, hit_cnt, N);
+#if 0
+      // TODO: add CMSGeom
+      if (Config::useCMSGeom) {
+        //propagateHelixToRMPlex(psErr,  psPar, inChg,  msPar, propErr, propPar);
+        throw std::runtime_error("useCMSGeom not implemented yet for GPU");
+      } else {}
+#endif
+    computeChi2_fn(propErr, msErr, msPar, propPar, outChi2, N);
+    getNewBestHitChi2_fn(XHitSize, XHitArr, outChi2.ptr, minChi2_reg, bestHit_reg, hit_cnt, N);
+  }
+  updateTracksWithBestHit_fn
+      (hits, 
+       minChi2_reg, bestHit_reg,
+       msErr, msPar, propPar,
+       Chi2, HitsIdx,
+       N);
+}
+
+
+__global__ void bestHit_kernel(
+    Hit *hits, const GPlexQI XHitSize, const GPlexHitIdx XHitArr, 
+    const GPlexLS propErr, GPlexHS msErr, GPlexHV msPar,
+    const GPlexLV propPar, GPlexQF outChi2,
+    GPlexQF Chi2, GPlexQI HitsIdx,
+    const int maxSize, const int N) {
+  bestHit_fn(hits, XHitSize, XHitArr, 
+    propErr, msErr, msPar,
+    propPar, outChi2,
+    Chi2, HitsIdx,
+    maxSize, N);
+}
+
+
+void bestHit_wrapper(const cudaStream_t &stream,
+    LayerOfHitsCU &layer, const GPlexQI &XHitSize,  const GPlexHitIdx &XHitArr,
+    const GPlexLS &propErr, GPlexHS &msErr, GPlexHV &msPar,
+    const GPlexLV &propPar, GPlexQF &outChi2,
+    GPlexQF &Chi2, GPlexQI &HitsIdx,
+    const int maxSize, const int N) {
+  int gridx = std::min((N-1)/BLOCK_SIZE_X + 1,
+                       max_blocks_x);
+  dim3 grid(gridx, 1, 1);
+  dim3 block(BLOCK_SIZE_X, 1, 1);
+
+  bestHit_kernel <<< grid, block, 0, stream >>>
+    (layer.m_hits, XHitSize, XHitArr,
+     propErr, msErr, msPar, propPar, outChi2,
+     /*propErr.ptr, propErr.stride,*/
+     /*msErr.ptr, msErr.stride, msErr.kSize,*/
+     /*msPar.ptr, msPar.stride, msPar.kSize,*/
+     /*outChi2.ptr, outChi2.stride,*/
+     Chi2, HitsIdx,
+     maxSize, N);
+}
+
+
+template <int BLOCK_THREADS>
+__global__ void findBestHit_kernel(LayerOfHitsCU *layers,
+                                   EtaBinOfCandidatesCU *etabin_of_cands,
+                                   GPlexQI XHitSize, GPlexHitIdx XHitArr,
+                                   GPlexLS Err_iP, GPlexLV Par_iP,
+                                   GPlexHS *msErr_arr, GPlexHV *msPar_arr,
+                                   GPlexLS Err_iC, GPlexLV Par_iC,
+                                   GPlexQF outChi2,
+                                   GPlexQF Chi2, GPlexQI *HitsIdx_arr,
+                                   GPlexQI inChg, GPlexQI Label, GeometryCU geom, 
+                                   int *maxSize, int gplex_size) {
+  for (int ebin = 0; ebin != Config::nEtaBin; ++ebin) {
+    for (int itrack = 0; itrack < etabin_of_cands[ebin].m_fill_index; itrack += gplex_size) {
+      int end = min(itrack + gplex_size, etabin_of_cands[ebin].m_fill_index);
+      int N = end - itrack; 
+
+      if (threadIdx.x + blockDim.x * blockIdx.x < N) {
+
+        InputTracksCU_fn(etabin_of_cands[ebin].m_candidates, Err_iP, Par_iP,
+            inChg, Chi2, Label, HitsIdx_arr, itrack, end, N);
+
+        for (int ilay = Config::nlayers_per_seed; ilay < Config::nLayers; ++ilay)
+        {
+          int hit_idx = ilay;
+          GPlexHS &msErr = msErr_arr[hit_idx];
+          GPlexHV &msPar = msPar_arr[hit_idx];
+          GPlexQI &HitsIdx = HitsIdx_arr[hit_idx];
+
+          float *radii = geom.radii;
+
+          LayerOfHitsCU &layer = layers[ilay];
+
+          int maxSize_block;
+          selectHitIndices_fn(layer, Err_iP, Par_iP, XHitSize, XHitArr, N);
+          // FIXME: Is reduction over block enough, or do we need device-wise reduction
+          reduceMax_fn<int, BLOCK_THREADS, 1, cub::BLOCK_REDUCE_WARP_REDUCTIONS>
+            (XHitSize.ptr, XHitSize.N, &maxSize_block);
+          bestHit_fn(layer.m_hits, XHitSize, XHitArr, 
+              Err_iP, msErr, msPar,
+              Par_iP, outChi2,
+              Chi2, HitsIdx,
+              maxSize_block, N);
+          kalmanUpdate_fn( Err_iP, msErr, Par_iP, msPar, Par_iC, Err_iC, N);
+          if (ilay+1 < Config::nLayers) {
+            float radius = radii[ilay+1];
+            propagationForBuilding_fn(Err_iC, Par_iC, inChg, radius, Err_iP, Par_iP, N);
+          }
+        }
+        OutputTracksCU_fn(etabin_of_cands[ebin].m_candidates, 
+            Err_iP, Par_iP, inChg, Chi2, Label, HitsIdx_arr, itrack, end, N);
+      }
+    }
+  }
+}
+
+
+void findBestHit_wrapper(cudaStream_t &stream,
+    LayerOfHitsCU *layers,
+    EventOfCandidatesCU &event_of_cands_cu,
+    GPlexQI &XHitSize, GPlexHitIdx &XHitArr,
+    GPlexLS &Err_iP, GPlexLV &Par_iP,
+    GPlexHS *msErr, GPlexHV *msPar,
+    GPlexLS &Err_iC, GPlexLV &Par_iC,
+    GPlexQF &outChi2,
+    GPlexQF &Chi2, GPlexQI *HitsIdx,
+    GPlexQI &inChg, GPlexQI &Label,
+    GeometryCU &geom, 
+    int *maxSize, int N) {
+  /*int gridx = std::min((N-1)/BLOCK_SIZE_X + 1,*/
+                       /*max_blocks_x);*/
+  int gridx = (N-1)/BLOCK_SIZE_X + 1;
+  dim3 grid(gridx, 1, 1);
+  dim3 block(BLOCK_SIZE_X, 1, 1);
+
+  if (gridx > max_blocks_x) {
+    throw std::runtime_error("The matriplex size should be chosen such "
+                             "that gplex_size*BLOCK_SIZE_X <= max_blocks_x");
+  }
+  // The loop over tracks is taking care of the case where there is more tracks
+  // than available threads.
+  // We should actually not throw an exception, GPlex should be allocated with
+  // a smaller size in MkFitter.
+
+  findBestHit_kernel<BLOCK_SIZE_X> <<< grid, block, 0, stream >>>
+    (layers, event_of_cands_cu.m_etabins_of_candidates,
+     XHitSize, XHitArr, Err_iP, Par_iP, msErr, msPar,
+     Err_iC, Par_iC, outChi2, Chi2, HitsIdx, inChg, Label, geom, maxSize, N);
+}
diff --git a/mkFit/best_hit_kernels.h b/mkFit/best_hit_kernels.h
new file mode 100644
index 0000000000000..8cf6a421820fe
--- /dev/null
+++ b/mkFit/best_hit_kernels.h
@@ -0,0 +1,49 @@
+#ifndef BEST_HIT_KERNELS_H
+#define BEST_HIT_KERNELS_H 
+
+
+#include "GPlex.h"
+#include "HitStructuresCU.h"
+#include "GeometryCU.h"
+
+
+void getNewBestHitChi2_wrapper(const cudaStream_t &stream,
+    const GPlexQI &XHitSize, const GPlexHitIdx &XHitArr,
+    const GPlexQF &outChi2, float *minChi2, int *bestHit,
+    const int hit_cnt, const int N);
+
+
+void fill_array_cu(float *array, const int size, const float value);
+
+
+void updateTracksWithBestHit_wrapper(const cudaStream_t &stream,
+    LayerOfHitsCU &, const float *minChi2, const int *best_hit, 
+    GPlexHS &msErr, GPlexHV &msPar, const GPlexLV &propPar,
+    GPlexQF &Chi2, GPlexQI& HitsIdx, const int N);
+
+int getMaxNumHits_wrapper(const GPlexQI d_XHitSize, const int N);
+
+
+void bestHit_wrapper(const cudaStream_t &stream,
+    LayerOfHitsCU &layer, const GPlexQI &XHitSize, const GPlexHitIdx &XHitArr,
+    const GPlexLS &propErr, GPlexHS &msErr, GPlexHV &msPar,
+    const GPlexLV &propPar, GPlexQF &outChi2,
+    GPlexQF &Chi2, GPlexQI& HitsIdx,
+    const int maxSize, const int N);
+
+
+void findBestHit_wrapper(cudaStream_t &stream,
+                         LayerOfHitsCU *layers, 
+                         EventOfCandidatesCU &event_of_cands_cu,
+                         GPlexQI &XHitSize, GPlexHitIdx &XHitArr,
+                         GPlexLS &Err_iP, GPlexLV &Par_iP,
+                         GPlexHS *msErr, GPlexHV *msPar,
+                         GPlexLS &Err_iC, GPlexLV &Par_iC,
+                         GPlexQF &outChi2,
+                         GPlexQF &Chi2, GPlexQI *HitsIdx,
+                         GPlexQI &inChg, GPlexQI &Label,
+                         GeometryCU &geom, 
+                         int *maxSize, int N);
+
+
+#endif  /* ifndef BEST_HIT_KERNELS_H */
diff --git a/mkFit/buildtestMPlex.cc b/mkFit/buildtestMPlex.cc
index 690ef8995b3bb..f801c791317ae 100644
--- a/mkFit/buildtestMPlex.cc
+++ b/mkFit/buildtestMPlex.cc
@@ -8,6 +8,7 @@
 
 #include "MkBuilder.h"
 #include "FitterCU.h"
+#include "BuilderCU.h"
 
 #include <omp.h>
 
@@ -74,7 +75,7 @@ double runBuildingTestPlexBestHit(Event& ev)
 {
   MkBuilder builder;
 
-  std::cout << "Building event...\n";
+  std::cerr << "Building event...\n";
   builder.begin_event(&ev, 0, __func__);
 
   builder.fit_seeds_tbb();
@@ -86,10 +87,20 @@ double runBuildingTestPlexBestHit(Event& ev)
   __itt_resume();
 #endif
 
+#if USE_CUDA
+  BuilderCU builder_cu(builder.get_event_of_hits(), builder.get_event(),
+                       event_of_cands);
+#endif
+
   double time = dtime();
 
   std::cout << "Finding best hits...\n";
+#if USE_CUDA
+  builder_cu.FindTracksBestHit(event_of_cands);
+  //builder.FindTracksBestHit_GPU(event_of_cands);
+#else
   builder.FindTracksBestHit(event_of_cands);
+#endif
 
   time = dtime() - time;
 
diff --git a/mkFit/computeChi2_kernels.cu b/mkFit/computeChi2_kernels.cu
index 29f32205d1be5..dabc27681c98f 100644
--- a/mkFit/computeChi2_kernels.cu
+++ b/mkFit/computeChi2_kernels.cu
@@ -1,47 +1,21 @@
-#include <stdio.h>
-#include <algorithm>
-#include "GPlex.h"
-#include "kalmanUpdater_kernels.h"
 #include "computeChi2_kernels.h"
 
-#include <thrust/device_vector.h>
-#include <thrust/fill.h>
-#include <thrust/reduce.h>
-#include <thrust/functional.h>
+#include "GPlex.h"
+#include "kalmanUpdater_kernels.h"
+#include "gpu_utils.h"
 
-#include "HitStructuresCU.h"
-#include "BinInfoUtils.h"
-#include "Hit.h"
+#include <stdio.h>
 
 #define L 6
 #define HS 6
 #define HV 3
-#define BLOCK_SIZE_X 32
-#define MAX_BLOCKS_X 65535 // CUDA constraint
-
-
-template <>
-__device__ float* SVector3::ArrayCU() {
-  return fArray; 
-}
+#define BLOCK_SIZE_X 256
 
-template <>
-__device__ float* SVector6::ArrayCU() {
-  return fArray; 
-}
-
-__device__ float *Hit::posArrayCU() {
-  return state_.pos_.ArrayCU();
-}
-
-__device__ float *Hit::errArrayCU() {
-  return state_.err_.ArrayCU();
-}
 
 __device__ void chi2Similarity_fn(
-    GPlexReg2V &a,
-    GPlexReg2S &c, // in registers
-    float *d, size_t dN) {
+    const GPlexReg2V &a,
+    const GPlexReg2S &c, // in registers
+    float *d, const size_t dN) {
 
   int n = threadIdx.x + blockIdx.x * blockDim.x;
 
@@ -56,9 +30,10 @@ __device__ void chi2Similarity_fn(
             + 2*( c[1]*a[1]*a[0]);
 }
 
+
 __device__ void RotateResidulsOnTangentPlane_fn(const float r00,//r00
-				  float r01,//r01
-				  GPlexRegHV &a  ,//res_glo
+				  const float r01,//r01
+				  const GPlexRegHV &a  ,//res_glo
           GPlexReg2V &b  )//res_loc
 {
 
@@ -68,9 +43,10 @@ __device__ void RotateResidulsOnTangentPlane_fn(const float r00,//r00
   b[1] =  a[2];
 }
 
-__device__ void ProjectResErr_fn(float a00,
-		   float a01,
-		   GPlexRegHS &b, 
+
+__device__ void ProjectResErr_fn(const float a00,
+		   const float a01,
+		   const GPlexRegHS &b, 
        GPlexRegHH &c)
 {
   // C = A * B, C is 3x3, A is 3x3 , B is 3x3 sym
@@ -87,8 +63,9 @@ __device__ void ProjectResErr_fn(float a00,
       c[ 8] = a01*b[ 3] - a00*b[ 4];
 }
 
-__device__ void ProjectResErrTransp_fn(float a00,
-			 float a01, GPlexRegHH &b, GPlexReg2S &c)
+
+__device__ void ProjectResErrTransp_fn(const float a00,
+			 const float a01, const GPlexRegHH &b, GPlexReg2S &c)
 {
   // C = A * B, C is 3x3 sym, A is 3x3 , B is 3x3
 
@@ -98,9 +75,10 @@ __device__ void ProjectResErrTransp_fn(float a00,
       c[ 2] = b[ 5];
 }
 
+
 __device__ void computeChi2_fn(
-    GPlexLS &propErr, GPlexHS &msErr, GPlexHV &msPar, GPlexLV &propPar,
-    GPlexQF &outChi2, const int N) {
+    const GPlexLS &propErr, const GPlexHS &msErr, const GPlexHV &msPar,
+    const GPlexLV &propPar, GPlexQF &outChi2, const int N) {
   int grid_width = blockDim.x * gridDim.x;
   int n = threadIdx.x + blockIdx.x * blockDim.x;
   /*float resErr_reg[HS]; // ~ resErr_glo*/
@@ -144,9 +122,10 @@ __device__ void computeChi2_fn(
   }
 }
 
+
 __global__ void computeChi2_kernel(
-    GPlexLS propErr, GPlexHS msErr, GPlexHV msPar, GPlexLV propPar,
-    GPlexQF outChi2, const int N) {
+    const GPlexLS propErr, const GPlexHS msErr, const GPlexHV msPar, 
+    const GPlexLV propPar, GPlexQF outChi2, const int N) {
   int itrack = threadIdx.x + blockDim.x*blockIdx.x;
   if (itrack < N) {
     computeChi2_fn
@@ -155,434 +134,15 @@ __global__ void computeChi2_kernel(
   }
 }
 
+
 void computeChi2_wrapper(cudaStream_t &stream, 
-    GPlexLS &propErr, GPlexHS &msErr, // GPlex<float> resErr,
-    GPlexHV &msPar, GPlexLV &propPar, GPlexQF &outChi2,
+    const GPlexLS &propErr, const GPlexHS &msErr,
+    const GPlexHV &msPar, const GPlexLV &propPar, GPlexQF &outChi2,
     const int N) {
   int gridx = std::min((N-1)/BLOCK_SIZE_X + 1,
-                       MAX_BLOCKS_X);
+                       max_blocks_x);
   dim3 grid(gridx, 1, 1);
   dim3 block(BLOCK_SIZE_X, 1, 1);
   computeChi2_kernel <<< grid, block, 0, stream >>>
     (propErr, msErr, msPar, propPar, outChi2, N);
  }
-
-template <typename GPlexObj>
-__device__ void SlurpIn_fn(GPlexObj to, // float *fArray, int stride, int kSize, 
-                           const char *arr, int *vi, int N) {
-  int j = threadIdx.x + blockDim.x * blockIdx.x;
-  if (j<N) {
-    int *XHitPos = vi;
-    int off = XHitPos[j] * sizeof(Hit);
-    for (int i = 0; i < to.kSize; ++i) { // plex_size
-      /*fArray[i*stride+ j] = * (const T*) (arr + i*sizeof(T) + off);*/
-      to(j, i, 0) = * (decltype(to.ptr)) (arr + i*sizeof(decltype(*to.ptr)) + off);
-    }
-  }
-}
-
-template <typename GPlexObj>
-__device__ void SlurpInIdx_fn(GPlexObj to, // float *fArray, int stride, int kSize, 
-                             const char *arr, int idx, int N) {
-  int j = threadIdx.x + blockDim.x * blockIdx.x;
-  if (j<N) {
-    for (int i = 0; i < to.kSize; ++i) { // plex_size
-      to(j, i, 0) = * (decltype(to.ptr)) (arr + i*sizeof(decltype(*to.ptr)) + idx);
-    }
-  }
-}
-
-
-__device__ void HitToMs_fn(GPlexHS &msErr, GPlexHV &msPar,
-                           Hit *hits, GPlexQI &XHitSize, GPlexHitIdx &XHitArr, 
-                           int *HitsIdx, int hit_cnt, int N) {
-  /*int j = threadIdx.x + blockDim.x*blockIdx.x;*/
-  int itrack = threadIdx.x + blockDim.x * blockIdx.x;
-  if (itrack < N) {
-
-    const char *varr      = (char*) hits;
-    const int   off_error = (char*) hits[0].errArrayCU() - varr;
-    const int   off_param = (char*) hits[0].posArrayCU() - varr;
-
-    if (hit_cnt < XHitSize[itrack]) {
-      HitsIdx[itrack] = XHitArr(itrack, hit_cnt, 0) * sizeof(Hit);
-    }
-    SlurpInIdx_fn(msErr, varr + off_error, HitsIdx[itrack], N);
-    SlurpInIdx_fn(msPar, varr + off_param, HitsIdx[itrack], N);
-  }
-}
-
-__global__ void HitToMs_kernel(GPlexHS msErr, GPlexHV msPar,
-    Hit *hits, GPlexQI XHitSize, GPlexHitIdx XHitArr, int *HitsIdx, int hit_cnt, int N) {
-
-    HitToMs_fn(msErr, msPar, hits, XHitSize, XHitArr, HitsIdx, hit_cnt, N);
-}
-
-#if 1
-void HitToMs_wrapper(cudaStream_t& stream,
-    GPlexHS &msErr, GPlexHV &msPar, LayerOfHitsCU &layer, 
-    GPlexQI &XHitSize, GPlexHitIdx &XHitArr, int *HitsIdx, int hit_cnt, int N) {
-  int gridx = std::min((N-1)/BLOCK_SIZE_X + 1,
-                       MAX_BLOCKS_X);
-  dim3 grid(gridx, 1, 1);
-  dim3 block(BLOCK_SIZE_X, 1, 1);
-#if 1
-  HitToMs_kernel <<< grid, block, 0 , stream >>>
-    (msErr, msPar, layer.m_hits, XHitSize, XHitArr, HitsIdx, hit_cnt, N);
-  cudaDeviceSynchronize();
-#endif
-}
-#endif
-
-
-__device__ void getNewBestHitChi2_fn(
-    GPlexQI &XHitSize, GPlexHitIdx &XHitArr,
-    float *outChi2, float &minChi2,
-    int &bestHit, int hit_cnt, int N) {
-  int itrack = threadIdx.x + blockDim.x*blockIdx.x;
-
-  if (itrack < N) {
-    if (hit_cnt < XHitSize[itrack]) {
-      float chi2 = fabs(outChi2[itrack]);
-      if (chi2 < minChi2) {
-        minChi2 = chi2;
-        /*bestHit = hit_cnt;*/
-        bestHit = XHitArr(itrack, hit_cnt, 0);
-      }
-    }
-  }
-}
-
-__global__ void getNewBestHitChi2_kernel(
-    GPlexQI XHitSize, GPlexHitIdx XHitArr,
-    float *outChi2, float *minChi2,
-    int *bestHit, int hit_cnt, int N) {
-  int itrack = threadIdx.x + blockDim.x*blockIdx.x;
-  if (itrack < N) {
-    getNewBestHitChi2_fn(XHitSize, XHitArr, outChi2, minChi2[itrack], bestHit[itrack], hit_cnt, N);
-  }
-}
-
-void getNewBestHitChi2_wrapper(cudaStream_t &stream,
-    GPlexQI &XHitSize, GPlexHitIdx &XHitArr,
-    GPlexQF &outChi2, float *minChi2, int *bestHit, int hit_cnt, int N) {
-  int gridx = std::min((N-1)/BLOCK_SIZE_X + 1,
-                       MAX_BLOCKS_X);
-  dim3 grid(gridx, 1, 1);
-  dim3 block(BLOCK_SIZE_X, 1, 1);
-#if 1
-  getNewBestHitChi2_kernel <<< grid, block, 0, stream >>>
-    (XHitSize, XHitArr, outChi2.ptr, minChi2, bestHit, hit_cnt, N);
-#endif
-}
-
-void fill_array_cu(float *array, int size, float value) {
-  thrust::device_ptr<float> d_ptr(array);
-  thrust::fill(d_ptr, d_ptr + size, value);
-}
-
-
-__device__ void updateTracksWithBestHit_fn(Hit *hits, 
-    float minChi2, int bestHit,
-    GPlexHS &msErr, GPlexHV &msPar, GPlexLV &propPar, 
-    float *Chi2, int *HitsIdx, int N) {
-  int itrack = threadIdx.x + blockDim.x*blockIdx.x;
-  if (itrack < N) {
-    if (bestHit >= 0)
-    {
-      Hit   &hit  = hits[ bestHit ];
-      float &chi2_local = minChi2;
-	  
-      for (int i = 0; i < msErr.kSize; ++i) {
-        msErr(itrack, i, 0) = hit.errArrayCU()[i];
-      }
-      for (int i = 0; i < msPar.kSize; ++i) {
-        msPar(itrack, i, 0) = hit.posArrayCU()[i];
-      }
-      Chi2[itrack] += chi2_local;
-      HitsIdx[itrack] = bestHit;
-    }
-    else
-    {
-      /*msErr[Nhits].SetDiagonal3x3(itrack, 666);*/
-      msErr(itrack, 0, 0) = 666;
-      msErr(itrack, 1, 0) = 0;
-      msErr(itrack, 2, 0) = 666;
-      msErr(itrack, 3, 0) = 0;
-      msErr(itrack, 4, 0) = 0;
-      msErr(itrack, 5, 0) = 666;
-
-      for (int i = 0; i < msPar.kSize; ++i) {
-        msPar(itrack, i, 0) = propPar(itrack, i, 0);
-      }
-      HitsIdx[itrack] = -1;
-      // Don't update chi2
-    }
-  }
-}
-
-__global__ void updateTracksWithBestHit_kernel(Hit *hits, float *minChi2, int *bestHit,
-    GPlexHS msErr, GPlexHV msPar, GPlexLV propPar, float *Chi2, int *HitsIdx, int N) {
-  int itrack = threadIdx.x + blockDim.x*blockIdx.x;
-  if (itrack < N) {
-    updateTracksWithBestHit_fn
-        (hits, minChi2[itrack], bestHit[itrack],
-         msErr, msPar, propPar, Chi2, HitsIdx, N);
-  }
-}
-
-#if 1
-void updateTracksWithBestHit_wrapper(cudaStream_t &stream,
-    LayerOfHitsCU &layer, float *minChi2, int *best_hit, 
-    GPlexHS &msErr, GPlexHV &msPar, GPlexLV &propPar,
-    float *Chi2, int *HitsIdx, int N) {
-  int gridx = std::min((N-1)/BLOCK_SIZE_X + 1,
-                       MAX_BLOCKS_X);
-  dim3 grid(gridx, 1, 1);
-  dim3 block(BLOCK_SIZE_X, 1, 1);
-  updateTracksWithBestHit_kernel <<< grid, block, 0, stream >>>
-      (layer.m_hits, minChi2, best_hit, msErr, msPar, propPar, Chi2, HitsIdx, N);
-}
-#endif
-
-int getMaxNumHits_wrapper(GPlexQI d_XHitSize, int N) {
-  thrust::device_ptr<int> d_ptr(d_XHitSize.ptr);
-  int maxSize=  thrust::reduce(d_ptr, d_ptr + N, -1, thrust::maximum<int>());
-  maxSize = std::min(maxSize, Config::maxHitsConsidered);
-
-  return maxSize;
-}
-
-__global__ void bestHit_kernel(
-    Hit *hits, GPlexQI XHitSize, GPlexHitIdx XHitArr, 
-    GPlexLS propErr, GPlexHS msErr, GPlexHV msPar,
-    GPlexLV propPar, GPlexQF outChi2,
-    float *Chi2, int *HitsIdx,
-    int maxSize, int N) {
-
-  int itrack = threadIdx.x + blockDim.x*blockIdx.x;
-  int bestHit_reg = -1;
-  float minChi2_reg = 15.f;
-
-  if (itrack < N)
-    HitsIdx[itrack] = 0;
-
-  for (int hit_cnt = 0; hit_cnt < maxSize; ++hit_cnt)
-  {
-    HitToMs_fn(msErr, msPar, hits, XHitSize, XHitArr, HitsIdx, hit_cnt, N);
-#if 0
-      // TODO: add CMSGeom
-      if (Config::useCMSGeom) {
-        //propagateHelixToRMPlex(psErr,  psPar, inChg,  msPar, propErr, propPar);
-        throw std::runtime_error("useCMSGeom not implemented yet for GPU");
-      } else {}
-#endif
-    computeChi2_fn(propErr, msErr, msPar, propPar, outChi2, N);
-    getNewBestHitChi2_fn(XHitSize, XHitArr, outChi2.ptr, minChi2_reg, bestHit_reg, hit_cnt, N);
-  }
-  updateTracksWithBestHit_fn
-      (hits, 
-       minChi2_reg, bestHit_reg,
-       msErr, msPar, propPar,
-       Chi2, HitsIdx,
-       N);
-}
-
-#if 1
-void bestHit_wrapper(cudaStream_t &stream,
-    LayerOfHitsCU &layer, GPlexQI &XHitSize,  GPlexHitIdx &XHitArr,
-    GPlexLS &propErr, GPlexHS &msErr, GPlexHV &msPar,
-    GPlexLV &propPar, GPlexQF &outChi2,
-    float *Chi2, int *HitsIdx,
-    int maxSize, int N) {
-  int gridx = std::min((N-1)/BLOCK_SIZE_X + 1,
-                       MAX_BLOCKS_X);
-  dim3 grid(gridx, 1, 1);
-  dim3 block(BLOCK_SIZE_X, 1, 1);
-
-  bestHit_kernel <<< grid, block, 0, stream >>>
-    (layer.m_hits, XHitSize, XHitArr,
-     propErr, msErr, msPar, propPar, outChi2,
-     /*propErr.ptr, propErr.stride,*/
-     /*msErr.ptr, msErr.stride, msErr.kSize,*/
-     /*msPar.ptr, msPar.stride, msPar.kSize,*/
-     /*outChi2.ptr, outChi2.stride,*/
-     Chi2, HitsIdx,
-     maxSize, N);
-}
-#endif
-
-__global__ void selectHitRanges_kernel(Hit *hits,
-    int *phi_bin_infos_first, int *phi_bin_infos_second, int bunch_fill_index,
-    GPlexQI XHitPos, GPlexQI XHitSize, GPlexLS Err, GPlexLV Par,
-    bool useCMSGeom, int N) {
-  int itrack = threadIdx.x + blockDim.x*blockIdx.x;
-  if (itrack < N) {
-    // must store hit vector into a data member so it can be used in hit selection.
-    // or ... can have it passed into other functions.
-    // somewhat yucky, either way.
-
-    // Also, must store two ints per Matriplex elements ... first index and size.
-    // These are XPos and XSize
-
-    /*const int iI = iP;*/
-    // Hmmh ... this should all be solved by partitioning ... let's try below ...
-    //
-    // float eta = getEta(eta_predx,eta_predy,eta_predz);
-    // //protect against anomalous eta (should go into getEtaPartition maybe?)
-    // if (fabs(eta) > etaDet) eta = (eta>0 ? etaDet*0.99 : -etaDet*0.99);
-    // unsigned int etabin = getEtaPartition(eta,etaDet);
-
-    const float predx = Par(itrack, (0*1 + 0), 0);  // Par[iI].ConstAt(itrack, 0, 0);
-    const float predy = Par(itrack, (1*1 + 0), 0);  // Par[iI].ConstAt(itrack, 1, 0);
-    const float predz = Par(itrack, (2*1 + 0), 0);  // Par[iI].ConstAt(itrack, 2, 0);
-
-    float phi = getPhi(predx,predy);
-
-    const float px2py2 = predx*predx+predy*predy; // predicted radius^2
-    const float dphidx = -predy/px2py2;
-    const float dphidy =  predx/px2py2;
-    // const float dphi2  =     dphidx*dphidx*(Err[iI].ConstAt(itrack, 0, 0) /*propState.errors.At(0,0)*/) +
-    //                          dphidy*dphidy*(Err[iI].ConstAt(itrack, 1, 1) /*propState.errors.At(1,1)*/) +
-    //                      2 * dphidx*dphidy*(Err[iI].ConstAt(itrack, 0, 1) /*propState.errors.At(0,1)*/);
-    const float dphi2  =     dphidx*dphidx*Err(itrack, 0, 0) +
-                             dphidy*dphidy*Err(itrack, 2, 0) +
-                         2 * dphidx*dphidy*Err(itrack, 1, 0);
-
-    const float dphi       = sqrtf(fabs(dphi2));//how come I get negative squared errors sometimes? MT -- how small?
-    const float nSigmaDphi = fminf(fmaxf(Config::nSigma*dphi, Config::minDPhi), Config::PI);
-    //const float nSigmaDphi = Config::nSigma*dphi;
-
-    float dPhiMargin = 0.;
-    if (useCMSGeom) {
-      //now correct for bending and for layer thickness unsing linear approximation
-      /*const float predpx = Par[iP].ConstAt(itrack, 3, 0);*/
-      /*const float predpy = Par[iP].ConstAt(itrack, 4, 0);*/
-      const float predpx = Par(itrack, (3*1 + 0), 0);
-      const float predpy = Par(itrack, (4*1 + 0), 0);
-      float deltaR = Config::cmsDeltaRad; //fixme! using constant vale, to be taken from layer properties
-      float radius = sqrt(px2py2);
-      float pt     = sqrt(predpx*predpx + predpy*predpy);
-      float cosTheta = ( predx*predpx + predy*predpy )/(pt*radius);
-      float hipo = deltaR/cosTheta;
-      float dist = sqrt(hipo*hipo - deltaR*deltaR);
-      dPhiMargin = dist/radius;
-    }
-    const float dphiMinus = normalizedPhi(phi-nSigmaDphi-dPhiMargin);
-    const float dphiPlus  = normalizedPhi(phi+nSigmaDphi+dPhiMargin);
-// FIXME ^ OK
-
-#ifdef DEBUG
-    std::ostringstream xout;
-    bool               xout_dump = false;
-    xout << "--------------------------------------------------------------------------------\n";
-    xout << "phi  = " << phi  << ", dphiMinus = " << dphiMinus << ", dphiPlus = " << dphiPlus << std::endl;
-    xout << "dphi = " << dphi  << ", dphi2 = " << dphi2 << ", nSigmaDphi = " << nSigmaDphi << ", nSigma = " << Config::nSigma << std::endl;
-#endif
-
-    int   phiBinMinus = getPhiPartition(dphiMinus);
-    int   phiBinPlus  = getPhiPartition(dphiPlus);
-
-#ifdef DEBUG
-    xout << "phiBinMinus = " << phiBinMinus << ", phiBinPlus = " << phiBinPlus << std::endl;
-#endif
-
-    // XXXX are these checks really needed?
-    phiBinMinus = fmaxf(0,phiBinMinus);
-    phiBinMinus = fminf(Config::nPhiPart-1,phiBinMinus);
-    phiBinPlus  = fmaxf(0,phiBinPlus);
-    phiBinPlus  = fminf(Config::nPhiPart-1,phiBinPlus);
-
-    //PhiBinInfo_t binInfoMinus = bunch_of_hits.m_phi_bin_infos[phiBinMinus];
-    //PhiBinInfo_t binInfoPlus  = bunch_of_hits.m_phi_bin_infos[phiBinPlus];
-    int binInfoMinus_first = phi_bin_infos_first[phiBinMinus];
-    int binInfoMinus_second = phi_bin_infos_second[phiBinMinus];
-    int binInfoPlus_first = phi_bin_infos_first[phiBinPlus];
-    int binInfoPlus_second = phi_bin_infos_second[phiBinPlus];
-
-
-    /*if (binInfoPlus.first + binInfoPlus.second - binInfoMinus.first > Config::maxHitsConsidered)*/
-    if (binInfoPlus_first + binInfoPlus_second - binInfoMinus_first > Config::maxHitsConsidered)
-    {
-      // XXXX
-      // Do something smart to reduce the range.
-      // I'd go for taking the exact phi bin and then walking left and right ...
-      // but this gives the wrap-around problem again.
-    }
-
-    // XXXX
-    // Hmmh ... maybe the copying of extras should be done on demand.
-    // BunchOfHits could know how many extras it has already.
-    // Or Giuseppe is right ... and we should just update the index vector for SlurpIn
-    // instead of shifting of the base address as is done now. Sigh.
-    
-    // fixme: temporary to avoid wrapping
-    // This is now fixed with Config::maxHitsConsidered extra hits copied to the end +
-    // changing XHitBegin/End to XHitPos/Size.
-    // Putting all of it into DEBUG
-#ifdef DEBUG
-    if (binInfoMinus > binInfoPlus)
-    {
-      // xout_dump = true;
-      xout << "FIXER IN:  phiBinMinus = " << phiBinMinus << ", phiBinPlus = " << phiBinPlus << std::endl;
-      xout << "FIXER IN:  BIMinus.first = " << binInfoMinus.first << ", BIPlus.first = " << binInfoPlus.first << std::endl;
-      xout << "FIXER IN:  BIMinus.second = " << binInfoMinus.second << ", BIPlus.second = " << binInfoPlus.second << std::endl;
-
-      int phibin = getPhiPartition(phi);
-
-      xout << "FIXER   :  phibin = " << phibin << std::endl;
-
-      // XXXX are those two really needed?
-      phibin = std::max(0,phibin);
-      phibin = std::min(Config::nPhiPart-1,phibin);
-
-      xout << "FIXER   :  phibin = " << phibin << std::endl;
-    }
-#endif
-
-    XHitPos[itrack] = binInfoMinus_first;
-    XHitSize[itrack] = binInfoPlus_first + binInfoPlus_second - binInfoMinus_first;
-    if (XHitSize[itrack] < 0)
-    {
-      // XXX It would be nice to have BunchOfHits.m_n_real_hits.
-      /*XHitSize[itrack] += bunch_of_hits.m_fill_index - Config::maxHitsConsidered;*/
-      XHitSize[itrack] += bunch_fill_index - Config::maxHitsConsidered;
-    }
-
-    // XXXX Hack to limit N_hits to maxHitsConsidered.
-    // Should at least take hits around central bin -- to be explored, esp. with jet presence.
-    // Strange ... this is worse than just taking first 25 hits !!!
-    // Comment out for now. Must talk to Giuseppe about this.
-    // if (XHitSize.At(itrack, 0, 0) > Config::maxHitsConsidered)
-    // {
-    //   xout_dump = true;
-    //   XHitPos .At(itrack, 0, 0) += (XHitSize.At(itrack, 0, 0) - Config::maxHitsConsidered) / 2;
-    //   XHitSize.At(itrack, 0, 0) = Config::maxHitsConsidered;
-    // }
-
-#ifdef DEBUG
-    xout << "found range firstHit=" << XHitPos.At(itrack, 0, 0) << " size=" << XHitSize.At(itrack, 0, 0) << std::endl;
-    if (xout_dump)
-       std::cout << xout.str();
-#endif
-
-  }
-}
-
-#if 0
-void selectHitRanges_wrapper(cudaStream_t &stream, LayerOfHitsCU &layer, 
-    GPlexQI &XHitPos, GPlexQI &XHitSize,
-    GPlexLS &Err, GPlexLV &Par,
-    int N) {
-  int gridx = std::min((N-1)/BLOCK_SIZE_X + 1,
-                       MAX_BLOCKS_X);
-  dim3 grid(gridx, 1, 1);
-  dim3 block(BLOCK_SIZE_X, 1, 1);
-
-  selectHitRanges_kernel <<< grid, block, 0, stream >>>
-    (bunch.m_hits, bunch.m_phi_bin_infos_first, 
-     bunch.m_phi_bin_infos_second, bunch.m_fill_index,
-     XHitPos, XHitSize, Err, Par,
-     Config::useCMSGeom, N);
-}
-#endif
diff --git a/mkFit/computeChi2_kernels.h b/mkFit/computeChi2_kernels.h
index 51c3262728bf9..c2c1cf1fb4eef 100644
--- a/mkFit/computeChi2_kernels.h
+++ b/mkFit/computeChi2_kernels.h
@@ -3,60 +3,30 @@
 
 #include "HitStructuresCU.h"
 #include "GPlex.h"
+#include "GeometryCU.h"
 
-void computeChi2_wrapper(cudaStream_t &stream, 
-    GPlexLS &propErr, GPlexHS &msErr, // GPlex<float> resErr,
-    GPlexHV &msPar, GPlexLV &propPar, GPlexQF &outChi2,
-    const int N);
-
-#if 1
-void HitToMs_wrapper(cudaStream_t& stream,
-    GPlexHS &msErr, GPlexHV &msPar, LayerOfHitsCU &layer, 
-    GPlexQI &XHitSize, GPlexHitIdx &XHitArr, int *HitsIdx, int hit_cnt, int N);
-#endif
-
-void getNewBestHitChi2_wrapper(cudaStream_t &stream,
-    GPlexQI &XHitSize, GPlexHitIdx &XHitArr,
-    GPlexQF &outChi2, float *minChi2, int *bestHit, int hit_cnt, int N);
 
-void fill_array_cu(float *array, int size, float value);
+__device__ void computeChi2_fn(const GPlexLS &propErr, const GPlexHS &msErr,
+    const GPlexHV &msPar, const GPlexLV &propPar, GPlexQF &outChi2, const int N);
 
-#if 1
-void updateTracksWithBestHit_wrapper(cudaStream_t &stream,
-    LayerOfHitsCU &, float *minChi2, int *best_hit, 
-    GPlexHS &msErr, GPlexHV &msPar, GPlexLV &propPar,
-    float *chi2, int *HitsIdx, int N);
-#endif
 
-int getMaxNumHits_wrapper(GPlexQI d_XHitSize, int N);
+void computeChi2_wrapper(cudaStream_t &stream, 
+    const GPlexLS &propErr, const GPlexHS &msErr, const GPlexHV &msPar, 
+    const GPlexLV &propPar, GPlexQF &outChi2, const int N);
 
 #if 1
-void bestHit_wrapper(cudaStream_t &stream,
-    LayerOfHitsCU &layer, GPlexQI &XHitSize, GPlexHitIdx &XHitArr,
-    GPlexLS &propErr, GPlexHS &msErr, GPlexHV &msPar,
-    GPlexLV &propPar, GPlexQF &outChi2,
-    float *Chi2, int *HitsIdx,
-    int maxSize, int N);
-#endif
-
-#if 0
-void selectHitRanges_wrapper(cudaStream_t &stream, BunchOfHitsCU &bunch, 
-    GPlexQI &XHitPos, GPlexQI &XHitSize,
-    GPlexLS &Err, GPlexLV &Par,
-    int N);
+void HitToMs_wrapper(cudaStream_t& stream,
+    GPlexHS &msErr, GPlexHV &msPar, LayerOfHitsCU &layer, GPlexQI &XHitSize, 
+    GPlexHitIdx &XHitArr, GPlexQI &HitsIdx, int hit_cnt, int N);
 #endif
 
-__device__ void RotateResidulsOnTangentPlane_fn(const float r00,//r00
-				  float r01,//r01
-				  GPlexRegHV &a  ,//res_glo
-          GPlexReg2V &b  );
+__device__ void RotateResidulsOnTangentPlane_fn(const float r00, 
+    const float r01, const GPlexRegHV &a, GPlexReg2V &b);
 
-__device__ void ProjectResErr_fn(float a00,
-		   float a01,
-		   GPlexRegHS &b, 
-       GPlexRegHH &c);
+__device__ void ProjectResErr_fn(const float a00, const float a01,
+                                 const GPlexRegHS &b, GPlexRegHH &c);
 
-__device__ void ProjectResErrTransp_fn(float a00,
-			 float a01, GPlexRegHH &b, GPlexReg2S &c);
+__device__ void ProjectResErrTransp_fn(const float a00, const float a01, 
+                                       const GPlexRegHH &b, GPlexReg2S &c);
 
 #endif
diff --git a/mkFit/gpu_utils.h b/mkFit/gpu_utils.h
new file mode 100644
index 0000000000000..38190cfb372bf
--- /dev/null
+++ b/mkFit/gpu_utils.h
@@ -0,0 +1,22 @@
+#ifndef GPU_UTILS_H
+#define GPU_UTILS_H 
+
+#include <cub/util_debug.cuh>
+
+#define cudaCheckError()               \
+  do {                                 \
+    cudaError_t e=cudaGetLastError();  \
+    CubDebugExit(e)                    \
+  } while(0)  
+
+#define cudaCheckErrorSync()           \
+  do {                                 \
+    cudaDeviceSynchronize();           \
+    cudaCheckError();                  \
+  } while(0)
+
+// CUDA specific:
+// Maximum number of blocks in the X direction of the thread grid.
+constexpr int max_blocks_x = 1 << 15;
+
+#endif /* ifndef GPU_UTILS_H */
diff --git a/mkFit/index_selection_kernels.cu b/mkFit/index_selection_kernels.cu
index 6384de6c64cc7..9e3700d8f96a3 100644
--- a/mkFit/index_selection_kernels.cu
+++ b/mkFit/index_selection_kernels.cu
@@ -1,37 +1,20 @@
 #include "index_selection_kernels.h"
 #include "Config.h"
 #include "HitStructures.h"
+#include "gpu_utils.h"
 
 #include "stdio.h"
 
 #define BLOCK_SIZE_X 32
-#define MAX_BLOCKS_X 65535 // CUDA constraint
 
 constexpr bool tmp_useCMSGeom = false;
 
-#if 0
-__device__ 
-int GetZBin(float z, const float m_zmin, const float m_fz) {
-  return (z - m_zmin) * m_fz;
-}
-
-__device__
-int GetZBinChecked(float z, const float m_zmin, const float m_fz, const int m_nz) { 
-  int zb = GetZBin(z); 
-  if (zb < 0) zb = 0; else if (zb >= m_nz) zb = m_nz - 1; return zb; 
-}
-#endif
-
-__global__ void selectHitIndices_kernel(LayerOfHitsCU layer_of_hits,
-    GPlexLS Err, GPlexLV Par, GPlexQI XHitSize, GPlexHitIdx XHitArr, int N) {
+__device__ void selectHitIndices_fn(const LayerOfHitsCU &layer_of_hits,
+    const GPlexLS &Err, const GPlexLV &Par, GPlexQI &XHitSize, 
+    GPlexHitIdx &XHitArr, const int N) {
   int itrack = threadIdx.x + blockDim.x*blockIdx.x;
-  /*if (itrack == 0) {*/
+
   if (itrack < N) {
-    /*printf("info %d\n", layer_of_hits.m_phi_bin_infos[10].first);*/
-    /*LayerOfHitsCU& l = layer_of_hits;*/
-    /*printf("gpu: %f, %f, %f\n", l.m_zmin, l.m_zmax, l.m_fz);*/
-  /*}*/
-  /*if (itrack < N) {*/
     bool dump = false;
     const float nSigmaPhi = 3;
     const float nSigmaZ   = 3;
@@ -116,13 +99,6 @@ __global__ void selectHitIndices_kernel(LayerOfHitsCU layer_of_hits,
       printf("LayerOfHitsCU::SelectHitIndices %6.3f %6.3f %6.6f %7.5f %3d %3d %4d %4d\n",
              z, phi, dz, dphi, zb1, zb2, pb1, pb2);
 
-    /*if (itrack == 0) {*/
-      /*int i1 = L.m_phi_bin_infos[zb1*L.m_nphi + (pb1 & L.m_phi_mask)].first;*/
-      /*int i2 = L.m_phi_bin_infos[zb1*L.m_nphi + (pb1 & L.m_phi_mask)].second;*/
-
-      /*printf("gpu: %d, %d\n", i1, i2);*/
-    /*}*/
-
     // MT: One could iterate in "spiral" order, to pick hits close to the center.
     // http://stackoverflow.com/questions/398299/looping-in-a-spiral
     // This would then work best with relatively small bin sizes.
@@ -178,27 +154,21 @@ __global__ void selectHitIndices_kernel(LayerOfHitsCU layer_of_hits,
 #endif  // 0
       }
     }
-    /*if (itrack == 0)*/
-    /*{*/
-      /*if (XHitSize[itrack] != xhitsize_tmp) {*/
-        /*printf("%d fromCPU %d  --fromGPU %d\n", itrack, xhitsize_tmp, XHitSize[itrack]);*/
-        /*int i1 = L.m_phi_bin_infos[zb1*L.m_nphi + (pb1 & L.m_phi_mask)].first;*/
-        /*int i2 = L.m_phi_bin_infos[zb1*L.m_nphi + (pb1 & L.m_phi_mask)].second;*/
-
-        /*printf("gpu: %d, %d\n", i1, i2);*/
-      /*}*/
-    /*}*/
   }
 }
 
-void selectHitIndices_wrapper(cudaStream_t& stream,
-    LayerOfHitsCU& layer_of_hits, GPlexLS& Err, GPlexLV& Par, 
-    GPlexQI& XHitSize, GPlexHitIdx& XHitArr, int N) {
+__global__ void selectHitIndices_kernel(const LayerOfHitsCU layer_of_hits,
+    const GPlexLS Err, const GPlexLV Par, GPlexQI XHitSize, GPlexHitIdx XHitArr, const int N) {
+  selectHitIndices_fn(layer_of_hits, Err, Par, XHitSize, XHitArr, N);
+}
+
+void selectHitIndices_wrapper(const cudaStream_t& stream,
+    const LayerOfHitsCU& layer_of_hits, const GPlexLS& Err, const GPlexLV& Par, 
+    GPlexQI& XHitSize, GPlexHitIdx& XHitArr, const int N) {
   int gridx = std::min((N-1)/BLOCK_SIZE_X + 1,
-                       MAX_BLOCKS_X);
+                       max_blocks_x);
   dim3 grid(gridx, 1, 1);
   dim3 block(BLOCK_SIZE_X, 1, 1);
-  /*printf("Before kernel %d \n", N);*/
   selectHitIndices_kernel <<< grid, block, 0, stream >>>
     (layer_of_hits, Err, Par, XHitSize, XHitArr, N);
 }
diff --git a/mkFit/index_selection_kernels.h b/mkFit/index_selection_kernels.h
index 225f1a0b15846..ff0cc0ef0ca5c 100644
--- a/mkFit/index_selection_kernels.h
+++ b/mkFit/index_selection_kernels.h
@@ -4,8 +4,12 @@
 #include "HitStructuresCU.h"
 #include "GPlex.h"
 
-void selectHitIndices_wrapper(cudaStream_t& stream,
-    LayerOfHitsCU& layer_of_hits, GPlexLS& Err, GPlexLV& Par, 
-    GPlexQI& XHitSize, GPlexHitIdx& XHitArr, int N);
+void selectHitIndices_wrapper(const cudaStream_t& stream,
+    const LayerOfHitsCU& layer_of_hits, const GPlexLS& Err, const GPlexLV& Par, 
+    GPlexQI& XHitSize, GPlexHitIdx& XHitArr, const int N);
+
+__device__ void selectHitIndices_fn(const LayerOfHitsCU &layer_of_hits,
+    const GPlexLS &Err, const GPlexLV &Par, GPlexQI &XHitSize,
+    GPlexHitIdx &XHitArr, const int N);
 
 #endif  // _INDEX_SELECTION_KERNELS_H_
diff --git a/mkFit/kalmanUpdater_kernels.cu b/mkFit/kalmanUpdater_kernels.cu
index abdd6e427bc08..f4e694e37da8c 100644
--- a/mkFit/kalmanUpdater_kernels.cu
+++ b/mkFit/kalmanUpdater_kernels.cu
@@ -2,6 +2,7 @@
 #include "Hit.h"
 #include "kalmanUpdater_kernels.h"
 #include "computeChi2_kernels.h"
+#include "gpu_utils.h"
 
 // TODO: Clean all the hard-coded #define
 #define LS 21
@@ -10,7 +11,6 @@
 #define HV 3
 
 #define BLOCK_SIZE_X 32
-#define MAX_BLOCKS_X 65535 // CUDA constraint
 
 /*__device__ float getPhi_fn2(float x, float y)*/
 /*{*/
@@ -21,22 +21,24 @@
   /*return atan2(r,z);*/
 /*}*/
 
-__device__ void subtract_matrix(const float *a, int aN, const float *b, int bN, 
-    float *c, int cN, int size, int n) {
+__device__ void subtract_matrix(const float *a, const int aN, 
+                                const float *b, const int bN, 
+                                      float *c, const int cN,
+                                const int size, const int n) {
   for (int i = 0; i < size; ++i) {
     c[i*cN + n] = a[i*aN + n] - b[i*bN + n];
     
   }
 }
 
-__device__ float getHypot_fn(float x, float y)
+__device__ float getHypot_fn(const float x, const float y)
 {
   return sqrt(x*x + y*y);
 }
 
 __device__
-void KalmanHTG_fn(float a00, float a01,
-	       const GPlexReg2S &b, GPlexRegHH &c)
+void KalmanHTG_fn(const float a00, const float a01,
+                  const GPlexReg2S &b, GPlexRegHH &c)
 {
 
    // HTG  = rot * res_loc
@@ -55,7 +57,7 @@ void KalmanHTG_fn(float a00, float a01,
 }
 
 __device__
-void KalmanGain_fn(const GPlexLS &A, GPlexRegHH &b, GPlexRegLH &c, int n)
+void KalmanGain_fn(const GPlexLS &A, const GPlexRegHH &b, GPlexRegLH &c, const int n)
 {
   // C = A * B, C is 6x3, A is 6x6 sym , B is 6x3
   using T = float;
@@ -69,9 +71,9 @@ void KalmanGain_fn(const GPlexLS &A, GPlexRegHH &b, GPlexRegLH &c, int n)
 
 __device__
 void KHMult_fn(const GPlexRegLH &a, 
-	    const float b00,
-	    const float b01,
-      GPlexRegLL &c)
+               const float b00,
+               const float b01,
+               GPlexRegLL &c)
 {
       c[ 0] = a[ 0]*b00;
       c[ 1] = a[ 0]*b01;
@@ -112,7 +114,7 @@ void KHMult_fn(const GPlexRegLH &a,
 }
 
 __device__
-void KHC_fn(const GPlexRegLL &a, const GPlexLS &B, GPlexLS &C, int n)
+void KHC_fn(const GPlexRegLL &a, const GPlexLS &B, GPlexLS &C, const int n)
 {
   // C = A * B, C is 6x6, A is 6x6 , B is 6x6 sym
   using T = float;
@@ -124,7 +126,7 @@ void KHC_fn(const GPlexRegLL &a, const GPlexLS &B, GPlexLS &C, int n)
 
 // 
 __device__
-void ConvertToPolar_fn(const GPlexLV &a, GPlexRegLV &b, GPlexRegLL &c, int n)
+void ConvertToPolar_fn(const GPlexLV &a, GPlexRegLV &b, GPlexRegLL &c, const int n)
 {
   int aN = a.stride; 
   typedef float T;
@@ -177,7 +179,7 @@ void ConvertToPolar_fn(const GPlexLV &a, GPlexRegLV &b, GPlexRegLL &c, int n)
 }
 
 __device__
-void PolarErr_fn(const GPlexRegLL &a, const float *b, int bN, GPlexRegLL &c, int n)
+void PolarErr_fn(const GPlexRegLL &a, const float *b, int bN, GPlexRegLL &c, const int n)
 {
   // C = A * B, C is 6x6, A is 6x6 , B is 6x6 sym
  
@@ -190,7 +192,7 @@ void PolarErr_fn(const GPlexRegLL &a, const float *b, int bN, GPlexRegLL &c, int
 }
 
 __device__
-void PolarErrTransp_fn(const GPlexRegLL &a, const GPlexRegLL &b, GPlexLS &C, int n)
+void PolarErrTransp_fn(const GPlexRegLL &a, const GPlexRegLL &b, GPlexLS &C, const int n)
 {
   // C = A * B, C is sym, A is 6x6 , B is 6x6
   using T = float;
@@ -201,7 +203,7 @@ void PolarErrTransp_fn(const GPlexRegLL &a, const GPlexRegLL &b, GPlexLS &C, int
 }
 
 __device__
-void ConvertToCartesian_fn(const GPlexRegLV &a, float *b, int bN, GPlexRegLL &c, int n)
+void ConvertToCartesian_fn(const GPlexRegLV &a, float *b, int bN, GPlexRegLL &c, const int n)
 {
     const float cosP = std::cos(a[ 4]); //fixme: use trig approx
     const float sinP = std::sin(a[ 4]);
@@ -254,7 +256,7 @@ void ConvertToCartesian_fn(const GPlexRegLV &a, float *b, int bN, GPlexRegLL &c,
 }
 
 __device__
-void CartesianErr_fn(const GPlexRegLL &a, const float *b, int bN, GPlexRegLL &c, int n)
+void CartesianErr_fn(const GPlexRegLL &a, const float *b, const int bN, GPlexRegLL &c, const int n)
 {
   // C = A * B, C is 6x6, A is 6x6 , B is 6x6 sym
   int aN = 1; int an = 0;
@@ -265,7 +267,7 @@ void CartesianErr_fn(const GPlexRegLL &a, const float *b, int bN, GPlexRegLL &c,
 }
 
 __device__
-void CartesianErrTransp_fn(const GPlexRegLL &a, const GPlexRegLL &b, GPlexLS &C, int n)
+void CartesianErrTransp_fn(const GPlexRegLL &a, const GPlexRegLL &b, GPlexLS &C, const int n)
 {
   // C = A * B, C is sym, A is 6x6 , B is 6x6
   using T = float;
@@ -280,8 +282,8 @@ void CartesianErrTransp_fn(const GPlexRegLL &a, const GPlexRegLL &b, GPlexLS &C,
 /// MultKalmanGain ////////////////////////////////////////////////////////////
 
 __device__ void upParam_MultKalmanGain_fn(
-    const float* __restrict__ a, size_t aN,
-    float* b_reg, float *c, int N, int n) {
+    const float* __restrict__ a, const size_t aN,
+    const float* b_reg, float *c, const int N, const int n) {
   // const T* __restrict__ tells the compiler that it can uses the read-only
   // cache, without worrying about coherency.
   // c -> kalmanGain, in register
@@ -396,7 +398,7 @@ __device__ void subtractFirst3_fn(const GPlexHV __restrict__ &A,
 /// AddIntoUpperLeft3x3  //////////////////////////////////////////////////////
 __device__ void addIntoUpperLeft3x3_fn(const GPlexLS __restrict__ &A,
                                        const GPlexHS __restrict__ &B,
-                                       GPlexRegHS &C, const int N, int n) {
+                                       GPlexRegHS &C, const int N, const int n) {
   using T = float;
   const T *a = A.ptr;  int aN = A.stride;
   const T *b = B.ptr;  int bN = B.stride;
@@ -416,10 +418,11 @@ __device__ void addIntoUpperLeft3x3_fn(const GPlexLS __restrict__ &A,
 
 /// MultResidualsAdd //////////////////////////////////////////////////////////
 __device__ void multResidualsAdd_fn(
-    float* reg_a,
-    const float* __restrict__ b, size_t bN,
-    const float* __restrict__ c, size_t cN,
-    float *d, size_t dN, int N, int n) {
+    const float* reg_a,
+    const float* __restrict__ b, const size_t bN,
+    const float* __restrict__ c, const size_t cN,
+          float *d,              const size_t dN, 
+    const int N, const int n) {
   // a -> kalmanGain
 
   /*int i = threadIdx.x;*/
@@ -473,18 +476,18 @@ void MultResidualsAdd_all_reg(const GPlexRegLH &a,
 
 /// KalmanGain_x_propErr //////////////////////////////////////////////////////
 __device__ void kalmanGain_x_propErr_fn(
-    float* d_kalmanGain,
-    const float* __restrict__ d_propErr, size_t stride_propErr,
-    float *d_outErr, size_t stride_outErr, const int N, int n) {
+    const float* d_kalmanGain,
+    const float* __restrict__ d_propErr, const size_t stride_propErr,
+    float *d_outErr, const size_t stride_outErr, const int N, const int n) {
   // a = d_kalmanGain,  b = d_propErr, c = outErrTemp
   // c = b - a*b
-  float *a = d_kalmanGain;
+  const float *a = d_kalmanGain;
   const float *b = d_propErr;
   float *c = d_outErr;
 
   /*size_t aN = stride_kalmanGain;*/
-  size_t bN = stride_propErr;
-  size_t cN = stride_outErr;
+  const size_t bN = stride_propErr;
+  const size_t cN = stride_outErr;
 
   register float reg_c[LS];
 
@@ -524,10 +527,10 @@ __device__ void kalmanGain_x_propErr_fn(
    }
 }
 
-__global__ void kalmanUpdate_kernel(
-    GPlexLS propErr, const GPlexHS __restrict__ msErr,
-    const GPlexLV __restrict__ par_iP, const GPlexHV __restrict__ msPar,
-    GPlexLV par_iC, GPlexLS outErr, const int N) {
+__device__ void kalmanUpdate_fn(
+    GPlexLS &propErr, const GPlexHS __restrict__ &msErr,
+    const GPlexLV __restrict__ &par_iP, const GPlexHV __restrict__ &msPar,
+    GPlexLV &par_iC, GPlexLS &outErr, const int N) {
   int grid_width = blockDim.x * gridDim.x;
   // Note: similar results with propErr kept in registers.
   //       It is read-only so using the read-only cache yields more flexibility
@@ -538,7 +541,7 @@ __global__ void kalmanUpdate_kernel(
   GPlexRegHS resErr_reg;
   /*float kalmanGain_reg[LH];*/
 
-  // If there is more matrices than MAX_BLOCKS_X * BLOCK_SIZE_X 
+  // If there is more matrices than max_blocks_x * BLOCK_SIZE_X 
   for (int z = 0; z < (N-1)/grid_width  +1; z++) {
     /*n += z*gridDim.x;*/
     n += z*grid_width;
@@ -635,13 +638,20 @@ __global__ void kalmanUpdate_kernel(
   }
 }
 
-void kalmanUpdate_wrapper(cudaStream_t& stream,
-    GPlexLS& d_propErr, GPlexHS& d_msErr,
-    GPlexLV& d_par_iP, GPlexHV& d_msPar,
+__global__ void kalmanUpdate_kernel(
+    GPlexLS propErr, const GPlexHS __restrict__ msErr,
+    const GPlexLV __restrict__ par_iP, const GPlexHV __restrict__ msPar,
+    GPlexLV par_iC, GPlexLS outErr, const int N) {
+  kalmanUpdate_fn( propErr, msErr, par_iP, msPar, par_iC, outErr, N);
+}
+
+void kalmanUpdate_wrapper(const cudaStream_t& stream,
+    GPlexLS& d_propErr, const GPlexHS& d_msErr,
+    GPlexLV& d_par_iP, const GPlexHV& d_msPar,
     GPlexLV& d_par_iC, GPlexLS& d_outErr,
     const int N) {
   int gridx = std::min((N-1)/BLOCK_SIZE_X + 1,
-                       MAX_BLOCKS_X);
+                       max_blocks_x);
   dim3 grid(gridx, 1, 1);
   dim3 block(BLOCK_SIZE_X, 1, 1);
   kalmanUpdate_kernel <<<grid, block, 0, stream >>>
diff --git a/mkFit/kalmanUpdater_kernels.h b/mkFit/kalmanUpdater_kernels.h
index 06970601b0162..e93f7e1f360df 100644
--- a/mkFit/kalmanUpdater_kernels.h
+++ b/mkFit/kalmanUpdater_kernels.h
@@ -3,24 +3,34 @@
 
 #include "GPlex.h"
 
-void kalmanUpdate_wrapper(cudaStream_t& stream,
-    GPlexLS& d_propErr, GPlexHS& d_msErr,
-    GPlexLV& d_par_iP, GPlexHV& d_msPar,
+void kalmanUpdate_wrapper(const cudaStream_t& stream,
+    GPlexLS& d_propErr, const GPlexHS& d_msErr,
+    GPlexLV& d_par_iP, const GPlexHV& d_msPar,
     GPlexLV& d_par_iC, GPlexLS& d_outErr,
     const int N);
 
-void reorganizeMs_wrapper(cudaStream_t& stream, GPlexQF& msPar,
-    float *full_posArray, GPlexHS& msErr, 
-    float *full_errArray, int *full_hitIdx, int hi, int maxHits,
-    int N, int hs, int hv, int Nhits);
+//void reorganizeMs_wrapper(cudaStream_t& stream, GPlexQF& msPar,
+//    float *full_posArray, GPlexHS& msErr, 
+//    float *full_errArray, int *full_hitIdx, int hi, int maxHits,
+//    int N, int hs, int hv, int Nhits);
+
+__global__ void kalmanUpdate_kernel(
+    GPlexLS propErr, const GPlexHS __restrict__ msErr,
+    const GPlexLV __restrict__ par_iP, const GPlexHV __restrict__ msPar,
+    GPlexLV par_iC, GPlexLS outErr, const int N);
+
+__device__ void kalmanUpdate_fn(
+    GPlexLS &propErr, const GPlexHS __restrict__ &msErr,
+    const GPlexLV __restrict__ &par_iP, const GPlexHV __restrict__ &msPar,
+    GPlexLV &par_iC, GPlexLS &outErr, const int N);
 
 __device__ void addIntoUpperLeft3x3_fn(const GPlexLS __restrict__ &A,
                                        const GPlexHS __restrict__ &B,
-                                       GPlexRegHS &c, const int N, int n);
+                                       GPlexRegHS &c, const int N, const int n);
 
 __device__ void subtractFirst3_fn(const GPlexHV __restrict__ &A,
                                   const GPlexLV __restrict__ &B,
-                                  GPlexRegHV &C, const int N, int n);
+                                  GPlexRegHV &C, const int N, const int n);
 
 __device__ void invertCramerSym_fn(float *a);
 __device__ void invertCramerSym2x2_fn(GPlexReg2S &a);
diff --git a/mkFit/mkFit.cc b/mkFit/mkFit.cc
index c2a2bed0aad63..de066ca17ec05 100644
--- a/mkFit/mkFit.cc
+++ b/mkFit/mkFit.cc
@@ -298,6 +298,7 @@ void test_standard()
          t_sum[0], t_sum[1], t_sum[2], t_sum[3], t_sum[4]);
   printf("Total event > 1 fit = %.5f  --- Build  BHMX = %.5f  MX = %.5f  CEMX = %.5f  TBBMX = %.5f\n",
          t_skip[0], t_skip[1], t_skip[2], t_skip[3], t_skip[4]);
+  //fflush(stdout);
 
   if (g_operation == "read")
   {
diff --git a/mkFit/propagation_kernels.cu b/mkFit/propagation_kernels.cu
index 5c09df58dcc56..af5670533523f 100644
--- a/mkFit/propagation_kernels.cu
+++ b/mkFit/propagation_kernels.cu
@@ -2,6 +2,7 @@
 #include "Debug.h"
 #include "propagation_kernels.h"
 #include <stdio.h>
+#include "gpu_utils.h"
 
 constexpr int L = 6;
 constexpr int LL2 = 36;
@@ -10,10 +11,9 @@ constexpr int LS = 21;
 // values from 32 to 512 give good results.
 // 32 gives slightly better results (on a K40)
 constexpr int BLOCK_SIZE_X = 32;
-constexpr int MAX_BLOCKS_X = 65535; // CUDA constraint
 
 __device__
-void MultHelixProp_fn(const GPlexRegLL& a, const GPlexLS& b, GPlexRegLL& c, int n)
+void MultHelixProp_fn(const GPlexRegLL& a, const GPlexLS& b, GPlexRegLL& c, const int n)
 {
    // C = A * B
 
@@ -32,7 +32,7 @@ void MultHelixProp_fn(const GPlexRegLL& a, const GPlexLS& b, GPlexRegLL& c, int
 }
 
 __device__
-void MultHelixPropTransp_fn(const GPlexRegLL& a, const GPlexRegLL& b, GPlexLS& c, int n)
+void MultHelixPropTransp_fn(const GPlexRegLL& a, const GPlexRegLL& b, GPlexLS& c, const int n)
 {
    // C = B * AT;
 
@@ -48,8 +48,8 @@ void MultHelixPropTransp_fn(const GPlexRegLL& a, const GPlexRegLL& b, GPlexLS& c
 // Registers are thread-private. Thus this function has no notion of
 // parallelism. It is ran serially by each calling thread.
 __device__ void computeJacobianSimple(float *errorProp,
-    float s, float k, float p, float pxin, float pyin, float pzin, 
-    float TP, float cosTP, float sinTP, int N) {
+    const float s, const float k, const float p, const float pxin, const float pyin, const float pzin, 
+    const float TP, const float cosTP, const float sinTP, const int N) {
 
   // std::cout << "total path s=" << s << std::endl;
   // TD = s*pt/p;
@@ -110,7 +110,7 @@ __device__ void computeJacobianSimple(float *errorProp,
 }
 
 /// Compute MsRad /////////////////////////////////////////////////////////////
-__device__ void assignMsRad_fn(const float r, float* msRad, int N, int n) {
+__device__ void assignMsRad_fn(const float r, float* msRad, const int N, const int n) {
   /*int n = threadIdx.x + blockIdx.x * blockDim.x;*/
   if (n < N) {
     *msRad = r;
@@ -119,7 +119,7 @@ __device__ void assignMsRad_fn(const float r, float* msRad, int N, int n) {
 
 // Not passing msRad.stride, as QF == 1 (second dim f msRad)
 __device__ void computeMsRad_fn(const GPlexHV& __restrict__ msPar,
-    GPlexRegQF &msRad, int N, int n) {
+    GPlexRegQF &msRad, const int N, const int n) {
   /*int n = threadIdx.x + blockIdx.x * blockDim.x;*/
   if (n < N) {
     msRad(n, 0, 0) = hipo(msPar(n, 0, 0), msPar(n, 1, 0));
@@ -131,7 +131,7 @@ __device__ void computeMsRad_fn(const GPlexHV& __restrict__ msPar,
 __device__ 
 void helixAtRFromIterative_fn(const GPlexLV& inPar,
     const GPlexQI& inChg, GPlexLV& outPar_global, const GPlexReg<float,1,1>& msRad, 
-    GPlexReg<float, LL2, L>& errorProp, int N, int n) {
+    GPlexReg<float, LL2, L>& errorProp, const int N, const int n) {
 
   GPlexReg<float, LL2, 1> outPar;
 
@@ -255,7 +255,7 @@ __global__ void propagation_kernel(
   GPlexRegQF msRad_reg;
   // Using registers instead of shared memory is ~ 30% faster.
   GPlexRegLL errorProp_reg;
-  // If there is more matrices than MAX_BLOCKS_X * BLOCK_SIZE_X 
+  // If there is more matrices than max_blocks_x * BLOCK_SIZE_X 
   for (int z = 0; z < (N-1)/grid_width  +1; z++) {
     n += z*grid_width;
     if (n < N) {
@@ -284,14 +284,14 @@ __global__ void propagation_kernel(
 }
 
 
-void propagation_wrapper(cudaStream_t& stream,
+void propagation_wrapper(const cudaStream_t& stream,
     GPlexHV& msPar,
     GPlexLV& inPar, GPlexQI& inChg,
     GPlexLV& outPar, GPlexLL& errorProp,
     GPlexLS& outErr, 
     const int N) {
   int gridx = std::min((N-1)/BLOCK_SIZE_X + 1,
-                       MAX_BLOCKS_X);
+                       max_blocks_x);
   dim3 grid(gridx, 1, 1);
   dim3 block(BLOCK_SIZE_X, 1, 1);
   propagation_kernel <<<grid, block, 0, stream >>>(msPar, inPar, inChg, outPar, errorProp, outErr, N);
@@ -300,10 +300,10 @@ void propagation_wrapper(cudaStream_t& stream,
 
 // PropagationMPlex.cc:propagateHelixToRMPlex, second version with 7 arguments 
 // Imposes the radius
-__global__ void propagationForBuilding_kernel(
-    const GPlexLS inErr, const GPlexLV inPar,
-    const GPlexQI inChg, const float radius,
-    GPlexLS outErr, GPlexLV outPar, 
+__device__ void propagationForBuilding_fn(
+    const GPlexLS &inErr, const GPlexLV &inPar,
+    const GPlexQI &inChg, const float radius,
+    GPlexLS &outErr, GPlexLV &outPar, 
     const int N) {
 #if 1
   int grid_width = blockDim.x * gridDim.x;
@@ -312,7 +312,7 @@ __global__ void propagationForBuilding_kernel(
   GPlexRegQF msRad_reg;
   // Using registers instead of shared memory is ~ 30% faster.
   GPlexRegLL errorProp_reg;
-  // If there is more matrices than MAX_BLOCKS_X * BLOCK_SIZE_X 
+  // If there is more matrices than max_blocks_x * BLOCK_SIZE_X 
   /*for (int z = 0; z < (N-1)/grid_width  +1; z++) {*/
     /*n += z*grid_width;*/
     if (n < N) {
@@ -326,20 +326,6 @@ __global__ void propagationForBuilding_kernel(
       for (int i = 0; i < 36; ++i) {
         errorProp_reg[i] = 0.0;
       }
-     /*if (n == 0)*/
-     /*{*/
-       /*int kk = n;*/
-       /*printf("\n");*/
-       /*printf("outErrGPU %d\n", kk);*/
-       /*for (int i = 0; i < 1; ++i) { for (int j = 0; j < 1; ++j)*/
-           /*printf("%8f ", outErr(kk,i,j)); printf("\t");*/
-       /*} printf("\n");*/
-
-       /*printf("outParGPU %d\n", kk);*/
-       /*for (int i = 0; i < 1; ++i) {*/
-           /*printf("%8f ", outPar(kk,i,0)); printf("\t");*/
-       /*} printf("\n");*/
-     /*}*/
     }
 
       /*assignMsRad_fn(radius, &msRad_reg, N, n);*/
@@ -352,16 +338,6 @@ __global__ void propagationForBuilding_kernel(
 #else
       helixAtRFromIterative_fn(inPar, inChg, outPar, msRad_reg, errorProp_reg, N, n);
 #endif
-   /*if(n == 0) {*/
-     /*printf("errorProp\n");*/
-     /*for (int i = 0; i < 6; ++i) {*/
-       /*printf("%8f ", inPar(0,i,0)); printf("\t");*/
-     /*} printf("\n");*/
-     /*for (int i = 0; i < 6; ++i) {*/
-       /*printf("%8f ", outPar(0,i,0)); printf("\t");*/
-     /*} printf("\n");*/
-   /*}*/
-
       // TODO: port me
       /*if (Config::useCMSGeom) {*/
         /*MPlexQF hitsRl;*/
@@ -389,13 +365,21 @@ __global__ void propagationForBuilding_kernel(
 #endif
 }
 
-void propagationForBuilding_wrapper(cudaStream_t& stream,
+__global__ void propagationForBuilding_kernel(
+    const GPlexLS inErr, const GPlexLV inPar,
+    const GPlexQI inChg, const float radius,
+    GPlexLS outErr, GPlexLV outPar, 
+    const int N) {
+  propagationForBuilding_fn( inErr, inPar, inChg, radius, outErr, outPar, N);
+}
+
+void propagationForBuilding_wrapper(const cudaStream_t& stream,
     const GPlexLS& inErr, const GPlexLV& inPar,
     const GPlexQI& inChg, const float radius,
     GPlexLS& outErr, GPlexLV& outPar, 
     const int N) {
   int gridx = std::min((N-1)/BLOCK_SIZE_X + 1,
-                       MAX_BLOCKS_X);
+                       max_blocks_x);
   dim3 grid(gridx, 1, 1);
   dim3 block(BLOCK_SIZE_X, 1, 1);
   propagationForBuilding_kernel<<<grid, block, 0, stream >>>
diff --git a/mkFit/propagation_kernels.h b/mkFit/propagation_kernels.h
index 1e1ada98584d5..d64c069cdcabe 100644
--- a/mkFit/propagation_kernels.h
+++ b/mkFit/propagation_kernels.h
@@ -3,17 +3,23 @@
 
 #include "GPlex.h"
 
-void propagation_wrapper(cudaStream_t& stream,
+void propagation_wrapper(const cudaStream_t& stream,
     GPlexHV& msPar,
     GPlexLV& inPar, GPlexQI& inChg,
     GPlexLV& outPar, GPlexLL& errorProp,
     GPlexLS& outErr, 
     const int N);
 
-void propagationForBuilding_wrapper(cudaStream_t& stream,
+void propagationForBuilding_wrapper(const cudaStream_t& stream,
     const GPlexLS& inErr, const GPlexLV& inPar,
     const GPlexQI& inChg, const float radius,
     GPlexLS& outErr, GPlexLV& outPar, 
     const int N);
 
+__device__ void propagationForBuilding_fn(
+    const GPlexLS &inErr, const GPlexLV &inPar,
+    const GPlexQI &inChg, const float radius,
+    GPlexLS &outErr, GPlexLV &outPar, 
+    const int N);
+
 #endif  // _PROPAGATION_KERNELS_H_
diff --git a/mkFit/reorganize.cu b/mkFit/reorganize.cu
deleted file mode 100644
index 523b86a69b55e..0000000000000
--- a/mkFit/reorganize.cu
+++ /dev/null
@@ -1,65 +0,0 @@
-#include "reorganize.h"
-#include <stdio.h>
-
-__global__ void toMatriplex_kernel(float *dst, int dst_stride,
-                                   const float* __restrict__ src, int src_stride,
-                                   int N, int LS) {
-  int i = threadIdx.x + blockIdx.x * blockDim.x;
-  int j = threadIdx.y + blockIdx.y * blockDim.y;
-
-  if (i < N && j < LS) {
-    if (i==-1) {
-      printf(" %d, mplex[%f]  /  lin[%f]\n", j, dst[i+j*dst_stride], src[j+i*src_stride]);
-    }
-    dst[i + j*dst_stride] = src[j + i*src_stride];
-    /*float diff = fabs((dst[i + j*dst_stride] - src[j + i*src_stride]));*/
-    /*if (diff > 1e-3) printf("%f\n", diff);*/
-  }
-}
-
-/*void toMatriplex_wrapper(cudaStream_t& stream, GPlex<float> &dst, GPlex<float> &src, int N, int LS) {*/
-  /*dim3 block(16, 8, 1);*/
-  /*dim3 grid((N-1)/16 + 1, (LS-1)/8 +1, 1);*/
-  /*toMatriplex_kernel <<<grid, block, 0, stream>>> (dst.ptr, dst.stride, src.ptr, src.stride, N, LS);*/
-/*}*/
-
-
-__global__ void reorganizeMs(float *msPar, size_t msPar_stride,
-                             float *full_posArray,
-                             float *msErr, size_t msErr_stride,
-                             float *full_errArray,
-                             int *full_hitIdx, int hi,
-                             int maxHits,
-                             int N, int HS, int HV, int Nhits) {
-
-  int i = threadIdx.x + blockIdx.x * blockDim.x;
-  int j = threadIdx.y + blockIdx.y * blockDim.y;
-
-  if (i < N) {
-    int hidx = full_hitIdx[i + hi*N];
-    if (j < HV) {
-      /*float tmp1 = msPar[i + msPar_stride*j];*/
-      msPar[i + msPar_stride*j] = full_posArray[j + HV*(hidx + hi*maxHits)];
-      /*float tmp2 = msPar[i + msPar_stride*j];*/
-      
-      /*if (i==0 && hi == 0) {*/
-        /*if (fabs(tmp1 - tmp2) > 1e-3) {*/
-          /*printf("i %d, j %d, old: %f, new %f\n", i, j, tmp1, tmp2);*/
-        /*}*/
-      /*}*/
-    }
-    if (j < HS) {
-      msErr[i + msErr_stride*j] = full_errArray[j + HS*(hidx + hi*maxHits)];
-    }
-  }
-}
-
-void reorganizeMs_wrapper(cudaStream_t& stream, GPlex<float, MPlexQF>& msPar,
-    float *full_posArray, GPlex<float, MPlexHS>& msErr, 
-    float *full_errArray, int *full_hitIdx, int hi, int maxHits,
-    int N, int hs, int hv, int Nhits) {
-  dim3 block(16, 6, 1);
-  dim3 grid((N-1)/16 + 1, (hs-1)/6 +1, 1);
-  reorganizeMs <<<grid, block, 0, stream>>> (msPar.ptr, msPar.stride, full_posArray,
-      msErr.ptr, msErr.stride, full_errArray, full_hitIdx, hi, maxHits, N, hs, hv, Nhits);
-}
diff --git a/mkFit/reorganize.h b/mkFit/reorganize.h
deleted file mode 100644
index 70f70b81c819f..0000000000000
--- a/mkFit/reorganize.h
+++ /dev/null
@@ -1,8 +0,0 @@
-#ifndef _REORGANIZE_KERNELS_H_
-#define _REORGANIZE_KERNELS_H_
-
-#include "GPlex.h"
-
-// void toMatriplex_wrapper(cudaStream_t& stream, GPlex<float> &dst, GPlex<float> &src, int n, int ls);
-
-#endif
diff --git a/mkFit/reorganize_gplex.cu b/mkFit/reorganize_gplex.cu
new file mode 100644
index 0000000000000..1e0f83671f1f7
--- /dev/null
+++ b/mkFit/reorganize_gplex.cu
@@ -0,0 +1,208 @@
+#include "reorganize_gplex.h"
+#include <stdio.h>
+
+#include "FitterCU.h"
+#include "accessors_cu.h"
+#include "Track.h"
+#include "gpu_utils.h"
+
+
+template <typename GPlexObj>
+__device__ void SlurpIn_fn(GPlexObj to, // float *fArray, int stride, int kSize, 
+                           const char *arr, const int *vi, const int N) {
+  int j = threadIdx.x + blockDim.x * blockIdx.x;
+  if (j<N) {
+    const int *XHitPos = vi;
+    const int off = XHitPos[j] * sizeof(Hit);
+    for (int i = 0; i < to.kSize; ++i) { // plex_size
+      /*fArray[i*stride+ j] = * (const T*) (arr + i*sizeof(T) + off);*/
+      to(j, i, 0) = * (decltype(to.ptr)) (arr + i*sizeof(decltype(*to.ptr)) + off);
+    }
+  }
+}
+
+
+template <typename GPlexObj>
+__device__ void SlurpInIdx_fn(GPlexObj to,
+                             const char *arr, const int idx, const int N) {
+  int j = threadIdx.x + blockDim.x * blockIdx.x;
+  if (j<N) {
+    for (int i = 0; i < to.kSize; ++i) { // plex_size
+      to(j, i, 0) = * (decltype(to.ptr)) (arr + i*sizeof(decltype(*to.ptr)) + idx);
+    }
+  }
+}
+
+
+template <typename GPlexObj>
+__device__ void SlurpOutIdx_fn(GPlexObj from, // float *fArray, int stride, int kSize, 
+                               const char *arr, const int idx, const int N) {
+  int j = threadIdx.x + blockDim.x * blockIdx.x;
+  if (j<N) {
+    for (int i = 0; i < from.kSize; ++i) { // plex_size
+      * (decltype(from.ptr)) (arr + i*sizeof(decltype(*from.ptr)) + idx) = from(j, i , 0);
+    }
+  }
+}
+
+
+__device__ void HitToMs_fn(GPlexHS &msErr, GPlexHV &msPar,
+                           Hit *hits, const GPlexQI &XHitSize, 
+                           const GPlexHitIdx &XHitArr, 
+                           GPlexQI &HitsIdx, const int hit_cnt, const int N) {
+  /*int j = threadIdx.x + blockDim.x*blockIdx.x;*/
+  int itrack = threadIdx.x + blockDim.x * blockIdx.x;
+  if (itrack < N) {
+
+    const char *varr      = (char*) hits;
+    const int   off_error = (char*) hits[0].errArrayCU() - varr;
+    const int   off_param = (char*) hits[0].posArrayCU() - varr;
+
+    if (hit_cnt < XHitSize[itrack]) {
+      HitsIdx[itrack] = XHitArr(itrack, hit_cnt, 0) * sizeof(Hit);
+    }
+    SlurpInIdx_fn(msErr, varr + off_error, HitsIdx[itrack], N);
+    SlurpInIdx_fn(msPar, varr + off_param, HitsIdx[itrack], N);
+  }
+}
+
+__global__ void HitToMs_kernel(GPlexHS msErr, GPlexHV msPar, Hit *hits,
+                               const GPlexQI XHitSize, const GPlexHitIdx XHitArr,
+                               GPlexQI HitsIdx, const int hit_cnt, const int N) {
+
+    HitToMs_fn(msErr, msPar, hits, XHitSize, XHitArr, HitsIdx, hit_cnt, N);
+}
+
+void HitToMs_wrapper(const cudaStream_t& stream,
+                     GPlexHS &msErr, GPlexHV &msPar, LayerOfHitsCU &layer, 
+                     const GPlexQI &XHitSize, const GPlexHitIdx &XHitArr,
+                     GPlexQI &HitsIdx, int hit_cnt, const int N) {
+  int gridx = std::min((N-1)/BLOCK_SIZE_X + 1,
+                       max_blocks_x);
+  dim3 grid(gridx, 1, 1);
+  dim3 block(BLOCK_SIZE_X, 1, 1);
+  HitToMs_kernel <<< grid, block, 0 , stream >>>
+    (msErr, msPar, layer.m_hits, XHitSize, XHitArr, HitsIdx, hit_cnt, N);
+  cudaDeviceSynchronize();
+}
+
+
+__device__ void InputTracksCU_fn (Track *tracks, 
+                                  GPlexLS &Err_iP, GPlexLV &Par_iP,
+                                  GPlexQI &Chg, GPlexQF &Chi2,
+                                  GPlexQI &Label, GPlexQI *HitsIdx,
+                                  const int beg, const int end, const int N) {
+  int itrack = threadIdx.x + blockDim.x*blockIdx.x;
+
+  if (itrack < (end-beg) && itrack < N) {
+    Track &trk = tracks[beg];
+    const char *varr       = (char*) &trk;
+    int   off_error = (char*) trk.errArrayCU() - varr;
+    int   off_param = (char*) trk.posArrayCU() - varr;
+
+    int i= itrack + beg;
+    const Track &trk_i = tracks[i];
+    int idx = (char*) &trk_i - varr;
+
+    Label(itrack, 0, 0) = tracks[i].label();
+    Chg(itrack, 0, 0) = tracks[i].charge();
+    Chi2(itrack, 0, 0) = tracks[i].chi2();
+    SlurpInIdx_fn(Err_iP, varr + off_error, idx, N);
+    SlurpInIdx_fn(Par_iP, varr + off_param, idx, N);
+
+    for (int hi = 0; hi < 3; ++hi)
+      HitsIdx[hi](itrack, 0, 0) = tracks[i].getHitIdx(hi);//dummy value for now
+  }
+}
+
+__global__ void InputTracksCU_kernel(Track *tracks, 
+                                     GPlexLS Err_iP, GPlexLV Par_iP,
+                                     GPlexQI Chg, GPlexQF Chi2, GPlexQI Label,
+                                     GPlexQI *HitsIdx,
+                                     int beg, int end, int N) {
+  InputTracksCU_fn(tracks, Err_iP, Par_iP, Chg, Chi2, Label, HitsIdx, beg, end, N);
+}
+
+
+void InputTracksCU_wrapper(const cudaStream_t &stream, 
+                           const EtaBinOfCandidatesCU &etaBin,
+                           GPlexLS &Err_iP, GPlexLV &Par_iP,
+                           GPlexQI &Chg, GPlexQF &Chi2, GPlexQI &Label,
+                           GPlexQI *HitsIdx,
+                           const int beg, const int end, const bool inputProp, int N) {
+  int gridx = std::min((N-1)/BLOCK_SIZE_X + 1,
+                       max_blocks_x);
+  dim3 grid(gridx, 1, 1);
+  dim3 block(BLOCK_SIZE_X, 1, 1);
+
+  InputTracksCU_kernel <<< grid, block, 0, stream >>>
+    (etaBin.m_candidates, Err_iP, Par_iP, Chg, Chi2, Label, HitsIdx,
+     beg, end, N);
+}
+
+
+__device__ void OutputTracksCU_fn(Track *tracks, 
+                                  const GPlexLS &Err_iP, const GPlexLV &Par_iP,
+                                  const GPlexQI &Chg, const GPlexQF &Chi2,
+                                  const GPlexQI &Label, const GPlexQI *HitsIdx,
+                                  const int beg, const int end, const int N) {
+  int itrack = threadIdx.x + blockDim.x*blockIdx.x;
+
+  if (itrack < (end-beg) && itrack < N) {
+    Track &trk = tracks[beg];
+    const char *varr       = (char*) &trk;
+    int   off_error = (char*) trk.errArrayCU() - varr;
+    int   off_param = (char*) trk.posArrayCU() - varr;
+
+    int i= itrack + beg;
+    const Track &trk_i = tracks[i];
+    int idx = (char*) &trk_i - varr;
+
+    SlurpOutIdx_fn(Err_iP, varr + off_error, idx, N);
+    SlurpOutIdx_fn(Par_iP, varr + off_param, idx, N);
+    tracks[i].setCharge(Chg(itrack, 0, 0));
+    tracks[i].setChi2(Chi2(itrack, 0, 0));
+    tracks[i].setLabel(Label(itrack, 0, 0));
+
+    // FIXME: Config::nLayers -> NHits
+    //        Needs to find a way to get the NHits
+    //        either store it as a class member, or pass it as an argument
+    tracks[i].resetHits();
+    /*int nGoodItIdx = 0;*/
+    for (int hi = 0; hi < Config::nLayers; ++hi) {
+      tracks[i].addHitIdx(HitsIdx[hi](itrack, 0, 0),0.);
+      // We probably use registers instead of going for class members:
+      /*int hit_idx = HitsIdx[hi](itrack, 0, 0);*/
+      /*tracks[i].setHitIdx(hi, hit_idx);*/
+      /*if (hit_idx >= 0) {*/
+        /*nGoodItIdx++; */
+      /*}*/
+    }
+    /*tracks[i].setNGoodHitIdx(nGoodItIdx);*/
+    /*tracks[i].setChi2(0.);*/
+  }
+}
+
+__global__ void OutputTracksCU_kernel(Track *tracks, 
+                                     GPlexLS Err_iP, GPlexLV Par_iP,
+                                     GPlexQI Chg, GPlexQF Chi2, GPlexQI Label,
+                                     GPlexQI *HitsIdx,
+                                     int beg, int end, int N) {
+  OutputTracksCU_fn(tracks, Err_iP, Par_iP, Chg, Chi2, Label, HitsIdx, beg, end, N);
+}
+
+
+void OutputTracksCU_wrapper(const cudaStream_t &stream,
+                            EtaBinOfCandidatesCU &etaBin,
+                            GPlexLS &Err_iP, GPlexLV &Par_iP,
+                            GPlexQI &Chg, GPlexQF &Chi2, GPlexQI &Label,
+                            GPlexQI *HitsIdx,
+                            const int beg, const int end, const bool outputProp, int N) {
+  int gridx = std::min((N-1)/BLOCK_SIZE_X + 1,
+                       max_blocks_x);
+  dim3 grid(gridx, 1, 1);
+  dim3 block(BLOCK_SIZE_X, 1, 1);
+
+  OutputTracksCU_kernel <<< grid, block, 0, stream >>>
+    (etaBin.m_candidates, Err_iP, Par_iP, Chg, Chi2, Label, HitsIdx, beg, end, N);
+}
diff --git a/mkFit/reorganize_gplex.h b/mkFit/reorganize_gplex.h
new file mode 100644
index 0000000000000..e9b536f379c08
--- /dev/null
+++ b/mkFit/reorganize_gplex.h
@@ -0,0 +1,48 @@
+#ifndef REORGANIZE_GPLEX_H
+#define REORGANIZE_GPLEX_H
+
+#include "GPlex.h"
+#include "Hit.h"
+#include "HitStructuresCU.h"
+
+__device__ void HitToMs_fn(GPlexHS &msErr, GPlexHV &msPar,
+                           Hit *hits, const GPlexQI &XHitSize,
+                           const GPlexHitIdx &XHitArr, 
+                           GPlexQI &HitsIdx, const int hit_cnt, const int N);
+
+__global__ void HitToMs_kernel(GPlexHS msErr, GPlexHV msPar, Hit *hits, 
+                               const GPlexQI XHitSize, const GPlexHitIdx XHitArr, 
+                               GPlexQI HitsIdx, const int hit_cnt, const int N);
+
+void HitToMs_wrapper(const cudaStream_t& stream,
+                     GPlexHS &msErr, GPlexHV &msPar, LayerOfHitsCU &layer, 
+                     const GPlexQI &XHitSize, const GPlexHitIdx &XHitArr, 
+                     GPlexQI &HitsIdx, const int hit_cnt, const int N);
+
+__device__ void InputTracksCU_fn(Track *tracks, 
+                                 GPlexLS &Err_iP, GPlexLV &Par_iP,
+                                 GPlexQI &Chg, GPlexQF &Chi2,
+                                 GPlexQI &Label, GPlexQI *HitsIdx,
+                                 const int beg, const int end, const int N);
+
+__device__ void OutputTracksCU_fn(Track *tracks, 
+                                  const GPlexLS &Err_iP, const GPlexLV &Par_iP,
+                                  const GPlexQI &Chg, const GPlexQF &Chi2,
+                                  const GPlexQI &Label, const GPlexQI *HitsIdx,
+                                  const int beg, const int end, const int N);
+
+void InputTracksCU_wrapper(const cudaStream_t &stream, 
+                           const EtaBinOfCandidatesCU &etaBin,
+                           GPlexLS &Err_iP, GPlexLV &Par_iP,
+                           GPlexQI &Chg, GPlexQF &Chi2, GPlexQI &Label,
+                           GPlexQI *HitsIdx,
+                           const int beg, const int end, const bool inputProp, int N);
+
+void OutputTracksCU_wrapper(const cudaStream_t &stream,
+                            EtaBinOfCandidatesCU &etaBin,
+                            GPlexLS &Err_iP, GPlexLV &Par_iP,
+                            GPlexQI &Chg, GPlexQF &Chi2, GPlexQI &Label,
+                            GPlexQI *HitsIdx,
+                            const int beg, const int end, const bool outputProp, int N);
+
+#endif  // REORGANIZE_GPLEX_H

From 99409850cf75545552869edb7a6d3365724f675e Mon Sep 17 00:00:00 2001
From: Matthieu Lefebvre <ml15@princeton.edu>
Date: Tue, 9 Aug 2016 11:17:59 -0400
Subject: [PATCH 08/13] Improvement to best hit on gpu

1) Refactores device functions so they compute a single track
Moving work decompostion to a higher level to separate decomposition from
actual computations.

2) Fixes indices bug in findBestHit kernel

itrack / tidx problem when ntracks > glex_size solved.

3) Fixes to ensure cpu still compiles after the introduction of gpu best hit
---
 Config.h                           |  3 +-
 Math/MatrixRepresentationsStatic.h |  4 +-
 Math/SMatrix.h                     |  4 +-
 mkFit/BuilderCU.cu                 | 37 ++++++++++-
 mkFit/BuilderCU.h                  |  9 +++
 mkFit/FitterCU.h                   |  2 -
 mkFit/GeometryCU.h                 |  5 ++
 mkFit/KalmanUtilsMPlex.h           |  2 +
 mkFit/MkBuilder.h                  |  2 +
 mkFit/MkFitter.h                   |  2 +
 mkFit/best_hit_kernels.cu          | 38 ++++++------
 mkFit/buildtestMPlex.cc            |  2 +
 mkFit/computeChi2_kernels.cu       | 84 +++++++++++++------------
 mkFit/computeChi2_kernels.h        |  3 +-
 mkFit/fittestMPlex.cc              |  2 +-
 mkFit/gpu_utils.cu                 |  5 ++
 mkFit/gpu_utils.h                  |  5 ++
 mkFit/index_selection_kernels.cu   |  7 ++-
 mkFit/index_selection_kernels.h    |  2 +-
 mkFit/kalmanUpdater_kernels.cu     | 99 ++++++++++++++----------------
 mkFit/kalmanUpdater_kernels.h      |  2 +-
 mkFit/mkFit.cc                     |  1 +
 mkFit/propagation_kernels.cu       | 99 +++++++++++++++---------------
 mkFit/propagation_kernels.h        |  2 +-
 mkFit/reorganize_gplex.cu          | 30 ++++-----
 mkFit/reorganize_gplex.h           |  9 ++-
 26 files changed, 264 insertions(+), 196 deletions(-)
 create mode 100644 mkFit/gpu_utils.cu

diff --git a/Config.h b/Config.h
index e4c3abe31be0a..b1bc303ae596e 100644
--- a/Config.h
+++ b/Config.h
@@ -99,7 +99,8 @@ namespace Config
   // Config for Hit and BinInfoUtils
   constexpr int   nPhiPart   = 1260;
   constexpr float fPhiFactor = nPhiPart / TwoPI;
-  constexpr int   nEtaPart   = 11;
+  //constexpr int   nEtaPart   = 11;
+  constexpr int   nEtaPart   = 1;
   constexpr int   nEtaBin    = 2 * nEtaPart - 1;
 
   constexpr float        fEtaFull  = 2 * Config::fEtaDet;
diff --git a/Math/MatrixRepresentationsStatic.h b/Math/MatrixRepresentationsStatic.h
index 9554c37db9137..90ef3ab2ab6a3 100644
--- a/Math/MatrixRepresentationsStatic.h
+++ b/Math/MatrixRepresentationsStatic.h
@@ -241,9 +241,9 @@ namespace Math {
      inline T* Array() { return fArray; }
 
      inline const T* Array() const { return fArray; }
-//#ifdef USE_CUDA
+#ifdef __CUDACC__
      T* ArrayCU();
-//#endif
+#endif
 
       /**
          assignment : only symmetric to symmetric allowed
diff --git a/Math/SMatrix.h b/Math/SMatrix.h
index 7714195350738..82126403ef951 100644
--- a/Math/SMatrix.h
+++ b/Math/SMatrix.h
@@ -272,9 +272,9 @@ class SMatrix {
    const T* Array() const;
    /// return pointer to internal array
    T* Array();
-//#ifdef USE_CUDA
+#ifdef __CUDACC__
    T* ArrayCU();
-//#endif
+#endif
 
    /** @name --- STL-like interface --- 
        The iterators access the matrix element in the order how they are 
diff --git a/mkFit/BuilderCU.cu b/mkFit/BuilderCU.cu
index cfc8e3657e009..389b4c51d8734 100644
--- a/mkFit/BuilderCU.cu
+++ b/mkFit/BuilderCU.cu
@@ -1,5 +1,6 @@
 #include "BuilderCU.h"
 
+#include "gpu_utils.h"
 #include "HitStructures.h"
 #include "HitStructuresCU.h"
 #include "GeometryCU.h"
@@ -7,10 +8,36 @@
 #include "Event.h"
 
 
+BuilderCU::BuilderCU()
+{
+}
+
+
 BuilderCU::BuilderCU(const EventOfHits& event_of_hits, const Event* event,
                      const EventOfCandidates& event_of_cands)
 {
-  int gplex_size = 1 << 12;
+  setUp(event_of_hits, event, event_of_cands);
+}
+
+
+BuilderCU::~BuilderCU() {
+  /*event_of_cands_cu.deallocGPU();*/
+
+  /*geom_cu.deallocate();*/
+  /*event_of_hits_cu.deallocGPU();*/
+
+  /*cuFitter->destroyStream();*/
+  /*cuFitter->free_extra_addBestHit();*/
+  /*cuFitter->freeDevice();*/
+  /*delete cuFitter;*/
+  tearDown();
+}
+
+
+void BuilderCU::setUp(const EventOfHits& event_of_hits, const Event* event,
+                      const EventOfCandidates& event_of_cands)
+{
+  int gplex_size = 1 << 14;
   cuFitter = new FitterCU<float> (gplex_size);
   cuFitter->allocateDevice();
   cuFitter->allocate_extra_addBestHit();
@@ -31,7 +58,7 @@ BuilderCU::BuilderCU(const EventOfHits& event_of_hits, const Event* event,
 }
 
 
-BuilderCU::~BuilderCU() {
+void BuilderCU::tearDown() {
   event_of_cands_cu.deallocGPU();
 
   geom_cu.deallocate();
@@ -51,4 +78,10 @@ void BuilderCU::FindTracksBestHit(EventOfCandidates& event_of_cands)
   cuFitter->addBestHit(event_of_hits_cu, geom_cu, event_of_cands_cu);
 
   event_of_cands_cu.copyToCPU(event_of_cands, cuFitter->get_stream());
+  cudaStreamSynchronize(cuFitter->get_stream());
+  cudaCheckError();
+
+  /*size_t free_mem, total_mem;*/
+  /*cudaMemGetInfo(&free_mem, &total_mem);*/
+  /*fprintf(stderr, "Free: %d\n", free_mem);*/
 }
diff --git a/mkFit/BuilderCU.h b/mkFit/BuilderCU.h
index f3a2421949f5a..87439d6be5d5b 100644
--- a/mkFit/BuilderCU.h
+++ b/mkFit/BuilderCU.h
@@ -9,13 +9,22 @@
 #include "Event.h"
 
 
+// FIXME: Design Issue
+//        What to do, allocation in ctor, free in dtor?
+//            not exception-safe
+//            but manage mem 
+//        or in separate function?
 class BuilderCU
 {
 public:
+  BuilderCU();
   BuilderCU(const EventOfHits& event_of_hits, const Event* event,
             const EventOfCandidates& event_of_cands);
   ~BuilderCU();
 
+  void setUp(const EventOfHits& event_of_hits, const Event* event,
+             const EventOfCandidates& event_of_cands);
+  void tearDown();
   void FindTracksBestHit(EventOfCandidates& event_of_cands);
 private:
   FitterCU<float> *cuFitter;
diff --git a/mkFit/FitterCU.h b/mkFit/FitterCU.h
index 0eaffc2df01f7..29917f43219a7 100644
--- a/mkFit/FitterCU.h
+++ b/mkFit/FitterCU.h
@@ -32,8 +32,6 @@ constexpr int LH = 18;
 
 using idx_t = Matriplex::idx_t;
 
-void separate_first_call_for_meaningful_profiling_numbers();
-
 template <typename T>
 class FitterCU {
  public:
diff --git a/mkFit/GeometryCU.h b/mkFit/GeometryCU.h
index 2269f79c6186c..16cda75eaee87 100644
--- a/mkFit/GeometryCU.h
+++ b/mkFit/GeometryCU.h
@@ -1,17 +1,22 @@
 #ifndef GEOMETRY_CU_H
 #define GEOMETRY_CU_H
 
+#include "gpu_utils.h"
+
 struct GeometryCU {
   float *radii;
 
   void allocate() {
     cudaMalloc((void**)&radii, Config::nLayers * sizeof(float));
+    cudaCheckError();
   }
   void deallocate() {
     cudaFree(radii);
+    cudaCheckError();
   }
   void getRadiiFromCPU(const float *h_radii) {
     cudaMemcpy(radii, h_radii, Config::nLayers * sizeof(float), cudaMemcpyHostToDevice);
+    cudaCheckError();
   }
 };
 
diff --git a/mkFit/KalmanUtilsMPlex.h b/mkFit/KalmanUtilsMPlex.h
index 5db3002f11f92..3842452d1d66b 100644
--- a/mkFit/KalmanUtilsMPlex.h
+++ b/mkFit/KalmanUtilsMPlex.h
@@ -4,7 +4,9 @@
 #include "Track.h"
 #include "Matrix.h"
 
+#ifdef USE_CUDA
 #include "FitterCU.h"
+#endif
 
 void updateParametersMPlex(const MPlexLS &psErr,  const MPlexLV& psPar, const MPlexQI &inChg,
                            const MPlexHS &msErr,  const MPlexHV& msPar,
diff --git a/mkFit/MkBuilder.h b/mkFit/MkBuilder.h
index eaeeaa03753d2..6246427ea9513 100644
--- a/mkFit/MkBuilder.h
+++ b/mkFit/MkBuilder.h
@@ -3,8 +3,10 @@
 
 #include <vector>
 
+#ifdef USE_CUDA
 #include "HitStructures.h"
 #include "FitterCU.h"
+#endif
 
 
 //------------------------------------------------------------------------------
diff --git a/mkFit/MkFitter.h b/mkFit/MkFitter.h
index c6fa30123a62b..36da1355ccd74 100644
--- a/mkFit/MkFitter.h
+++ b/mkFit/MkFitter.h
@@ -8,8 +8,10 @@
 #include "HitStructures.h"
 #include "BinInfoUtils.h"
 
+#if USE_CUDA
 #include "FitterCU.h"
 #include "HitStructuresCU.h"
+#endif
 
 //#define DEBUG 1
 
diff --git a/mkFit/best_hit_kernels.cu b/mkFit/best_hit_kernels.cu
index 1ef5b797acadd..06f62f438b8e9 100644
--- a/mkFit/best_hit_kernels.cu
+++ b/mkFit/best_hit_kernels.cu
@@ -143,9 +143,9 @@ __device__ void bestHit_fn(
     const GPlexLS &propErr, GPlexHS &msErr, GPlexHV &msPar,
     const GPlexLV &propPar, GPlexQF &outChi2,
     GPlexQF &Chi2, GPlexQI &HitsIdx,
-    const int maxSize, const int N) {
+    const int maxSize, const int itrack, const int N) {
 
-  int itrack = threadIdx.x + blockDim.x*blockIdx.x;
+  /*int itrack = threadIdx.x + blockDim.x*blockIdx.x;*/
   int bestHit_reg = -1;
   float minChi2_reg = 15.f;
 
@@ -154,7 +154,7 @@ __device__ void bestHit_fn(
 
   for (int hit_cnt = 0; hit_cnt < maxSize; ++hit_cnt)
   {
-    HitToMs_fn(msErr, msPar, hits, XHitSize, XHitArr, HitsIdx, hit_cnt, N);
+    HitToMs_fn(msErr, msPar, hits, XHitSize, XHitArr, HitsIdx, hit_cnt, itrack, N);
 #if 0
       // TODO: add CMSGeom
       if (Config::useCMSGeom) {
@@ -162,7 +162,7 @@ __device__ void bestHit_fn(
         throw std::runtime_error("useCMSGeom not implemented yet for GPU");
       } else {}
 #endif
-    computeChi2_fn(propErr, msErr, msPar, propPar, outChi2, N);
+    computeChi2_fn(propErr, msErr, msPar, propPar, outChi2, itrack, N);
     getNewBestHitChi2_fn(XHitSize, XHitArr, outChi2.ptr, minChi2_reg, bestHit_reg, hit_cnt, N);
   }
   updateTracksWithBestHit_fn
@@ -180,11 +180,12 @@ __global__ void bestHit_kernel(
     const GPlexLV propPar, GPlexQF outChi2,
     GPlexQF Chi2, GPlexQI HitsIdx,
     const int maxSize, const int N) {
+  int itrack = threadIdx.x + blockDim.x*blockIdx.x;
   bestHit_fn(hits, XHitSize, XHitArr, 
     propErr, msErr, msPar,
     propPar, outChi2,
     Chi2, HitsIdx,
-    maxSize, N);
+    maxSize, itrack, N);
 }
 
 
@@ -223,14 +224,17 @@ __global__ void findBestHit_kernel(LayerOfHitsCU *layers,
                                    GPlexQI inChg, GPlexQI Label, GeometryCU geom, 
                                    int *maxSize, int gplex_size) {
   for (int ebin = 0; ebin != Config::nEtaBin; ++ebin) {
-    for (int itrack = 0; itrack < etabin_of_cands[ebin].m_fill_index; itrack += gplex_size) {
-      int end = min(itrack + gplex_size, etabin_of_cands[ebin].m_fill_index);
-      int N = end - itrack; 
+    for (int beg = 0; beg < etabin_of_cands[ebin].m_fill_index; beg += gplex_size) {
+      int end = min(beg + gplex_size, etabin_of_cands[ebin].m_fill_index);
+      int N = end - beg; 
+
+      int tidx = threadIdx.x + blockDim.x*blockIdx.x;
+      int itrack = beg + tidx;
 
-      if (threadIdx.x + blockDim.x * blockIdx.x < N) {
+      if (itrack < end) {
 
         InputTracksCU_fn(etabin_of_cands[ebin].m_candidates, Err_iP, Par_iP,
-            inChg, Chi2, Label, HitsIdx_arr, itrack, end, N);
+            inChg, Chi2, Label, HitsIdx_arr, beg, end, tidx, N);
 
         for (int ilay = Config::nlayers_per_seed; ilay < Config::nLayers; ++ilay)
         {
@@ -244,23 +248,21 @@ __global__ void findBestHit_kernel(LayerOfHitsCU *layers,
           LayerOfHitsCU &layer = layers[ilay];
 
           int maxSize_block;
-          selectHitIndices_fn(layer, Err_iP, Par_iP, XHitSize, XHitArr, N);
+          selectHitIndices_fn(layer, Err_iP, Par_iP, XHitSize, XHitArr, tidx, N);
           // FIXME: Is reduction over block enough, or do we need device-wise reduction
           reduceMax_fn<int, BLOCK_THREADS, 1, cub::BLOCK_REDUCE_WARP_REDUCTIONS>
             (XHitSize.ptr, XHitSize.N, &maxSize_block);
           bestHit_fn(layer.m_hits, XHitSize, XHitArr, 
-              Err_iP, msErr, msPar,
-              Par_iP, outChi2,
-              Chi2, HitsIdx,
-              maxSize_block, N);
-          kalmanUpdate_fn( Err_iP, msErr, Par_iP, msPar, Par_iC, Err_iC, N);
+                     Err_iP, msErr, msPar, Par_iP, outChi2,
+                     Chi2, HitsIdx, maxSize_block, tidx, N);
+          kalmanUpdate_fn( Err_iP, msErr, Par_iP, msPar, Par_iC, Err_iC, tidx, N);
           if (ilay+1 < Config::nLayers) {
             float radius = radii[ilay+1];
-            propagationForBuilding_fn(Err_iC, Par_iC, inChg, radius, Err_iP, Par_iP, N);
+            propagationForBuilding_fn(Err_iC, Par_iC, inChg, radius, Err_iP, Par_iP, tidx, N);
           }
         }
         OutputTracksCU_fn(etabin_of_cands[ebin].m_candidates, 
-            Err_iP, Par_iP, inChg, Chi2, Label, HitsIdx_arr, itrack, end, N);
+            Err_iP, Par_iP, inChg, Chi2, Label, HitsIdx_arr, beg, end, tidx, N);
       }
     }
   }
diff --git a/mkFit/buildtestMPlex.cc b/mkFit/buildtestMPlex.cc
index f801c791317ae..96b7e1de09183 100644
--- a/mkFit/buildtestMPlex.cc
+++ b/mkFit/buildtestMPlex.cc
@@ -7,8 +7,10 @@
 #include "BinInfoUtils.h"
 
 #include "MkBuilder.h"
+#ifdef USE_CUDA
 #include "FitterCU.h"
 #include "BuilderCU.h"
+#endif
 
 #include <omp.h>
 
diff --git a/mkFit/computeChi2_kernels.cu b/mkFit/computeChi2_kernels.cu
index dabc27681c98f..dc8d95d870640 100644
--- a/mkFit/computeChi2_kernels.cu
+++ b/mkFit/computeChi2_kernels.cu
@@ -15,9 +15,10 @@
 __device__ void chi2Similarity_fn(
     const GPlexReg2V &a,
     const GPlexReg2S &c, // in registers
-    float *d, const size_t dN) {
+    float *d, const size_t dN,
+    const int n) {
 
-  int n = threadIdx.x + blockIdx.x * blockDim.x;
+  //int n = threadIdx.x + blockIdx.x * blockDim.x;
 
   // manually subrtact into local vars -- 3 of them
   /*float x0 = a[0 * aN + n] - b[0 * aN + n];*/
@@ -78,47 +79,41 @@ __device__ void ProjectResErrTransp_fn(const float a00,
 
 __device__ void computeChi2_fn(
     const GPlexLS &propErr, const GPlexHS &msErr, const GPlexHV &msPar,
-    const GPlexLV &propPar, GPlexQF &outChi2, const int N) {
-  int grid_width = blockDim.x * gridDim.x;
-  int n = threadIdx.x + blockIdx.x * blockDim.x;
+    const GPlexLV &propPar, GPlexQF &outChi2, const int n, const int N) {
+  //int n = threadIdx.x + blockIdx.x * blockDim.x;
   /*float resErr_reg[HS]; // ~ resErr_glo*/
   GPlexRegHS resErr_reg;
 
-  for (int z = 0; z < (N-1)/grid_width  +1; z++) {
-    n += z*grid_width;
-
-    if (n < N) {
-
-      // coordinate change
-      float rotT00;
-      float rotT01;
-      const float r = hipo(msPar(n, 0, 0), msPar(n, 1, 0));
-      rotT00 = -(msPar(n, 1, 0) + propPar(n, 1, 0))/(2*r);
-      rotT01 =  (msPar(n, 0, 0) + propPar(n, 0, 0))/(2*r);
-
-      /*float res_glo[HV];*/
-      GPlexRegHV res_glo;
-      subtractFirst3_fn(msPar, propPar, res_glo, N, n);
-
-      for (int j = 0; j < HS; ++j) {
-        resErr_reg[j] = 0; //resErr[j*resErr_stride + n];
-      }
-      addIntoUpperLeft3x3_fn(propErr, msErr, resErr_reg, N, n);
-
-      GPlexReg2V res_loc;   //position residual in local coordinates
-      RotateResidulsOnTangentPlane_fn(rotT00,rotT01,res_glo,res_loc);
-      /*MPlex2S resErr_loc;//covariance sum in local position coordinates*/
-      /*MPlexHH tempHH;*/
-      GPlexReg2S resErr_loc; // 2x2 sym
-      GPlexRegHH tempHH;  // 3*3 sym
-      ProjectResErr_fn  (rotT00, rotT01, resErr_reg, tempHH);
-      ProjectResErrTransp_fn(rotT00, rotT01, tempHH, resErr_loc);
-
-      /*invertCramerSym_fn(resErr_reg);*/
-      invertCramerSym2x2_fn(resErr_loc);
-
-      chi2Similarity_fn(res_loc, resErr_loc, outChi2.ptr, outChi2.stride);
+  if (n < N) {
+    // coordinate change
+    float rotT00;
+    float rotT01;
+    const float r = hipo(msPar(n, 0, 0), msPar(n, 1, 0));
+    rotT00 = -(msPar(n, 1, 0) + propPar(n, 1, 0))/(2*r);
+    rotT01 =  (msPar(n, 0, 0) + propPar(n, 0, 0))/(2*r);
+
+    /*float res_glo[HV];*/
+    GPlexRegHV res_glo;
+    subtractFirst3_fn(msPar, propPar, res_glo, N, n);
+
+    for (int j = 0; j < HS; ++j) {
+      resErr_reg[j] = 0; //resErr[j*resErr_stride + n];
     }
+    addIntoUpperLeft3x3_fn(propErr, msErr, resErr_reg, N, n);
+
+    GPlexReg2V res_loc;   //position residual in local coordinates
+    RotateResidulsOnTangentPlane_fn(rotT00,rotT01,res_glo,res_loc);
+    /*MPlex2S resErr_loc;//covariance sum in local position coordinates*/
+    /*MPlexHH tempHH;*/
+    GPlexReg2S resErr_loc; // 2x2 sym
+    GPlexRegHH tempHH;  // 3*3 sym
+    ProjectResErr_fn  (rotT00, rotT01, resErr_reg, tempHH);
+    ProjectResErrTransp_fn(rotT00, rotT01, tempHH, resErr_loc);
+
+    /*invertCramerSym_fn(resErr_reg);*/
+    invertCramerSym2x2_fn(resErr_loc);
+
+    chi2Similarity_fn(res_loc, resErr_loc, outChi2.ptr, outChi2.stride, n);
   }
 }
 
@@ -126,11 +121,14 @@ __device__ void computeChi2_fn(
 __global__ void computeChi2_kernel(
     const GPlexLS propErr, const GPlexHS msErr, const GPlexHV msPar, 
     const GPlexLV propPar, GPlexQF outChi2, const int N) {
+  int grid_width = blockDim.x * gridDim.x;
   int itrack = threadIdx.x + blockDim.x*blockIdx.x;
-  if (itrack < N) {
-    computeChi2_fn
-      (propErr, msErr, msPar, propPar,
-       outChi2, N);
+  for (int z = 0; z < (N-1)/grid_width  +1; z++) {
+    itrack += z*grid_width;
+
+    if (itrack < N) {
+      computeChi2_fn (propErr, msErr, msPar, propPar, outChi2, itrack, N);
+    }
   }
 }
 
diff --git a/mkFit/computeChi2_kernels.h b/mkFit/computeChi2_kernels.h
index c2c1cf1fb4eef..72f0e1f06976e 100644
--- a/mkFit/computeChi2_kernels.h
+++ b/mkFit/computeChi2_kernels.h
@@ -7,7 +7,8 @@
 
 
 __device__ void computeChi2_fn(const GPlexLS &propErr, const GPlexHS &msErr,
-    const GPlexHV &msPar, const GPlexLV &propPar, GPlexQF &outChi2, const int N);
+    const GPlexHV &msPar, const GPlexLV &propPar, GPlexQF &outChi2, 
+    const int itrack, const int N);
 
 
 void computeChi2_wrapper(cudaStream_t &stream, 
diff --git a/mkFit/fittestMPlex.cc b/mkFit/fittestMPlex.cc
index 41bffb26d21bc..9bd437bbeb3f5 100644
--- a/mkFit/fittestMPlex.cc
+++ b/mkFit/fittestMPlex.cc
@@ -245,7 +245,7 @@ void runAllEventsFittingTestPlexGPU(std::vector<Event>& events)
     }
     cuFitter.freeDevice();
   }
-  std::cerr << "###### Total GPU time: " << dtime() - total_gpu_time << " ######\n";
+  std::cerr << "###### [Fitting] Total GPU time: " << dtime() - total_gpu_time << " ######\n";
 }
 
 
diff --git a/mkFit/gpu_utils.cu b/mkFit/gpu_utils.cu
new file mode 100644
index 0000000000000..178f808355817
--- /dev/null
+++ b/mkFit/gpu_utils.cu
@@ -0,0 +1,5 @@
+#include "gpu_utils.h"
+
+void sync_gpu() {
+  cudaCheckErrorSync();
+}
diff --git a/mkFit/gpu_utils.h b/mkFit/gpu_utils.h
index 38190cfb372bf..feaad0517c0dc 100644
--- a/mkFit/gpu_utils.h
+++ b/mkFit/gpu_utils.h
@@ -19,4 +19,9 @@
 // Maximum number of blocks in the X direction of the thread grid.
 constexpr int max_blocks_x = 1 << 15;
 
+// The first call to a CUDA API function takes the initialization hit.
+void separate_first_call_for_meaningful_profiling_numbers();
+
+void sync_gpu();
+
 #endif /* ifndef GPU_UTILS_H */
diff --git a/mkFit/index_selection_kernels.cu b/mkFit/index_selection_kernels.cu
index 9e3700d8f96a3..22a2b2692c1d0 100644
--- a/mkFit/index_selection_kernels.cu
+++ b/mkFit/index_selection_kernels.cu
@@ -11,8 +11,8 @@ constexpr bool tmp_useCMSGeom = false;
 
 __device__ void selectHitIndices_fn(const LayerOfHitsCU &layer_of_hits,
     const GPlexLS &Err, const GPlexLV &Par, GPlexQI &XHitSize, 
-    GPlexHitIdx &XHitArr, const int N) {
-  int itrack = threadIdx.x + blockDim.x*blockIdx.x;
+    GPlexHitIdx &XHitArr, const int itrack, const int N) {
+  //int itrack = threadIdx.x + blockDim.x*blockIdx.x;
 
   if (itrack < N) {
     bool dump = false;
@@ -159,7 +159,8 @@ __device__ void selectHitIndices_fn(const LayerOfHitsCU &layer_of_hits,
 
 __global__ void selectHitIndices_kernel(const LayerOfHitsCU layer_of_hits,
     const GPlexLS Err, const GPlexLV Par, GPlexQI XHitSize, GPlexHitIdx XHitArr, const int N) {
-  selectHitIndices_fn(layer_of_hits, Err, Par, XHitSize, XHitArr, N);
+  int itrack = threadIdx.x + blockDim.x*blockIdx.x;
+  selectHitIndices_fn(layer_of_hits, Err, Par, XHitSize, XHitArr, itrack, N);
 }
 
 void selectHitIndices_wrapper(const cudaStream_t& stream,
diff --git a/mkFit/index_selection_kernels.h b/mkFit/index_selection_kernels.h
index ff0cc0ef0ca5c..801b5a4b068f9 100644
--- a/mkFit/index_selection_kernels.h
+++ b/mkFit/index_selection_kernels.h
@@ -10,6 +10,6 @@ void selectHitIndices_wrapper(const cudaStream_t& stream,
 
 __device__ void selectHitIndices_fn(const LayerOfHitsCU &layer_of_hits,
     const GPlexLS &Err, const GPlexLV &Par, GPlexQI &XHitSize,
-    GPlexHitIdx &XHitArr, const int N);
+    GPlexHitIdx &XHitArr, const int itrack, const int N);
 
 #endif  // _INDEX_SELECTION_KERNELS_H_
diff --git a/mkFit/kalmanUpdater_kernels.cu b/mkFit/kalmanUpdater_kernels.cu
index f4e694e37da8c..ed099158726f4 100644
--- a/mkFit/kalmanUpdater_kernels.cu
+++ b/mkFit/kalmanUpdater_kernels.cu
@@ -530,54 +530,49 @@ __device__ void kalmanGain_x_propErr_fn(
 __device__ void kalmanUpdate_fn(
     GPlexLS &propErr, const GPlexHS __restrict__ &msErr,
     const GPlexLV __restrict__ &par_iP, const GPlexHV __restrict__ &msPar,
-    GPlexLV &par_iC, GPlexLS &outErr, const int N) {
-  int grid_width = blockDim.x * gridDim.x;
+    GPlexLV &par_iC, GPlexLS &outErr, const int n, const int N) {
   // Note: similar results with propErr kept in registers.
   //       It is read-only so using the read-only cache yields more flexibility
   //       wrt block size without increasing the pressure on registers to much.
-  int n = threadIdx.x + blockIdx.x * blockDim.x;
   // There is no need to keep resErr and kalmanGain as global memory arrays.
   /*float resErr_reg[HS];*/
   GPlexRegHS resErr_reg;
   /*float kalmanGain_reg[LH];*/
 
   // If there is more matrices than max_blocks_x * BLOCK_SIZE_X 
-  for (int z = 0; z < (N-1)/grid_width  +1; z++) {
-    /*n += z*gridDim.x;*/
-    n += z*grid_width;
-    if (n < N) {
-      for (int j = 0; j < HS; ++j) {
-        resErr_reg[j] = 0; //resErr[j*resErr_stride + n];
-      }
+  if (n < N) {
+    for (int j = 0; j < HS; ++j) {
+      resErr_reg[j] = 0; //resErr[j*resErr_stride + n];
+    }
 
-      // FIXME: Add useCMSGeom -> port propagateHelixToRMPlex
+    // FIXME: Add useCMSGeom -> port propagateHelixToRMPlex
 #if 0
-      if (Config::useCMSGeom) {
-        propagateHelixToRMPlex(psErr,  psPar, inChg,  msPar, propErr, propPar);
-      } else {
-        propErr = psErr;
-        propPar = psPar;
-      }
+    if (Config::useCMSGeom) {
+      propagateHelixToRMPlex(psErr,  psPar, inChg,  msPar, propErr, propPar);
+    } else {
+      propErr = psErr;
+      propPar = psPar;
+    }
 #endif
-      float rotT00;
-      float rotT01;
-      const float r = hipo(msPar(n, 0, 0), msPar(n, 1, 0));
-      rotT00 = -(msPar(n, 1, 0) + par_iP(n, 1, 0))/(2*r);
-      rotT01 =  (msPar(n, 0, 0) + par_iP(n, 0, 0))/(2*r);
-
-      GPlexRegHV res_glo;
-      subtractFirst3_fn(msPar, par_iP, res_glo, N, n);
-
-      addIntoUpperLeft3x3_fn(propErr, msErr, resErr_reg, N, n);
-      GPlexReg2V res_loc;   //position residual in local coordinates
-      RotateResidulsOnTangentPlane_fn(rotT00,rotT01,res_glo,res_loc);
-      GPlexReg2S resErr_loc; // 2x2 sym
-      GPlexRegHH tempHH;  // 3*3 sym
-      ProjectResErr_fn  (rotT00, rotT01, resErr_reg, tempHH);
-      ProjectResErrTransp_fn(rotT00, rotT01, tempHH, resErr_loc);
-
-      /*invertCramerSym_fn(resErr_reg);*/
-      invertCramerSym2x2_fn(resErr_loc);
+    float rotT00;
+    float rotT01;
+    const float r = hipo(msPar(n, 0, 0), msPar(n, 1, 0));
+    rotT00 = -(msPar(n, 1, 0) + par_iP(n, 1, 0))/(2*r);
+    rotT01 =  (msPar(n, 0, 0) + par_iP(n, 0, 0))/(2*r);
+
+    GPlexRegHV res_glo;
+    subtractFirst3_fn(msPar, par_iP, res_glo, N, n);
+
+    addIntoUpperLeft3x3_fn(propErr, msErr, resErr_reg, N, n);
+    GPlexReg2V res_loc;   //position residual in local coordinates
+    RotateResidulsOnTangentPlane_fn(rotT00,rotT01,res_glo,res_loc);
+    GPlexReg2S resErr_loc; // 2x2 sym
+    GPlexRegHH tempHH;  // 3*3 sym
+    ProjectResErr_fn  (rotT00, rotT01, resErr_reg, tempHH);
+    ProjectResErrTransp_fn(rotT00, rotT01, tempHH, resErr_loc);
+
+    /*invertCramerSym_fn(resErr_reg);*/
+    invertCramerSym2x2_fn(resErr_loc);
 #ifndef POLCOORD
     // Move to "polar" coordinates: (x,y,z,1/pT,phi,theta) [can we find a better name?]
 
@@ -613,7 +608,7 @@ __device__ void kalmanUpdate_fn(
     /*outErr.Subtract(propErr, outErr);// outErr is in "polar" coordinates now*/
     subtract_matrix(propErr.ptr, propErr.stride, outErr.ptr, outErr.stride, 
         /*propErr.ptr, propErr.stride, LS, n);*/
-        outErr.ptr, outErr.stride, LS, n);
+      outErr.ptr, outErr.stride, LS, n);
 
 #ifndef POLCOORD
     // Go back to cartesian coordinates
@@ -625,16 +620,15 @@ __device__ void kalmanUpdate_fn(
     CartesianErrTransp_fn(jac_pol, tempLL, outErr, n);// outErr is in cartesian coordinates now
 #endif
 #if 0
-      upParam_MultKalmanGain_fn(propErr, propErr_stride,
-          resErr_reg, kalmanGain_reg, N, n);             
-      multResidualsAdd_fn(kalmanGain_reg, par_iP, par_iP_stride, 
-          msPar, msPar_stride, par_iC, par_iC_stride, N, n);
-
-      kalmanGain_x_propErr_fn(kalmanGain_reg,
-          propErr, propErr_stride,
-          outErr, outErr_stride, N, n);
+    upParam_MultKalmanGain_fn(propErr, propErr_stride,
+        resErr_reg, kalmanGain_reg, N, n);             
+    multResidualsAdd_fn(kalmanGain_reg, par_iP, par_iP_stride, 
+        msPar, msPar_stride, par_iC, par_iC_stride, N, n);
+
+    kalmanGain_x_propErr_fn(kalmanGain_reg,
+        propErr, propErr_stride,
+        outErr, outErr_stride, N, n);
 #endif
-    }
   }
 }
 
@@ -642,7 +636,13 @@ __global__ void kalmanUpdate_kernel(
     GPlexLS propErr, const GPlexHS __restrict__ msErr,
     const GPlexLV __restrict__ par_iP, const GPlexHV __restrict__ msPar,
     GPlexLV par_iC, GPlexLS outErr, const int N) {
-  kalmanUpdate_fn( propErr, msErr, par_iP, msPar, par_iC, outErr, N);
+  int grid_width = blockDim.x * gridDim.x;
+  int n = threadIdx.x + blockIdx.x * blockDim.x;
+
+  for (int z = 0; z < (N-1)/grid_width  +1; z++) {
+    n += z*grid_width;
+    kalmanUpdate_fn(propErr, msErr, par_iP, msPar, par_iC, outErr, n, N);
+  }
 }
 
 void kalmanUpdate_wrapper(const cudaStream_t& stream,
@@ -658,8 +658,3 @@ void kalmanUpdate_wrapper(const cudaStream_t& stream,
       (d_propErr, d_msErr, d_par_iP, d_msPar, d_par_iC, d_outErr, N);
 }
 
-// Should probably not be in this file, but creating a file for
-// this oneliner seems overkill.
-void separate_first_call_for_meaningful_profiling_numbers() {
-  cudaDeviceSynchronize();
-}
diff --git a/mkFit/kalmanUpdater_kernels.h b/mkFit/kalmanUpdater_kernels.h
index e93f7e1f360df..112462eb4c8ba 100644
--- a/mkFit/kalmanUpdater_kernels.h
+++ b/mkFit/kalmanUpdater_kernels.h
@@ -22,7 +22,7 @@ __global__ void kalmanUpdate_kernel(
 __device__ void kalmanUpdate_fn(
     GPlexLS &propErr, const GPlexHS __restrict__ &msErr,
     const GPlexLV __restrict__ &par_iP, const GPlexHV __restrict__ &msPar,
-    GPlexLV &par_iC, GPlexLS &outErr, const int N);
+    GPlexLV &par_iC, GPlexLS &outErr, const int itrack, const int N);
 
 __device__ void addIntoUpperLeft3x3_fn(const GPlexLS __restrict__ &A,
                                        const GPlexHS __restrict__ &B,
diff --git a/mkFit/mkFit.cc b/mkFit/mkFit.cc
index de066ca17ec05..718f200f01c59 100644
--- a/mkFit/mkFit.cc
+++ b/mkFit/mkFit.cc
@@ -19,6 +19,7 @@
 
 #ifdef USE_CUDA
 #include "FitterCU.h"
+#include "gpu_utils.h"
 #endif
 
 #include <cstdlib>
diff --git a/mkFit/propagation_kernels.cu b/mkFit/propagation_kernels.cu
index af5670533523f..d80b87f11a7d2 100644
--- a/mkFit/propagation_kernels.cu
+++ b/mkFit/propagation_kernels.cu
@@ -48,7 +48,8 @@ void MultHelixPropTransp_fn(const GPlexRegLL& a, const GPlexRegLL& b, GPlexLS& c
 // Registers are thread-private. Thus this function has no notion of
 // parallelism. It is ran serially by each calling thread.
 __device__ void computeJacobianSimple(float *errorProp,
-    const float s, const float k, const float p, const float pxin, const float pyin, const float pzin, 
+    const float s, const float k, const float p, 
+    const float pxin, const float pyin, const float pzin, 
     const float TP, const float cosTP, const float sinTP, const int N) {
 
   // std::cout << "total path s=" << s << std::endl;
@@ -304,64 +305,58 @@ __device__ void propagationForBuilding_fn(
     const GPlexLS &inErr, const GPlexLV &inPar,
     const GPlexQI &inChg, const float radius,
     GPlexLS &outErr, GPlexLV &outPar, 
-    const int N) {
+    const int n, const int N) {
 #if 1
-  int grid_width = blockDim.x * gridDim.x;
-  int n = threadIdx.x + blockIdx.x * blockDim.x;
-
   GPlexRegQF msRad_reg;
   // Using registers instead of shared memory is ~ 30% faster.
   GPlexRegLL errorProp_reg;
   // If there is more matrices than max_blocks_x * BLOCK_SIZE_X 
-  /*for (int z = 0; z < (N-1)/grid_width  +1; z++) {*/
-    /*n += z*grid_width;*/
-    if (n < N) {
-      
-      for (int i = 0; i < inErr.kSize; ++i) {
-        outErr[n + i*outErr.stride] = inErr[n + i*inErr.stride];
-      }
-      for (int i = 0; i < inPar.kSize; ++i) {
-        outPar[n + i*outPar.stride] = inPar[n + i*inPar.stride];
-      }
-      for (int i = 0; i < 36; ++i) {
-        errorProp_reg[i] = 0.0;
-      }
+  if (n < N) {
+
+    for (int i = 0; i < inErr.kSize; ++i) {
+      outErr[n + i*outErr.stride] = inErr[n + i*inErr.stride];
+    }
+    for (int i = 0; i < inPar.kSize; ++i) {
+      outPar[n + i*outPar.stride] = inPar[n + i*inPar.stride];
+    }
+    for (int i = 0; i < 36; ++i) {
+      errorProp_reg[i] = 0.0;
     }
 
-      /*assignMsRad_fn(radius, &msRad_reg, N, n);*/
-      msRad_reg(n, 0, 0) = radius;
-      /*if (n == 0) printf("gpu r = %f\n", radius);*/
+    /*assignMsRad_fn(radius, &msRad_reg, N, n);*/
+    msRad_reg(n, 0, 0) = radius;
+    /*if (n == 0) printf("gpu r = %f\n", radius);*/
 
 #ifdef POLCOORD
-      // TODO: port me
-      helixAtRFromIterativePolar(inPar, inChg, outPar, msRad, errorProp, N_proc);
+    // TODO: port me
+    helixAtRFromIterativePolar(inPar, inChg, outPar, msRad, errorProp, N_proc);
 #else
-      helixAtRFromIterative_fn(inPar, inChg, outPar, msRad_reg, errorProp_reg, N, n);
+    helixAtRFromIterative_fn(inPar, inChg, outPar, msRad_reg, errorProp_reg, N, n);
 #endif
-      // TODO: port me
-      /*if (Config::useCMSGeom) {*/
-        /*MPlexQF hitsRl;*/
-        /*MPlexQF hitsXi;*/
-        /*for (int n = 0; n < NN; ++n) {*/
-        /*hitsRl.At(n, 0, 0) = getRlVal(r, outPar.ConstAt(n, 2, 0));*/
-        /*hitsXi.At(n, 0, 0) = getXiVal(r, outPar.ConstAt(n, 2, 0));*/
-        /*}*/
-        /*applyMaterialEffects(hitsRl, hitsXi, outErr, outPar, N_proc);*/
-      /*}*/
-      /*similarity_fn(errorProp_reg, outErr, N, n);*/
-
-      // Matriplex version of:
-      // result.errors = ROOT::Math::Similarity(errorProp, outErr);
-
-      //MultHelixProp can be optimized for polar coordinates, see GenMPlexOps.pl
-      /*MPlexLL temp;*/
-      /*MultHelixProp      (errorProp, outErr, temp);*/
-      /*MultHelixPropTransp(errorProp, temp,   outErr);*/
-      GPlexRegLL temp;
-      MultHelixProp_fn      (errorProp_reg, outErr, temp, n);
-      MultHelixPropTransp_fn(errorProp_reg, temp,   outErr, n);
-
-  /*}*/
+    // TODO: port me
+    /*if (Config::useCMSGeom) {*/
+    /*MPlexQF hitsRl;*/
+    /*MPlexQF hitsXi;*/
+    /*for (int n = 0; n < NN; ++n) {*/
+    /*hitsRl.At(n, 0, 0) = getRlVal(r, outPar.ConstAt(n, 2, 0));*/
+    /*hitsXi.At(n, 0, 0) = getXiVal(r, outPar.ConstAt(n, 2, 0));*/
+    /*}*/
+    /*applyMaterialEffects(hitsRl, hitsXi, outErr, outPar, N_proc);*/
+    /*}*/
+    /*similarity_fn(errorProp_reg, outErr, N, n);*/
+
+    // Matriplex version of:
+    // result.errors = ROOT::Math::Similarity(errorProp, outErr);
+
+    //MultHelixProp can be optimized for polar coordinates, see GenMPlexOps.pl
+    /*MPlexLL temp;*/
+    /*MultHelixProp      (errorProp, outErr, temp);*/
+    /*MultHelixPropTransp(errorProp, temp,   outErr);*/
+    GPlexRegLL temp;
+    MultHelixProp_fn      (errorProp_reg, outErr, temp, n);
+    MultHelixPropTransp_fn(errorProp_reg, temp,   outErr, n);
+
+  }
 #endif
 }
 
@@ -370,7 +365,13 @@ __global__ void propagationForBuilding_kernel(
     const GPlexQI inChg, const float radius,
     GPlexLS outErr, GPlexLV outPar, 
     const int N) {
-  propagationForBuilding_fn( inErr, inPar, inChg, radius, outErr, outPar, N);
+  int grid_width = blockDim.x * gridDim.x;
+  int n = threadIdx.x + blockIdx.x * blockDim.x;
+
+  for (int z = 0; z < (N-1)/grid_width  +1; z++) {
+    n += z*grid_width;
+    propagationForBuilding_fn( inErr, inPar, inChg, radius, outErr, outPar, n, N);
+  }
 }
 
 void propagationForBuilding_wrapper(const cudaStream_t& stream,
diff --git a/mkFit/propagation_kernels.h b/mkFit/propagation_kernels.h
index d64c069cdcabe..12ee16d38c38c 100644
--- a/mkFit/propagation_kernels.h
+++ b/mkFit/propagation_kernels.h
@@ -20,6 +20,6 @@ __device__ void propagationForBuilding_fn(
     const GPlexLS &inErr, const GPlexLV &inPar,
     const GPlexQI &inChg, const float radius,
     GPlexLS &outErr, GPlexLV &outPar, 
-    const int N);
+    const int n, const int N);
 
 #endif  // _PROPAGATION_KERNELS_H_
diff --git a/mkFit/reorganize_gplex.cu b/mkFit/reorganize_gplex.cu
index 1e0f83671f1f7..8690076f405b0 100644
--- a/mkFit/reorganize_gplex.cu
+++ b/mkFit/reorganize_gplex.cu
@@ -49,9 +49,10 @@ __device__ void SlurpOutIdx_fn(GPlexObj from, // float *fArray, int stride, int
 __device__ void HitToMs_fn(GPlexHS &msErr, GPlexHV &msPar,
                            Hit *hits, const GPlexQI &XHitSize, 
                            const GPlexHitIdx &XHitArr, 
-                           GPlexQI &HitsIdx, const int hit_cnt, const int N) {
+                           GPlexQI &HitsIdx, const int hit_cnt, 
+                           const int itrack, const int N) {
   /*int j = threadIdx.x + blockDim.x*blockIdx.x;*/
-  int itrack = threadIdx.x + blockDim.x * blockIdx.x;
+  //int itrack = threadIdx.x + blockDim.x * blockIdx.x;
   if (itrack < N) {
 
     const char *varr      = (char*) hits;
@@ -69,8 +70,8 @@ __device__ void HitToMs_fn(GPlexHS &msErr, GPlexHV &msPar,
 __global__ void HitToMs_kernel(GPlexHS msErr, GPlexHV msPar, Hit *hits,
                                const GPlexQI XHitSize, const GPlexHitIdx XHitArr,
                                GPlexQI HitsIdx, const int hit_cnt, const int N) {
-
-    HitToMs_fn(msErr, msPar, hits, XHitSize, XHitArr, HitsIdx, hit_cnt, N);
+  int itrack = threadIdx.x + blockDim.x * blockIdx.x;
+  HitToMs_fn(msErr, msPar, hits, XHitSize, XHitArr, HitsIdx, hit_cnt, itrack, N);
 }
 
 void HitToMs_wrapper(const cudaStream_t& stream,
@@ -91,8 +92,9 @@ __device__ void InputTracksCU_fn (Track *tracks,
                                   GPlexLS &Err_iP, GPlexLV &Par_iP,
                                   GPlexQI &Chg, GPlexQF &Chi2,
                                   GPlexQI &Label, GPlexQI *HitsIdx,
-                                  const int beg, const int end, const int N) {
-  int itrack = threadIdx.x + blockDim.x*blockIdx.x;
+                                  const int beg, const int end, 
+                                  const int itrack, const int N) {
+  //int itrack = threadIdx.x + blockDim.x*blockIdx.x;
 
   if (itrack < (end-beg) && itrack < N) {
     Track &trk = tracks[beg];
@@ -120,7 +122,8 @@ __global__ void InputTracksCU_kernel(Track *tracks,
                                      GPlexQI Chg, GPlexQF Chi2, GPlexQI Label,
                                      GPlexQI *HitsIdx,
                                      int beg, int end, int N) {
-  InputTracksCU_fn(tracks, Err_iP, Par_iP, Chg, Chi2, Label, HitsIdx, beg, end, N);
+  int itrack = threadIdx.x + blockDim.x*blockIdx.x;
+  InputTracksCU_fn(tracks, Err_iP, Par_iP, Chg, Chi2, Label, HitsIdx, beg, end, itrack, N);
 }
 
 
@@ -145,8 +148,9 @@ __device__ void OutputTracksCU_fn(Track *tracks,
                                   const GPlexLS &Err_iP, const GPlexLV &Par_iP,
                                   const GPlexQI &Chg, const GPlexQF &Chi2,
                                   const GPlexQI &Label, const GPlexQI *HitsIdx,
-                                  const int beg, const int end, const int N) {
-  int itrack = threadIdx.x + blockDim.x*blockIdx.x;
+                                  const int beg, const int end, 
+                                  const int itrack, const int N) {
+  //int itrack = threadIdx.x + blockDim.x*blockIdx.x;
 
   if (itrack < (end-beg) && itrack < N) {
     Track &trk = tracks[beg];
@@ -164,14 +168,11 @@ __device__ void OutputTracksCU_fn(Track *tracks,
     tracks[i].setChi2(Chi2(itrack, 0, 0));
     tracks[i].setLabel(Label(itrack, 0, 0));
 
-    // FIXME: Config::nLayers -> NHits
-    //        Needs to find a way to get the NHits
-    //        either store it as a class member, or pass it as an argument
     tracks[i].resetHits();
     /*int nGoodItIdx = 0;*/
     for (int hi = 0; hi < Config::nLayers; ++hi) {
       tracks[i].addHitIdx(HitsIdx[hi](itrack, 0, 0),0.);
-      // We probably use registers instead of going for class members:
+      // FIXME: We probably want to use registers instead of going for class members:
       /*int hit_idx = HitsIdx[hi](itrack, 0, 0);*/
       /*tracks[i].setHitIdx(hi, hit_idx);*/
       /*if (hit_idx >= 0) {*/
@@ -188,7 +189,8 @@ __global__ void OutputTracksCU_kernel(Track *tracks,
                                      GPlexQI Chg, GPlexQF Chi2, GPlexQI Label,
                                      GPlexQI *HitsIdx,
                                      int beg, int end, int N) {
-  OutputTracksCU_fn(tracks, Err_iP, Par_iP, Chg, Chi2, Label, HitsIdx, beg, end, N);
+  int itrack = threadIdx.x + blockDim.x*blockIdx.x;
+  OutputTracksCU_fn(tracks, Err_iP, Par_iP, Chg, Chi2, Label, HitsIdx, beg, end, itrack, N);
 }
 
 
diff --git a/mkFit/reorganize_gplex.h b/mkFit/reorganize_gplex.h
index e9b536f379c08..405daa9ab76a2 100644
--- a/mkFit/reorganize_gplex.h
+++ b/mkFit/reorganize_gplex.h
@@ -8,7 +8,8 @@
 __device__ void HitToMs_fn(GPlexHS &msErr, GPlexHV &msPar,
                            Hit *hits, const GPlexQI &XHitSize,
                            const GPlexHitIdx &XHitArr, 
-                           GPlexQI &HitsIdx, const int hit_cnt, const int N);
+                           GPlexQI &HitsIdx, const int hit_cnt,
+                           const int itrack, const int N);
 
 __global__ void HitToMs_kernel(GPlexHS msErr, GPlexHV msPar, Hit *hits, 
                                const GPlexQI XHitSize, const GPlexHitIdx XHitArr, 
@@ -23,13 +24,15 @@ __device__ void InputTracksCU_fn(Track *tracks,
                                  GPlexLS &Err_iP, GPlexLV &Par_iP,
                                  GPlexQI &Chg, GPlexQF &Chi2,
                                  GPlexQI &Label, GPlexQI *HitsIdx,
-                                 const int beg, const int end, const int N);
+                                 const int beg, const int end,
+                                 const int itrack, const int N);
 
 __device__ void OutputTracksCU_fn(Track *tracks, 
                                   const GPlexLS &Err_iP, const GPlexLV &Par_iP,
                                   const GPlexQI &Chg, const GPlexQF &Chi2,
                                   const GPlexQI &Label, const GPlexQI *HitsIdx,
-                                  const int beg, const int end, const int N);
+                                  const int beg, const int end, 
+                                  const int itrack, const int N);
 
 void InputTracksCU_wrapper(const cudaStream_t &stream, 
                            const EtaBinOfCandidatesCU &etaBin,

From 3ac0904416020dd6bb409bb923e9aec445eb37d7 Mon Sep 17 00:00:00 2001
From: Matthieu Lefebvre <ml15@princeton.edu>
Date: Fri, 19 Aug 2016 09:56:54 -0400
Subject: [PATCH 09/13] Adds check to see if gpu structures are correctly
 filled

---
 mkFit/buildtestMPlex.cc           |  2 +
 mkFit/check_gpu_hit_structures.cu | 76 +++++++++++++++++++++++++++++++
 mkFit/check_gpu_hit_structures.h  |  8 ++++
 mkFit/reorganize_gplex.cu         |  6 +++
 mkFit/reorganize_gplex.h          |  3 ++
 5 files changed, 95 insertions(+)
 create mode 100644 mkFit/check_gpu_hit_structures.cu
 create mode 100644 mkFit/check_gpu_hit_structures.h

diff --git a/mkFit/buildtestMPlex.cc b/mkFit/buildtestMPlex.cc
index 96b7e1de09183..fa73fe318c33b 100644
--- a/mkFit/buildtestMPlex.cc
+++ b/mkFit/buildtestMPlex.cc
@@ -10,6 +10,7 @@
 #ifdef USE_CUDA
 #include "FitterCU.h"
 #include "BuilderCU.h"
+#include "check_gpu_hit_structures.h"
 #endif
 
 #include <omp.h>
@@ -90,6 +91,7 @@ double runBuildingTestPlexBestHit(Event& ev)
 #endif
 
 #if USE_CUDA
+  check_event_of_hits_gpu(builder.get_event_of_hits());
   BuilderCU builder_cu(builder.get_event_of_hits(), builder.get_event(),
                        event_of_cands);
 #endif
diff --git a/mkFit/check_gpu_hit_structures.cu b/mkFit/check_gpu_hit_structures.cu
new file mode 100644
index 0000000000000..7498444cfd567
--- /dev/null
+++ b/mkFit/check_gpu_hit_structures.cu
@@ -0,0 +1,76 @@
+#include "check_gpu_hit_structures.h"
+
+/*#include "reorganize_gplex.cu"*/
+#include "HitStructures.h"
+#include "HitStructuresCU.h"
+#include "reorganize_gplex.h"
+
+#include <iostream>
+
+
+__global__ void get_hit_pos_and_err(LayerOfHitsCU *layers,
+    int ilay, int hit_idx, float *pos, float *err, int pos_size, int err_size) {
+  if (threadIdx.x + blockDim.x * blockIdx.x == 0) {
+    LayerOfHitsCU &layer = layers[ilay];
+    Hit &hit = layer.m_hits[hit_idx];
+    float *posArray = get_posArray(hit);
+    float *errArray = get_errArray(hit);
+    for (int i = 0; i < pos_size; ++i) {
+      pos[i] = posArray[i];
+    }
+    for (int i = 0; i < err_size; ++i) {
+      err[i] = errArray[i];
+    }
+  }
+}
+
+
+void compare_carrays(const float *h_a, const float *d_a, 
+                     const float prec, const int n) 
+{
+  for (int i = 0; i < n; ++i) {
+    // should be relative comparison, verify if div by 0 will happen
+    if (std::abs(h_a[i] - d_a[i]) > prec) {
+      std::cerr << i << " : " << h_a[i] << " / " << d_a[i] << std::endl;
+    }
+  }
+}
+
+
+void check_event_of_hits_gpu(const EventOfHits& event_of_hits)
+{
+  EventOfHitsCU event_of_hits_cu;
+  event_of_hits_cu.allocGPU(event_of_hits);
+  event_of_hits_cu.copyFromCPU(event_of_hits);
+
+  constexpr int pos_size = 3;
+  constexpr int err_size = 6;
+
+  float *d_pos, *d_err;
+  float pos[pos_size], err[err_size];
+
+  cudaMalloc((void**)&d_pos, pos_size*sizeof(float));
+  cudaMalloc((void**)&d_err, err_size*sizeof(float));
+
+  dim3 grid(1, 1, 1);
+  dim3 block(1, 1, 1);
+
+  int ilay = 2;
+  int hit_idx = 3;
+
+  get_hit_pos_and_err <<< grid, block >>>
+    (event_of_hits_cu.m_layers_of_hits, ilay, hit_idx, d_pos, d_err, pos_size, err_size);
+
+  cudaMemcpy(pos, d_pos, pos_size*sizeof(float), cudaMemcpyDeviceToHost);
+  cudaMemcpy(err, d_err, err_size*sizeof(float), cudaMemcpyDeviceToHost);
+
+  compare_carrays(event_of_hits.m_layers_of_hits[ilay].m_hits[hit_idx].posArray(),
+                  pos, 1e-3, pos_size);
+  compare_carrays(event_of_hits.m_layers_of_hits[ilay].m_hits[hit_idx].errArray(),
+                  err, 1e-3, err_size);
+
+  cudaFree(d_pos);
+  cudaFree(d_err);
+
+  event_of_hits_cu.deallocGPU();
+}
diff --git a/mkFit/check_gpu_hit_structures.h b/mkFit/check_gpu_hit_structures.h
new file mode 100644
index 0000000000000..b44d4edb7aa0d
--- /dev/null
+++ b/mkFit/check_gpu_hit_structures.h
@@ -0,0 +1,8 @@
+#ifndef CHECK_GPU_HIT_STRUCTURE_H
+#define CHECK_GPU_HIT_STRUCTURE_H 
+
+#include "HitStructures.h"
+
+void check_event_of_hits_gpu(const EventOfHits& event_of_hits);
+
+#endif /* ifndef CHECK_GPU_HIT_STRUCTURE_H */
diff --git a/mkFit/reorganize_gplex.cu b/mkFit/reorganize_gplex.cu
index 8690076f405b0..b9d7beaafe0f3 100644
--- a/mkFit/reorganize_gplex.cu
+++ b/mkFit/reorganize_gplex.cu
@@ -6,6 +6,12 @@
 #include "Track.h"
 #include "gpu_utils.h"
 
+__device__ float *get_posArray(Hit &hit) {
+    return hit.posArrayCU();
+}
+__device__ float *get_errArray(Hit &hit) {
+    return hit.errArrayCU();
+}
 
 template <typename GPlexObj>
 __device__ void SlurpIn_fn(GPlexObj to, // float *fArray, int stride, int kSize, 
diff --git a/mkFit/reorganize_gplex.h b/mkFit/reorganize_gplex.h
index 405daa9ab76a2..4b89b9aee28dc 100644
--- a/mkFit/reorganize_gplex.h
+++ b/mkFit/reorganize_gplex.h
@@ -5,6 +5,9 @@
 #include "Hit.h"
 #include "HitStructuresCU.h"
 
+__device__ float *get_posArray(Hit &hit);
+__device__ float *get_errArray(Hit &hit);
+
 __device__ void HitToMs_fn(GPlexHS &msErr, GPlexHV &msPar,
                            Hit *hits, const GPlexQI &XHitSize,
                            const GPlexHitIdx &XHitArr, 

From 77e516eef33b11fd54695f08554127f23ca8e209 Mon Sep 17 00:00:00 2001
From: Matthieu Lefebvre <ml15@princeton.edu>
Date: Mon, 22 Aug 2016 14:19:50 -0400
Subject: [PATCH 10/13] GPU bestHit: shift to polar coordinates

---
 Config.h                       |   2 +-
 mkFit/kalmanUpdater_kernels.cu |  27 +++--
 mkFit/propagation_kernels.cu   | 214 ++++++++++++++++++++++++++++++++-
 3 files changed, 229 insertions(+), 14 deletions(-)

diff --git a/Config.h b/Config.h
index b1bc303ae596e..771c7ef4f5b9e 100644
--- a/Config.h
+++ b/Config.h
@@ -5,7 +5,7 @@
 #include <string> // won't compile on clang gcc for mac OS w/o this!
 
 //#define PRINTOUTS_FOR_PLOTS
-//#define POLCOORD
+#define POLCOORD
 
 namespace Config
 {
diff --git a/mkFit/kalmanUpdater_kernels.cu b/mkFit/kalmanUpdater_kernels.cu
index ed099158726f4..1e63e2714de68 100644
--- a/mkFit/kalmanUpdater_kernels.cu
+++ b/mkFit/kalmanUpdater_kernels.cu
@@ -418,23 +418,25 @@ __device__ void addIntoUpperLeft3x3_fn(const GPlexLS __restrict__ &A,
 
 /// MultResidualsAdd //////////////////////////////////////////////////////////
 __device__ void multResidualsAdd_fn(
-    const float* reg_a,
-    const float* __restrict__ b, const size_t bN,
-    const float* __restrict__ c, const size_t cN,
-          float *d,              const size_t dN, 
+    const GPlexRegLH &reg_a,
+    const GPlexLV __restrict__ &B, 
+    const GPlexReg2V &c,
+          GPlexLV &D,
     const int N, const int n) {
   // a -> kalmanGain
 
-  /*int i = threadIdx.x;*/
-  /*int n = threadIdx.x + blockIdx.x * blockDim.x;*/
+  using T = float;
+  const T *b = B.ptr;  int bN = B.stride;
+        T *d = D.ptr;  int dN = D.stride;
 
-  /*for (int z = 0; z < (N-1)/gridDim.x  +1; z++) {*/
-    /*n += z*gridDim.x;*/
      if (n < N) {
+       // TODO: This has changed with the introduction of polar coordiantes
+       //       (x0, x1, x2) are not used anymore (see commented values)
+       //       Clean this function's code, once it is better understood.
        // manually substract into local vars -- 3 of them
-       const float x0 = c[0 * cN + n] - b[0 * bN + n];
-       const float x1 = c[1 * cN + n] - b[1 * bN + n];
-       const float x2 = c[2 * cN + n] - b[2 * bN + n];
+       const float x0 = c[0]; // - b[0 * bN + n];
+       const float x1 = c[1]; //- b[1 * bN + n];
+       const float x2 = 0; //c[2] - b[2 * bN + n];
 
        // generate loop (can also write it manually this time, it's not much)
        // WARNING: highly numerically sensitive expressions.
@@ -596,7 +598,8 @@ __device__ void kalmanUpdate_fn(
 
 #ifdef POLCOORD
     // FIXME: assuming no polcoord for now
-    MultResidualsAdd(K.arr, propPar, res_loc, outPar);// propPar_pol is now the updated parameters in "polar" coordinates
+    //MultResidualsAdd(K.arr, propPar, res_loc, outPar);// propPar_pol is now the updated parameters in "polar" coordinates
+    multResidualsAdd_fn(K, par_iP, res_loc, par_iC, N, n);// propPar_pol is now the updated parameters in "polar" coordinates
     GPlexRegLL tempLL;
 #else
     /*MultResidualsAdd(K, propPar_pol, res_loc, propPar_pol);// propPar_pol is now the updated parameters in "polar" coordinates*/
diff --git a/mkFit/propagation_kernels.cu b/mkFit/propagation_kernels.cu
index d80b87f11a7d2..64438ade75374 100644
--- a/mkFit/propagation_kernels.cu
+++ b/mkFit/propagation_kernels.cu
@@ -127,6 +127,216 @@ __device__ void computeMsRad_fn(const GPlexHV& __restrict__ msPar,
   }
 }
 
+__device__ void helixAtRFromIterativePolar_fn(const GPlexLV& inPar, 
+    const GPlexQI& inChg, GPlexLV& outPar, const GPlexReg<float, 1, 1>& msRad, 
+    GPlexReg<float, LL2, L>& errorProp, const int N, const int n)
+{
+  errorProp.SetVal(0);
+
+#pragma simd
+  //for (int n = 0; n < NN; ++n)
+  if (n < N) 
+    {
+      //initialize erroProp to identity matrix
+      errorProp(n,0,0) = 1.f;
+      errorProp(n,1,1) = 1.f;
+      errorProp(n,2,2) = 1.f;
+      errorProp(n,3,3) = 1.f;
+      errorProp(n,4,4) = 1.f;
+      errorProp(n,5,5) = 1.f;
+
+      const float k = inChg(n, 0, 0) * 100.f / (-Config::sol*Config::Bfield);
+      const float r = msRad(n, 0, 0);
+      float r0 = hipo(inPar(n, 0, 0), inPar(n, 1, 0));
+
+      // if (std::abs(r-r0)<0.0001f) {
+      // 	dprint("distance less than 1mum, skip");
+      // 	continue;
+      // }
+
+      const float xin   = inPar(n, 0, 0);
+      const float yin   = inPar(n, 1, 0);
+      const float zin   = inPar(n, 2, 0);
+      const float ipt   = inPar(n, 3, 0);
+      const float phiin = inPar(n, 4, 0);
+      const float theta = inPar(n, 5, 0);
+
+      dprint_np(n, std::endl << "input parameters"
+            << " inPar(n, 0, 0)=" << std::setprecision(9) << inPar(n, 0, 0)
+            << " inPar(n, 1, 0)=" << std::setprecision(9) << inPar(n, 1, 0)
+            << " inPar(n, 2, 0)=" << std::setprecision(9) << inPar(n, 2, 0)
+            << " inPar(n, 3, 0)=" << std::setprecision(9) << inPar(n, 3, 0)
+            << " inPar(n, 4, 0)=" << std::setprecision(9) << inPar(n, 4, 0)
+            << " inPar(n, 5, 0)=" << std::setprecision(9) << inPar(n, 5, 0)
+            );
+
+      const float kinv  = 1.f/k;
+      const float pt = 1.f/ipt;
+
+      float D = 0., cosa = 0., sina = 0., id = 0.;
+      //no trig approx here, phi can be large
+      float cosPorT = std::cos(phiin), sinPorT = std::sin(phiin);
+      float pxin = cosPorT*pt;
+      float pyin = sinPorT*pt;
+
+      dprint_np(n, std::endl << "k=" << std::setprecision(9) << k << " pxin=" << std::setprecision(9) << pxin << " pyin="
+             << std::setprecision(9) << pyin << " cosPorT=" << std::setprecision(9) << cosPorT
+             << " sinPorT=" << std::setprecision(9) << sinPorT << " pt=" << std::setprecision(9) << pt);
+
+      //derivatives initialized to value for first iteration, i.e. distance = r-r0in
+      float dDdx = r0 > 0.f ? -xin/r0 : 0.f;
+      float dDdy = r0 > 0.f ? -yin/r0 : 0.f;
+      float dDdipt = 0.;
+      float dDdphi = 0.;
+
+      for (int i = 0; i < Config::Niter; ++i)
+      {
+	dprint_np(n, std::endl << "attempt propagation from r=" << r0 << " to r=" << r << std::endl
+	       << "x=" << xin << " y=" << yin  << " z=" << inPar(n, 2, 0) << " px=" << pxin << " py=" << pyin << " pz=" << pt*std::tan(theta) << " q=" << inChg(n, 0, 0));
+
+	//compute distance and path for the current iteration
+	r0 = hipo(outPar(n, 0, 0), outPar(n, 1, 0));
+	id = (r-r0);
+	D+=id;
+	if (Config::useTrigApprox) {
+	  sincos4(id*ipt*kinv, sina, cosa);
+	} else {
+          cosa=std::cos(id*ipt*kinv);
+          sina=std::sin(id*ipt*kinv);
+	}
+
+        dprint_np(n, std::endl << "r=" << std::setprecision(9) << r << " r0=" << std::setprecision(9) << r0
+               << " id=" << std::setprecision(9) << id << " cosa=" << cosa << " sina=" << sina);
+
+	//update derivatives on total distance
+	if (i+1 != Config::Niter) {
+
+          const float x = outPar(n, 0, 0);
+          const float y = outPar(n, 1, 0);
+          const float oor0 = (r0>0.f && std::abs(r-r0)<0.0001f) ? 1.f/r0 : 0.f;
+
+          const float dadipt = id*kinv;
+
+          const float dadx = -x*ipt*kinv*oor0;
+          const float dady = -y*ipt*kinv*oor0;
+
+	  const float pxca = pxin*cosa;
+	  const float pxsa = pxin*sina;
+	  const float pyca = pyin*cosa;
+	  const float pysa = pyin*sina;
+
+	  float tmp = k*dadx;
+          dDdx   -= ( x*(1.f + tmp*(pxca - pysa)) + y*tmp*(pyca + pxsa) )*oor0;
+	  tmp = k*dady;
+          dDdy   -= ( x*tmp*(pxca - pysa) + y*(1.f + tmp*(pyca + pxsa)) )*oor0;
+          //now r0 depends on ipt and phi as well
+	  tmp = dadipt*ipt;
+          dDdipt -= k*( x*(pxca*tmp - pysa*tmp - pyca - pxsa + pyin) +
+                        y*(pyca*tmp + pxsa*tmp - pysa + pxca - pxin))*pt*oor0;
+          dDdphi += k*( x*(pysa - pxin + pxca) - y*(pxsa - pyin + pyca))*oor0;
+        }
+
+	//update parameters
+	outPar(n, 0, 0) = outPar(n, 0, 0) + k*(pxin*sina - pyin*(1.f-cosa));
+	outPar(n, 1, 0) = outPar(n, 1, 0) + k*(pyin*sina + pxin*(1.f-cosa));
+	const float pxinold = pxin;//copy before overwriting
+	pxin = pxin*cosa - pyin*sina;
+	pyin = pyin*cosa + pxinold*sina;
+
+        dprint_np(n, std::endl << "outPar(n, 0, 0)=" << outPar(n, 0, 0) << " outPar(n, 1, 0)=" << outPar(n, 1, 0)
+               << " pxin=" << pxin << " pyin=" << pyin);
+      }
+
+      const float alpha  = D*ipt*kinv;
+      const float dadx   = dDdx*ipt*kinv;
+      const float dady   = dDdy*ipt*kinv;
+      const float dadipt = (ipt*dDdipt + D)*kinv;
+      const float dadphi = dDdphi*ipt*kinv;
+
+      if (Config::useTrigApprox) {
+	sincos4(alpha, sina, cosa);
+      } else {
+	cosa=std::cos(alpha);
+	sina=std::sin(alpha);
+      }
+
+      errorProp(n,0,0) = 1.f+k*dadx*(cosPorT*cosa-sinPorT*sina)*pt;
+      errorProp(n,0,1) =     k*dady*(cosPorT*cosa-sinPorT*sina)*pt;
+      errorProp(n,0,2) = 0.f;
+      errorProp(n,0,3) = k*(cosPorT*(ipt*dadipt*cosa-sina)+sinPorT*((1.f-cosa)-ipt*dadipt*sina))*pt*pt;
+      errorProp(n,0,4) = k*(cosPorT*dadphi*cosa - sinPorT*dadphi*sina - sinPorT*sina + cosPorT*cosa - cosPorT)*pt;
+      errorProp(n,0,5) = 0.f;
+
+      errorProp(n,1,0) =     k*dadx*(sinPorT*cosa+cosPorT*sina)*pt;
+      errorProp(n,1,1) = 1.f+k*dady*(sinPorT*cosa+cosPorT*sina)*pt;
+      errorProp(n,1,2) = 0.f;
+      errorProp(n,1,3) = k*(sinPorT*(ipt*dadipt*cosa-sina)+cosPorT*(ipt*dadipt*sina-(1.f-cosa)))*pt*pt;
+      errorProp(n,1,4) = k*(sinPorT*dadphi*cosa + cosPorT*dadphi*sina + sinPorT*cosa + cosPorT*sina - sinPorT)*pt;
+      errorProp(n,1,5) = 0.f;
+
+      //no trig approx here, theta can be large
+      cosPorT=std::cos(theta);
+      sinPorT=std::sin(theta);
+      //redefine sinPorT as 1./sinPorT to reduce the number of temporaries
+      sinPorT = 1.f/sinPorT;
+
+      outPar(n, 2, 0) = inPar(n, 2, 0) + k*alpha*cosPorT*pt*sinPorT;
+
+      errorProp(n,2,0) = k*cosPorT*dadx*pt*sinPorT;
+      errorProp(n,2,1) = k*cosPorT*dady*pt*sinPorT;
+      errorProp(n,2,2) = 1.f;
+      errorProp(n,2,3) = k*cosPorT*(ipt*dadipt-alpha)*pt*pt*sinPorT;
+      errorProp(n,2,4) = k*dadphi*cosPorT*pt*sinPorT;
+      errorProp(n,2,5) =-k*alpha*pt*sinPorT*sinPorT;
+
+      outPar(n, 3, 0) = ipt;
+
+      errorProp(n,3,0) = 0.f;
+      errorProp(n,3,1) = 0.f;
+      errorProp(n,3,2) = 0.f;
+      errorProp(n,3,3) = 1.f;
+      errorProp(n,3,4) = 0.f;
+      errorProp(n,3,5) = 0.f;
+
+      outPar(n, 4, 0) = inPar(n, 4, 0)+alpha;
+
+      errorProp(n,4,0) = dadx;
+      errorProp(n,4,1) = dady;
+      errorProp(n,4,2) = 0.f;
+      errorProp(n,4,3) = dadipt;
+      errorProp(n,4,4) = 1.f+dadphi;
+      errorProp(n,4,5) = 0.f;
+
+      outPar(n, 5, 0) = theta;
+
+      errorProp(n,5,0) = 0.f;
+      errorProp(n,5,1) = 0.f;
+      errorProp(n,5,2) = 0.f;
+      errorProp(n,5,3) = 0.f;
+      errorProp(n,5,4) = 0.f;
+      errorProp(n,5,5) = 1.f;
+
+      dprint_np(n, "propagation end, dump parameters" << std::endl
+	     << "pos = " << outPar(n, 0, 0) << " " << outPar(n, 1, 0) << " " << outPar(n, 2, 0) << std::endl
+	     << "mom = " << std::cos(outPar(n, 4, 0))/outPar(n, 3, 0) << " " << std::sin(outPar(n, 4, 0))/outPar(n, 3, 0) << " " << 1./(outPar(n, 3, 0)*tan(outPar(n, 5, 0)))
+	     << " r=" << std::sqrt( outPar(n, 0, 0)*outPar(n, 0, 0) + outPar(n, 1, 0)*outPar(n, 1, 0) ) << " pT=" << 1./std::abs(outPar(n, 3, 0)) << std::endl);
+      
+#ifdef DEBUG
+      if (n < N_proc) {
+	dmutex_guard;
+	std::cout << n << ": jacobian" << std::endl;
+	printf("%5f %5f %5f %5f %5f %5f\n", errorProp(n,0,0),errorProp(n,0,1),errorProp(n,0,2),errorProp(n,0,3),errorProp(n,0,4),errorProp(n,0,5));
+	printf("%5f %5f %5f %5f %5f %5f\n", errorProp(n,1,0),errorProp(n,1,1),errorProp(n,1,2),errorProp(n,1,3),errorProp(n,1,4),errorProp(n,1,5));
+	printf("%5f %5f %5f %5f %5f %5f\n", errorProp(n,2,0),errorProp(n,2,1),errorProp(n,2,2),errorProp(n,2,3),errorProp(n,2,4),errorProp(n,2,5));
+	printf("%5f %5f %5f %5f %5f %5f\n", errorProp(n,3,0),errorProp(n,3,1),errorProp(n,3,2),errorProp(n,3,3),errorProp(n,3,4),errorProp(n,3,5));
+	printf("%5f %5f %5f %5f %5f %5f\n", errorProp(n,4,0),errorProp(n,4,1),errorProp(n,4,2),errorProp(n,4,3),errorProp(n,4,4),errorProp(n,4,5));
+	printf("%5f %5f %5f %5f %5f %5f\n", errorProp(n,5,0),errorProp(n,5,1),errorProp(n,5,2),errorProp(n,5,3),errorProp(n,5,4),errorProp(n,5,5));
+      }
+#endif
+    }
+}
+
+
 #include "PropagationMPlex.icc"
 
 __device__ 
@@ -276,6 +486,7 @@ __global__ void propagation_kernel(
 #ifdef POLCOORD
       // FIXME: port me
       // helixAtRFromIterativePolar(inPar, inChg, outPar, msRad, errorProp);
+    helixAtRFromIterativePolar_fn(inPar, inChg, outPar, msRad_reg, errorProp_reg, N, n);
 #else
       helixAtRFromIterative_fn(inPar, inChg, outPar, msRad_reg, errorProp_reg, N, n);
 #endif
@@ -329,7 +540,8 @@ __device__ void propagationForBuilding_fn(
 
 #ifdef POLCOORD
     // TODO: port me
-    helixAtRFromIterativePolar(inPar, inChg, outPar, msRad, errorProp, N_proc);
+    //helixAtRFromIterativePolar(inPar, inChg, outPar, msRad, errorProp, N_proc);
+    helixAtRFromIterativePolar_fn(inPar, inChg, outPar, msRad_reg, errorProp_reg, N, n);
 #else
     helixAtRFromIterative_fn(inPar, inChg, outPar, msRad_reg, errorProp_reg, N, n);
 #endif

From 2589d8f873677353297c5e97a947bad61157c8e7 Mon Sep 17 00:00:00 2001
From: Matthieu Lefebvre <ml15@princeton.edu>
Date: Tue, 23 Aug 2016 17:06:26 -0400
Subject: [PATCH 11/13] Starts remodeling gpu fitter to reorganize on the gpu

---
 mkFit/BuilderCU.cu           |  2 +-
 mkFit/FitterCU-imp.h         | 13 +++++---
 mkFit/fittestMPlex.cc        |  2 +-
 mkFit/fittracks_kernels.cu   | 45 ++++++++++++++++++++++++++
 mkFit/fittracks_kernels.h    | 13 ++++++++
 mkFit/mkFit.cc               |  7 ++++
 mkFit/propagation_kernels.cu | 63 ++++++++++++++++++++----------------
 mkFit/propagation_kernels.h  | 12 +++++++
 8 files changed, 122 insertions(+), 35 deletions(-)
 create mode 100644 mkFit/fittracks_kernels.cu
 create mode 100644 mkFit/fittracks_kernels.h

diff --git a/mkFit/BuilderCU.cu b/mkFit/BuilderCU.cu
index 389b4c51d8734..0be1399129d4a 100644
--- a/mkFit/BuilderCU.cu
+++ b/mkFit/BuilderCU.cu
@@ -37,7 +37,7 @@ BuilderCU::~BuilderCU() {
 void BuilderCU::setUp(const EventOfHits& event_of_hits, const Event* event,
                       const EventOfCandidates& event_of_cands)
 {
-  int gplex_size = 1 << 14;
+  int gplex_size = 1 << 15;
   cuFitter = new FitterCU<float> (gplex_size);
   cuFitter->allocateDevice();
   cuFitter->allocate_extra_addBestHit();
diff --git a/mkFit/FitterCU-imp.h b/mkFit/FitterCU-imp.h
index 1b2e5f3150adf..e235c644bfdbb 100644
--- a/mkFit/FitterCU-imp.h
+++ b/mkFit/FitterCU-imp.h
@@ -2,6 +2,7 @@
 #include "Config.h"
 #include "GeometryCU.h"
 #include "reorganize_gplex.h"
+#include "fittracks_kernels.h"
 
 template <typename T>
 void FitterCU<T>::setNumberTracks(const idx_t Ntracks) {
@@ -94,7 +95,6 @@ void FitterCU<T>::kalmanUpdate_standalone(
 template <typename T>
 void FitterCU<T>::propagationMerged(const int hit_idx) {
   propagation_wrapper(stream, d_msPar[hit_idx], d_par_iC, d_inChg,
-                      //d_par_iP, d_Err_iC, d_Err_iP, N); // TODO: Check outErr/errorProp
                       d_par_iP, d_errorProp, d_Err_iP, N);
 }
 
@@ -238,15 +238,18 @@ void FitterCU<T>::FitTracks(MPlexQI &Chg, MPlexLV& par_iC, MPlexLS& err_iC,
     d_msPar[hi].copyAsyncFromHost(stream, msPar[hi]);
     d_msErr[hi].copyAsyncFromHost(stream, msErr[hi]);
 
-    propagationMerged(hi);
-    kalmanUpdateMerged(hi);
+    //propagationMerged(hi);
+    //kalmanUpdateMerged(hi);
+    fittracks_wrapper(stream, d_Err_iP, d_par_iP, d_msErr, d_msPar,
+                      d_Err_iC, d_par_iC, d_errorProp, d_inChg,
+                      hi, N);
   }
   cudaEventRecord(stop, 0);
   cudaEventSynchronize(stop);
 
   cudaEventElapsedTime(&etime, start, stop);
-  std::cerr << "CUDA etime: " << etime << " ms.\n";
-  std::cerr << "Total reorg: " << total_reorg << " ms.\n";
+  //std::cerr << "CUDA etime: " << etime << " ms.\n";
+  //std::cerr << "Total reorg: " << total_reorg << " ms.\n";
 
   d_par_iC.copyAsyncToHost(stream, par_iC);
   d_Err_iC.copyAsyncToHost(stream, err_iC);
diff --git a/mkFit/fittestMPlex.cc b/mkFit/fittestMPlex.cc
index 8044d4d3d2a19..066fe87f50167 100644
--- a/mkFit/fittestMPlex.cc
+++ b/mkFit/fittestMPlex.cc
@@ -296,7 +296,7 @@ double runFittingTestPlexGPU(FitterCU<float> &cuFitter,
 
       double time_output = dtime();
       mkfp->OutputFittedTracks(rectracks, itrack, end);
-      std::cerr << "Output time: " << (dtime() - time_output)*1e3 << std::endl;
+      //std::cerr << "Output time: " << (dtime() - time_output)*1e3 << std::endl;
    }
 
    time = dtime() - time;
diff --git a/mkFit/fittracks_kernels.cu b/mkFit/fittracks_kernels.cu
new file mode 100644
index 0000000000000..d77a281cc5200
--- /dev/null
+++ b/mkFit/fittracks_kernels.cu
@@ -0,0 +1,45 @@
+#include "fittracks_kernels.h"
+
+#include "kalmanUpdater_kernels.h"
+#include "propagation_kernels.h"
+
+constexpr int BLOCK_SIZE_X = 256;
+
+__global__ void fittracks_kernel(
+      GPlexLV par_iP, GPlexLS Err_iP,
+      GPlexHV msPar, GPlexHS msErr,
+      GPlexLV par_iC, GPlexLS Err_iC,
+      GPlexLL errorProp, GPlexQI inChg,
+      int N)
+{
+  int grid_width = blockDim.x * gridDim.x;
+  int n = threadIdx.x + blockIdx.x * blockDim.x;
+  for (int z = 0; z < (N-1)/grid_width  +1; z++) {
+    n += z*grid_width;
+
+    propagation_fn(msPar, par_iC, inChg, par_iP, errorProp, Err_iP, n, N);
+    kalmanUpdate_fn(Err_iP, msErr, par_iP, msPar, par_iC, Err_iC, n, N);
+  }
+}
+
+void fittracks_wrapper(cudaStream_t &stream,
+                       GPlexLS &Err_iP, GPlexLV &par_iP, 
+                       GPlexHS *msErr, GPlexHV *msPar,
+                       GPlexLS &Err_iC, GPlexLV &par_iC,
+                       GPlexLL &errorProp, GPlexQI &inChg,
+                       const int hit_idx, const int N)
+{
+  int gridx = std::min((N-1)/BLOCK_SIZE_X + 1,
+                       max_blocks_x);
+  dim3 grid(gridx, 1, 1);
+  dim3 block(BLOCK_SIZE_X, 1, 1);
+
+  fittracks_kernel <<< grid, block, 0, stream >>>
+      (par_iP, Err_iP, 
+       msPar[hit_idx], msErr[hit_idx],
+       par_iC, Err_iC,
+       errorProp, inChg, 
+       N);
+  /*kalmanUpdate_wrapper(stream, Err_iP, msErr[hit_idx],*/
+                       /*par_iP, msPar[hit_idx], par_iC, Err_iC, N);*/
+}
diff --git a/mkFit/fittracks_kernels.h b/mkFit/fittracks_kernels.h
new file mode 100644
index 0000000000000..2486b80a51e3b
--- /dev/null
+++ b/mkFit/fittracks_kernels.h
@@ -0,0 +1,13 @@
+#ifndef FITTRACKS_KERNELS_H_G3FDJYTX
+#define FITTRACKS_KERNELS_H_G3FDJYTX
+
+#include "GPlex.h"
+
+void fittracks_wrapper(cudaStream_t &stream,
+                       GPlexLS &Err_iP, GPlexLV &par_iP, 
+                       GPlexHS *msErr, GPlexHV *msPar,
+                       GPlexLS &Err_iC, GPlexLV &par_iC,
+                       GPlexLL &errorProp, GPlexQI &inChg,
+                       const int hit_idx, const int N);
+
+#endif /* end of include guard: FITTRACKS_KERNELS_H_G3FDJYTX */
diff --git a/mkFit/mkFit.cc b/mkFit/mkFit.cc
index dc5bda99f548f..88883d3ad521c 100644
--- a/mkFit/mkFit.cc
+++ b/mkFit/mkFit.cc
@@ -274,7 +274,14 @@ void test_standard()
 
     for (int b = 0; b < Config::finderReportBestOutOfN; ++b)
     {
+#ifndef USE_CUDA
       t_cur[0] = (g_run_fit_std) ? runFittingTestPlex(ev, plex_tracks) : 0;
+#else
+      FitterCU<float> cuFitter(NN);
+      cuFitter.allocateDevice();
+      t_cur[0] = (g_run_fit_std) ? runFittingTestPlexGPU(cuFitter, ev, plex_tracks) : 0;
+      cuFitter.freeDevice();
+#endif
       t_cur[1] = (g_run_build_all || g_run_build_bh)  ? runBuildingTestPlexBestHit(ev) : 0;
       t_cur[2] = (g_run_build_all || g_run_build_std) ? runBuildingTestPlex(ev, ev_tmp) : 0;
       t_cur[3] = (g_run_build_all || g_run_build_ce)  ? runBuildingTestPlexCloneEngine(ev, ev_tmp) : 0;
diff --git a/mkFit/propagation_kernels.cu b/mkFit/propagation_kernels.cu
index 5b1ce07e694ca..a9debeb83308d 100644
--- a/mkFit/propagation_kernels.cu
+++ b/mkFit/propagation_kernels.cu
@@ -454,44 +454,53 @@ __device__ void similarity_fn(GPlexRegLL &a, GPlexLS &b, int N, int n) {
   }
 }
 
+
 // PropagationMPlex.cc:propagateHelixToRMPlex, first version with 6 arguments 
-__global__ void propagation_kernel(
-    GPlexHV msPar,
-    GPlexLV inPar, GPlexQI inChg,
-    GPlexLV outPar, GPlexLL errorProp,
-    GPlexLS outErr, int N) {
+__device__ void propagation_fn(
+    GPlexHV &msPar,
+    GPlexLV &inPar, GPlexQI &inChg,
+    GPlexLV &outPar, GPlexLL &errorProp,
+    GPlexLS &outErr, int n, int N) {
 
-  int grid_width = blockDim.x * gridDim.x;
-  int n = threadIdx.x + blockIdx.x * blockDim.x;
   GPlexRegQF msRad_reg;
   // Using registers instead of shared memory is ~ 30% faster.
   GPlexRegLL errorProp_reg;
   // If there is more matrices than max_blocks_x * BLOCK_SIZE_X 
-  for (int z = 0; z < (N-1)/grid_width  +1; z++) {
-    n += z*grid_width;
-    if (n < N) {
+  if (n < N) {
 #if 0
-      computeMsRad_fn(msPar, stride_msPar, &msRad_reg, N, n);
-      if (Config::doIterative) {
-        helixAtRFromIterative_fn(inPar, inPar_stride,
-            inChg, outPar, outPar_stride, msRad_reg, 
-            errorProp_reg, N, n);
-      } else {
-        // TODO: not ported for now. Assuming Config::doIterative
-        // helixAtRFromIntersection(inPar, inChg, outPar, msRad, errorProp);
-      }
-      similarity_fn(errorProp_reg, outErr, outErr_stride, N, n);
+    computeMsRad_fn(msPar, stride_msPar, &msRad_reg, N, n);
+    if (Config::doIterative) {
+      helixAtRFromIterative_fn(inPar, inPar_stride,
+          inChg, outPar, outPar_stride, msRad_reg, 
+          errorProp_reg, N, n);
+    } else {
+      // TODO: not ported for now. Assuming Config::doIterative
+      // helixAtRFromIntersection(inPar, inChg, outPar, msRad, errorProp);
+    }
+    similarity_fn(errorProp_reg, outErr, outErr_stride, N, n);
 #endif
-      computeMsRad_fn(msPar, msRad_reg, N, n);
+    computeMsRad_fn(msPar, msRad_reg, N, n);
 #ifdef CCSCOORD
-      // FIXME: port me
-      // helixAtRFromIterativePolar(inPar, inChg, outPar, msRad, errorProp);
     helixAtRFromIterativePolar_fn(inPar, inChg, outPar, msRad_reg, errorProp_reg, N, n);
 #else
-      helixAtRFromIterative_fn(inPar, inChg, outPar, msRad_reg, errorProp_reg, N, n);
+    helixAtRFromIterative_fn(inPar, inChg, outPar, msRad_reg, errorProp_reg, N, n);
 #endif
-      similarity_fn(errorProp_reg, outErr, N, n);
-    }
+    similarity_fn(errorProp_reg, outErr, N, n);
+  }
+}
+
+
+__global__ void propagation_kernel(
+    GPlexHV msPar,
+    GPlexLV inPar, GPlexQI inChg,
+    GPlexLV outPar, GPlexLL errorProp,
+    GPlexLS outErr, int N)
+{
+  int grid_width = blockDim.x * gridDim.x;
+  int n = threadIdx.x + blockIdx.x * blockDim.x;
+  for (int z = 0; z < (N-1)/grid_width  +1; z++) {
+    n += z*grid_width;
+    propagation_fn(msPar, inPar, inChg, outPar, errorProp, outErr, n, N);
   }
 }
 
@@ -539,8 +548,6 @@ __device__ void propagationForBuilding_fn(
     /*if (n == 0) printf("gpu r = %f\n", radius);*/
 
 #ifdef CCSCOORD
-    // TODO: port me
-    //helixAtRFromIterativePolar(inPar, inChg, outPar, msRad, errorProp, N_proc);
     helixAtRFromIterativePolar_fn(inPar, inChg, outPar, msRad_reg, errorProp_reg, N, n);
 #else
     helixAtRFromIterative_fn(inPar, inChg, outPar, msRad_reg, errorProp_reg, N, n);
diff --git a/mkFit/propagation_kernels.h b/mkFit/propagation_kernels.h
index 12ee16d38c38c..bbf4a80c07196 100644
--- a/mkFit/propagation_kernels.h
+++ b/mkFit/propagation_kernels.h
@@ -3,6 +3,12 @@
 
 #include "GPlex.h"
 
+__device__ void propagation_fn(
+    GPlexHV &msPar,
+    GPlexLV &inPar, GPlexQI &inChg,
+    GPlexLV &outPar, GPlexLL &errorProp,
+    GPlexLS &outErr, int n, int N);
+
 void propagation_wrapper(const cudaStream_t& stream,
     GPlexHV& msPar,
     GPlexLV& inPar, GPlexQI& inChg,
@@ -16,6 +22,12 @@ void propagationForBuilding_wrapper(const cudaStream_t& stream,
     GPlexLS& outErr, GPlexLV& outPar, 
     const int N);
 
+__device__ void propagation_fn(
+    GPlexHV &msPar,
+    GPlexLV &inPar, GPlexQI &inChg,
+    GPlexLV &outPar, GPlexLL &errorProp,
+    GPlexLS &outErr, int n, int N);
+
 __device__ void propagationForBuilding_fn(
     const GPlexLS &inErr, const GPlexLV &inPar,
     const GPlexQI &inChg, const float radius,

From c0f5284e1ef477bb425003306ea230fc04ba95db Mon Sep 17 00:00:00 2001
From: Matthieu Lefebvre <ml15@princeton.edu>
Date: Mon, 29 Aug 2016 09:03:26 -0400
Subject: [PATCH 12/13] Streams for gpu best_hit + gpu reorg for fitting

Cleans references to FitterCU in MkBuilder

Adds tbb parallel loop for besthit on the gpu

Reorganizes fittracks on the GPU

Slow down the process on a K20, in comparison to overlapped CPU reorg / GPU computations.
However, if the tracks come from building, they are already reorganized to GPlex.
---
 Config.h                     |   4 +-
 Math/SVector.h               |   4 +-
 mkFit/BuilderCU.cu           |  13 +--
 mkFit/FitterCU-imp.h         | 219 ++++++++++++-----------------------
 mkFit/FitterCU.h             |   8 +-
 mkFit/HitStructuresCU.cu     |  37 ++++++
 mkFit/HitStructuresCU.h      |   4 +
 mkFit/MkBuilder.cc           |  77 ------------
 mkFit/MkBuilder.h            |  10 --
 mkFit/buildtestMPlex.cc      |  74 +++++++++++-
 mkFit/buildtestMPlex.h       |   4 +
 mkFit/fittestMPlex.cc        |  66 ++++-------
 mkFit/fittracks_kernels.cu   |  27 +++--
 mkFit/gpu_utils.cu           |   4 +
 mkFit/gpu_utils.h            |   4 +-
 mkFit/mkFit.cc               |  25 ++--
 mkFit/propagation_kernels.cu |  34 ++++--
 mkFit/propagation_kernels.h  |  18 +--
 mkFit/reorganize_gplex.cu    | 123 +++++++++++++++++---
 mkFit/reorganize_gplex.h     |  22 +++-
 20 files changed, 414 insertions(+), 363 deletions(-)

diff --git a/Config.h b/Config.h
index fe0f7d75a3a80..a9620e3bab2b0 100644
--- a/Config.h
+++ b/Config.h
@@ -205,9 +205,9 @@ namespace Config
     #ifdef __MIC__
       #define MPT_SIZE 16
     #elif defined USE_CUDA
-      #define MPT_SIZE 8 
+      #define MPT_SIZE 8
     #else
-      #define MPT_SIZE 8 
+      #define MPT_SIZE 8
     #endif
   #endif
 
diff --git a/Math/SVector.h b/Math/SVector.h
index a81c019499f8b..1186867a7679c 100644
--- a/Math/SVector.h
+++ b/Math/SVector.h
@@ -185,9 +185,9 @@ class SVector {
    const T* Array() const;
    /// return non-const pointer to internal array
    T* Array();
-//#ifdef USE_CUDA
+#ifdef __CUDACC__
    T* ArrayCU();
-//#endif
+#endif
    
    /** @name --- STL-like interface --- */
    
diff --git a/mkFit/BuilderCU.cu b/mkFit/BuilderCU.cu
index 0be1399129d4a..6d6ee60863365 100644
--- a/mkFit/BuilderCU.cu
+++ b/mkFit/BuilderCU.cu
@@ -21,15 +21,6 @@ BuilderCU::BuilderCU(const EventOfHits& event_of_hits, const Event* event,
 
 
 BuilderCU::~BuilderCU() {
-  /*event_of_cands_cu.deallocGPU();*/
-
-  /*geom_cu.deallocate();*/
-  /*event_of_hits_cu.deallocGPU();*/
-
-  /*cuFitter->destroyStream();*/
-  /*cuFitter->free_extra_addBestHit();*/
-  /*cuFitter->freeDevice();*/
-  /*delete cuFitter;*/
   tearDown();
 }
 
@@ -37,7 +28,7 @@ BuilderCU::~BuilderCU() {
 void BuilderCU::setUp(const EventOfHits& event_of_hits, const Event* event,
                       const EventOfCandidates& event_of_cands)
 {
-  int gplex_size = 1 << 15;
+  int gplex_size = 1 << 14;
   cuFitter = new FitterCU<float> (gplex_size);
   cuFitter->allocateDevice();
   cuFitter->allocate_extra_addBestHit();
@@ -79,7 +70,7 @@ void BuilderCU::FindTracksBestHit(EventOfCandidates& event_of_cands)
 
   event_of_cands_cu.copyToCPU(event_of_cands, cuFitter->get_stream());
   cudaStreamSynchronize(cuFitter->get_stream());
-  cudaCheckError();
+  //cudaCheckError();
 
   /*size_t free_mem, total_mem;*/
   /*cudaMemGetInfo(&free_mem, &total_mem);*/
diff --git a/mkFit/FitterCU-imp.h b/mkFit/FitterCU-imp.h
index e235c644bfdbb..cae975636a6df 100644
--- a/mkFit/FitterCU-imp.h
+++ b/mkFit/FitterCU-imp.h
@@ -4,6 +4,8 @@
 #include "reorganize_gplex.h"
 #include "fittracks_kernels.h"
 
+#include "Track.h"
+
 template <typename T>
 void FitterCU<T>::setNumberTracks(const idx_t Ntracks) {
   N = Ntracks;
@@ -94,8 +96,8 @@ void FitterCU<T>::kalmanUpdate_standalone(
 
 template <typename T>
 void FitterCU<T>::propagationMerged(const int hit_idx) {
-  propagation_wrapper(stream, d_msPar[hit_idx], d_par_iC, d_inChg,
-                      d_par_iP, d_errorProp, d_Err_iP, N);
+  propagation_wrapper(stream, d_msPar[hit_idx], d_Err_iC, d_par_iC, d_inChg,
+                      d_par_iP, d_Err_iP, N);
 }
 
 // FIXME: Temporary. Separate allocations / transfers
@@ -168,14 +170,14 @@ void FitterCU<T>::OutputTracksAndHitIdx(EtaBinOfCandidatesCU &etaBin,
 }
 
 
-#if 1
 template <typename T>
 void FitterCU<T>::propagateTracksToR(const float radius, const int N) {
   propagationForBuilding_wrapper(stream, d_Err_iC, d_par_iC, d_inChg, 
                                  radius, d_Err_iP, d_par_iP, N); 
 }
-#endif
 
+
+#if 1
 template <typename T>
 void FitterCU<T>::propagateTracksToR_standalone(const float radius, const int N,
     const MPlexLS& Err_iC, const MPlexLV& par_iC, const MPlexQI& inChg, 
@@ -188,6 +190,48 @@ void FitterCU<T>::propagateTracksToR_standalone(const float radius, const int N,
   d_par_iP.copyAsyncToHost(stream, Par_iP);
 }
 
+template <typename T>
+void FitterCU<T>::FitTracks(Track *tracks_cu, int num_tracks,
+                            EventOfHitsCU &events_of_hits_cu,
+                            int Nhits)
+{
+  float etime;
+  cudaEvent_t start, stop;
+  cudaEventCreate(&start);
+  cudaEventCreate(&stop);
+
+  cudaEventRecord(start, 0);
+ 
+  for (int itrack = 0; itrack < num_tracks; itrack += Nalloc)
+  {
+    int beg = itrack;
+    int end = std::min(itrack + Nalloc, num_tracks);
+    setNumberTracks(end-beg);
+
+    InputTracksAndHitsCU_wrapper(stream, tracks_cu, events_of_hits_cu,
+                                 d_Err_iC, d_par_iC,
+                                 d_msErr_arr, d_msPar_arr,
+                                 d_inChg, d_Chi2, d_Label,
+                                 d_HitsIdx_arr, beg, end, false, N);
+    fittracks_wrapper(stream, d_Err_iP, d_par_iP, d_msErr_arr, d_msPar_arr,
+                      d_Err_iC, d_par_iC, d_errorProp, d_inChg,
+                      Nhits, N);
+    OutputFittedTracksCU_wrapper(stream, tracks_cu, 
+                                 d_Err_iC, d_par_iC,
+                                 d_inChg, d_Chi2, d_Label,
+                                 beg, end, N);
+  }
+
+  cudaEventRecord(stop, 0);
+  cudaEventSynchronize(stop);
+
+  cudaEventElapsedTime(&etime, start, stop);
+  std::cerr << "CUDA etime: " << etime << " ms.\n";
+  
+  cudaEventDestroy(start);
+  cudaEventDestroy(stop);
+}
+#else
 template <typename T>
 void FitterCU<T>::FitTracks(MPlexQI &Chg, MPlexLV& par_iC, MPlexLS& err_iC,
                             MPlexHV* msPar, MPlexHS* msErr, int Nhits,
@@ -205,9 +249,18 @@ void FitterCU<T>::FitTracks(MPlexQI &Chg, MPlexLV& par_iC, MPlexLS& err_iC,
 
   setNumberTracks(end-beg);
 
-  d_inChg.copyAsyncFromHost(stream, Chg);
-  d_par_iC.copyAsyncFromHost(stream, par_iC);
-  d_Err_iC.copyAsyncFromHost(stream, err_iC);
+  Track *tracks_cu;
+  cudaMalloc((void**)&tracks_cu, tracks.size()*sizeof(Track));
+  cudaMemcpy(tracks_cu, &tracks[0], tracks.size()*sizeof(Track), cudaMemcpyHostToDevice);
+  allocate_extra_addBestHit();
+
+  InputTracksAndHitsCU_wrapper(stream, tracks_cu, d_Err_iC, d_par_iC, d_inChg,
+                               d_Chi2, d_Label, d_HitsIdx_arr, beg, end, false, N);
+
+
+  //d_inChg.copyAsyncFromHost(stream, Chg);
+  //d_par_iC.copyAsyncFromHost(stream, par_iC);
+  //d_Err_iC.copyAsyncFromHost(stream, err_iC);
 
   cudaEventRecord(start, 0);
  
@@ -219,6 +272,7 @@ void FitterCU<T>::FitTracks(MPlexQI &Chg, MPlexLV& par_iC, MPlexLS& err_iC,
     d_par_iP.copyAsyncFromDevice(stream, d_par_iC); 
     d_Err_iP.copyAsyncFromDevice(stream, d_Err_iC);
     
+#if 0
     double time_input = dtime();
     int itrack;
     omp_set_num_threads(Config::numThreadsReorg);
@@ -234,15 +288,23 @@ void FitterCU<T>::FitTracks(MPlexQI &Chg, MPlexLV& par_iC, MPlexLS& err_iC,
       msPar[hi].CopyIn(itrack, hit.posArray());
     }
     total_reorg += (dtime() - time_input)*1e3;
+#endif
 
     d_msPar[hi].copyAsyncFromHost(stream, msPar[hi]);
     d_msErr[hi].copyAsyncFromHost(stream, msErr[hi]);
 
-    //propagationMerged(hi);
-    //kalmanUpdateMerged(hi);
-    fittracks_wrapper(stream, d_Err_iP, d_par_iP, d_msErr, d_msPar,
-                      d_Err_iC, d_par_iC, d_errorProp, d_inChg,
-                      hi, N);
+    propagationMerged(hi);
+    //MPlexLS  err_iP;
+    //MPlexLV par_iP;
+    //d_par_iC.copyAsyncToHost(stream, par_iC);
+    //d_par_iP.copyAsyncToHost(stream, par_iP);
+    //d_Err_iP.copyAsyncToHost(stream, err_iP);
+    //propagation_wrapper(stream, d_msPar[hi], d_Err_iC, d_par_iC, d_inChg,
+                      //d_par_iP, d_errorProp, d_Err_iP, N);
+    kalmanUpdateMerged(hi);
+    //fittracks_wrapper(stream, d_Err_iP, d_par_iP, d_msErr, d_msPar,
+                      //d_Err_iC, d_par_iC, d_errorProp, d_inChg,
+                      //hi, N);
   }
   cudaEventRecord(stop, 0);
   cudaEventSynchronize(stop);
@@ -250,6 +312,9 @@ void FitterCU<T>::FitTracks(MPlexQI &Chg, MPlexLV& par_iC, MPlexLS& err_iC,
   cudaEventElapsedTime(&etime, start, stop);
   //std::cerr << "CUDA etime: " << etime << " ms.\n";
   //std::cerr << "Total reorg: " << total_reorg << " ms.\n";
+  
+  free_extra_addBestHit();
+  cudaFree(tracks_cu);
 
   d_par_iC.copyAsyncToHost(stream, par_iC);
   d_Err_iC.copyAsyncToHost(stream, err_iC);
@@ -261,134 +326,4 @@ void FitterCU<T>::FitTracks(MPlexQI &Chg, MPlexLV& par_iC, MPlexLS& err_iC,
   cudaEventDestroy(start);
   cudaEventDestroy(stop);
 }
-
-///////////////////////////////////////////////////////////////////////////////
-// Backup function: temps that have been deactivated
-///////////////////////////////////////////////////////////////////////////////
-
-#if 0
-template <typename T>
-void FitterCU<T>::computeChi2gpu(const MPlexLS &psErr, const MPlexLV& propPar,
-    const MPlexQI &inChg, MPlexHS &msErr, MPlexHV& msPar,
-    float *minChi2, int *bestHit,
-    LayerOfHitsCU &d_layer, MPlexQI &XHitSize, Matriplex::Matriplex<int, 16, 1, MPT_SIZE> &XHitArr,
-    MPlexQF &Chi2, MPlexQI &HitsIdx, MPlexQF &outChi2, int maxSize2, int hit_idx, int NN) {
-
-  float *d_minChi2;
-  int *d_bestHit;
-  cudaMalloc((void**)&d_minChi2, NN*sizeof(float));
-  cudaMalloc((void**)&d_bestHit, NN*sizeof(int));
-
-  cudaMemcpyAsync(d_minChi2, minChi2, NN*sizeof(float), cudaMemcpyHostToDevice, stream);
-  cudaMemcpyAsync(d_bestHit, bestHit, NN*sizeof(int), cudaMemcpyHostToDevice, stream);
-
-  cudaMemset(d_bestHit, -1, NN*sizeof(int));
-  fill_array_cu(d_minChi2, NN, 15.f);
-
-  d_Err_iP.copyAsyncFromHost(stream, psErr);
-  d_par_iP.copyAsyncFromHost(stream, propPar);
-  d_msErr[hit_idx].copyAsyncFromHost(stream, msErr);
-  d_msPar[hit_idx].copyAsyncFromHost(stream, msPar);
-  //d_XHitPos.copyAsyncFromHost(stream, XHitPos);
-  //d_XHitSize.copyAsyncFromHost(stream, XHitSize);
-  //d_XHitArr.copyAsyncFromHost(stream, XHitArr);
-
-  //cudaMemcpy2DAsync(d_Chi2, NN*sizeof(float), Chi2.fArray, NN*sizeof(float), 
-               //NN*sizeof(float), 1, cudaMemcpyHostToDevice, stream);
-  //cudaMemcpy2DAsync(d_HitsIdx, NN*sizeof(int), HitsIdx.fArray, NN*sizeof(int), 
-               //NN*sizeof(int), 1, cudaMemcpyHostToDevice, stream);
-
-  //cudaStreamSynchronize(stream);
-  //cudaCheckError();
-
-  //selectHitRanges_wrapper(stream, d_bunch, d_XHitPos, d_XHitSize, 
-      //d_Err_iP, d_par_iP, N);
-
-  int maxSize = getMaxNumHits_wrapper(d_XHitSize, N);
-  //bestHit_wrapper(stream, d_bunch, d_XHitPos,
-                  //d_Err_iP, d_msErr, d_msPar, d_par_iP, d_outChi2,
-                  //d_Chi2, d_HitsIdx,
-                  //maxSize2, N);
-  for (int hit_cnt = 0; hit_cnt < maxSize; ++hit_cnt)
-  {
-    //// TODO: add CMSGeom
-    //if (Config::useCMSGeom) {
-      ////propagateHelixToRMPlex(psErr,  psPar, inChg,  msPar, propErr, propPar);
-      //throw std::runtime_error("useCMSGeom not implemented yet for GPU");
-    //} else {}
-    HitToMs_wrapper(stream, d_msErr[hit_idx], d_msPar[hit_idx], d_layer,
-                    d_XHitSize, d_XHitArr, d_HitsIdx[hit_idx], hit_cnt, NN);
-
-    computeChi2_wrapper(stream, d_Err_iP, d_msErr[hit_idx], //d_resErr, 
-        d_msPar[hit_idx], d_par_iP, d_outChi2, NN);
-
-    getNewBestHitChi2_wrapper(stream, d_XHitSize, d_XHitArr, d_outChi2, d_minChi2, d_bestHit, hit_cnt, NN);
-
-    //cudaStreamSynchronize(stream);
-    //cudaCheckError();
-  }
-  updateTracksWithBestHit_wrapper(stream, d_layer, d_minChi2, d_bestHit, 
-    d_msErr[hit_idx], d_msPar[hit_idx], d_par_iP, d_Chi2, d_HitsIdx[hit_idx], N);
-
-  //d_outChi2.copyAsyncToHost(stream, outChi2);
-  //cudaMemcpyAsync(minChi2, d_minChi2, NN*sizeof(float), cudaMemcpyDeviceToHost, stream);
-  //cudaMemcpyAsync(bestHit, d_bestHit, NN*sizeof(int), cudaMemcpyDeviceToHost, stream);
-
-  //cudaMemcpy2DAsync(Chi2.fArray, NN*sizeof(float), d_Chi2, NN*sizeof(float), 
-  //             NN*sizeof(float), 1, cudaMemcpyDeviceToHost, stream);
-  //cudaMemcpy2DAsync(HitsIdx.fArray, NN*sizeof(int), d_HitsIdx, NN*sizeof(int), 
-  //             NN*sizeof(int), 1, cudaMemcpyDeviceToHost, stream);
-  d_Chi2.copyAsyncToHost(stream, Chi2); 
-  d_HitsIdx[hit_idx].copyAsyncToHost(stream, HitsIdx);
-  d_msErr[hit_idx].copyAsyncToHost(stream, msErr);
-  d_msPar[hit_idx].copyAsyncToHost(stream, msPar);
-
-
-  cudaStreamSynchronize(stream);
-  cudaCheckError();
-  //for (int itrack = 0; itrack < NN; ++itrack)
-  //{
-    ////printf("CPU [%d]  -- %d : %f\n", itrack, HitsIdx(itrack, 0, 0), Chi2[itrack]);
-  //}
-
-  cudaFree(d_minChi2);
-  cudaFree(d_bestHit);
-}
-
-// FIXME: Temporary. Separate allocations / transfers
-template <typename T>
-void FitterCU<T>::prepare_addBestHit() {
-    //const MPlexLS &psErr, const MPlexLV& propPar,
-    //const MPlexQI &inChg,
-    //MPlexQI &XHitSize, Matriplex::Matriplex<int, 16, 1, MPT_SIZE> &XHitArr,
-    //size_t num_tracks) {
-  //setNumberTracks(num_tracks);  // temporary: should be end - beg
-
-  //createStream();
-  //cudaCheckError();
-  // psErr -> d_Err_iP
-  //d_Err_iP.copyAsyncFromHost(stream, psErr);
-  //d_par_iP.copyAsyncFromHost(stream, propPar);
-  //d_inChg.copyAsyncFromHost(stream, inChg);
-}
-
-// TODO: Temporary. Separate allocations / transfers
-template <typename T>
-void FitterCU<T>::finalize_addBestHit(
-    MPlexHS *msErr, MPlexHV* msPar,
-    MPlexLS& Err_iP, MPlexLV& Par_iP, 
-    MPlexQI *HitsIdx, 
-    MPlexQI &Label,
-    int start_idx, int end_idx) {
-  d_par_iP.copyAsyncToHost(stream, Par_iP);
-  d_Err_iP.copyAsyncToHost(stream, Err_iP);
-  d_Label.copyAsyncToHost(stream, Label);
- 
-  // Get msPar, msErr, chi2 and HitIdx out from the GPU to the CPU
-  for (int hit_idx = start_idx; hit_idx < end_idx; ++hit_idx) {
-    d_msPar[hit_idx].copyAsyncToHost(stream, msPar[hit_idx]);
-    d_msErr[hit_idx].copyAsyncToHost(stream, msErr[hit_idx]);
-    d_HitsIdx[hit_idx].copyAsyncToHost(stream, HitsIdx[hit_idx]);
-  }
-}
 #endif
diff --git a/mkFit/FitterCU.h b/mkFit/FitterCU.h
index 178a954fe1843..41715a7745f9b 100644
--- a/mkFit/FitterCU.h
+++ b/mkFit/FitterCU.h
@@ -45,6 +45,7 @@ class FitterCU {
   void destroyStream();
   cudaStream_t& get_stream() { return stream; }
 
+  int get_Nalloc() const { return Nalloc; }
   void setNumberTracks(const idx_t Ntracks);
 
   void propagationMerged(const int hit_idx);
@@ -94,10 +95,9 @@ class FitterCU {
       MPlexLS& Err_iP, MPlexLV& Par_iP);
 
   // fitting higher order methods
-  void FitTracks(MPlexQI &Chg, MPlexLV& par_iC, MPlexLS& err_iC,
-                 MPlexHV* msPar, MPlexHS* msErr, int Nhits,
-                 std::vector<Track> &tracks, int beg, int end,
-                 std::vector<HitVec> &layerHits);
+  void FitTracks(Track *tracks_cu, int num_tracks,
+                 EventOfHitsCU &events_of_hits_cu,
+                 int NHits);
   void InputTracksAndHitIdx(const EtaBinOfCandidatesCU &etaBin,
                    const int beg, const int end, const bool inputProp);
   void OutputTracksAndHitIdx(EtaBinOfCandidatesCU &etaBin,
diff --git a/mkFit/HitStructuresCU.cu b/mkFit/HitStructuresCU.cu
index 97eac804a51c6..3fc3fb797312a 100644
--- a/mkFit/HitStructuresCU.cu
+++ b/mkFit/HitStructuresCU.cu
@@ -41,6 +41,12 @@ void LayerOfHitsCU::copyLayerOfHitsFromCPU(const LayerOfHits &layer,
   }
 }
 
+void LayerOfHitsCU::copyFromCPU(const HitVec hits, const cudaStream_t &stream)
+{
+  cudaMemcpyAsync(m_hits, &hits[0], sizeof(Hit)*hits.size(),
+                  cudaMemcpyHostToDevice, stream);
+}
+
 void EventOfHitsCU::allocGPU(const EventOfHits &event_of_hits) {
   m_n_layers = event_of_hits.m_n_layers;
   // Allocate GPU array. 
@@ -60,6 +66,25 @@ void EventOfHitsCU::allocGPU(const EventOfHits &event_of_hits) {
   /*cudaCheckError();*/
 }
 
+void EventOfHitsCU::allocGPU(const std::vector<HitVec> &layerHits)
+{
+  m_n_layers = layerHits.size();
+  // Allocate GPU array. 
+  // Members's address  of array's elements are in the GPU space
+  cudaMalloc((void**)&m_layers_of_hits, m_n_layers*sizeof(LayerOfHitsCU));
+  cudaCheckError();
+  // Allocate CPU array. 
+  // Members's address  of array's elements are in the CPU space
+  // This allows to call allocate for each array's element.
+  m_layers_of_hits_alloc = new LayerOfHitsCU[m_n_layers];
+  for (int i = 0; i < m_n_layers; ++i) {
+    m_layers_of_hits_alloc[i].alloc_hits(layerHits[i].size());
+    // no phi_bin_infos -- free-d later
+    m_layers_of_hits_alloc[i].alloc_phi_bin_infos(1, 1);
+  }
+  /*cudaCheckError();*/
+}
+
 void EventOfHitsCU::deallocGPU() {
   for (int i = 0; i < m_n_layers; ++i) {
     /*cudaCheckError();*/
@@ -84,6 +109,18 @@ void EventOfHitsCU::copyFromCPU(const EventOfHits& event_of_hits,
   /*cudaCheckError();*/
 }
 
+
+void EventOfHitsCU::copyFromCPU(const std::vector<HitVec> &layerHits,
+                                const cudaStream_t &stream) {
+  for (int i = 0; i < layerHits.size(); i++) {
+    m_layers_of_hits_alloc[i].copyFromCPU(layerHits[i]);
+  }
+  cudaMemcpyAsync(m_layers_of_hits, m_layers_of_hits_alloc, 
+                  m_n_layers*sizeof(LayerOfHitsCU), 
+                  cudaMemcpyHostToDevice, stream);
+}
+
+
 // ============================================================================
 
 void EtaBinOfCandidatesCU::alloc_tracks(const int ntracks) {
diff --git a/mkFit/HitStructuresCU.h b/mkFit/HitStructuresCU.h
index c7750500d1009..11146b3f930e0 100644
--- a/mkFit/HitStructuresCU.h
+++ b/mkFit/HitStructuresCU.h
@@ -48,6 +48,7 @@ class LayerOfHitsCU {
 
   void copyLayerOfHitsFromCPU(const LayerOfHits &layer,
                               const cudaStream_t &stream=0);
+  void copyFromCPU(const HitVec hits, const cudaStream_t &stream=0);
 
 #ifdef __CUDACC__
   __device__
@@ -90,9 +91,12 @@ class EventOfHitsCU
   EventOfHitsCU() : m_n_layers{} {};
 
   void allocGPU(const EventOfHits &event_of_hits);
+  void allocGPU(const std::vector<HitVec> &layerHits);
   void deallocGPU();
   void copyFromCPU(const EventOfHits& event_of_hits,
                    const cudaStream_t &stream=0);
+  void copyFromCPU(const std::vector<HitVec> &layerHits,
+                   const cudaStream_t &stream=0);
 };
 
 // ============================================================================
diff --git a/mkFit/MkBuilder.cc b/mkFit/MkBuilder.cc
index 5f6ba55a8cee6..244b6fb28af22 100644
--- a/mkFit/MkBuilder.cc
+++ b/mkFit/MkBuilder.cc
@@ -6,11 +6,6 @@
 
 #include "MkFitter.h"
 
-#ifdef USE_CUDA
-#include "FitterCU.h"
-#include "GeometryCU.h"
-#endif
-
 #include "OmpThreadData.h"
 
 //#define DEBUG
@@ -108,17 +103,10 @@ MkBuilder::MkBuilder() :
   m_event_of_hits(Config::nLayers)
 {
   m_mkfp_arr.resize(Config::numThreadsFinder);
-#ifdef USE_CUDA
-  m_cuFitter_arr.resize(Config::numThreadsFinder);
-#endif
 
   for (int i = 0; i < Config::numThreadsFinder; ++i)
   {
     m_mkfp_arr[i] = new (_mm_malloc(sizeof(MkFitter), 64)) MkFitter(0);
-#ifdef USE_CUDA
-    m_cuFitter_arr[i] = new FitterCU<float>(NN);
-    m_cuFitter_arr[i]->allocateDevice();
-#endif
   }
 }
 
@@ -127,9 +115,6 @@ MkBuilder::~MkBuilder()
    for (int i = 0; i < Config::numThreadsFinder; ++i)
    {
      _mm_free(m_mkfp_arr[i]);
-#ifdef USE_CUDA
-     m_cuFitter_arr[i]->freeDevice();
-#endif
    }
 }
 
@@ -665,68 +650,6 @@ void MkBuilder::FindTracksBestHit(EventOfCandidates& event_of_cands)
   }); //end of parallel section over seeds
 }
 
-#ifdef USE_CUDA
-void MkBuilder::FindTracksBestHit_GPU(EventOfCandidates& event_of_cands)
-{
-  EventOfHitsCU event_of_hits_cu;
-  event_of_hits_cu.allocGPU(m_event_of_hits);
-  event_of_hits_cu.copyFromCPU(m_event_of_hits);
-
-  LayerOfHits& l = m_event_of_hits.m_layers_of_hits[Config::nlayers_per_seed];
-
-  MkFitter* mkfp = m_mkfp_arr[0];
-
-  int gplex_size = 1 << 12;
-  FitterCU<float> cuFitter(gplex_size);
-  cuFitter.allocateDevice();
-  cuFitter.allocate_extra_addBestHit();
-  cuFitter.createStream();
-  cuFitter.setNumberTracks(gplex_size);
-
-  std::vector<float> radii (Config::nLayers);
-  for (int ilay = Config::nlayers_per_seed; ilay < Config::nLayers; ++ilay) {
-    radii[ilay] = m_event->geom_.Radius(ilay);
-  }
-  GeometryCU geom_cu;
-  geom_cu.allocate();
-  geom_cu.getRadiiFromCPU(&radii[0]);
-
-  EventOfCandidatesCU event_of_cands_cu;
-  event_of_cands_cu.allocGPU(event_of_cands);
-  event_of_cands_cu.copyFromCPU(event_of_cands);
-
-  //for (int ebin = 0; ebin != Config::nEtaBin; ++ebin) {
-    //EtaBinOfCandidates& etabin_of_candidates = event_of_cands.m_etabins_of_candidates[ebin];
-
-    //EtaBinOfCandidatesCU &etabin_of_cand_cu = event_of_cands_cu.m_etabins_of_candidates_alloc[ebin];
-
-    // FIXME: Do we actually need this loop, if FitterCU is as wide as etabin
-    //for (int itrack = 0; itrack < etabin_of_candidates.m_fill_index; itrack += NN) {
-      //int end = std::min(itrack + NN, etabin_of_candidates.m_fill_index);
-
-      //cuFitter.setNumberTracks(end-itrack);
-      //cuFitter.InputTracksAndHitIdx(etabin_of_cand_cu, itrack, end, true);
-
-      cuFitter.addBestHit(event_of_hits_cu, geom_cu, event_of_cands_cu);
-
-
-      //cuFitter.OutputTracksAndHitIdx(etabin_of_cand_cu, itrack, end, true);
-    //}
-  //}
-
-  event_of_cands_cu.copyToCPU(event_of_cands);
-  event_of_cands_cu.deallocGPU();
-
-  geom_cu.deallocate();
-  cuFitter.destroyStream();
-  cuFitter.free_extra_addBestHit();
-  cuFitter.freeDevice();
-  event_of_hits_cu.deallocGPU();
-
-  mkfp->SetNhits(Config::nLayers);
-}
-#endif
-
 
 //------------------------------------------------------------------------------
 // FindTracks & FindTracksCloneEngine common functions
diff --git a/mkFit/MkBuilder.h b/mkFit/MkBuilder.h
index 28b82b80c3086..e903d2dce6977 100644
--- a/mkFit/MkBuilder.h
+++ b/mkFit/MkBuilder.h
@@ -3,12 +3,6 @@
 
 #include <vector>
 
-#ifdef USE_CUDA
-#include "HitStructures.h"
-#include "FitterCU.h"
-#endif
-
-
 //------------------------------------------------------------------------------
 
 #include "MkFitter.h"
@@ -52,9 +46,6 @@ class MkBuilder
   EventOfHits    m_event_of_hits;
 
   std::vector<MkFitter*> m_mkfp_arr;
-#ifdef USE_CUDA
-  std::vector<FitterCU<float>*> m_cuFitter_arr;
-#endif
 
   int m_cnt=0, m_cnt1=0, m_cnt2=0, m_cnt_8=0, m_cnt1_8=0, m_cnt2_8=0, m_cnt_nomc=0;
 
@@ -110,7 +101,6 @@ class MkBuilder
   virtual void FindTracksCloneEngine();
   virtual void FindTracksCloneEngineTbb();
 #ifdef USE_CUDA
-  void FindTracksBestHit_GPU(EventOfCandidates& event_of_cands);
   const Event* get_event() const { return m_event; }
   const EventOfHits& get_event_of_hits() const { return m_event_of_hits; }
 #endif
diff --git a/mkFit/buildtestMPlex.cc b/mkFit/buildtestMPlex.cc
index 1652c1b25dcf6..2b83225218a14 100644
--- a/mkFit/buildtestMPlex.cc
+++ b/mkFit/buildtestMPlex.cc
@@ -92,7 +92,6 @@ double runBuildingTestPlexBestHit(Event& ev)
   std::unique_ptr<MkBuilder> builder_ptr(make_builder());
   MkBuilder &builder = * builder_ptr.get();
 
-  std::cerr << "Building event...\n";
   builder.begin_event(&ev, 0, __func__);
 
   if   (Config::findSeeds) {builder.find_seeds();}
@@ -102,15 +101,14 @@ double runBuildingTestPlexBestHit(Event& ev)
 
   EventOfCandidates event_of_cands;
   builder.find_tracks_load_seeds(event_of_cands);
-  builder.quality_output_besthit(event_of_cands);
 
 #ifdef USE_VTUNE_PAUSE
   __itt_resume();
 #endif
 
 #if USE_CUDA
-  check_event_of_hits_gpu(builder.get_event_of_hits());
-  check_event_of_cands_gpu(event_of_cands);
+  //check_event_of_hits_gpu(builder.get_event_of_hits());
+  //check_event_of_cands_gpu(event_of_cands);
   BuilderCU builder_cu(builder.get_event_of_hits(), builder.get_event(),
                        event_of_cands);
 #endif
@@ -118,9 +116,7 @@ double runBuildingTestPlexBestHit(Event& ev)
   double time = dtime();
 
 #if USE_CUDA
-  std::cout << "Finding best hits...\n";
   builder_cu.FindTracksBestHit(event_of_cands);
-  //builder.FindTracksBestHit_GPU(event_of_cands);
 #else
   builder.FindTracksBestHit(event_of_cands);
 #endif
@@ -273,3 +269,69 @@ double runBuildingTestPlexTbb(Event& ev, EventTmp& ev_tmp)
 
   return time;
 }
+
+
+//==============================================================================
+// runAllBuildTestPlexBestHitGPU
+//==============================================================================
+
+#if USE_CUDA
+double runAllBuildingTestPlexBestHitGPU(std::vector<Event> &events)
+{
+
+  int num_builders = events.size();
+  std::vector<std::unique_ptr<MkBuilder>> builder_ptrs(num_builders);
+  std::vector<EventOfCandidates> event_of_cands_vec(num_builders);
+  std::vector<BuilderCU> builder_cu_vec(num_builders);
+
+  for (int i = 0; i < builder_ptrs.size(); ++i) {
+    Event &ev = events[i];
+    builder_ptrs[i] = std::unique_ptr<MkBuilder> (make_builder());
+
+    MkBuilder &builder = * builder_ptrs[i].get();
+
+    builder.begin_event(&ev, 0, __func__);
+
+    if   (Config::findSeeds) {builder.find_seeds();}
+    else                     {builder.map_seed_hits();} // all other simulated seeds need to have hit indices line up in LOH for seed fit
+
+    builder.fit_seeds_tbb();
+
+    EventOfCandidates &event_of_cands = event_of_cands_vec[i];
+    builder.find_tracks_load_seeds(event_of_cands);
+
+    BuilderCU &builder_cu = builder_cu_vec[i];
+    builder_cu.setUp(builder.get_event_of_hits(), builder.get_event(),
+                         event_of_cands);
+  }
+
+  //omp_set_num_threads(Config::numThreadsEvents);
+  //std::cerr << "num threads "<< omp_get_num_threads() << std::endl;
+//#pragma omp parallel for reduction(+:total_time)
+  //for (int i = 0; i < builder_ptrs.size(); ++i) {
+  double time = dtime();
+  tbb::parallel_for(size_t(0), builder_ptrs.size(), [&](size_t i) {
+    EventOfCandidates &event_of_cands = event_of_cands_vec[i];
+    BuilderCU &builder_cu = builder_cu_vec[i];
+    MkBuilder &builder = * builder_ptrs[i].get();
+
+    builder_cu.FindTracksBestHit(event_of_cands);
+  });
+  time = dtime() - time;
+
+  for (int i = 0; i < builder_ptrs.size(); ++i) {
+    EventOfCandidates &event_of_cands = event_of_cands_vec[i];
+    BuilderCU &builder_cu = builder_cu_vec[i];
+    MkBuilder &builder = * builder_ptrs[i].get();
+    if   (!Config::normal_val) {
+      builder.quality_output_besthit(event_of_cands);
+    } else {
+      builder.root_val_besthit(event_of_cands);
+    }
+
+    builder.end_event();
+  }
+  
+  return time;
+}
+#endif
diff --git a/mkFit/buildtestMPlex.h b/mkFit/buildtestMPlex.h
index 25b636014e108..bb082ad4a07c6 100644
--- a/mkFit/buildtestMPlex.h
+++ b/mkFit/buildtestMPlex.h
@@ -13,4 +13,8 @@ double runBuildingTestPlexCloneEngine(Event& ev, EventTmp& evtmp);
 
 double runBuildingTestPlexTbb(Event& ev, EventTmp& evtmp);
 
+#if USE_CUDA
+double runAllBuildingTestPlexBestHitGPU(std::vector<Event> &events);
+#endif
+
 #endif
diff --git a/mkFit/fittestMPlex.cc b/mkFit/fittestMPlex.cc
index 066fe87f50167..b5082932d3d7d 100644
--- a/mkFit/fittestMPlex.cc
+++ b/mkFit/fittestMPlex.cc
@@ -203,6 +203,7 @@ void runAllEventsFittingTestPlexGPU(std::vector<Event>& events)
     cuFitter.freeDevice();
   }
 #endif
+  separate_first_call_for_meaningful_profiling_numbers();
 
   // Reorgnanization (copyIn) can eventually be multithreaded.
   omp_set_nested(1);
@@ -216,8 +217,10 @@ void runAllEventsFittingTestPlexGPU(std::vector<Event>& events)
 
   // FitterCU is declared here to share allocations and deallocations
   // between the multiple events processed by a single thread.
-  FitterCU<float> cuFitter(NN);
+  int gplex_size = 10000;
+  FitterCU<float> cuFitter(gplex_size);
   cuFitter.allocateDevice();
+  cuFitter.allocate_extra_addBestHit();
 
     for (int evt = thr_idx+1; evt <= Config::nEvents; evt+= numThreadsEvents) {
       int idx = thr_idx;
@@ -238,13 +241,14 @@ void runAllEventsFittingTestPlexGPU(std::vector<Event>& events)
       // Validation crashes for multiple threads.
       // It is something in relation to ROOT. Not sure what. 
       if (omp_get_num_threads() <= 1) {
-        if (g_run_fit_std) {
+        //if (g_run_fit_std) {
           std::string tree_name = "validation-plex-" + std::to_string(evt) + ".root";
           make_validation_tree(tree_name.c_str(), ev.simTracks_, plex_tracks_ev);
-        }
+        //}
       }
 #endif
     }
+    cuFitter.free_extra_addBestHit();
     cuFitter.freeDevice();
   }
   std::cerr << "###### [Fitting] Total GPU time: " << dtime() - total_gpu_time << " ######\n";
@@ -254,55 +258,33 @@ void runAllEventsFittingTestPlexGPU(std::vector<Event>& events)
 double runFittingTestPlexGPU(FitterCU<float> &cuFitter, 
     Event& ev, std::vector<Track>& rectracks)
 {
+  std::vector<Track>& simtracks = ev.simTracks_;
 
-   std::vector<Track>& simtracks = ev.simTracks_;
+  cuFitter.createStream();
 
-   const int Nhits = Config::nLayers;
-   // XXX What if there's a missing / double layer?
-   // Eventually, should sort track vector by number of hits!
-   // And pass the number in on each "setup" call.
-   // Reserves should be made for maximum possible number (but this is just
-   // measurments errors, params).
+  Track *tracks_cu;
+  cudaMalloc((void**)&tracks_cu, simtracks.size()*sizeof(Track));
+  cudaMemcpyAsync(tracks_cu, &simtracks[0], simtracks.size()*sizeof(Track),
+                  cudaMemcpyHostToDevice, cuFitter.get_stream());
 
-   // NOTE: MkFitter *MUST* be on heap, not on stack!
-   // Standard operator new screws up alignment of ALL MPlex memebrs of MkFitter,
-   // even if one adds attr(aligned(64)) thingy to every possible place.
+  EventOfHitsCU events_of_hits_cu;
+  events_of_hits_cu.allocGPU(ev.layerHits_);
+  events_of_hits_cu.copyFromCPU(ev.layerHits_, cuFitter.get_stream());
 
-   // MkFitter *mkfp = new (_mm_malloc(sizeof(MkFitter), 64)) MkFitter(Nhits);
+  double time = dtime();
 
-   MkFitter* mkfp_arr = new (_mm_malloc(sizeof(MkFitter), 64)) MkFitter(Nhits);
+  cuFitter.FitTracks(tracks_cu, simtracks.size(), events_of_hits_cu, Config::nLayers);
 
-   int theEnd = simtracks.size();
-   double time = dtime();
-   int Nstride = NN;
+  cudaMemcpy(&rectracks[0], tracks_cu, simtracks.size()*sizeof(Track), cudaMemcpyDeviceToHost);
 
-   for (int itrack = 0; itrack < theEnd; itrack += Nstride)
-   {
-      int end = std::min(itrack + Nstride, theEnd);
-
-      MkFitter *mkfp = mkfp_arr;
+  time = dtime() - time;
 
-      //double time_input = dtime();
-      mkfp->InputTracksAndHits(simtracks, ev.layerHits_, itrack, end);
-      //std::cerr << "Input time: " << (dtime() - time_input)*1e3 << std::endl;
 
-      cuFitter.FitTracks(mkfp->Chg,
-                         mkfp->GetPar0(),
-                         mkfp->GetErr0(),
-                         mkfp->msPar,
-                         mkfp->msErr,
-                         Nhits,
-                         simtracks, itrack, end, ev.layerHits_);
+  events_of_hits_cu.deallocGPU();
+  cudaFree(tracks_cu);
 
-      double time_output = dtime();
-      mkfp->OutputFittedTracks(rectracks, itrack, end);
-      //std::cerr << "Output time: " << (dtime() - time_output)*1e3 << std::endl;
-   }
+  cuFitter.destroyStream();
 
-   time = dtime() - time;
-
-   _mm_free(mkfp_arr);
-
-   return time;
+  return time;
 }
 #endif
diff --git a/mkFit/fittracks_kernels.cu b/mkFit/fittracks_kernels.cu
index d77a281cc5200..8833d3c3809c4 100644
--- a/mkFit/fittracks_kernels.cu
+++ b/mkFit/fittracks_kernels.cu
@@ -7,27 +7,32 @@ constexpr int BLOCK_SIZE_X = 256;
 
 __global__ void fittracks_kernel(
       GPlexLV par_iP, GPlexLS Err_iP,
-      GPlexHV msPar, GPlexHS msErr,
+      GPlexHV *msPar_arr, GPlexHS *msErr_arr,
       GPlexLV par_iC, GPlexLS Err_iC,
       GPlexLL errorProp, GPlexQI inChg,
-      int N)
+      const int Nhits, int N)
 {
   int grid_width = blockDim.x * gridDim.x;
   int n = threadIdx.x + blockIdx.x * blockDim.x;
-  for (int z = 0; z < (N-1)/grid_width  +1; z++) {
-    n += z*grid_width;
+  for (int hi = 0; hi < Nhits; ++hi) {
+    GPlexHV &msPar = msPar_arr[hi];
+    GPlexHS &msErr = msErr_arr[hi];
 
-    propagation_fn(msPar, par_iC, inChg, par_iP, errorProp, Err_iP, n, N);
-    kalmanUpdate_fn(Err_iP, msErr, par_iP, msPar, par_iC, Err_iC, n, N);
+    for (int z = 0; z < (N-1)/grid_width  +1; z++) {
+      n += z*grid_width;
+
+      propagation_fn(Err_iC, par_iC, inChg, msPar, Err_iP, par_iP, n, N);
+      kalmanUpdate_fn(Err_iP, msErr, par_iP, msPar, par_iC, Err_iC, n, N);
+    }
   }
 }
 
 void fittracks_wrapper(cudaStream_t &stream,
                        GPlexLS &Err_iP, GPlexLV &par_iP, 
-                       GPlexHS *msErr, GPlexHV *msPar,
+                       GPlexHS *msErr_arr, GPlexHV *msPar_arr,
                        GPlexLS &Err_iC, GPlexLV &par_iC,
                        GPlexLL &errorProp, GPlexQI &inChg,
-                       const int hit_idx, const int N)
+                       const int Nhits, const int N)
 {
   int gridx = std::min((N-1)/BLOCK_SIZE_X + 1,
                        max_blocks_x);
@@ -36,10 +41,8 @@ void fittracks_wrapper(cudaStream_t &stream,
 
   fittracks_kernel <<< grid, block, 0, stream >>>
       (par_iP, Err_iP, 
-       msPar[hit_idx], msErr[hit_idx],
+       msPar_arr, msErr_arr,
        par_iC, Err_iC,
        errorProp, inChg, 
-       N);
-  /*kalmanUpdate_wrapper(stream, Err_iP, msErr[hit_idx],*/
-                       /*par_iP, msPar[hit_idx], par_iC, Err_iC, N);*/
+       Nhits, N);
 }
diff --git a/mkFit/gpu_utils.cu b/mkFit/gpu_utils.cu
index 178f808355817..2800aa39bd4b8 100644
--- a/mkFit/gpu_utils.cu
+++ b/mkFit/gpu_utils.cu
@@ -3,3 +3,7 @@
 void sync_gpu() {
   cudaCheckErrorSync();
 }
+
+void separate_first_call_for_meaningful_profiling_numbers() {
+  sync_gpu();
+}
diff --git a/mkFit/gpu_utils.h b/mkFit/gpu_utils.h
index feaad0517c0dc..3995866b9370a 100644
--- a/mkFit/gpu_utils.h
+++ b/mkFit/gpu_utils.h
@@ -3,6 +3,8 @@
 
 #include <cub/util_debug.cuh>
 
+#include <cstdint>
+
 #define cudaCheckError()               \
   do {                                 \
     cudaError_t e=cudaGetLastError();  \
@@ -17,7 +19,7 @@
 
 // CUDA specific:
 // Maximum number of blocks in the X direction of the thread grid.
-constexpr int max_blocks_x = 1 << 15;
+constexpr int max_blocks_x = INT32_MAX;
 
 // The first call to a CUDA API function takes the initialization hit.
 void separate_first_call_for_meaningful_profiling_numbers();
diff --git a/mkFit/mkFit.cc b/mkFit/mkFit.cc
index 88883d3ad521c..e1c11e6e38f19 100644
--- a/mkFit/mkFit.cc
+++ b/mkFit/mkFit.cc
@@ -188,7 +188,11 @@ void test_standard()
 
   EventTmp ev_tmp;
 
-#if 0 //USE_CUDA
+#if USE_CUDA
+  tbb::task_scheduler_init tbb_init(Config::numThreadsFinder);
+  //tbb::task_scheduler_init tbb_init(tbb::task_scheduler_init::automatic);
+  
+  //omp_set_num_threads(Config::numThreadsFinder);
   // fittest time. Sum of all events. In case of multiple events
   // being run simultaneously in different streams this time will
   // be larger than the elapsed time.
@@ -219,21 +223,10 @@ void test_standard()
 
   if (g_run_fit_std) runAllEventsFittingTestPlexGPU(events);
 
-  for (int evt = 1; evt <= Config::nEvents; ++evt)
-  {
-    printf("\n");
-    printf("Processing event %d\n", evt);
-
-    Event& ev = events[evt-1];
-
-    //plex_tracks.resize(ev.simTracks_.size());
-    omp_set_num_threads(Config::numThreadsFinder);
-
-    if (g_run_build_bh) {
-      double my_time = runBuildingTestPlexBestHit(ev);
-      std::cout << "BestHit -- GPU: " << my_time << std::endl;
-    }
-    std::exit(0);
+  if (g_run_build_all || g_run_build_bh) {
+    double total_best_hit_time = 0.;
+    total_best_hit_time = runAllBuildingTestPlexBestHitGPU(events);
+    std::cout << "Total best hit time (GPU): " << total_best_hit_time << std::endl;
   }
 #else
   // MT: task_scheduler_init::automatic doesn't really work (segv!) + we don't
diff --git a/mkFit/propagation_kernels.cu b/mkFit/propagation_kernels.cu
index a9debeb83308d..70f4957177a44 100644
--- a/mkFit/propagation_kernels.cu
+++ b/mkFit/propagation_kernels.cu
@@ -457,16 +457,25 @@ __device__ void similarity_fn(GPlexRegLL &a, GPlexLS &b, int N, int n) {
 
 // PropagationMPlex.cc:propagateHelixToRMPlex, first version with 6 arguments 
 __device__ void propagation_fn(
-    GPlexHV &msPar,
-    GPlexLV &inPar, GPlexQI &inChg,
-    GPlexLV &outPar, GPlexLL &errorProp,
-    GPlexLS &outErr, int n, int N) {
+    GPlexLS &inErr, GPlexLV &inPar, 
+    GPlexQI &inChg, GPlexHV &msPar,
+    GPlexLS &outErr, GPlexLV &outPar,
+    int n, int N) {
 
   GPlexRegQF msRad_reg;
   // Using registers instead of shared memory is ~ 30% faster.
   GPlexRegLL errorProp_reg;
   // If there is more matrices than max_blocks_x * BLOCK_SIZE_X 
   if (n < N) {
+    for (int i = 0; i < inErr.kSize; ++i) {
+      outErr[n + i*outErr.stride] = inErr[n + i*inErr.stride];
+    }
+    for (int i = 0; i < inPar.kSize; ++i) {
+      outPar[n + i*outPar.stride] = inPar[n + i*inPar.stride];
+    }
+    for (int i = 0; i < 36; ++i) {
+      errorProp_reg[i] = 0.0;
+    }
 #if 0
     computeMsRad_fn(msPar, stride_msPar, &msRad_reg, N, n);
     if (Config::doIterative) {
@@ -485,37 +494,42 @@ __device__ void propagation_fn(
 #else
     helixAtRFromIterative_fn(inPar, inChg, outPar, msRad_reg, errorProp_reg, N, n);
 #endif
-    similarity_fn(errorProp_reg, outErr, N, n);
+    /*similarity_fn(errorProp_reg, outErr, N, n);*/
+    GPlexRegLL temp;
+    MultHelixProp_fn      (errorProp_reg, outErr, temp, n);
+    MultHelixPropTransp_fn(errorProp_reg, temp,   outErr, n);
   }
 }
 
 
 __global__ void propagation_kernel(
+    GPlexLS inErr,
     GPlexHV msPar,
     GPlexLV inPar, GPlexQI inChg,
-    GPlexLV outPar, GPlexLL errorProp,
+    GPlexLV outPar,
     GPlexLS outErr, int N)
 {
   int grid_width = blockDim.x * gridDim.x;
   int n = threadIdx.x + blockIdx.x * blockDim.x;
   for (int z = 0; z < (N-1)/grid_width  +1; z++) {
     n += z*grid_width;
-    propagation_fn(msPar, inPar, inChg, outPar, errorProp, outErr, n, N);
+    propagation_fn(inErr, inPar, inChg, msPar, outErr, outPar, n, N);
   }
 }
 
 
 void propagation_wrapper(const cudaStream_t& stream,
-    GPlexHV& msPar,
+    GPlexHV& msPar, GPlexLS& inErr,
     GPlexLV& inPar, GPlexQI& inChg,
-    GPlexLV& outPar, GPlexLL& errorProp,
+    GPlexLV& outPar,
     GPlexLS& outErr, 
     const int N) {
   int gridx = std::min((N-1)/BLOCK_SIZE_X + 1,
                        max_blocks_x);
   dim3 grid(gridx, 1, 1);
   dim3 block(BLOCK_SIZE_X, 1, 1);
-  propagation_kernel <<<grid, block, 0, stream >>>(msPar, inPar, inChg, outPar, errorProp, outErr, N);
+  propagation_kernel <<<grid, block, 0, stream >>>
+    (inErr, msPar, inPar, inChg, outPar, outErr, N);
 }
 
 
diff --git a/mkFit/propagation_kernels.h b/mkFit/propagation_kernels.h
index bbf4a80c07196..aec2e256bf918 100644
--- a/mkFit/propagation_kernels.h
+++ b/mkFit/propagation_kernels.h
@@ -3,16 +3,10 @@
 
 #include "GPlex.h"
 
-__device__ void propagation_fn(
-    GPlexHV &msPar,
-    GPlexLV &inPar, GPlexQI &inChg,
-    GPlexLV &outPar, GPlexLL &errorProp,
-    GPlexLS &outErr, int n, int N);
-
 void propagation_wrapper(const cudaStream_t& stream,
-    GPlexHV& msPar,
+    GPlexHV& msPar, GPlexLS& inErr,
     GPlexLV& inPar, GPlexQI& inChg,
-    GPlexLV& outPar, GPlexLL& errorProp,
+    GPlexLV& outPar,
     GPlexLS& outErr, 
     const int N);
 
@@ -23,10 +17,10 @@ void propagationForBuilding_wrapper(const cudaStream_t& stream,
     const int N);
 
 __device__ void propagation_fn(
-    GPlexHV &msPar,
-    GPlexLV &inPar, GPlexQI &inChg,
-    GPlexLV &outPar, GPlexLL &errorProp,
-    GPlexLS &outErr, int n, int N);
+    GPlexLS &inErr, GPlexLV &inPar, 
+    GPlexQI &inChg, GPlexHV &msPar,
+    GPlexLS &outErr, GPlexLV &outPar,
+    int n, int N);
 
 __device__ void propagationForBuilding_fn(
     const GPlexLS &inErr, const GPlexLV &inPar,
diff --git a/mkFit/reorganize_gplex.cu b/mkFit/reorganize_gplex.cu
index bd2b88d094442..305ab59ac277f 100644
--- a/mkFit/reorganize_gplex.cu
+++ b/mkFit/reorganize_gplex.cu
@@ -130,6 +130,7 @@ __device__ void InputTracksCU_fn (Track *tracks,
   }
 }
 
+
 __global__ void InputTracksCU_kernel(Track *tracks, 
                                      GPlexLS Err_iP, GPlexLV Par_iP,
                                      GPlexQI Chg, GPlexQF Chi2, GPlexQI Label,
@@ -157,12 +158,87 @@ void InputTracksCU_wrapper(const cudaStream_t &stream,
 }
 
 
+__device__ void InputTracksAndHitsCU_fn (Track *tracks, LayerOfHitsCU *layerHits,
+                                         GPlexLS &Err_iP, GPlexLV &Par_iP,
+                                         GPlexHS *msErr_arr, GPlexHV *msPar_arr,
+                                         GPlexQI &Chg, GPlexQF &Chi2,
+                                         GPlexQI &Label, GPlexQI *HitsIdx,
+                                         const int beg, const int end, 
+                                         const int itrack, const int N) {
+  //int itrack = threadIdx.x + blockDim.x*blockIdx.x;
+
+  if (itrack < (end-beg) && itrack < N) {
+    Track &trk = tracks[beg];
+    const char *varr       = (char*) &trk;
+    int   off_error = (char*) trk.errArrayCU() - varr;
+    int   off_param = (char*) trk.posArrayCU() - varr;
+
+    int i= itrack + beg;
+    const Track &trk_i = tracks[i];
+    int idx = (char*) &trk_i - varr;
+
+    Label(itrack, 0, 0) = tracks[i].label();
+    Chg(itrack, 0, 0) = tracks[i].charge();
+    Chi2(itrack, 0, 0) = tracks[i].chi2();
+    SlurpInIdx_fn(Err_iP, varr + off_error, idx, N);
+    SlurpInIdx_fn(Par_iP, varr + off_param, idx, N);
+
+    // Note Config::nLayers -- not suitable for building
+    for (int hi = 0; hi < Config::nLayers; ++hi) {
+      int hidx = tracks[i].getHitIdx(hi);
+      Hit &hit = layerHits[hi].m_hits[hidx];
+
+      HitsIdx[hi](itrack, 0, 0) = idx;
+      if (hidx < 0) continue;
+
+      SlurpInIdx_fn(msErr_arr[hi], (char *)hit.errArrayCU(), 0, N);
+      SlurpInIdx_fn(msPar_arr[hi], (char *)hit.posArrayCU(), 0, N);
+    }
+  }
+}
+
+
+__global__ void InputTracksAndHitsCU_kernel(Track *tracks, LayerOfHitsCU *layers,
+                                            GPlexLS Err_iP, GPlexLV Par_iP,
+                                            GPlexHS *msErr_arr, GPlexHV *msPar_arr,
+                                            GPlexQI Chg, GPlexQF Chi2, GPlexQI Label,
+                                            GPlexQI *HitsIdx,
+                                            int beg, int end, int N) {
+  int itrack = threadIdx.x + blockDim.x*blockIdx.x;
+  InputTracksAndHitsCU_fn(tracks, layers, Err_iP, Par_iP, msErr_arr, msPar_arr,
+                          Chg, Chi2, Label, HitsIdx, beg, end, itrack, N);
+}
+
+
+void InputTracksAndHitsCU_wrapper(const cudaStream_t &stream, 
+                                  Track *tracks, EventOfHitsCU &event_of_hits,
+                                  GPlexLS &Err_iP, GPlexLV &Par_iP,
+                                  GPlexHS *msErr_arr, GPlexHV *msPar_arr,
+                                  GPlexQI &Chg, GPlexQF &Chi2, GPlexQI &Label,
+                                  GPlexQI *HitsIdx,
+                                  const int beg, const int end, 
+                                  const bool inputProp, int N) {
+  int gridx = std::min((N-1)/BLOCK_SIZE_X + 1,
+                       max_blocks_x);
+  dim3 grid(gridx, 1, 1);
+  dim3 block(BLOCK_SIZE_X, 1, 1);
+
+  InputTracksAndHitsCU_kernel <<< grid, block, 0, stream >>>
+    (tracks, event_of_hits.m_layers_of_hits, 
+     Err_iP, Par_iP, 
+     msErr_arr, msPar_arr, 
+     Chg, Chi2, Label, HitsIdx,
+     beg, end, N);
+}
+
+
 __device__ void OutputTracksCU_fn(Track *tracks, 
                                   const GPlexLS &Err_iP, const GPlexLV &Par_iP,
                                   const GPlexQI &Chg, const GPlexQF &Chi2,
                                   const GPlexQI &Label, const GPlexQI *HitsIdx,
                                   const int beg, const int end, 
-                                  const int itrack, const int N) {
+                                  const int itrack, const int N,
+                                  const bool update_hit_idx) {
   //int itrack = threadIdx.x + blockDim.x*blockIdx.x;
 
   if (itrack < (end-beg) && itrack < N) {
@@ -181,19 +257,21 @@ __device__ void OutputTracksCU_fn(Track *tracks,
     tracks[i].setChi2(Chi2(itrack, 0, 0));
     tracks[i].setLabel(Label(itrack, 0, 0));
 
-    tracks[i].resetHits();
-    /*int nGoodItIdx = 0;*/
-    for (int hi = 0; hi < Config::nLayers; ++hi) {
-      tracks[i].addHitIdx(HitsIdx[hi](itrack, 0, 0),0.);
-      // FIXME: We probably want to use registers instead of going for class members:
-      /*int hit_idx = HitsIdx[hi](itrack, 0, 0);*/
-      /*tracks[i].setHitIdx(hi, hit_idx);*/
-      /*if (hit_idx >= 0) {*/
+    if (update_hit_idx) {
+      tracks[i].resetHits();
+      /*int nGoodItIdx = 0;*/
+      for (int hi = 0; hi < Config::nLayers; ++hi) {
+        tracks[i].addHitIdx(HitsIdx[hi](itrack, 0, 0),0.);
+        // FIXME: We probably want to use registers instead of going for class members:
+        /*int hit_idx = HitsIdx[hi](itrack, 0, 0);*/
+        /*tracks[i].setHitIdx(hi, hit_idx);*/
+        /*if (hit_idx >= 0) {*/
         /*nGoodItIdx++; */
-      /*}*/
+        /*}*/
+      }
+      /*tracks[i].setNGoodHitIdx(nGoodItIdx);*/
+      /*tracks[i].setChi2(0.);*/
     }
-    /*tracks[i].setNGoodHitIdx(nGoodItIdx);*/
-    /*tracks[i].setChi2(0.);*/
   }
 }
 
@@ -201,9 +279,11 @@ __global__ void OutputTracksCU_kernel(Track *tracks,
                                      GPlexLS Err_iP, GPlexLV Par_iP,
                                      GPlexQI Chg, GPlexQF Chi2, GPlexQI Label,
                                      GPlexQI *HitsIdx,
-                                     int beg, int end, int N) {
+                                     int beg, int end, int N,
+                                     const bool update_hit_idx=true) {
   int itrack = threadIdx.x + blockDim.x*blockIdx.x;
-  OutputTracksCU_fn(tracks, Err_iP, Par_iP, Chg, Chi2, Label, HitsIdx, beg, end, itrack, N);
+  OutputTracksCU_fn(tracks, Err_iP, Par_iP, Chg, Chi2, Label, HitsIdx,
+                    beg, end, itrack, N, update_hit_idx);
 }
 
 
@@ -221,3 +301,18 @@ void OutputTracksCU_wrapper(const cudaStream_t &stream,
   OutputTracksCU_kernel <<< grid, block, 0, stream >>>
     (etaBin.m_candidates, Err_iP, Par_iP, Chg, Chi2, Label, HitsIdx, beg, end, N);
 }
+
+
+void OutputFittedTracksCU_wrapper(const cudaStream_t &stream,
+                                  Track *tracks_cu, 
+                                  GPlexLS &Err_iP, GPlexLV &Par_iP,
+                                  GPlexQI &Chg, GPlexQF &Chi2, GPlexQI &Label,
+                                  const int beg, const int end, int N) {
+  int gridx = std::min((N-1)/BLOCK_SIZE_X + 1,
+                       max_blocks_x);
+  dim3 grid(gridx, 1, 1);
+  dim3 block(BLOCK_SIZE_X, 1, 1);
+
+  OutputTracksCU_kernel <<< grid, block, 0, stream >>>
+    (tracks_cu, Err_iP, Par_iP, Chg, Chi2, Label, nullptr, beg, end, N, false);
+}
diff --git a/mkFit/reorganize_gplex.h b/mkFit/reorganize_gplex.h
index 2cda2f4c438d0..f59e2b6c0c464 100644
--- a/mkFit/reorganize_gplex.h
+++ b/mkFit/reorganize_gplex.h
@@ -4,6 +4,7 @@
 #include "GPlex.h"
 #include "Hit.h"
 #include "HitStructuresCU.h"
+#include "Track.h"
 
 __device__ float *get_posArray(Hit &hit);
 __device__ float *get_errArray(Hit &hit);
@@ -37,7 +38,8 @@ __device__ void OutputTracksCU_fn(Track *tracks,
                                   const GPlexQI &Chg, const GPlexQF &Chi2,
                                   const GPlexQI &Label, const GPlexQI *HitsIdx,
                                   const int beg, const int end, 
-                                  const int itrack, const int N);
+                                  const int itrack, const int N,
+                                  const bool update_hit_idx=true);
 
 void InputTracksCU_wrapper(const cudaStream_t &stream, 
                            const EtaBinOfCandidatesCU &etaBin,
@@ -46,11 +48,27 @@ void InputTracksCU_wrapper(const cudaStream_t &stream,
                            GPlexQI *HitsIdx,
                            const int beg, const int end, const bool inputProp, int N);
 
+void InputTracksAndHitsCU_wrapper(const cudaStream_t &stream, 
+                                  Track *tracks, EventOfHitsCU &event_of_hits,
+                                  GPlexLS &Err_iP, GPlexLV &Par_iP,
+                                  GPlexHS *msErr_arr, GPlexHV *msPar_arr,
+                                  GPlexQI &Chg, GPlexQF &Chi2, GPlexQI &Label,
+                                  GPlexQI *HitsIdx,
+                                  const int beg, const int end,
+                                  const bool inputProp, int N);
+
 void OutputTracksCU_wrapper(const cudaStream_t &stream,
                             EtaBinOfCandidatesCU &etaBin,
                             GPlexLS &Err_iP, GPlexLV &Par_iP,
                             GPlexQI &Chg, GPlexQF &Chi2, GPlexQI &Label,
                             GPlexQI *HitsIdx,
-                            const int beg, const int end, const bool outputProp, int N);
+                            const int beg, const int end, bool outputProp, int N);
+
+
+void OutputFittedTracksCU_wrapper(const cudaStream_t &stream,
+                                  Track *tracks_cu, 
+                                  GPlexLS &Err_iP, GPlexLV &Par_iP,
+                                  GPlexQI &Chg, GPlexQF &Chi2, GPlexQI &Label,
+                                  const int beg, const int end, int N);
 
 #endif  // REORGANIZE_GPLEX_H

From 53cf9c03803bad16cd3c0efa0beea376e9f4f1ce Mon Sep 17 00:00:00 2001
From: Matthieu Lefebvre <ml15@princeton.edu>
Date: Wed, 14 Sep 2016 16:54:06 -0400
Subject: [PATCH 13/13] Reset config values to default one

---
 Config.h        | 3 +--
 Makefile.config | 8 ++++----
 2 files changed, 5 insertions(+), 6 deletions(-)

diff --git a/Config.h b/Config.h
index a9620e3bab2b0..cc8b1774c81f7 100644
--- a/Config.h
+++ b/Config.h
@@ -121,8 +121,7 @@ namespace Config
   // Config for Hit and BinInfoUtils
   constexpr int   nPhiPart   = 1260;
   constexpr float fPhiFactor = nPhiPart / TwoPI;
-  //constexpr int   nEtaPart   = 11;
-  constexpr int   nEtaPart   = 1;
+  constexpr int   nEtaPart   = 11;  // 1 is better for GPU best_hit
   constexpr int   nEtaBin    = 2 * nEtaPart - 1;
 
   constexpr float        fEtaFull  = 2 * Config::fEtaDet;
diff --git a/Makefile.config b/Makefile.config
index 8a2966700c5a2..15231cdb21ed5 100644
--- a/Makefile.config
+++ b/Makefile.config
@@ -19,7 +19,7 @@
 # OSXGCC5    := yes
 # To keep Dan's version working
 # TBB_PREFIX := tbb
-#TBB_PREFIX := ${TBBROOT}
+# TBB_PREFIX := ${TBBROOT}
 
 # 1. Use ROOT or not (never used on MIC)
 # Comment out to disable root ("yes" is not relevant)
@@ -28,7 +28,7 @@
 # 2. Use gcc (clang by default on mac) or icc
 # Comment out to force using standard c++. For mic only icc can be used.
 ifdef INTEL_LICENSE_FILE
-CXX := icpc
+CXX := icc
 else ifdef OSXGCC5
 CXX := c++-mp-5
 endif
@@ -39,7 +39,7 @@ CUBROOT=/home/ml15/tools/cub
 NV := nvcc -prec-sqrt=true -I${CUBROOT} 
 #-g -G -lineinfo
 # Comment out to compile for CPU
-USE_CUDA := yes
+#USE_CUDA := yes
 
 # 3. Optimization
 # -O3 implies vectorization and simd (but not AVX)
@@ -58,7 +58,7 @@ USE_INTRINSICS := -DMPLEX_USE_INTRINSICS
 # To enforce given vector size (does not work with intrinsics!)
 # USE_INTRINSICS := -DMPT_SIZE=1
 
-# USE_VTUNE_NOTIFY := yes
+USE_VTUNE_NOTIFY := yes
 
 # 6. MIC stuff is built when icc is the chosen compiler.
 # MIC build is always done without root.