diff --git a/BinInfoUtils.h b/BinInfoUtils.h
index 908eca6534c53..7c9481caa3caa 100644
--- a/BinInfoUtils.h
+++ b/BinInfoUtils.h
@@ -12,18 +12,27 @@ typedef std::pair<int, int>                 BinInfo;
 typedef std::vector<std::vector<BinInfo>>   BinInfoLayerMap;
 typedef std::vector<BinInfoLayerMap>        BinInfoMap;
 
+#ifdef __CUDACC__
+__host__ __device__
+#endif
 inline float downPhi(float phi)
 {
   while (phi >= Config::PI) {phi-=Config::TwoPI;}
   return phi;
 }
 	
+#ifdef __CUDACC__
+__host__ __device__
+#endif
 inline float upPhi(float phi)
 {
   while (phi <= -Config::PI) {phi+=Config::TwoPI;}
   return phi;
 }
 
+#ifdef __CUDACC__
+__host__ __device__
+#endif
 inline float normalizedPhi(float phi)
 {
   //  return std::fmod(phi, (float) Config::PI); // return phi +pi out of phase for |phi| beyond boundary! 
@@ -31,6 +40,9 @@ inline float normalizedPhi(float phi)
   return phi;
 }
 
+#ifdef __CUDACC__
+__host__ __device__
+#endif
 inline int getPhiPartition(float phi)
 {
   //assume phi is between -PI and PI
diff --git a/Config.h b/Config.h
index d4db239c5ac2f..cc8b1774c81f7 100644
--- a/Config.h
+++ b/Config.h
@@ -121,7 +121,7 @@ namespace Config
   // Config for Hit and BinInfoUtils
   constexpr int   nPhiPart   = 1260;
   constexpr float fPhiFactor = nPhiPart / TwoPI;
-  constexpr int   nEtaPart   = 11;
+  constexpr int   nEtaPart   = 11;  // 1 is better for GPU best_hit
   constexpr int   nEtaBin    = 2 * nEtaPart - 1;
 
   constexpr float        fEtaFull  = 2 * Config::fEtaDet;
@@ -204,7 +204,7 @@ namespace Config
     #ifdef __MIC__
       #define MPT_SIZE 16
     #elif defined USE_CUDA
-      #define MPT_SIZE 10000
+      #define MPT_SIZE 8
     #else
       #define MPT_SIZE 8
     #endif
diff --git a/Hit.h b/Hit.h
index c44ba1d8ea594..a95f3d9ed38bb 100644
--- a/Hit.h
+++ b/Hit.h
@@ -85,11 +85,17 @@ inline float getInvRad2(float x, float y){
   return 1.0f/(x*x + y*y);
 }
 
+#ifdef __CUDACC__
+__host__ __device__
+#endif
 inline float getPhi(float x, float y)
 {
   return std::atan2(y,x); 
 }
 
+#ifdef __CUDACC__
+__host__ __device__
+#endif
 inline float getTheta(float r, float z){
   return std::atan2(r,z);
 }
@@ -217,6 +223,10 @@ class Hit
 
   const float* posArray() const {return state_.pos_.Array();}
   const float* errArray() const {return state_.err_.Array();}
+#if __CUDACC__
+  __device__ float* posArrayCU();
+  __device__ float* errArrayCU();
+#endif
 
   // Non-const versions needed for CopyOut of Matriplex.
   SVector3&     parameters_nc() {return state_.pos_;}
diff --git a/Makefile.config b/Makefile.config
index badc9594ca741..15231cdb21ed5 100644
--- a/Makefile.config
+++ b/Makefile.config
@@ -19,6 +19,7 @@
 # OSXGCC5    := yes
 # To keep Dan's version working
 # TBB_PREFIX := tbb
+# TBB_PREFIX := ${TBBROOT}
 
 # 1. Use ROOT or not (never used on MIC)
 # Comment out to disable root ("yes" is not relevant)
@@ -34,9 +35,11 @@ endif
 
 # 2.1 Use nvcc to compile cuda code
 # CUDA compiler
-NV := nvcc
+CUBROOT=/home/ml15/tools/cub
+NV := nvcc -prec-sqrt=true -I${CUBROOT} 
+#-g -G -lineinfo
 # Comment out to compile for CPU
-# USE_CUDA := -DUSE_CUDA
+#USE_CUDA := yes
 
 # 3. Optimization
 # -O3 implies vectorization and simd (but not AVX)
@@ -113,12 +116,19 @@ INWARD_FIT := -DINWARDFIT
 # Derived settings
 ################################################################
 
-CPPFLAGS := -I. ${USE_MATRIPLEX} ${USE_INTRINSICS} ${USE_CUDA} -std=c++11
+CPPFLAGS := -I. ${USE_MATRIPLEX} ${USE_INTRINSICS} -std=c++11
 CXXFLAGS := ${OPT} ${OSX_CXXFLAGS}
 
-LDFLAGS_HOST :=
+LDFLAGS_HOST := 
 LDFLAGS_MIC  :=
 
+ifdef USE_CUDA
+CPPFLAGS += -DUSE_CUDA -I/nfs/opt/cuda/include
+#CPPFLAGS += -I/home/ml15/tools/cub
+CPPFLAGS += -I${CUBROOT}
+LDFLAGS_HOST += -L${CUDALIBDIR}
+endif
+
 CPPFLAGS += ${USE_STATE_VALIDITY_CHECKS} ${USE_SCATTERING} ${USE_LINEAR_INTERPOLATION} ${ENDTOEND} ${USE_ETA_SEGMENTATION} ${INWARD_FIT} ${GEN_FLAT_ETA}
 
 ifdef USE_VTUNE_NOTIFY
@@ -130,7 +140,8 @@ endif
 endif
 
 ifneq ($(CXX),icc)
-  #CXXFLAGS += -Wall -Wno-unknown-pragmas
+  CPPFLAGS += -I/opt/rh/python27/root/usr/include
+  LDFLAGS_HOST += -L/opt/rh/python27/root/usr/lib64
 endif
 
 ifeq ($(CXX),icc)
diff --git a/Math/MatrixRepresentationsStatic.h b/Math/MatrixRepresentationsStatic.h
index 21cf5b9c59c46..90ef3ab2ab6a3 100644
--- a/Math/MatrixRepresentationsStatic.h
+++ b/Math/MatrixRepresentationsStatic.h
@@ -241,6 +241,9 @@ namespace Math {
      inline T* Array() { return fArray; }
 
      inline const T* Array() const { return fArray; }
+#ifdef __CUDACC__
+     T* ArrayCU();
+#endif
 
       /**
          assignment : only symmetric to symmetric allowed
diff --git a/Math/SMatrix.h b/Math/SMatrix.h
index b14edfccc5d56..82126403ef951 100644
--- a/Math/SMatrix.h
+++ b/Math/SMatrix.h
@@ -272,6 +272,9 @@ class SMatrix {
    const T* Array() const;
    /// return pointer to internal array
    T* Array();
+#ifdef __CUDACC__
+   T* ArrayCU();
+#endif
 
    /** @name --- STL-like interface --- 
        The iterators access the matrix element in the order how they are 
diff --git a/Math/SVector.h b/Math/SVector.h
index 7de0c830a9538..1186867a7679c 100644
--- a/Math/SVector.h
+++ b/Math/SVector.h
@@ -185,6 +185,9 @@ class SVector {
    const T* Array() const;
    /// return non-const pointer to internal array
    T* Array();
+#ifdef __CUDACC__
+   T* ArrayCU();
+#endif
    
    /** @name --- STL-like interface --- */
    
diff --git a/Matriplex/GenMul.pm b/Matriplex/GenMul.pm
index 45df5311c487a..186029e53bc0d 100644
--- a/Matriplex/GenMul.pm
+++ b/Matriplex/GenMul.pm
@@ -532,6 +532,75 @@ sub multiply_standard
 
 # ----------------------------------------------------------------------
 
+sub generate_addend_gpu
+{
+  my ($S, $x, $y) = @_;
+
+  return undef if $S->{$x}{pat} eq '0' or  $S->{$y}{pat} eq '0';
+  return "1"   if $S->{$x}{pat} eq '1' and $S->{$y}{pat} eq '1';
+
+  my $xstr = sprintf "$S->{$x}{mat}{name}\[%2d*$S->{$x}{mat}{name}N+$S->{$x}{mat}{name}n]", $S->{$x}{idx};
+  my $ystr = sprintf "$S->{$y}{mat}{name}\[%2d*$S->{$y}{mat}{name}N+$S->{$y}{mat}{name}n]", $S->{$y}{idx};
+
+  return $xstr if $S->{$y}{pat} eq '1';
+  return $ystr if $S->{$x}{pat} eq '1';
+
+  return "${xstr}*${ystr}";
+}
+
+sub multiply_gpu
+{
+  # Standard mutiplication - outputs unrolled C code, one line
+  # per target matrix element.
+  # Arguments: a, b, c   -- all GenMul::MBase with right dimensions.
+  # Does:      c = a * b
+
+  check_multiply_arguments(@_);
+
+  my ($S, $a, $b, $c) = @_;
+
+  my $is_c_symmetric = $c->isa("GenMul::MatrixSym");
+
+  # With no_size_check matrices do not have to be compatible.
+  my $k_max = $a->{N} <= $b->{M} ? $a->{N} : $b->{M};
+
+  for (my $i = 0; $i < $c->{M}; ++$i)
+  {
+    my $j_max = $is_c_symmetric ?  $i + 1 : $c->{N};
+
+    for (my $j = 0; $j < $j_max; ++$j)
+    {
+      my $x = $c->idx($i, $j);
+
+      printf "$S->{prefix}$c->{name}\[%2d*$c->{name}N+$c->{name}n\] = ", $x;
+
+      my @sum;
+
+      for (my $k = 0; $k < $k_max; ++$k)
+      {
+        $S->generate_indices_and_patterns_for_multiplication($i, $j, $k);
+
+        my $addend = $S->generate_addend_gpu('a', 'b');
+
+        push @sum, $addend if defined $addend;
+      }
+      if (@sum)
+      {
+        print join(" + ", @sum), ";";
+      }
+      else
+      {
+        print "0;"
+      }
+      print "\n";
+    }
+  }
+
+  $S->delete_temporaries();
+}
+
+# ----------------------------------------------------------------------
+
 sub load_if_needed
 {
   my ($S, $x) = @_;
@@ -709,6 +778,7 @@ sub dump_multiply_std_and_intrinsic
   }
 
   print <<"FNORD";
+#ifndef __CUDACC__
 #ifdef MPLEX_INTRINSICS
 
    for (int n = 0; n < N; n += MPLEX_INTRINSICS_WIDTH_BYTES / sizeof(T))
@@ -732,6 +802,11 @@ FNORD
   print <<"FNORD";
    }
 #endif
+#else  // __CUDACC__
+FNORD
+  $S->multiply_gpu($a, $b, $c);
+  print <<"FNORD";
+#endif  // __CUDACC__
 FNORD
 
   
diff --git a/Matriplex/Matriplex.h b/Matriplex/Matriplex.h
index 0853e75618677..eb26e4ae64827 100644
--- a/Matriplex/Matriplex.h
+++ b/Matriplex/Matriplex.h
@@ -131,6 +131,9 @@ class Matriplex
         for (int j = 0; j < N; ++j)
         {
            fArray[i*N + j] = * (const T*) (arr + i*sizeof(T) + vi[j]);
+           //if(j==2) {
+             //printf("cpu -- %d : %d,  %f\n", i, vi[j], fArray[i*N+j]);
+           //}
         }
       }
    }
diff --git a/Matrix.h b/Matrix.h
index efe0782c8e87f..fa622e80fc165 100644
--- a/Matrix.h
+++ b/Matrix.h
@@ -50,11 +50,17 @@ inline double dtime()
     return( tseconds );
 }
 
+#ifdef __CUDACC__
+__host__ __device__
+#endif
 inline float hipo(float x, float y)
 {
   return std::sqrt(x*x + y*y);
 }
 
+#ifdef __CUDACC__
+__host__ __device__
+#endif
 inline void sincos4(const float x, float& sin, float& cos)
 {
    // Had this writen with explicit division by factorial.
@@ -74,7 +80,7 @@ inline void sincos4(const float x, float& sin, float& cos)
   #ifdef __INTEL_COMPILER
     #define ASSUME_ALIGNED(a, b) __assume_aligned(a, b)
   #else
-    #define ASSUME_ALIGNED(a, b) __builtin_assume_aligned(a, b)
+    #define ASSUME_ALIGNED(a, b) a = static_cast<decltype(a)>(__builtin_assume_aligned(a, b))
   #endif
 
   #include "Matriplex/MatriplexSym.h"
diff --git a/Track.h b/Track.h
index b7d485f1e6a64..334c48bd359f8 100644
--- a/Track.h
+++ b/Track.h
@@ -140,6 +140,11 @@ class Track
 
   const float* posArray() const {return state_.parameters.Array();}
   const float* errArray() const {return state_.errors.Array();}
+//#ifdef USE_CUDA
+#if __CUDACC__
+  __device__ float* posArrayCU();
+  __device__ float* errArrayCU();
+#endif
 
   // Non-const versions needed for CopyOut of Matriplex.
   SVector6&     parameters_nc() {return state_.parameters;}
@@ -149,8 +154,17 @@ class Track
   SVector3 position() const {return SVector3(state_.parameters[0],state_.parameters[1],state_.parameters[2]);}
   SVector3 momentum() const {return SVector3(state_.parameters[3],state_.parameters[4],state_.parameters[5]);}
 
+#if __CUDACC__
+  __host__ __device__
+#endif
   int      charge() const {return state_.charge;}
+#if __CUDACC__
+  __host__ __device__
+#endif
   float    chi2()   const {return chi2_;}
+#if __CUDACC__
+  __host__ __device__
+#endif
   int      label()  const {return label_;}
 
   float x()      const { return state_.parameters[0];}
@@ -200,12 +214,18 @@ class Track
     }
   }
 
+#if __CUDACC__
+  __host__ __device__
+#endif
   void addHitIdx(int hitIdx,float chi2)
   {
     hitIdxArr_[++hitIdxPos_] = hitIdx;
     if (hitIdx >= 0) { ++nGoodHitIdx_; chi2_+=chi2; }
   }
 
+#if __CUDACC__
+  __host__ __device__
+#endif
   int getHitIdx(int posHitIdx) const
   {
     return hitIdxArr_[posHitIdx];
@@ -222,6 +242,9 @@ class Track
     }
   }
 
+#if __CUDACC__
+  __host__ __device__
+#endif
   void setHitIdx(int posHitIdx, int newIdx) {
     hitIdxArr_[posHitIdx] = newIdx;
   }
@@ -233,6 +256,16 @@ class Track
     }
   }
 
+#if __CUDACC__
+  __host__ __device__
+#endif
+  void setNGoodHitIdx(int nHits) {
+    nGoodHitIdx_ = nHits;
+  }
+
+#if __CUDACC__
+  __host__ __device__
+#endif
   void resetHits()
   {
     hitIdxPos_   = -1;
@@ -250,8 +283,18 @@ class Track
     }
     return layers;
   }
+
+#if __CUDACC__
+  __host__ __device__
+#endif
   void setCharge(int chg)  {state_.charge=chg;}
+#if __CUDACC__
+  __host__ __device__
+#endif
   void setChi2(float chi2) {chi2_=chi2;}
+#if __CUDACC__
+  __host__ __device__
+#endif
   void setLabel(int lbl)   {label_=lbl;}
 
   void setState(const TrackState& newState) {state_=newState;}
diff --git a/mkFit/BuilderCU.cu b/mkFit/BuilderCU.cu
new file mode 100644
index 0000000000000..6d6ee60863365
--- /dev/null
+++ b/mkFit/BuilderCU.cu
@@ -0,0 +1,78 @@
+#include "BuilderCU.h"
+
+#include "gpu_utils.h"
+#include "HitStructures.h"
+#include "HitStructuresCU.h"
+#include "GeometryCU.h"
+#include "FitterCU.h"
+#include "Event.h"
+
+
+BuilderCU::BuilderCU()
+{
+}
+
+
+BuilderCU::BuilderCU(const EventOfHits& event_of_hits, const Event* event,
+                     const EventOfCandidates& event_of_cands)
+{
+  setUp(event_of_hits, event, event_of_cands);
+}
+
+
+BuilderCU::~BuilderCU() {
+  tearDown();
+}
+
+
+void BuilderCU::setUp(const EventOfHits& event_of_hits, const Event* event,
+                      const EventOfCandidates& event_of_cands)
+{
+  int gplex_size = 1 << 14;
+  cuFitter = new FitterCU<float> (gplex_size);
+  cuFitter->allocateDevice();
+  cuFitter->allocate_extra_addBestHit();
+  cuFitter->createStream();
+  cuFitter->setNumberTracks(gplex_size);
+
+  event_of_hits_cu.allocGPU(event_of_hits);
+  event_of_hits_cu.copyFromCPU(event_of_hits);
+
+  std::vector<float> radii (Config::nLayers);
+  for (int ilay = Config::nlayers_per_seed; ilay < Config::nLayers; ++ilay) {
+    radii[ilay] = event->geom_.Radius(ilay);
+  }
+  geom_cu.allocate();
+  geom_cu.getRadiiFromCPU(&radii[0]);
+
+  event_of_cands_cu.allocGPU(event_of_cands);
+}
+
+
+void BuilderCU::tearDown() {
+  event_of_cands_cu.deallocGPU();
+
+  geom_cu.deallocate();
+  event_of_hits_cu.deallocGPU();
+
+  cuFitter->destroyStream();
+  cuFitter->free_extra_addBestHit();
+  cuFitter->freeDevice();
+  delete cuFitter;
+}
+
+
+void BuilderCU::FindTracksBestHit(EventOfCandidates& event_of_cands) 
+{
+  event_of_cands_cu.copyFromCPU(event_of_cands, cuFitter->get_stream());
+
+  cuFitter->addBestHit(event_of_hits_cu, geom_cu, event_of_cands_cu);
+
+  event_of_cands_cu.copyToCPU(event_of_cands, cuFitter->get_stream());
+  cudaStreamSynchronize(cuFitter->get_stream());
+  //cudaCheckError();
+
+  /*size_t free_mem, total_mem;*/
+  /*cudaMemGetInfo(&free_mem, &total_mem);*/
+  /*fprintf(stderr, "Free: %d\n", free_mem);*/
+}
diff --git a/mkFit/BuilderCU.h b/mkFit/BuilderCU.h
new file mode 100644
index 0000000000000..87439d6be5d5b
--- /dev/null
+++ b/mkFit/BuilderCU.h
@@ -0,0 +1,37 @@
+#ifndef BUILDER_CU_H
+#define BUILDER_CU_H 
+
+#include "FitterCU.h"
+#include "HitStructures.h"
+#include "HitStructuresCU.h"
+#include "GeometryCU.h"
+#include "Geometry.h"
+#include "Event.h"
+
+
+// FIXME: Design Issue
+//        What to do, allocation in ctor, free in dtor?
+//            not exception-safe
+//            but manage mem 
+//        or in separate function?
+class BuilderCU
+{
+public:
+  BuilderCU();
+  BuilderCU(const EventOfHits& event_of_hits, const Event* event,
+            const EventOfCandidates& event_of_cands);
+  ~BuilderCU();
+
+  void setUp(const EventOfHits& event_of_hits, const Event* event,
+             const EventOfCandidates& event_of_cands);
+  void tearDown();
+  void FindTracksBestHit(EventOfCandidates& event_of_cands);
+private:
+  FitterCU<float> *cuFitter;
+  EventOfHitsCU event_of_hits_cu;
+  EventOfCandidatesCU event_of_cands_cu;
+  GeometryCU geom_cu;
+};
+
+
+#endif /* ifndef BUILDER_CU_H */
diff --git a/mkFit/FitterCU-imp.h b/mkFit/FitterCU-imp.h
index 8a94f3d5f4aaa..cae975636a6df 100644
--- a/mkFit/FitterCU-imp.h
+++ b/mkFit/FitterCU-imp.h
@@ -1,5 +1,13 @@
+#include <cstdlib>
+#include "Config.h"
+#include "GeometryCU.h"
+#include "reorganize_gplex.h"
+#include "fittracks_kernels.h"
+
+#include "Track.h"
+
 template <typename T>
-void FitterCU<T>::setNumberTracks(idx_t Ntracks) {
+void FitterCU<T>::setNumberTracks(const idx_t Ntracks) {
   N = Ntracks;
 
   // Raise an exceptioin when the FitterCU instance is too small
@@ -22,16 +30,25 @@ void FitterCU<T>::destroyStream() {
 
 template <typename T>
 void FitterCU<T>::allocateDevice() {
+  d_par_iP.allocate(Nalloc, LV);
   d_par_iC.allocate(Nalloc, LV);
+
+  d_Err_iP.allocate(Nalloc, LS);
+  d_Err_iC.allocate(Nalloc, LS);
+
   d_inChg.allocate(Nalloc, QI);
-  d_par_iP.allocate(Nalloc, LV);
   d_errorProp.allocate(Nalloc, LL);
-  d_Err_iP.allocate(Nalloc, LS);
-  d_msPar.allocate(Nalloc, HV);
-  d_outErr.allocate(Nalloc, LS);
-  d_msErr.allocate(Nalloc, HS);
 
-  cudaCheckError()
+  cudaMalloc((void**)&d_msPar_arr, Config::nLayers * sizeof(GPlexHV));
+  cudaMalloc((void**)&d_msErr_arr, Config::nLayers * sizeof(GPlexHS));
+  for (int hi = 0; hi < Config::nLayers; ++hi) {
+    d_msPar[hi].allocate(Nalloc, HV);
+    d_msErr[hi].allocate(Nalloc, HS);
+  }
+  cudaMemcpy(d_msPar_arr, d_msPar, Config::nLayers*sizeof(GPlexHV), cudaMemcpyHostToDevice);
+  cudaMemcpy(d_msErr_arr, d_msErr, Config::nLayers*sizeof(GPlexHS), cudaMemcpyHostToDevice);
+  cudaMalloc((void**)&d_maxSize, sizeof(int));  //  global maximum
+  cudaCheckError();
 }
 
 template <typename T>
@@ -41,125 +58,180 @@ void FitterCU<T>::freeDevice() {
   d_par_iP.free();
   d_errorProp.free();
   d_Err_iP.free();
-  d_msPar.free();
-  d_outErr.free();
-  d_msErr.free();
-
-  cudaCheckError()
-}
-
-template <typename T>
-void FitterCU<T>::sendInParToDevice(const MPlexLV& inPar) {
-  cudaMemcpy2DAsync(d_par_iC.ptr, d_par_iC.pitch, inPar.fArray, N*sizeof(T),
-               N*sizeof(T), LV, cudaMemcpyHostToDevice, stream);
-  cudaCheckError()
+  d_Err_iC.free();
+  for (int hi = 0; hi < Config::nLayers; ++hi) {
+    d_msPar[hi].free();
+    d_msErr[hi].free();
+  }
+  cudaFree(d_msPar_arr);
+  cudaFree(d_msErr_arr);
+  cudaFree(d_maxSize);
+  cudaCheckError();
 }
 
 template <typename T>
-void FitterCU<T>::sendInErrToDevice(const MPlexLS& inErr) {
-  cudaMemcpy2DAsync(d_outErr.ptr, d_outErr.pitch, inErr.fArray, N*sizeof(T),
-               N*sizeof(T), LS, cudaMemcpyHostToDevice, stream);
-  cudaCheckError()
+void FitterCU<T>::kalmanUpdateMerged(const int hit_idx) {
+  kalmanUpdate_wrapper(stream, d_Err_iP, d_msErr[hit_idx],
+                       d_par_iP, d_msPar[hit_idx], d_par_iC, d_Err_iC, N);
 }
 
 template <typename T>
-void FitterCU<T>::sendInChgToDevice(const MPlexQI& inChg) {
-  cudaMemcpy2DAsync(d_inChg.ptr, d_inChg.pitch, inChg.fArray, N*sizeof(T),
-               N*sizeof(T), QI, cudaMemcpyHostToDevice, stream);
-  cudaCheckError()
+void FitterCU<T>::kalmanUpdate_standalone(
+    const MPlexLS &psErr, const MPlexLV& psPar, const MPlexQI &inChg,
+    const MPlexHS &msErr, const MPlexHV& msPar,
+    MPlexLS &outErr, MPlexLV& outPar,
+    const int hit_idx, const int N_proc)
+{
+  d_Err_iP.copyAsyncFromHost(stream, psErr);
+  d_msErr[hit_idx].copyAsyncFromHost(stream, msErr);
+  d_par_iP.copyAsyncFromHost(stream, psPar);
+  d_msPar[hit_idx].copyAsyncFromHost(stream, msPar);
+
+  kalmanUpdate_wrapper(stream, d_Err_iP, d_msErr[hit_idx],
+                       d_par_iP, d_msPar[hit_idx], d_par_iC, d_Err_iC, N_proc);
+
+  d_par_iC.copyAsyncToHost(stream, outPar);
+  d_Err_iC.copyAsyncToHost(stream, outErr);
 }
 
 template <typename T>
-void FitterCU<T>::sendMsRadToDevice(const MPlexQF& msRad) {
-  cudaMemcpy2DAsync(d_msRad.ptr, d_msRad.pitch, msRad.fArray, N*sizeof(T),
-               N*sizeof(T), QF, cudaMemcpyHostToDevice, stream);
-  cudaCheckError()
+void FitterCU<T>::propagationMerged(const int hit_idx) {
+  propagation_wrapper(stream, d_msPar[hit_idx], d_Err_iC, d_par_iC, d_inChg,
+                      d_par_iP, d_Err_iP, N);
 }
 
+// FIXME: Temporary. Separate allocations / transfers
 template <typename T>
-void FitterCU<T>::sendOutParToDevice(const MPlexLV& outPar) {
-  cudaMemcpy2DAsync(d_par_iP.ptr, d_par_iP.pitch, outPar.fArray, N*sizeof(T),
-               N*sizeof(T), LV, cudaMemcpyHostToDevice, stream);
-  cudaCheckError()
+void FitterCU<T>::allocate_extra_addBestHit() {
+  d_outChi2.allocate(Nalloc, QF);
+  d_XHitPos.allocate(Nalloc, QI);
+  d_XHitSize.allocate(Nalloc, QI);
+  d_XHitArr.allocate(Nalloc, GPlexHitIdxMax);
+  cudaMalloc((void**)&d_HitsIdx_arr, Config::nLayers * sizeof(GPlexQI));
+  for (int hi = 0; hi < Config::nLayers; ++hi) {
+    d_HitsIdx[hi].allocate(Nalloc, QI);
+  }
+  cudaMemcpy(d_HitsIdx_arr, d_HitsIdx, Config::nLayers*sizeof(GPlexQI), cudaMemcpyHostToDevice);
+  d_Chi2.allocate(Nalloc, QF);
+  d_Label.allocate(Nalloc, QI);
+  cudaCheckError();
 }
 
 template <typename T>
-void FitterCU<T>::sendOutErrToDevice(const MPlexLS& outErr) {
-  cudaMemcpy2DAsync(d_Err_iP.ptr, d_Err_iP.pitch, outErr.fArray, N*sizeof(T),
-               N*sizeof(T), LS, cudaMemcpyHostToDevice, stream);
-  cudaCheckError()
+void FitterCU<T>::free_extra_addBestHit() {
+  for (int hi = 0; hi < Config::nLayers; ++hi) {
+    d_HitsIdx[hi].free(); cudaCheckError();
+  }
+  cudaFree(d_HitsIdx_arr);
+  d_Label.free(); cudaCheckError();
+  d_Chi2.free(); cudaCheckError();
+
+  d_XHitArr.free(); cudaCheckError();
+  d_XHitSize.free(); cudaCheckError();
+  d_XHitPos.free(); cudaCheckError();
+  d_outChi2.free(); cudaCheckError();
 }
 
 template <typename T>
-void FitterCU<T>::sendMsParToDevice(const MPlexHV& msPar) {
-  cudaMemcpy2DAsync(d_msPar.ptr, d_msPar.pitch, msPar.fArray, N*sizeof(T),
-               N*sizeof(T), HV, cudaMemcpyHostToDevice, stream);
-  cudaCheckError()
+void FitterCU<T>::setHitsIdxToZero(const int hit_idx) {
+  cudaMemset(d_HitsIdx[hit_idx].ptr, 0, Nalloc*sizeof(int));
 }
 
 template <typename T>
-void FitterCU<T>::sendMsErrToDevice(const MPlexHS& msErr) {
-  cudaMemcpy2DAsync(d_msErr.ptr, d_msErr.pitch, msErr.fArray, N*sizeof(T),
-               N*sizeof(T), HS, cudaMemcpyHostToDevice, stream);
-  cudaCheckError()
-}
+void FitterCU<T>::addBestHit(EventOfHitsCU &event, GeometryCU &geom_cu,
+                             EventOfCandidatesCU &event_of_cands_cu) {
+    findBestHit_wrapper(stream, event.m_layers_of_hits, 
+                        event_of_cands_cu,
+                        d_XHitSize, d_XHitArr,
+                        d_Err_iP, d_par_iP, 
+                        d_msErr_arr, d_msPar_arr,
+                        d_Err_iC, d_par_iC, d_outChi2,
+                        d_Chi2, d_HitsIdx_arr,
+                        d_inChg, d_Label, geom_cu,
+                        d_maxSize, N);
+}   
 
 template <typename T>
-void FitterCU<T>::getOutParFromDevice(MPlexLV& outPar) {
-  cudaMemcpy2DAsync(outPar.fArray, N*sizeof(T), d_par_iC.ptr, d_par_iC.pitch,
-               N*sizeof(T), LV, cudaMemcpyDeviceToHost, stream);
-  cudaCheckError()
+void FitterCU<T>::InputTracksAndHitIdx(const EtaBinOfCandidatesCU &etaBin,
+                              const int beg, const int end, const bool inputProp) {
+  InputTracksCU_wrapper(stream, etaBin, d_Err_iP, d_par_iP,
+                        d_inChg, d_Chi2, d_Label, d_HitsIdx_arr,
+                        beg, end, inputProp, N);
 }
 
 template <typename T>
-void FitterCU<T>::getErrorPropFromDevice(MPlexLL& errorProp) {
-  cudaMemcpy2DAsync(errorProp.fArray, N*sizeof(T),
-               d_errorProp.ptr, d_errorProp.pitch,
-               N*sizeof(T), LL, cudaMemcpyDeviceToHost, stream);
-  cudaCheckError()
+void FitterCU<T>::OutputTracksAndHitIdx(EtaBinOfCandidatesCU &etaBin,
+                               const int beg, const int end, const bool outputProp) {
+  OutputTracksCU_wrapper(stream, etaBin, d_Err_iP, d_par_iP,
+                         d_inChg, d_Chi2, d_Label, d_HitsIdx_arr,
+                         beg, end, outputProp, N);
+  cudaStreamSynchronize(stream);
+  cudaCheckError();
 }
 
-template <typename T>
-void FitterCU<T>::getOutErrFromDevice(MPlexLS& outErr) {
-  cudaMemcpy2DAsync(outErr.fArray, N*sizeof(T), d_outErr.ptr, d_outErr.pitch,
-               N*sizeof(T), LS, cudaMemcpyDeviceToHost, stream);
-  cudaCheckError()
-}
 
 template <typename T>
-void FitterCU<T>::getMsRadFromDevice(MPlexQF& msRad) {
-  cudaMemcpy2DAsync(msRad.fArray, N*sizeof(T), d_msRad.ptr, d_msRad.pitch,
-               N*sizeof(T), QF, cudaMemcpyDeviceToHost, stream);
-  cudaCheckError()
+void FitterCU<T>::propagateTracksToR(const float radius, const int N) {
+  propagationForBuilding_wrapper(stream, d_Err_iC, d_par_iC, d_inChg, 
+                                 radius, d_Err_iP, d_par_iP, N); 
 }
 
-template <typename T>
-void FitterCU<T>::setOutParFromInPar() {
-  cudaMemcpy2DAsync(d_par_iP.ptr, d_par_iP.pitch, d_par_iC.ptr, d_par_iC.pitch,
-               N*sizeof(T), LV, cudaMemcpyDeviceToDevice, stream);
-  cudaCheckError()
-}
 
+#if 1
 template <typename T>
-void FitterCU<T>::setOutErrFromInErr() {
-  cudaMemcpy2DAsync(d_Err_iP.ptr, d_Err_iP.pitch, d_outErr.ptr, d_outErr.pitch,
-               N*sizeof(T), LS, cudaMemcpyDeviceToDevice, stream);
-  cudaCheckError()
+void FitterCU<T>::propagateTracksToR_standalone(const float radius, const int N,
+    const MPlexLS& Err_iC, const MPlexLV& par_iC, const MPlexQI& inChg, 
+    MPlexLS& Err_iP, MPlexLV& Par_iP) {
+  d_Err_iC.copyAsyncFromHost(stream, Err_iC);
+  d_par_iC.copyAsyncFromHost(stream, par_iC);
+  //propagationForBuilding_wrapper(stream, d_Err_iC, d_par_iC, d_inChg, 
+                                 //radius, d_Err_iP, d_par_iP, N); 
+  d_Err_iP.copyAsyncToHost(stream, Err_iP);
+  d_par_iP.copyAsyncToHost(stream, Par_iP);
 }
 
 template <typename T>
-void FitterCU<T>::kalmanUpdateMerged() {
-  kalmanUpdate_wrapper(stream, d_Err_iP, d_msErr,
-                       d_par_iP, d_msPar, d_par_iC, d_outErr, N);
-}
+void FitterCU<T>::FitTracks(Track *tracks_cu, int num_tracks,
+                            EventOfHitsCU &events_of_hits_cu,
+                            int Nhits)
+{
+  float etime;
+  cudaEvent_t start, stop;
+  cudaEventCreate(&start);
+  cudaEventCreate(&stop);
 
-template <typename T>
-void FitterCU<T>::propagationMerged() {
-  propagation_wrapper(stream, d_msPar, d_par_iC, d_inChg,
-                      d_par_iP, d_errorProp, d_Err_iP, N);
-}
+  cudaEventRecord(start, 0);
+ 
+  for (int itrack = 0; itrack < num_tracks; itrack += Nalloc)
+  {
+    int beg = itrack;
+    int end = std::min(itrack + Nalloc, num_tracks);
+    setNumberTracks(end-beg);
+
+    InputTracksAndHitsCU_wrapper(stream, tracks_cu, events_of_hits_cu,
+                                 d_Err_iC, d_par_iC,
+                                 d_msErr_arr, d_msPar_arr,
+                                 d_inChg, d_Chi2, d_Label,
+                                 d_HitsIdx_arr, beg, end, false, N);
+    fittracks_wrapper(stream, d_Err_iP, d_par_iP, d_msErr_arr, d_msPar_arr,
+                      d_Err_iC, d_par_iC, d_errorProp, d_inChg,
+                      Nhits, N);
+    OutputFittedTracksCU_wrapper(stream, tracks_cu, 
+                                 d_Err_iC, d_par_iC,
+                                 d_inChg, d_Chi2, d_Label,
+                                 beg, end, N);
+  }
 
+  cudaEventRecord(stop, 0);
+  cudaEventSynchronize(stop);
 
+  cudaEventElapsedTime(&etime, start, stop);
+  std::cerr << "CUDA etime: " << etime << " ms.\n";
+  
+  cudaEventDestroy(start);
+  cudaEventDestroy(stop);
+}
+#else
 template <typename T>
 void FitterCU<T>::FitTracks(MPlexQI &Chg, MPlexLV& par_iC, MPlexLS& err_iC,
                             MPlexHV* msPar, MPlexHS* msErr, int Nhits,
@@ -177,9 +249,18 @@ void FitterCU<T>::FitTracks(MPlexQI &Chg, MPlexLV& par_iC, MPlexLS& err_iC,
 
   setNumberTracks(end-beg);
 
-  sendInChgToDevice(Chg);
-  sendInParToDevice(par_iC);
-  sendInErrToDevice(err_iC);
+  Track *tracks_cu;
+  cudaMalloc((void**)&tracks_cu, tracks.size()*sizeof(Track));
+  cudaMemcpy(tracks_cu, &tracks[0], tracks.size()*sizeof(Track), cudaMemcpyHostToDevice);
+  allocate_extra_addBestHit();
+
+  InputTracksAndHitsCU_wrapper(stream, tracks_cu, d_Err_iC, d_par_iC, d_inChg,
+                               d_Chi2, d_Label, d_HitsIdx_arr, beg, end, false, N);
+
+
+  //d_inChg.copyAsyncFromHost(stream, Chg);
+  //d_par_iC.copyAsyncFromHost(stream, par_iC);
+  //d_Err_iC.copyAsyncFromHost(stream, err_iC);
 
   cudaEventRecord(start, 0);
  
@@ -188,9 +269,10 @@ void FitterCU<T>::FitTracks(MPlexQI &Chg, MPlexLV& par_iC, MPlexLS& err_iC,
   {
     // Switch outPut and inPut parameters and errors
     // similar to iC <-> iP in the CPU code.
-    setOutParFromInPar();
-    setOutErrFromInErr(); // d_Err_iP
+    d_par_iP.copyAsyncFromDevice(stream, d_par_iC); 
+    d_Err_iP.copyAsyncFromDevice(stream, d_Err_iC);
     
+#if 0
     double time_input = dtime();
     int itrack;
     omp_set_num_threads(Config::numThreadsReorg);
@@ -206,22 +288,36 @@ void FitterCU<T>::FitTracks(MPlexQI &Chg, MPlexLV& par_iC, MPlexLS& err_iC,
       msPar[hi].CopyIn(itrack, hit.posArray());
     }
     total_reorg += (dtime() - time_input)*1e3;
-
-    sendMsParToDevice(msPar[hi]);
-    sendMsErrToDevice(msErr[hi]);
-
-    propagationMerged();
-    kalmanUpdateMerged();
+#endif
+
+    d_msPar[hi].copyAsyncFromHost(stream, msPar[hi]);
+    d_msErr[hi].copyAsyncFromHost(stream, msErr[hi]);
+
+    propagationMerged(hi);
+    //MPlexLS  err_iP;
+    //MPlexLV par_iP;
+    //d_par_iC.copyAsyncToHost(stream, par_iC);
+    //d_par_iP.copyAsyncToHost(stream, par_iP);
+    //d_Err_iP.copyAsyncToHost(stream, err_iP);
+    //propagation_wrapper(stream, d_msPar[hi], d_Err_iC, d_par_iC, d_inChg,
+                      //d_par_iP, d_errorProp, d_Err_iP, N);
+    kalmanUpdateMerged(hi);
+    //fittracks_wrapper(stream, d_Err_iP, d_par_iP, d_msErr, d_msPar,
+                      //d_Err_iC, d_par_iC, d_errorProp, d_inChg,
+                      //hi, N);
   }
   cudaEventRecord(stop, 0);
   cudaEventSynchronize(stop);
 
   cudaEventElapsedTime(&etime, start, stop);
-  std::cerr << "CUDA etime: " << etime << " ms.\n";
-  std::cerr << "Total reorg: " << total_reorg << " ms.\n";
+  //std::cerr << "CUDA etime: " << etime << " ms.\n";
+  //std::cerr << "Total reorg: " << total_reorg << " ms.\n";
+  
+  free_extra_addBestHit();
+  cudaFree(tracks_cu);
 
-  getOutParFromDevice(par_iC);
-  getOutErrFromDevice(err_iC);
+  d_par_iC.copyAsyncToHost(stream, par_iC);
+  d_Err_iC.copyAsyncToHost(stream, err_iC);
   
   cudaStreamSynchronize(stream);
   // freeDevice(); -> moved to mkFit/mkFit.cc
@@ -230,4 +326,4 @@ void FitterCU<T>::FitTracks(MPlexQI &Chg, MPlexLV& par_iC, MPlexLS& err_iC,
   cudaEventDestroy(start);
   cudaEventDestroy(stop);
 }
-
+#endif
diff --git a/mkFit/FitterCU.h b/mkFit/FitterCU.h
index a56a912559ea3..41715a7745f9b 100644
--- a/mkFit/FitterCU.h
+++ b/mkFit/FitterCU.h
@@ -2,48 +2,36 @@
 #define _PROPAGATOR_CU_H_
 
 #include <cuda_runtime.h>
-#include <omp.h>
-#include <stdexcept>
 
 #include "Matrix.h"
+
+#include "HitStructuresCU.h"
+#include "GPlex.h"
+#include "GeometryCU.h"
+#include "gpu_utils.h"
+
 #include "propagation_kernels.h"
 #include "kalmanUpdater_kernels.h"
-#include "GPlex.h"
+#include "computeChi2_kernels.h"
+#include "index_selection_kernels.h"
+#include "best_hit_kernels.h"
+
+#include <omp.h>
+#include <stdexcept>
 
-#define LV 6
-#define QI 1
-#define QF 1
+constexpr int LV = 6;
+constexpr int QI = 1;
+constexpr int QF = 1;
 #define LL 36
-#define LS 21
-#define HV 3
-#define HS 6
-#define LH 18
+constexpr int LS = 21;
+constexpr int HV = 3;
+constexpr int HS = 6;
+constexpr int LH = 18;
 
-#define BLOCK_SIZE_X 16
-#define MAX_BLOCKS_X 65535 // CUDA constraint
+#define BLOCK_SIZE_X 256
 
 using idx_t = Matriplex::idx_t;
 
-// Macro for checking cuda errors following a cuda launch or api call
-// This comes from Jeff Larkins (NVIDIA)
-#define cudaCheckError() {                                          \
-  cudaError_t e=cudaGetLastError();                                 \
-  if(e!=cudaSuccess) {                                              \
-    printf("Cuda failure %s:%d: '%s'\n",__FILE__,__LINE__,cudaGetErrorString(e));           \
-    exit(0); \
-  }                                                                 \
-}
-#if 0
-#define cudaCheckErrorSync() {                                      \
-  cudaDeviceSynchronize();                                          \
-  cudaCheckError();                                                 \
-}
-#else
-#define cudaCheckErrorSync() {}
-#endif
-
-void separate_first_call_for_meaningful_profiling_numbers();
-
 template <typename T>
 class FitterCU {
  public:
@@ -55,54 +43,100 @@ class FitterCU {
 
   void createStream();
   void destroyStream();
+  cudaStream_t& get_stream() { return stream; }
 
-  void setNumberTracks(idx_t Ntracks);
+  int get_Nalloc() const { return Nalloc; }
+  void setNumberTracks(const idx_t Ntracks);
 
-  void sendInParToDevice(const MPlexLV& inPar);
-  void sendInErrToDevice(const MPlexLS& inErr);
-  void sendInChgToDevice(const MPlexQI& inChg);
-  void sendMsRadToDevice(const MPlexQF& msRad);
-  void sendOutParToDevice(const MPlexLV& outPar);
-  void sendOutErrToDevice(const MPlexLS& outErr);
-  void sendMsParToDevice(const MPlexHV& msPar);
-  
-  void getErrorPropFromDevice(MPlexLL& errorProp);
-  void getMsRadFromDevice(MPlexQF& msRad);
+  void propagationMerged(const int hit_idx);
+  void kalmanUpdateMerged(const int hit_idx);
+  void kalmanUpdate_standalone(
+      const MPlexLS &psErr, const MPlexLV& psPar, const MPlexQI &inChg,
+      const MPlexHS &msErr, const MPlexHV& msPar,
+      MPlexLS &outErr, MPlexLV& outPar,
+      const int hit_idx, const int N_proc);
 
-  void setOutParFromInPar();
-  void setOutErrFromInErr();
+#if 0
+  void computeChi2gpu(const MPlexLS &psErr, const MPlexLV& propPar,
+    const MPlexQI &inChg, MPlexHS &msErr, MPlexHV& msPar,
+    float *minChi2, int *bestHit,
+    LayerOfHitsCU &d_layer, MPlexQI &XHitSize, Matriplex::Matriplex<int, 16, 1, MPT_SIZE> &XHitArr,
+    MPlexQF &Chi2, MPlexQI &HitsIdx, MPlexQF&outChi2, int maxSize, int hit_idx,
+    int NN);
+#endif
 
-  // updater specfic transfers.
-  void sendMsErrToDevice(const MPlexHS& msErr);
-  void getOutParFromDevice(MPlexLV& outPar);
-  void getOutErrFromDevice(MPlexLS& outErr);
+  void allocate_extra_addBestHit();
+  void free_extra_addBestHit();
 
-  void propagationMerged();
-  void kalmanUpdateMerged();
+#if 0
+  void prepare_addBestHit();
+      //const MPlexLS &psErr, const MPlexLV& propPar,
+      //const MPlexQI &inChg, 
+      //MPlexQI &XHitSize, Matriplex::Matriplex<int, 16, 1, MPT_SIZE> &XHitArr,
+      //size_t NN);
+  void finalize_addBestHit(
+      MPlexHS *msErr, MPlexHV* msPar,
+      MPlexLS& Err_iP, MPlexLV& Par_iP, 
+      MPlexQI *HitsIdx,
+      MPlexQI &Label,
+      int start_idx, int end_idx);
+#endif
+  void setHitsIdxToZero(const int hit_idx);
+
+#if 1
+  //void addBestHit(EventOfHitsCU& event, const int ilay, const float *radii, int hit_idx);
+  void addBestHit(EventOfHitsCU& event, GeometryCU &geom_cu,
+                  EventOfCandidatesCU &event_of_cands_cu);
+#endif
+  void propagateTracksToR(const float radius, const int N);
+  void propagateTracksToR_standalone(const float radius, const int N,
+      const MPlexLS& Err_iC, const MPlexLV& par_iC, 
+      const MPlexQI& inChg, 
+      MPlexLS& Err_iP, MPlexLV& Par_iP);
 
   // fitting higher order methods
-  void FitTracks(MPlexQI &Chg, MPlexLV& par_iC, MPlexLS& err_iC,
-                 MPlexHV* msPar, MPlexHS* msErr, int Nhits,
-                 std::vector<Track> &tracks, int beg, int end,
-                 std::vector<HitVec> &layerHits);
+  void FitTracks(Track *tracks_cu, int num_tracks,
+                 EventOfHitsCU &events_of_hits_cu,
+                 int NHits);
+  void InputTracksAndHitIdx(const EtaBinOfCandidatesCU &etaBin,
+                   const int beg, const int end, const bool inputProp);
+  void OutputTracksAndHitIdx(EtaBinOfCandidatesCU &etaBin,
+                    const int beg, const int end, const bool outputProp);
 
  private:
   // N is the actual size, Nalloc should be >= N, as it is intended
   // to allocated arrays that can be used for several sets of tracks.
   idx_t Nalloc;
   idx_t N;
+
   /* data */
-  GPlex<T> d_par_iC;  // LV
-  GPlex<int> d_inChg;  // QI
-  GPlex<T> d_par_iP; // LV
-  GPlex<T> d_msRad;  // QF
-  GPlex<T> d_errorProp;  // LL
-  GPlex<T> d_Err_iP;
-  GPlex<T> d_msPar;
-
-  GPlex<T> d_outErr;
-  GPlex<T> d_msErr;
+  GPlexLV d_par_iP; // LV
+  GPlexLV d_par_iC; // LV
+
+  GPlexLS d_Err_iP; // LS
+  GPlexLS d_Err_iC; // LS
+
+  GPlexQI d_inChg;  // QI
+  GPlexQF d_msRad;  // QF
+  GPlexLL d_errorProp;  // LL
+
+  GPlexHV *d_msPar_arr;  // completely on the GPU
+  GPlexHV d_msPar[Config::nLayers];  // on the CPU, with arrays on the GPU
+  GPlexHS *d_msErr_arr;
+  GPlexHS d_msErr[Config::nLayers];
   
+  GPlexQI d_XHitPos;  // QI : 1D arrary following itracks
+  GPlexQI d_XHitSize;  // QI : " "
+  GPlexHitIdx d_XHitArr;
+
+  GPlexQF d_outChi2;
+  GPlexQI *d_HitsIdx_arr;
+  GPlexQI d_HitsIdx[Config::nLayers];
+  GPlexQF d_Chi2;
+  GPlexQI d_Label;
+
+  int *d_maxSize;  // max number of tracks for AddBestHit
+
   // everything run in a stream so multiple instance of FitterCU can
   // run concurrently on the GPU.
   cudaStream_t stream;
diff --git a/mkFit/GPlex.h b/mkFit/GPlex.h
index 9dfe3b86e6e02..e5c2c48024b8c 100644
--- a/mkFit/GPlex.h
+++ b/mkFit/GPlex.h
@@ -4,6 +4,9 @@
 #include <cuda_runtime.h>
 #include <stdio.h>
 
+#include "gpu_utils.h"
+#include "Matrix.h"
+
 // GPU implementation of a Matriplex-like structure
 // The number of tracks is the fast dimension and is padded in order to have
 // consecutive and aligned memory accesses. For cached reads, this result in a
@@ -11,23 +14,91 @@
 // See:
 // http://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#global-memory-3-0
 // In practice, The number of tracks (ntracks) is set to be MPT_SIZE
-template <typename T>
+template <typename T, typename M>
 struct GPlex { 
+  using value_type = T;
   T* ptr;
-  size_t pitch, stride, x, y;
+  size_t pitch, stride, N, kSize;
+
+  __device__ T  operator[](int xx) const { return ptr[xx]; }
+  __device__ T& operator[](int xx)       { return ptr[xx]; }
+
+  __device__ T& operator()(int n, int i, int j)       { return ptr[n + i*stride]; }
+  __device__ T  operator()(int n, int i, int j) const { return ptr[n + i*stride]; }
 
-  void allocate(size_t ntracks, size_t plex_size) {
-    x = ntracks;
-    y = plex_size;
-    cudaMallocPitch((void**)&ptr, &pitch, x*sizeof(T), y);
+  void allocate(size_t ntracks, size_t aSize) {
+    N = ntracks;
+    kSize = aSize;
+    cudaMallocPitch((void**)&ptr, &pitch, N*sizeof(T), kSize);
     stride = pitch/sizeof(T);  // Number of elements
   }
   void free() {
     cudaFree(ptr);
-    x = 0; y = 0; pitch = 0; stride = 0;
+    N = 0; kSize = 0; pitch = 0; stride = 0;
   }
   //cudaMemcpy2D(d_msErr.ptr, d_msErr.pitch, msErr.fArray, N*sizeof(T),
                //N*sizeof(T), HS, cudaMemcpyHostToDevice);
+
+  void copyAsyncFromHost(cudaStream_t& stream, const M& mplex) {
+    cudaMemcpy2DAsync(ptr, pitch, mplex.fArray, N*sizeof(T),
+                      N*sizeof(T), kSize, cudaMemcpyHostToDevice, stream);
+    cudaCheckError();
+  }
+  void copyAsyncToHost(cudaStream_t& stream, M& mplex) {
+    cudaMemcpy2DAsync(mplex.fArray, N*sizeof(T), ptr, pitch,
+                      N*sizeof(T), kSize, cudaMemcpyDeviceToHost, stream);
+    cudaCheckError();
+  }
+  void copyAsyncFromDevice(cudaStream_t& stream, GPlex<T, M>& gplex) {
+    cudaMemcpy2DAsync(ptr, pitch, gplex.ptr, gplex.pitch,
+                      N*sizeof(T), kSize, cudaMemcpyDeviceToDevice, stream);
+    cudaCheckError();
+  }
 };
 
+using GPlexLL = GPlex<float, MPlexLL>;
+using GPlexLV = GPlex<float, MPlexLV>;
+using GPlexLS = GPlex<float, MPlexLS>;
+
+using GPlexHH = GPlex<float, MPlexHH>;
+using GPlexHV = GPlex<float, MPlexHV>;
+using GPlexHS = GPlex<float, MPlexHS>;
+
+using GPlexLH = GPlex<float, MPlexLH>;
+
+using GPlexQF = GPlex<float, MPlexQF>;
+using GPlexQI = GPlex<int, MPlexQI>;
+
+const int GPlexHitIdxMax = 16;  // FIXME: copied and past from MkFitter.h
+using GPlexHitIdx = GPlex<int, Matriplex::Matriplex<int, GPlexHitIdxMax, 1, MPT_SIZE>>;
+
+template <typename T, int D1, int D2>
+struct GPlexReg {
+  __device__ T  operator[](int xx) const { return arr[xx]; }
+  __device__ T& operator[](int xx)       { return arr[xx]; }
+
+  __device__ T& operator()(int n, int i, int j)       { return arr[i*D2 + j]; }
+  __device__ T  operator()(int n, int i, int j) const { return arr[i*D2 + j]; }
+
+  __device__ void SetVal(T v)
+  {
+     for (int i = 0; i < D1; ++i)
+     {
+        arr[i] = v;
+     }
+  }
+
+  T arr[D1];
+};
+
+using GPlexRegLL = GPlexReg<float, 36, 6>;
+using GPlexRegLH = GPlexReg<float, 18, 6>;
+using GPlexRegHH = GPlexReg<float, 9, 3>;
+using GPlexRegLV = GPlexReg<float, 6, 1>;
+using GPlexRegHS = GPlexReg<float, 6, 1>;
+using GPlexRegHV = GPlexReg<float, 3, 1>;
+using GPlexReg2V = GPlexReg<float, 2, 1>;
+using GPlexReg2S = GPlexReg<float, 3, 1>;
+using GPlexRegQF = GPlexReg<float, 1, 1>;
+
 #endif  // _GPLEX_H_
diff --git a/mkFit/GeometryCU.h b/mkFit/GeometryCU.h
new file mode 100644
index 0000000000000..16cda75eaee87
--- /dev/null
+++ b/mkFit/GeometryCU.h
@@ -0,0 +1,23 @@
+#ifndef GEOMETRY_CU_H
+#define GEOMETRY_CU_H
+
+#include "gpu_utils.h"
+
+struct GeometryCU {
+  float *radii;
+
+  void allocate() {
+    cudaMalloc((void**)&radii, Config::nLayers * sizeof(float));
+    cudaCheckError();
+  }
+  void deallocate() {
+    cudaFree(radii);
+    cudaCheckError();
+  }
+  void getRadiiFromCPU(const float *h_radii) {
+    cudaMemcpy(radii, h_radii, Config::nLayers * sizeof(float), cudaMemcpyHostToDevice);
+    cudaCheckError();
+  }
+};
+
+#endif /* ifndef GEOMETRY_CU_H */
diff --git a/mkFit/HitStructures.cc b/mkFit/HitStructures.cc
index 9d5703555fe59..427ca92690b1f 100644
--- a/mkFit/HitStructures.cc
+++ b/mkFit/HitStructures.cc
@@ -189,7 +189,7 @@ void LayerOfHits::SelectHitIndices(float z, float phi, float dz, float dphi, std
   // zb1 -= 2; if (zb < 0) zb = 0;
   // zb2 += 2; if (zb >= m_nz) zb = m_nz;
 
-  if (dump)
+  //if (dump)
     printf("LayerOfHits::SelectHitIndices %6.3f %6.3f %6.4f %7.5f %3d %3d %4d %4d\n",
            z, phi, dz, dphi, zb1, zb2, pb1, pb2);
 
@@ -202,6 +202,7 @@ void LayerOfHits::SelectHitIndices(float z, float phi, float dz, float dphi, std
 
       for (int hi = m_phi_bin_infos[zi][pb].first; hi < m_phi_bin_infos[zi][pb].second; ++hi)
       {
+        printf("hi : %d\n", hi);
         // Here could enforce some furhter selection on hits
 #ifdef LOH_USE_PHI_Z_ARRAYS
         float ddz   = std::abs(z   - m_hit_zs[hi]);
diff --git a/mkFit/HitStructuresCU.cu b/mkFit/HitStructuresCU.cu
new file mode 100644
index 0000000000000..3fc3fb797312a
--- /dev/null
+++ b/mkFit/HitStructuresCU.cu
@@ -0,0 +1,215 @@
+
+#include <vector>
+#include <algorithm>
+
+#include "HitStructuresCU.h"
+
+void LayerOfHitsCU::alloc_hits(const int size) {
+  cudaMalloc((void**)&m_hits, sizeof(Hit)*size);
+  m_capacity = size;
+}
+
+void LayerOfHitsCU::free_hits() {
+  cudaFree(m_hits);
+  m_capacity = 0;
+}
+
+void LayerOfHitsCU::alloc_phi_bin_infos(const int nz, const int nphi) {
+  cudaMalloc((void**)&m_phi_bin_infos, sizeof(PairIntsCU)*nz*nphi);
+  m_nz = nz;
+}
+
+void LayerOfHitsCU::free_phi_bin_infos() {
+  cudaFree(m_phi_bin_infos);
+  m_nz = 0;
+}
+
+void LayerOfHitsCU::copyLayerOfHitsFromCPU(const LayerOfHits &layer, 
+                                           const cudaStream_t &stream) {
+  cudaMemcpyAsync(m_hits, layer.m_hits, sizeof(Hit)*m_capacity,
+                  cudaMemcpyHostToDevice, stream);
+  /*cudaCheckError();*/
+  m_zmin = layer.m_zmin;
+  m_zmax = layer.m_zmax;
+  m_fz = layer.m_fz;
+  // FIXME: copy other values
+  // TODO: probably quite inefficient:
+  for (int i = 0; i < m_nz; ++i) {
+    cudaMemcpyAsync(m_phi_bin_infos + i*Config::m_nphi, &(layer.m_phi_bin_infos[i][0]), 
+                    sizeof(PairIntsCU)*Config::m_nphi, cudaMemcpyHostToDevice, stream);
+    /*cudaCheckError();*/
+  }
+}
+
+void LayerOfHitsCU::copyFromCPU(const HitVec hits, const cudaStream_t &stream)
+{
+  cudaMemcpyAsync(m_hits, &hits[0], sizeof(Hit)*hits.size(),
+                  cudaMemcpyHostToDevice, stream);
+}
+
+void EventOfHitsCU::allocGPU(const EventOfHits &event_of_hits) {
+  m_n_layers = event_of_hits.m_n_layers;
+  // Allocate GPU array. 
+  // Members's address  of array's elements are in the GPU space
+  cudaMalloc((void**)&m_layers_of_hits, m_n_layers*sizeof(LayerOfHitsCU));
+  cudaCheckError();
+  // Allocate CPU array. 
+  // Members's address  of array's elements are in the CPU space
+  // This allows to call allocate for each array's element.
+  m_layers_of_hits_alloc = new LayerOfHitsCU[m_n_layers];
+  for (int i = 0; i < m_n_layers; ++i) {
+    m_layers_of_hits_alloc[i].alloc_hits(event_of_hits.m_layers_of_hits[i].m_capacity);
+    m_layers_of_hits_alloc[i].alloc_phi_bin_infos(
+        event_of_hits.m_layers_of_hits[i].m_nz, 
+        Config::m_nphi);
+  }
+  /*cudaCheckError();*/
+}
+
+void EventOfHitsCU::allocGPU(const std::vector<HitVec> &layerHits)
+{
+  m_n_layers = layerHits.size();
+  // Allocate GPU array. 
+  // Members's address  of array's elements are in the GPU space
+  cudaMalloc((void**)&m_layers_of_hits, m_n_layers*sizeof(LayerOfHitsCU));
+  cudaCheckError();
+  // Allocate CPU array. 
+  // Members's address  of array's elements are in the CPU space
+  // This allows to call allocate for each array's element.
+  m_layers_of_hits_alloc = new LayerOfHitsCU[m_n_layers];
+  for (int i = 0; i < m_n_layers; ++i) {
+    m_layers_of_hits_alloc[i].alloc_hits(layerHits[i].size());
+    // no phi_bin_infos -- free-d later
+    m_layers_of_hits_alloc[i].alloc_phi_bin_infos(1, 1);
+  }
+  /*cudaCheckError();*/
+}
+
+void EventOfHitsCU::deallocGPU() {
+  for (int i = 0; i < m_n_layers; ++i) {
+    /*cudaCheckError();*/
+    m_layers_of_hits_alloc[i].free_hits();
+    m_layers_of_hits_alloc[i].free_phi_bin_infos();
+    /*cudaCheckError();*/
+  }
+  cudaFree(m_layers_of_hits);
+  /*cudaCheckError();*/
+  delete[] m_layers_of_hits_alloc;
+}
+
+void EventOfHitsCU::copyFromCPU(const EventOfHits& event_of_hits,
+                                const cudaStream_t &stream) {
+  for (int i = 0; i < event_of_hits.m_n_layers; i++) {
+    m_layers_of_hits_alloc[i].copyLayerOfHitsFromCPU(event_of_hits.m_layers_of_hits[i]);
+  }
+  /*cudaCheckError();*/
+  cudaMemcpyAsync(m_layers_of_hits, m_layers_of_hits_alloc, 
+                  event_of_hits.m_n_layers*sizeof(LayerOfHitsCU), 
+                  cudaMemcpyHostToDevice, stream);
+  /*cudaCheckError();*/
+}
+
+
+void EventOfHitsCU::copyFromCPU(const std::vector<HitVec> &layerHits,
+                                const cudaStream_t &stream) {
+  for (int i = 0; i < layerHits.size(); i++) {
+    m_layers_of_hits_alloc[i].copyFromCPU(layerHits[i]);
+  }
+  cudaMemcpyAsync(m_layers_of_hits, m_layers_of_hits_alloc, 
+                  m_n_layers*sizeof(LayerOfHitsCU), 
+                  cudaMemcpyHostToDevice, stream);
+}
+
+
+// ============================================================================
+
+void EtaBinOfCandidatesCU::alloc_tracks(const int ntracks) {
+  m_real_size = ntracks;
+  m_fill_index = 0;
+
+  cudaMalloc((void**)&m_candidates, sizeof(Track)*m_real_size);
+  /*cudaCheckError();*/
+}
+
+
+void EtaBinOfCandidatesCU::free_tracks() {
+  cudaFree(m_candidates);
+  /*cudaCheckError();*/
+  m_real_size = 0;
+  m_fill_index = 0;
+}
+
+
+void EtaBinOfCandidatesCU::copyFromCPU(const EtaBinOfCandidates &eta_bin, 
+                                       const cudaStream_t &stream) {
+  assert (eta_bin.m_fill_index < m_real_size); // or something
+  m_fill_index = eta_bin.m_fill_index;
+
+  cudaMemcpyAsync(m_candidates, &eta_bin.m_candidates[0],
+                  sizeof(Track)*m_fill_index, cudaMemcpyHostToDevice, stream);
+  /*cudaCheckError();*/
+}
+
+
+void EtaBinOfCandidatesCU::copyToCPU(EtaBinOfCandidates &eta_bin,
+                                     const cudaStream_t &stream) const {
+  assert (eta_bin.m_fill_index < m_real_size); // or something
+
+  cudaMemcpyAsync(&eta_bin.m_candidates[0], m_candidates,
+                  sizeof(Track)*m_fill_index, cudaMemcpyDeviceToHost, stream);
+  /*cudaCheckError();*/
+}
+
+// ============================================================================
+
+void EventOfCandidatesCU::allocGPU(const EventOfCandidates &event_of_cands) {
+  m_n_etabins = Config::nEtaBin;
+  // Allocate GPU array. 
+  // Members's address  of array's elements are in the GPU space
+  cudaMalloc((void**)&m_etabins_of_candidates, m_n_etabins*sizeof(EtaBinOfCandidatesCU));
+  /*cudaCheckError();*/
+  // Allocate CPU array. 
+  // Members's address  of array's elements are in the CPU space
+  // This allows to call allocate for each array's element.
+  m_etabins_of_candidates_alloc = new EtaBinOfCandidatesCU[m_n_etabins];
+  for (int i = 0; i < m_n_etabins; ++i) {
+    const EtaBinOfCandidates& h_etabin = event_of_cands.m_etabins_of_candidates[i];
+    m_etabins_of_candidates_alloc[i].alloc_tracks(h_etabin.m_real_size);
+  }
+  /*cudaCheckError();*/
+}
+
+
+void EventOfCandidatesCU::deallocGPU() {
+  for (int i = 0; i < m_n_etabins; ++i) {
+    /*cudaCheckError();*/
+    m_etabins_of_candidates_alloc[i].free_tracks();
+    /*cudaCheckError();*/
+  }
+  cudaFree(m_etabins_of_candidates);
+  /*cudaCheckError();*/
+  delete[] m_etabins_of_candidates_alloc;
+}
+
+
+void EventOfCandidatesCU::copyFromCPU(const EventOfCandidates& event_of_cands,
+                                      const cudaStream_t &stream) {
+  for (int i = 0; i < m_n_etabins; i++) {
+    m_etabins_of_candidates_alloc[i].copyFromCPU(event_of_cands.m_etabins_of_candidates[i]);
+  }
+  /*cudaCheckError();*/
+  cudaMemcpyAsync(m_etabins_of_candidates, m_etabins_of_candidates_alloc, 
+      m_n_etabins*sizeof(EtaBinOfCandidatesCU), 
+      cudaMemcpyHostToDevice, stream);
+  /*cudaCheckError();*/
+}
+
+
+void EventOfCandidatesCU::copyToCPU(EventOfCandidates& event_of_cands, 
+                                    const cudaStream_t &stream) const {
+  for (int i = 0; i < m_n_etabins; i++) {
+    m_etabins_of_candidates_alloc[i].copyToCPU(event_of_cands.m_etabins_of_candidates[i]);
+  }
+  /*cudaCheckError();*/
+  // We do not need to copy the array of pointers to EventOfCandidatesCU back
+}
diff --git a/mkFit/HitStructuresCU.h b/mkFit/HitStructuresCU.h
new file mode 100644
index 0000000000000..11146b3f930e0
--- /dev/null
+++ b/mkFit/HitStructuresCU.h
@@ -0,0 +1,142 @@
+#ifndef _HIT_STRUCTURES_H_
+#define _HIT_STRUCTURES_H_
+
+#include "HitStructures.h"
+#include "Config.h"
+#include "gpu_utils.h"
+
+template <typename T1, typename T2>
+struct PairCU {
+  T1 first;
+  T2 second;
+};
+
+using PairIntsCU = PairCU<int, int>;
+
+class LayerOfHitsCU {
+ public:
+  Hit *m_hits;
+  PairIntsCU *m_phi_bin_infos;
+
+  float m_zmin, m_zmax, m_fz;
+  int m_nz = 0;
+  int m_capacity = 0;
+ 
+  //fixme, these are copies of the ones above, need to merge with a more generic name
+  float m_rmin, m_rmax, m_fr;
+  int   m_nr = 0;
+
+  // This could be a parameter, layer dependent.
+  //static constexpr int   m_nphi = 1024;
+  // Testing bin filling
+  //  static constexpr int   m_nphi = 16;
+  static constexpr float m_fphi = Config::m_nphi / Config::TwoPI;
+  static constexpr int   m_phi_mask = 0x3ff;
+
+  // As above
+  //static constexpr float m_max_dz   = 1;
+  //static constexpr float m_max_dphi = 0.02;
+
+  LayerOfHitsCU() {};
+  ~LayerOfHitsCU() {};
+
+  void alloc_hits(const int size);
+  void free_hits();
+
+  void alloc_phi_bin_infos(const int nz, const int nphi);
+  void free_phi_bin_infos();
+
+  void copyLayerOfHitsFromCPU(const LayerOfHits &layer,
+                              const cudaStream_t &stream=0);
+  void copyFromCPU(const HitVec hits, const cudaStream_t &stream=0);
+
+#ifdef __CUDACC__
+  __device__
+#endif
+  int GetZBin(const float z)    const { return (z - m_zmin) * m_fz; }
+    
+#ifdef __CUDACC__
+  __device__
+#endif
+  int GetZBinChecked(float z) const {
+    int zb = GetZBin(z);
+    if (zb < 0) { 
+      zb = 0;
+    } else {
+      if (zb >= m_nz) zb = m_nz - 1;
+    }
+    return zb; 
+  }
+
+  // if you don't pass phi in (-pi, +pi), mask away the upper bits using m_phi_mask
+#ifdef __CUDACC__
+  __device__
+#endif
+  int   GetPhiBin(float phi) const {
+    return floorf(m_fphi * (phi + Config::PI)); 
+  }
+};
+
+
+class EventOfHitsCU 
+{
+public:
+  LayerOfHitsCU *m_layers_of_hits;  // the real stuff: on GPU
+  int            m_n_layers;
+
+  // The following array is to be able to allocate GPU arrays from
+  // the CPU and then copy the address of the GPU ptr to the GPU structure
+  LayerOfHitsCU *m_layers_of_hits_alloc;
+  
+  EventOfHitsCU() : m_n_layers{} {};
+
+  void allocGPU(const EventOfHits &event_of_hits);
+  void allocGPU(const std::vector<HitVec> &layerHits);
+  void deallocGPU();
+  void copyFromCPU(const EventOfHits& event_of_hits,
+                   const cudaStream_t &stream=0);
+  void copyFromCPU(const std::vector<HitVec> &layerHits,
+                   const cudaStream_t &stream=0);
+};
+
+// ============================================================================
+
+class EtaBinOfCandidatesCU 
+{
+public:
+  Track *m_candidates;
+  
+  int m_real_size;
+  int m_fill_index;
+
+  void alloc_tracks(const int ntracks);
+  void free_tracks();
+
+  void copyFromCPU(const EtaBinOfCandidates &eta_bin,
+                   const cudaStream_t &stream=0);
+  void copyToCPU(EtaBinOfCandidates &eta_bin,
+                  const cudaStream_t &stream=0) const;
+};
+
+
+class EventOfCandidatesCU 
+{
+public:
+  EtaBinOfCandidatesCU *m_etabins_of_candidates;  // device array
+  int m_n_etabins;
+  
+  // Host array. For allocation and transfer purposes
+  EtaBinOfCandidatesCU *m_etabins_of_candidates_alloc;
+
+  EventOfCandidatesCU() : m_n_etabins{} {};
+
+  void allocGPU(const EventOfCandidates &event_of_cands);
+  void deallocGPU();
+  void copyFromCPU(const EventOfCandidates &event_of_cands,
+                   const cudaStream_t &stream=0);
+  void copyToCPU(EventOfCandidates &event_of_cands,
+                 const cudaStream_t &stream=0) const;
+};
+
+#endif  // _HIT_STRUCTURES_H_
+
diff --git a/mkFit/KalmanUtilsMPlex.cc b/mkFit/KalmanUtilsMPlex.cc
index 71d49be47b250..4ab6c6d18836c 100644
--- a/mkFit/KalmanUtilsMPlex.cc
+++ b/mkFit/KalmanUtilsMPlex.cc
@@ -855,7 +855,6 @@ void updateParametersMPlex(const MPlexLS &psErr,  const MPlexLV& psPar, const MP
 #endif
 }
 
-
 void computeChi2MPlex(const MPlexLS &psErr,  const MPlexLV& psPar, const MPlexQI &inChg,
                       const MPlexHS &msErr,  const MPlexHV& msPar,
                             MPlexQF& outChi2,
diff --git a/mkFit/KalmanUtilsMPlex.h b/mkFit/KalmanUtilsMPlex.h
index 678f417d26c69..53234074e2178 100644
--- a/mkFit/KalmanUtilsMPlex.h
+++ b/mkFit/KalmanUtilsMPlex.h
@@ -4,11 +4,21 @@
 #include "Track.h"
 #include "Matrix.h"
 
+#ifdef USE_CUDA
+#include "FitterCU.h"
+#endif
+
 void updateParametersMPlex(const MPlexLS &psErr,  const MPlexLV& psPar, const MPlexQI &inChg,
                            const MPlexHS &msErr,  const MPlexHV& msPar,
                                  MPlexLS &outErr,       MPlexLV& outPar,
                            const int      N_proc);
 
+#ifdef USE_CUDA  // FIXME: temporary; move to FitterCU
+void computeChi2MPlex_tmp(const MPlexLS &psErr,  const MPlexLV& psPar, const MPlexQI &inChg,
+                      const MPlexHS &msErr,  const MPlexHV& msPar,
+                            MPlexQF& outChi2,
+                            FitterCU<float>& cuFitter);
+#endif
 void computeChi2MPlex(const MPlexLS &psErr,  const MPlexLV& psPar, const MPlexQI &inChg,
 		      const MPlexHS &msErr,  const MPlexHV& msPar,
                             MPlexQF& outChi2,
diff --git a/mkFit/Makefile b/mkFit/Makefile
index eb0c38dea566c..3cfd1cbe24120 100644
--- a/mkFit/Makefile
+++ b/mkFit/Makefile
@@ -74,12 +74,21 @@ CU_OBJS := $(CU_SRCS:.cu=.o)
 
 LDFLAGS_CU := -lcudart
 
+# To share __device__ function across several source files
+# 1) compile with --device-c (works as -c should be)
+# 2) Create some kind of dictionary with all the .cu.o
+# 3) The dictionary AND the original .o files should be used
+# -- Works only for CUDA_VERSION >= 5
 # TODO: Clean the "-I.. -std=c++11"
 ${CU_OBJS}: %.o: %.cu
-	${NV} -c -o $@ $< -I.. -std=c++11 -DUSE_MATRIPLEX
+	${NV} --device-c -o $@ $< -I.. -std=c++11 -DUSE_MATRIPLEX
+
+CU_LINK := cu_link.o
+${CU_LINK}: ${CU_OBJS}
+	${NV} --device-link $^ --output-file $@
 endif
 
-ALLOBJS := ${MKFOBJS} ${ABOVE_OBJS} ${CU_OBJS}
+ALLOBJS := ${MKFOBJS} ${ABOVE_OBJS} ${CU_OBJS} ${CU_LINK}
 
 ${MKFDEPS}: auto-genmplex
 
@@ -100,6 +109,11 @@ TTree.h:
 	echo "Using dummy rule for TTree.h"
 endif
 
+ifdef CUBROOT
+cub/util_debug.cuh:
+	echo "Using dummy rule for cub/util_debug.cuh"
+endif
+
 ${MKFOBJS}: %.o: %.cc %.d
 	${CXX} ${CPPFLAGS} ${CXXFLAGS} ${VEC_HOST} -c -o $@ $<
 
diff --git a/mkFit/MkBuilder.h b/mkFit/MkBuilder.h
index a7ed903c32f85..e903d2dce6977 100644
--- a/mkFit/MkBuilder.h
+++ b/mkFit/MkBuilder.h
@@ -3,9 +3,6 @@
 
 #include <vector>
 
-#include "HitStructures.h"
-
-
 //------------------------------------------------------------------------------
 
 #include "MkFitter.h"
@@ -103,6 +100,10 @@ class MkBuilder
   virtual void FindTracks();
   virtual void FindTracksCloneEngine();
   virtual void FindTracksCloneEngineTbb();
+#ifdef USE_CUDA
+  const Event* get_event() const { return m_event; }
+  const EventOfHits& get_event_of_hits() const { return m_event_of_hits; }
+#endif
 };
 
 #endif
diff --git a/mkFit/MkFitter.cc b/mkFit/MkFitter.cc
index 28170716f054b..3aee5ab2c26ec 100644
--- a/mkFit/MkFitter.cc
+++ b/mkFit/MkFitter.cc
@@ -5,7 +5,7 @@
 #include "KalmanUtilsMPlex.h"
 #include "ConformalUtilsMPlex.h"
 #ifdef USE_CUDA
-#include "FitterCU.h"
+//#include "FitterCU.h"
 #endif
 
 //#define DEBUG
@@ -65,7 +65,10 @@ void MkFitter::InputTracksAndHits(const std::vector<Track>&  tracks,
   // assert(end - beg == NN);
 
   int itrack;
-#ifdef USE_CUDA
+
+// FIXME: uncomment when track building is ported to GPU.
+#if 0
+//#ifdef USE_CUDA
   // This openmp loop brings some performances when using
   // a single thread to fit all events.
   // However, it is more advantageous to use the threads to
@@ -88,7 +91,9 @@ void MkFitter::InputTracksAndHits(const std::vector<Track>&  tracks,
 // CopyIn seems fast enough, but indirections are quite slow.
 // For GPU computations, it has been moved in between kernels
 // in an attempt to overlap CPU and GPU computations.
-#ifndef USE_CUDA
+// FIXME: uncomment when track building is ported to GPU.
+#if 1
+//#ifndef USE_CUDA
     for (int hi = 0; hi < Nhits; ++hi)
     {
       const int hidx = trk.getHitIdx(hi);
@@ -114,7 +119,8 @@ void MkFitter::InputTracksAndHits(const std::vector<Track>&  tracks,
   // assert(end - beg == NN);
 
   int itrack;
-#ifdef USE_CUDA
+//#ifdef USE_CUDA
+#if 0
   // This openmp loop brings some performances when using
   // a single thread to fit all events.
   // However, it is more advantageous to use the threads to
@@ -137,7 +143,8 @@ void MkFitter::InputTracksAndHits(const std::vector<Track>&  tracks,
 // CopyIn seems fast enough, but indirections are quite slow.
 // For GPU computations, it has been moved in between kernels
 // in an attempt to overlap CPU and GPU computations.
-#ifndef USE_CUDA
+//#ifndef USE_CUDA
+#if 1
     for (int hi = 0; hi < Nhits; ++hi)
     {
       const int hidx = trk.getHitIdx(hi);
@@ -168,7 +175,8 @@ void MkFitter::SlurpInTracksAndHits(const std::vector<Track>&  tracks,
   int idx[NN]      __attribute__((aligned(64)));
   int itrack;
 
-#ifdef USE_CUDA
+//#ifdef USE_CUDA
+#if 0
   // This openmp loop brings some performances when using
   // a single thread to fit all events.
   // However, it is more advantageous to use the threads to
@@ -200,7 +208,8 @@ void MkFitter::SlurpInTracksAndHits(const std::vector<Track>&  tracks,
 // CopyIn seems fast enough, but indirections are quite slow.
 // For GPU computations, it has been moved in between kernels
 // in an attempt to overlap CPU and GPU computations.
-#ifndef USE_CUDA
+//#ifndef USE_CUDA
+#if 1
   for (int hi = 0; hi < Nhits; ++hi)
   {
     const int   hidx      = tracks[beg].getHitIdx(hi);
@@ -257,7 +266,9 @@ void MkFitter::InputTracksAndHitIdx(const std::vector<Track>& tracks,
 
     for (int hi = 0; hi < Nhits; ++hi)
     {
-
+      // MPBL: It does not seem that these values are that dummies
+      //       Not transfering them to the GPU reduces the number of
+      //       nFoundHits in the printouts.
       HitsIdx[hi](itrack, 0, 0) = trk.getHitIdx(hi);//dummy value for now
 
     }
@@ -351,7 +362,8 @@ void MkFitter::InputSeedsTracksAndHits(const std::vector<Track>&  seeds,
   // assert(end - beg == NN);
 
   int itrack;
-#ifdef USE_CUDA
+//#ifdef USE_CUDA
+#if 0
   // This openmp loop brings some performances when using
   // a single thread to fit all events.
   // However, it is more advantageous to use the threads to
@@ -378,7 +390,8 @@ void MkFitter::InputSeedsTracksAndHits(const std::vector<Track>&  seeds,
 // CopyIn seems fast enough, but indirections are quite slow.
 // For GPU computations, it has been moved in between kernels
 // in an attempt to overlap CPU and GPU computations.
-#ifndef USE_CUDA
+//#ifndef USE_CUDA
+#if 1
     for (int hi = 0; hi < Nhits; ++hi)
     {
       const int hidx = trk.getHitIdx(hi);
@@ -661,7 +674,7 @@ void MkFitter::SelectHitIndices(const LayerOfHits &layer_of_hits, const int N_pr
     // const int pb2 = L.GetPhiBin(phi + dphi) + 2;
 
     if (dump)
-      printf("LayerOfHits::SelectHitIndices %6.3f %6.3f %6.4f %7.5f %3d %3d %4d %4d\n",
+      printf("LayerOfHits::SelectHitIndices %6.3f %6.3f %6.6f %7.5f %3d %3d %4d %4d\n",
              z, phi, dz, dphi, zb1, zb2, pb1, pb2);
 
     // MT: One could iterate in "spiral" order, to pick hits close to the center.
diff --git a/mkFit/MkFitter.h b/mkFit/MkFitter.h
index 611de6f0affe9..d90c42a9db457 100644
--- a/mkFit/MkFitter.h
+++ b/mkFit/MkFitter.h
@@ -8,6 +8,11 @@
 #include "HitStructures.h"
 #include "BinInfoUtils.h"
 
+#if USE_CUDA
+#include "FitterCU.h"
+#include "HitStructuresCU.h"
+#endif
+
 //#define DEBUG 1
 
 //#define USE_BOHS
diff --git a/mkFit/PropagationMPlex.cc b/mkFit/PropagationMPlex.cc
index 733a21c091842..d0ce07b3cbc8a 100644
--- a/mkFit/PropagationMPlex.cc
+++ b/mkFit/PropagationMPlex.cc
@@ -583,6 +583,18 @@ void helixAtRFromIterativeCCS(const MPlexLV& inPar, const MPlexQI& inChg, MPlexL
     }
 }
 
+#if 0
+// FIXME: Do not destroy improvements from the devel branch.
+//        Dan's shared .icc proof of concept should be bring back once everything is cleanly merged.
+//#pragma omp declare simd simdlen(NN) notinbranch linear(n)
+#include "PropagationMPlex.icc"
+
+void helixAtRFromIterative(const MPlexLV& inPar, const MPlexQI& inChg, MPlexLV& outPar, const MPlexQF &msRad, MPlexLL& errorProp) {
+  errorProp.SetVal(0);
+
+  //#pragma ivdep
+  helixAtRFromIterative_impl(inPar, inChg, outPar, msRad, errorProp, 0, NN);
+#endif
 void helixAtRFromIterative(const MPlexLV& inPar, const MPlexQI& inChg, MPlexLV& outPar,
                            const MPlexQF &msRad, MPlexLL& errorProp,
                            const int      N_proc)
@@ -921,7 +933,7 @@ void applyMaterialEffects(const MPlexQF &hitsRl, const MPlexQF& hitsXi,
 
 void propagateHelixToRMPlex(const MPlexLS &inErr,  const MPlexLV& inPar,
                             const MPlexQI &inChg,  const MPlexHV& msPar, 
-			          MPlexLS &outErr,       MPlexLV& outPar,
+                            MPlexLS &outErr,       MPlexLV& outPar,
                             const int      N_proc)
 {
    const idx_t N  = NN;
@@ -1003,7 +1015,7 @@ void propagateHelixToRMPlex(const MPlexLS &inErr,  const MPlexLV& inPar,
 
 void propagateHelixToRMPlex(const MPlexLS& inErr,  const MPlexLV& inPar,
                             const MPlexQI& inChg,  const float    r,
-			    MPlexLS&       outErr, MPlexLV&       outPar,
+                            MPlexLS&       outErr, MPlexLV&       outPar,
                             const int      N_proc)
 {
    outErr = inErr;
diff --git a/mkFit/PropagationMPlex.icc b/mkFit/PropagationMPlex.icc
new file mode 100644
index 0000000000000..a9bfd217d063a
--- /dev/null
+++ b/mkFit/PropagationMPlex.icc
@@ -0,0 +1,200 @@
+template<typename Tf, typename Ti, typename TfLL1, typename Tf11, typename TfLLL>
+#ifdef __CUDACC__
+__device__
+#endif
+static inline void helixAtRFromIterative_impl(const Tf& __restrict__ inPar,  const   Ti& __restrict__ inChg, 
+                                                 TfLL1& __restrict__ outPar, const Tf11& __restrict__ msRad, 
+                                                 TfLLL& __restrict__ errorProp, int nmin, int nmax)
+{
+#pragma simd
+  for (int n = nmin; n < nmax; ++n) {
+    //initialize erroProp to identity matrix
+    errorProp(n,0,0) = 1.f;
+    errorProp(n,1,1) = 1.f;
+    errorProp(n,2,2) = 1.f;
+    errorProp(n,3,3) = 1.f;
+    errorProp(n,4,4) = 1.f;
+    errorProp(n,5,5) = 1.f;
+
+    const float xin  = inPar(n, 0, 0);
+    const float yin  = inPar(n, 1, 0);
+    const float pxin = inPar(n, 3, 0);
+    const float pyin = inPar(n, 4, 0);
+    const float pzin = inPar(n, 5, 0);
+    const float r    = msRad(n, 0, 0); 
+    float r0 = hipo(xin, yin);
+
+    dprint(std::endl << "attempt propagation from r=" << r0 << " to r=" << r << std::endl
+      << "x=" << xin << " y=" << yin  << " z=" << inPar.ConstAt(n, 2, 0) << " px=" << pxin << " py=" << pyin << " pz=" << pzin << " q=" << inChg.ConstAt(n, 0, 0));
+
+    if (std::abs(r-r0)<0.0001f) {
+    	dprint("distance less than 1mum, skip");
+      continue;
+    }
+    const float pt2    = pxin*pxin+pyin*pyin;
+    const float pt     = std::sqrt(pt2);
+    const float ptinv  = 1.f/pt;
+    const float pt2inv = ptinv*ptinv;
+    //p=0.3Br => r=p/(0.3*B)
+    const float k = inChg(n, 0, 0) * 100.f / (-Config::sol*Config::Bfield);
+    const float invcurvature = 1.f/(pt*k);//in 1./cm
+    const float ctgTheta=pzin*ptinv;
+
+    //variables to be updated at each iterations
+    float totalDistance = 0;
+    //derivatives initialized to value for first iteration, i.e. distance = r-r0in
+    float dTDdx = r0>0.0f ? -xin/r0 : 0.0f;
+    float dTDdy = r0>0.0f ? -yin/r0 : 0.0f;
+    float dTDdpx = 0.f;
+    float dTDdpy = 0.f;
+    //5 iterations is a good starting point
+    //const unsigned int Niter = 10;
+    // const unsigned int Niter = 5+std::round(r-r0)/2;
+#pragma ivdep
+    for (unsigned int iter=0; iter < Config::Niter; ++iter) {
+
+  	  dprint("propagation iteration #" << i);
+  	  const float x  = outPar(n, 0, 0);
+  	  const float y  = outPar(n, 1, 0);
+  	  const float px = outPar(n, 3, 0);
+  	  const float py = outPar(n, 4, 0);
+      r0 = hipo(x, y);
+
+  	  dprint("r0=" << r0 << " pt=" << pt);
+
+      totalDistance += (r-r0);
+  	  dprint("distance=" << (r-r0) << " angPath=" << (r-r0)*invcurvature);
+
+      float cosAP, sinAP;
+      if (Config::useTrigApprox) {  // TODO: uncomment
+        sincos4((r-r0)*invcurvature, sinAP, cosAP);
+      } else {
+        cosAP=std::cos((r-r0)*invcurvature);
+        sinAP=std::sin((r-r0)*invcurvature);
+      }
+
+      //helix propagation formulas
+      //http://www.phys.ufl.edu/~avery/fitting/fitting4.pdf
+  	  outPar(n, 0, 0) = outPar(n, 0, 0) + k*(px*sinAP-py*(1-cosAP));
+  	  outPar(n, 1, 0) = outPar(n, 1, 0) + k*(py*sinAP+px*(1-cosAP));
+  	  outPar(n, 2, 0) = outPar(n, 2, 0) + (r-r0)*ctgTheta;
+  	  outPar(n, 3, 0) = px*cosAP-py*sinAP;
+  	  outPar(n, 4, 0) = py*cosAP+px*sinAP;
+      //outPar(n, 5, 0) = pz; //take this out as it is redundant
+
+  	  if (iter+1 != Config::Niter && r0 > 0 && std::abs((r-r0)*invcurvature)>0.000000001f)
+  	  {
+  	     //update derivatives on total distance for next step, where totalDistance+=r-r0
+  	     //now r0 depends on px and py
+
+  	     dprint("r0=" << 1.f/r0 << " r0inv=" << r0 << " pt=" << pt);
+
+  	     //update derivative on D
+  	     const float dAPdpx = -(r-r0)*invcurvature*px*pt2inv;//r0 is now 1./r0 (this could go above the redefinition of r0!)
+  	     const float dAPdpy = -(r-r0)*invcurvature*py*pt2inv;
+  	     r0 = 1.f/r0;//WARNING, now r0 is r0inv (one less temporary)
+  	     const float dAPdx = -x*r0*invcurvature;
+  	     const float dAPdy = -y*r0*invcurvature;
+  	     //reduce temporary variables
+  	     //dxdx = 1 + k*dAPdx*(px*cosAP - py*sinAP);
+  	     //dydx = k*dAPdx*(py*cosAP + px*sinAP);
+  	     //dTDdx -= r0*(x*dxdx + y*dydx);
+  	     dTDdx -= r0*(x*(1.f + k*dAPdx*(px*cosAP - py*sinAP)) + y*(k*dAPdx*(py*cosAP + px*sinAP)));
+  	     //reuse same temporary variables
+  	     //dxdy = k*dAPdy*(px*cosAP - py*sinAP);
+  	     //dydy = 1 + k*dAPdy*(py*cosAP + px*sinAP);
+  	     //dTDdy -= r0*(x*dxdy + y*dydy);
+  	     dTDdy -= r0*(x*(k*dAPdy*(px*cosAP - py*sinAP)) + y*(1.f + k*dAPdy*(py*cosAP + px*sinAP)));
+  	     //dxdpx = k*(sinAP + px*cosAP*dAPdpx - py*sinAP*dAPdpx);
+  	     //dydpx = k*(py*cosAP*dAPdpx + 1. - cosAP + px*sinAP*dAPdpx);
+  	     //dTDdpx -= r0*(x*dxdpx + y*dydpx);
+  	     dTDdpx -= r0*(x*(k*(sinAP + px*cosAP*dAPdpx - py*sinAP*dAPdpx)) + y*(k*(py*cosAP*dAPdpx + 1.f - cosAP + px*sinAP*dAPdpx)));
+  	     //dxdpy = k*(px*cosAP*dAPdpy - 1. + cosAP - py*sinAP*dAPdpy);
+  	     //dydpy = k*(sinAP + py*cosAP*dAPdpy + px*sinAP*dAPdpy);
+  	     //dTDdpy -= r0*(x*dxdpy + y*(dydpy);
+  	     dTDdpy -= r0*(x*(k*(px*cosAP*dAPdpy - 1.f + cosAP - py*sinAP*dAPdpy)) + y*(k*(sinAP + py*cosAP*dAPdpy + px*sinAP*dAPdpy)));
+
+  	  }
+  	  dprint("iteration end, dump parameters" << std::endl
+  		 << "pos = " << outPar.At(n, 0, 0) << " " << outPar.At(n, 1, 0) << " " << outPar.At(n, 2, 0) << std::endl
+  		 << "mom = " << outPar.At(n, 3, 0) << " " << outPar.At(n, 4, 0) << " " << outPar.At(n, 5, 0) << std::endl
+  		 << "r=" << std::sqrt( outPar.At(n, 0, 0)*outPar.At(n, 0, 0) + outPar.At(n, 1, 0)*outPar.At(n, 1, 0) ) 
+       << " pT=" << std::sqrt( outPar.At(n, 3, 0)*outPar.At(n, 3, 0) + outPar.At(n, 4, 0)*outPar.At(n, 4, 0) ));
+      const float TD=totalDistance;
+      const float TP=TD*invcurvature;//totalAngPath
+    
+      dprint("TD=" << TD << " TP=" << TP << " arrived at r=" << std::sqrt(outPar.At(n, 0, 0)*outPar.At(n, 0, 0)+outPar.At(n, 1, 0)*outPar.At(n, 1, 0))
+        << std::endl
+        << "pos = " << outPar.At(n, 0, 0) << " " << outPar.At(n, 1, 0) << " " << outPar.At(n, 2, 0) << std::endl
+        << "mom = " << outPar.At(n, 3, 0) << " " << outPar.At(n, 4, 0) << " " << outPar.At(n, 5, 0));
+
+      const float iC=invcurvature;
+      const float dCdpx = k*pxin*ptinv;
+      const float dCdpy = k*pyin*ptinv;
+      const float dTPdx = dTDdx*iC;
+      const float dTPdy = dTDdy*iC;
+      const float dTPdpx = (dTDdpx - TD*dCdpx*iC)*iC; // MT change: avoid division
+      const float dTPdpy = (dTDdpy - TD*dCdpy*iC)*iC; // MT change: avoid division
+    
+      float cosTP, sinTP;
+      if (Config::useTrigApprox) {
+        sincos4(TP, sinTP, cosTP);
+      } else {
+        cosTP = std::cos(TP);
+        sinTP = std::sin(TP);
+      }
+
+      //now try to make full jacobian
+      //derive these to compute jacobian
+      //x = xin + k*(pxin*sinTP-pyin*(1-cosTP));
+      //y = yin + k*(pyin*sinTP+pxin*(1-cosTP));
+      //z = zin + k*TP*pzin;
+      //px = pxin*cosTP-pyin*sinTP;
+      //py = pyin*cosTP+pxin*sinTP;
+      //pz = pzin;
+      //jacobian
+
+      errorProp(n,0,0) = 1 + k*dTPdx*(pxin*cosTP - pyin*sinTP);	//dxdx;
+      errorProp(n,0,1) = k*dTPdy*(pxin*cosTP - pyin*sinTP);	//dxdy;
+      errorProp(n,0,2) = 0.;
+      errorProp(n,0,3) = k*(sinTP + pxin*cosTP*dTPdpx - pyin*sinTP*dTPdpx); //dxdpx;
+      errorProp(n,0,4) = k*(pxin*cosTP*dTPdpy - 1.f + cosTP - pyin*sinTP*dTPdpy);//dxdpy;
+      errorProp(n,0,5) = 0.;
+    
+      errorProp(n,1,0) = k*dTPdx*(pyin*cosTP + pxin*sinTP);	//dydx;
+      errorProp(n,1,1) = 1 + k*dTPdy*(pyin*cosTP + pxin*sinTP);	//dydy;
+      errorProp(n,1,2) = 0.;
+      errorProp(n,1,3) = k*(pyin*cosTP*dTPdpx + 1.f - cosTP + pxin*sinTP*dTPdpx);//dydpx;
+      errorProp(n,1,4) = k*(sinTP + pyin*cosTP*dTPdpy + pxin*sinTP*dTPdpy); //dydpy;
+      errorProp(n,1,5) = 0.;
+    
+      errorProp(n,2,0) = k*pzin*dTPdx;	//dzdx;
+      errorProp(n,2,1) = k*pzin*dTPdy;	//dzdy;
+      errorProp(n,2,2) = 1.f;
+      errorProp(n,2,3) = k*pzin*dTPdpx;//dzdpx;
+      errorProp(n,2,4) = k*pzin*dTPdpy;//dzdpy;
+      errorProp(n,2,5) = k*TP; //dzdpz;
+    
+      errorProp(n,3,0) = -dTPdx*(pxin*sinTP + pyin*cosTP);	//dpxdx;
+      errorProp(n,3,1) = -dTPdy*(pxin*sinTP + pyin*cosTP);	//dpxdy;
+      errorProp(n,3,2) = 0.;
+      errorProp(n,3,3) = cosTP - dTPdpx*(pxin*sinTP + pyin*cosTP); //dpxdpx;
+      errorProp(n,3,4) = -sinTP - dTPdpy*(pxin*sinTP + pyin*cosTP);//dpxdpy;
+      errorProp(n,3,5) = 0.;
+    
+      errorProp(n,4,0) = -dTPdx*(pyin*sinTP - pxin*cosTP); //dpydx;
+      errorProp(n,4,1) = -dTPdy*(pyin*sinTP - pxin*cosTP);	//dpydy;
+      errorProp(n,4,2) = 0.;
+      errorProp(n,4,3) = +sinTP - dTPdpx*(pyin*sinTP - pxin*cosTP);//dpydpx;
+      errorProp(n,4,4) = +cosTP - dTPdpy*(pyin*sinTP - pxin*cosTP);//dpydpy;
+      errorProp(n,4,5) = 0.;
+    
+      errorProp(n,5,0) = 0.;
+      errorProp(n,5,1) = 0.;
+      errorProp(n,5,2) = 0.;
+      errorProp(n,5,3) = 0.;
+      errorProp(n,5,4) = 0.;
+      errorProp(n,5,5) = 1.f;
+    }
+  }
+}
diff --git a/mkFit/accessors_cu.h b/mkFit/accessors_cu.h
new file mode 100644
index 0000000000000..f2bb9d9b07c24
--- /dev/null
+++ b/mkFit/accessors_cu.h
@@ -0,0 +1,40 @@
+#ifndef ACCESSORS_CU_H
+#define ACCESSORS_CU_H 
+
+template <>
+__device__ float* SVector3::ArrayCU() {
+  return fArray; 
+}
+
+template <>
+__device__ float* SVector6::ArrayCU() {
+  return fArray; 
+}
+
+template <>
+__device__ float* ROOT::Math::MatRepSym<float,6>::ArrayCU() {
+  return fArray;
+}
+
+template <>
+__device__ float* SMatrixSym66::ArrayCU() {
+  return fRep.ArrayCU(); 
+}
+
+__device__ float *Hit::posArrayCU() {
+  return state_.pos_.ArrayCU();
+}
+
+__device__ float *Hit::errArrayCU() {
+  return state_.err_.ArrayCU();
+}
+
+__device__ float *Track::posArrayCU() {
+  return state_.parameters.ArrayCU();
+}
+
+__device__ float *Track::errArrayCU() {
+  return state_.errors.ArrayCU();
+}
+
+#endif /* ifndef ACCESSORS_CU_H */
diff --git a/mkFit/array_algorithms_cu.h b/mkFit/array_algorithms_cu.h
new file mode 100644
index 0000000000000..439a0a10c6710
--- /dev/null
+++ b/mkFit/array_algorithms_cu.h
@@ -0,0 +1,59 @@
+#ifndef ARRAY_ALGORITHMS_CU_H
+#define ARRAY_ALGORITHMS_CU_H 
+
+#include <cub/cub.cuh>
+#include "atomic_utils.h"
+#include "gpu_utils.h"
+
+template <typename T,
+          int BLOCK_THREADS,
+          int ITEMS_PER_THREAD,
+          cub::BlockReduceAlgorithm ALGORITHM>
+__device__ void reduceMax_fn(const T *d_in, const int in_size, T *d_max) {
+  // Specialize BlockReduce type for our thread block
+  typedef cub::BlockReduce<T, BLOCK_THREADS, ALGORITHM> BlockReduceT;
+  // Shared memory
+  __shared__ typename BlockReduceT::TempStorage temp_storage;
+
+  for (int i = threadIdx.x + blockIdx.x*blockDim.x;
+           i < in_size;
+           i += blockDim.x * gridDim.x) {
+    // Per-thread tile data
+    T data[ITEMS_PER_THREAD];
+    int block_offset = i - threadIdx.x;
+    cub::LoadDirectStriped<BLOCK_THREADS>(threadIdx.x, 
+                                          d_in + block_offset, //blockIdx.x*blockDim.x,
+                                          data);
+    // Compute sum for a single thread block
+    T aggregate = BlockReduceT(temp_storage).Reduce(data, cub::Max());
+
+    // FIXME: Is reduction over block enough, or do we need device-wise reduction
+    //        CPU code reduces on NN (typically 8 or 16) values. So block-wise should
+    //        be good enough.
+    //        Device-wise reductions with atomics are performance killers
+    //if (threadIdx.x == 0) {
+      //AtomicMax(d_max, aggregate);
+    //}
+    *d_max = aggregate;
+  }
+}
+
+template <typename T,
+          int BLOCK_THREADS,
+          int ITEMS_PER_THREAD,
+          cub::BlockReduceAlgorithm ALGORITHM>
+__global__ void reduceMax_kernel(T *d_in, int in_size, T *d_max) {
+  reduceMax_fn<T, BLOCK_THREADS, ITEMS_PER_THREAD, ALGORITHM>(d_in, in_size, d_max);
+}
+
+template <typename T, 
+          int BLOCK_THREADS,
+          int ITEMS_PER_THREAD>
+void max_element_wrapper(T *d_v, int num_elems, T *d_max) {
+  dim3 block (BLOCK_THREADS, 1, 1);
+  dim3 grid (std::min(max_blocks_x, (num_elems-1)/BLOCK_THREADS + 1), 1, 1) ;
+  reduceMax_kernel<T, BLOCK_THREADS, ITEMS_PER_THREAD, cub::BLOCK_REDUCE_RAKING>
+       <<< grid, block >>>
+       (d_v, num_elems, d_max);
+}
+#endif /* ifndef ARRAY_ALGORITHMS_CU_H */
diff --git a/mkFit/atomic_utils.h b/mkFit/atomic_utils.h
new file mode 100644
index 0000000000000..c5f82d99bc50d
--- /dev/null
+++ b/mkFit/atomic_utils.h
@@ -0,0 +1,102 @@
+#ifndef ATOMIC_UTILS_H
+#define ATOMIC_UTILS_H
+
+/* Copyright (c) 2012, NVIDIA CORPORATION. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *  * Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ *  * Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *  * Neither the name of NVIDIA CORPORATION nor the names of its
+ *    contributors may be used to endorse or promote products derived
+ *    from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+ * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+ * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+ * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+// AtomicMAX code from:
+// https://github.com/parallel-forall/code-samples/blob/master/posts/cuda-aware-mpi-example/src/Device.cu
+
+#define uint64 unsigned long long
+
+  template <typename T>
+__device__ void AtomicMax(T * const address, const T value)
+{
+  atomicMax(address, value);
+}
+
+/**
+ * @brief Compute the maximum of 2 single-precision floating point values using an atomic operation
+ *
+ * @param[in]addressThe address of the reference value which might get updated with the maximum
+ * @param[in]valueThe value that is compared to the reference in order to determine the maximum
+ */
+  template <>
+__device__ void AtomicMax(float * const address, const float value)
+{
+  if (* address >= value)
+  {
+    return;
+  }
+
+  int * const address_as_i = (int *)address;
+  int old = * address_as_i, assumed;
+
+  do 
+  {
+    assumed = old;
+    if (__int_as_float(assumed) >= value)
+    {
+      break;
+    }
+    // The value stored at address_as_i might have changed since it has been loaded
+    // into old and into assumed.
+    old = atomicCAS(address_as_i, assumed, __float_as_int(value));
+  } while (assumed != old);  // the other threads did not change address_as_i, so the
+                             // atomicCAS return action makes sense.
+}
+
+/**
+ * @brief Compute the maximum of 2 double-precision floating point values using an atomic operation
+ *
+ * @param[in]addressThe address of the reference value which might get updated with the maximum
+ * @param[in]valueThe value that is compared to the reference in order to determine the maximum
+ */
+  template <>
+__device__ void AtomicMax(double * const address, const double value)
+{
+  if (* address >= value)
+  {
+    return;
+  }
+
+  uint64 * const address_as_i = (uint64 *)address;
+  uint64 old = * address_as_i, assumed;
+
+  do 
+  {
+    assumed = old;
+    if (__longlong_as_double(assumed) >= value)
+    {
+      break;
+    }
+
+    old = atomicCAS(address_as_i, assumed, __double_as_longlong(value));
+  } while (assumed != old);
+}
+
+#endif /* ifndef ATOMIC_UTILS_H */
diff --git a/mkFit/best_hit_kernels.cu b/mkFit/best_hit_kernels.cu
new file mode 100644
index 0000000000000..931aba62d18a9
--- /dev/null
+++ b/mkFit/best_hit_kernels.cu
@@ -0,0 +1,305 @@
+#include "best_hit_kernels.h"
+
+#include "computeChi2_kernels.h"
+#include "index_selection_kernels.h"
+#include "reorganize_gplex.h"
+#include "array_algorithms_cu.h"
+#include "kalmanUpdater_kernels.h"
+#include "propagation_kernels.h"
+
+#include <thrust/device_vector.h>
+#include <thrust/fill.h>
+#include <thrust/reduce.h>
+#include <thrust/functional.h>
+
+#define BLOCK_SIZE_X 256
+
+
+__device__ void getNewBestHitChi2_fn(
+    const GPlexQI &XHitSize, const GPlexHitIdx &XHitArr,
+    const float *outChi2, float &minChi2,
+    int &bestHit, const int hit_cnt, const int N) {
+  int itrack = threadIdx.x + blockDim.x*blockIdx.x;
+
+  if (itrack < N) {
+    if (hit_cnt < XHitSize[itrack]) {
+      float chi2 = fabs(outChi2[itrack]);
+      if (chi2 < minChi2) {
+        minChi2 = chi2;
+        bestHit = XHitArr(itrack, hit_cnt, 0);
+      }
+    }
+  }
+}
+
+
+__global__ void getNewBestHitChi2_kernel(
+    const GPlexQI XHitSize, const GPlexHitIdx XHitArr,
+    const float *outChi2, float *minChi2,
+    int *bestHit, const int hit_cnt, const int N) {
+  int itrack = threadIdx.x + blockDim.x*blockIdx.x;
+  if (itrack < N) {
+    getNewBestHitChi2_fn(XHitSize, XHitArr, outChi2, minChi2[itrack], bestHit[itrack], hit_cnt, N);
+  }
+}
+
+
+void getNewBestHitChi2_wrapper(const cudaStream_t &stream,
+    const GPlexQI &XHitSize, const GPlexHitIdx &XHitArr,
+    const GPlexQF &outChi2, float *minChi2, int *bestHit, 
+    const int hit_cnt, const int N) {
+  int gridx = std::min((N-1)/BLOCK_SIZE_X + 1,
+                       max_blocks_x);
+  dim3 grid(gridx, 1, 1);
+  dim3 block(BLOCK_SIZE_X, 1, 1);
+  getNewBestHitChi2_kernel <<< grid, block, 0, stream >>>
+    (XHitSize, XHitArr, outChi2.ptr, minChi2, bestHit, hit_cnt, N);
+}
+
+
+void fill_array_cu(float *array, const int size, const float value) {
+  thrust::device_ptr<float> d_ptr(array);
+  thrust::fill(d_ptr, d_ptr + size, value);
+}
+
+
+__device__ void updateTracksWithBestHit_fn(Hit *hits, 
+    const float minChi2, const int bestHit,
+    GPlexHS &msErr, GPlexHV &msPar, const GPlexLV &propPar, 
+    GPlexQF &Chi2, GPlexQI &HitsIdx, const int N) {
+  int itrack = threadIdx.x + blockDim.x*blockIdx.x;
+  if (itrack < N) {
+    if (bestHit >= 0)
+    {
+      Hit   &hit  = hits[ bestHit ];
+      const float &chi2_local = minChi2;
+	  
+      for (int i = 0; i < msErr.kSize; ++i) {
+        msErr(itrack, i, 0) = hit.errArrayCU()[i];
+      }
+      for (int i = 0; i < msPar.kSize; ++i) {
+        msPar(itrack, i, 0) = hit.posArrayCU()[i];
+      }
+      Chi2[itrack] += chi2_local;
+      HitsIdx[itrack] = bestHit;
+    }
+    else
+    {
+      /*msErr[Nhits].SetDiagonal3x3(itrack, 666);*/
+      msErr(itrack, 0, 0) = 666;
+      msErr(itrack, 1, 0) = 0;
+      msErr(itrack, 2, 0) = 666;
+      msErr(itrack, 3, 0) = 0;
+      msErr(itrack, 4, 0) = 0;
+      msErr(itrack, 5, 0) = 666;
+
+      for (int i = 0; i < msPar.kSize; ++i) {
+        msPar(itrack, i, 0) = propPar(itrack, i, 0);
+      }
+      HitsIdx[itrack] = -1;
+      // Don't update chi2
+    }
+  }
+}
+
+
+__global__ void updateTracksWithBestHit_kernel(Hit *hits, 
+    const float *minChi2, const int *bestHit,
+    GPlexHS msErr, GPlexHV msPar, const GPlexLV propPar, 
+    GPlexQF Chi2, GPlexQI HitsIdx, const int N) {
+  int itrack = threadIdx.x + blockDim.x*blockIdx.x;
+  if (itrack < N) {
+    updateTracksWithBestHit_fn
+        (hits, minChi2[itrack], bestHit[itrack],
+         msErr, msPar, propPar, Chi2, HitsIdx, N);
+  }
+}
+
+
+void updateTracksWithBestHit_wrapper(const cudaStream_t &stream,
+    LayerOfHitsCU &layer, const float *minChi2, const int *best_hit, 
+    GPlexHS &msErr, GPlexHV &msPar, const GPlexLV &propPar,
+    GPlexQF &Chi2, GPlexQI &HitsIdx, const int N) {
+  int gridx = std::min((N-1)/BLOCK_SIZE_X + 1,
+                       max_blocks_x);
+  dim3 grid(gridx, 1, 1);
+  dim3 block(BLOCK_SIZE_X, 1, 1);
+  updateTracksWithBestHit_kernel <<< grid, block, 0, stream >>>
+      (layer.m_hits, minChi2, best_hit, msErr, msPar, propPar, Chi2, HitsIdx, N);
+}
+
+
+int getMaxNumHits_wrapper(const GPlexQI d_XHitSize, const int N) {
+  thrust::device_ptr<int> d_ptr(d_XHitSize.ptr);
+  int maxSize=  thrust::reduce(d_ptr, d_ptr + N, -1, thrust::maximum<int>());
+  maxSize = std::min(maxSize, Config::maxHitsConsidered);
+
+  return maxSize;
+}
+
+
+__device__ void bestHit_fn(
+    Hit *hits, const GPlexQI &XHitSize, const GPlexHitIdx &XHitArr, 
+    const GPlexLS &propErr, GPlexHS &msErr, GPlexHV &msPar,
+    const GPlexLV &propPar, GPlexQF &outChi2,
+    GPlexQF &Chi2, GPlexQI &HitsIdx,
+    const int maxSize, const int itrack, const int N) {
+
+  /*int itrack = threadIdx.x + blockDim.x*blockIdx.x;*/
+  int bestHit_reg = -1;
+  float minChi2_reg = Config::chi2Cut;
+
+  if (itrack < N)
+    HitsIdx[itrack] = 0;
+
+  for (int hit_cnt = 0; hit_cnt < maxSize; ++hit_cnt)
+  {
+    HitToMs_fn(msErr, msPar, hits, XHitSize, XHitArr, HitsIdx, hit_cnt, itrack, N);
+#if 0
+      // TODO: add CMSGeom
+      if (Config::useCMSGeom) {
+        //propagateHelixToRMPlex(psErr,  psPar, inChg,  msPar, propErr, propPar);
+        throw std::runtime_error("useCMSGeom not implemented yet for GPU");
+      } else {}
+#endif
+    computeChi2_fn(propErr, msErr, msPar, propPar, outChi2, itrack, N);
+    getNewBestHitChi2_fn(XHitSize, XHitArr, outChi2.ptr, minChi2_reg, bestHit_reg, hit_cnt, N);
+  }
+  updateTracksWithBestHit_fn
+      (hits, 
+       minChi2_reg, bestHit_reg,
+       msErr, msPar, propPar,
+       Chi2, HitsIdx,
+       N);
+}
+
+
+__global__ void bestHit_kernel(
+    Hit *hits, const GPlexQI XHitSize, const GPlexHitIdx XHitArr, 
+    const GPlexLS propErr, GPlexHS msErr, GPlexHV msPar,
+    const GPlexLV propPar, GPlexQF outChi2,
+    GPlexQF Chi2, GPlexQI HitsIdx,
+    const int maxSize, const int N) {
+  int itrack = threadIdx.x + blockDim.x*blockIdx.x;
+  bestHit_fn(hits, XHitSize, XHitArr, 
+    propErr, msErr, msPar,
+    propPar, outChi2,
+    Chi2, HitsIdx,
+    maxSize, itrack, N);
+}
+
+
+void bestHit_wrapper(const cudaStream_t &stream,
+    LayerOfHitsCU &layer, const GPlexQI &XHitSize,  const GPlexHitIdx &XHitArr,
+    const GPlexLS &propErr, GPlexHS &msErr, GPlexHV &msPar,
+    const GPlexLV &propPar, GPlexQF &outChi2,
+    GPlexQF &Chi2, GPlexQI &HitsIdx,
+    const int maxSize, const int N) {
+  int gridx = std::min((N-1)/BLOCK_SIZE_X + 1,
+                       max_blocks_x);
+  dim3 grid(gridx, 1, 1);
+  dim3 block(BLOCK_SIZE_X, 1, 1);
+
+  bestHit_kernel <<< grid, block, 0, stream >>>
+    (layer.m_hits, XHitSize, XHitArr,
+     propErr, msErr, msPar, propPar, outChi2,
+     /*propErr.ptr, propErr.stride,*/
+     /*msErr.ptr, msErr.stride, msErr.kSize,*/
+     /*msPar.ptr, msPar.stride, msPar.kSize,*/
+     /*outChi2.ptr, outChi2.stride,*/
+     Chi2, HitsIdx,
+     maxSize, N);
+}
+
+
+template <int BLOCK_THREADS>
+__global__ void findBestHit_kernel(LayerOfHitsCU *layers,
+                                   EtaBinOfCandidatesCU *etabin_of_cands,
+                                   GPlexQI XHitSize, GPlexHitIdx XHitArr,
+                                   GPlexLS Err_iP, GPlexLV Par_iP,
+                                   GPlexHS *msErr_arr, GPlexHV *msPar_arr,
+                                   GPlexLS Err_iC, GPlexLV Par_iC,
+                                   GPlexQF outChi2,
+                                   GPlexQF Chi2, GPlexQI *HitsIdx_arr,
+                                   GPlexQI inChg, GPlexQI Label, GeometryCU geom, 
+                                   int *maxSize, int gplex_size,
+                                   const int nlayers_per_seed) {
+  for (int ebin = 0; ebin != Config::nEtaBin; ++ebin) {
+    for (int beg = 0; beg < etabin_of_cands[ebin].m_fill_index; beg += gplex_size) {
+      int end = min(beg + gplex_size, etabin_of_cands[ebin].m_fill_index);
+      int N = end - beg; 
+
+      int tidx = threadIdx.x + blockDim.x*blockIdx.x;
+      int itrack = beg + tidx;
+
+      if (itrack < end) {
+
+        InputTracksCU_fn(etabin_of_cands[ebin].m_candidates, Err_iP, Par_iP,
+            inChg, Chi2, Label, HitsIdx_arr, beg, end, tidx, N);
+
+        for (int ilay = nlayers_per_seed; ilay < Config::nLayers; ++ilay)
+        {
+          int hit_idx = ilay;
+          GPlexHS &msErr = msErr_arr[hit_idx];
+          GPlexHV &msPar = msPar_arr[hit_idx];
+          GPlexQI &HitsIdx = HitsIdx_arr[hit_idx];
+
+          float *radii = geom.radii;
+
+          LayerOfHitsCU &layer = layers[ilay];
+
+          int maxSize_block;
+          selectHitIndices_fn(layer, Err_iP, Par_iP, XHitSize, XHitArr, tidx, N);
+          // FIXME: Is reduction over block enough, or do we need device-wise reduction
+          reduceMax_fn<int, BLOCK_THREADS, 1, cub::BLOCK_REDUCE_WARP_REDUCTIONS>
+            (XHitSize.ptr, XHitSize.N, &maxSize_block);
+          bestHit_fn(layer.m_hits, XHitSize, XHitArr, 
+                     Err_iP, msErr, msPar, Par_iP, outChi2,
+                     Chi2, HitsIdx, maxSize_block, tidx, N);
+          kalmanUpdate_fn( Err_iP, msErr, Par_iP, msPar, Par_iC, Err_iC, tidx, N);
+          if (ilay+1 < Config::nLayers) {
+            float radius = radii[ilay+1];
+            propagationForBuilding_fn(Err_iC, Par_iC, inChg, radius, Err_iP, Par_iP, tidx, N);
+          }
+        }
+        OutputTracksCU_fn(etabin_of_cands[ebin].m_candidates, 
+            Err_iP, Par_iP, inChg, Chi2, Label, HitsIdx_arr, beg, end, tidx, N);
+      }
+    }
+  }
+}
+
+
+void findBestHit_wrapper(cudaStream_t &stream,
+    LayerOfHitsCU *layers,
+    EventOfCandidatesCU &event_of_cands_cu,
+    GPlexQI &XHitSize, GPlexHitIdx &XHitArr,
+    GPlexLS &Err_iP, GPlexLV &Par_iP,
+    GPlexHS *msErr, GPlexHV *msPar,
+    GPlexLS &Err_iC, GPlexLV &Par_iC,
+    GPlexQF &outChi2,
+    GPlexQF &Chi2, GPlexQI *HitsIdx,
+    GPlexQI &inChg, GPlexQI &Label,
+    GeometryCU &geom, 
+    int *maxSize, int N) {
+  /*int gridx = std::min((N-1)/BLOCK_SIZE_X + 1,*/
+                       /*max_blocks_x);*/
+  int gridx = (N-1)/BLOCK_SIZE_X + 1;
+  dim3 grid(gridx, 1, 1);
+  dim3 block(BLOCK_SIZE_X, 1, 1);
+
+  if (gridx > max_blocks_x) {
+    throw std::runtime_error("The matriplex size should be chosen such "
+                             "that gplex_size*BLOCK_SIZE_X <= max_blocks_x");
+  }
+  // The loop over tracks is taking care of the case where there is more tracks
+  // than available threads.
+  // We should actually not throw an exception, GPlex should be allocated with
+  // a smaller size in MkFitter.
+
+  findBestHit_kernel<BLOCK_SIZE_X> <<< grid, block, 0, stream >>>
+    (layers, event_of_cands_cu.m_etabins_of_candidates,
+     XHitSize, XHitArr, Err_iP, Par_iP, msErr, msPar,
+     Err_iC, Par_iC, outChi2, Chi2, HitsIdx, inChg, Label, geom, maxSize, N,
+     Config::nlayers_per_seed);
+}
diff --git a/mkFit/best_hit_kernels.h b/mkFit/best_hit_kernels.h
new file mode 100644
index 0000000000000..8cf6a421820fe
--- /dev/null
+++ b/mkFit/best_hit_kernels.h
@@ -0,0 +1,49 @@
+#ifndef BEST_HIT_KERNELS_H
+#define BEST_HIT_KERNELS_H 
+
+
+#include "GPlex.h"
+#include "HitStructuresCU.h"
+#include "GeometryCU.h"
+
+
+void getNewBestHitChi2_wrapper(const cudaStream_t &stream,
+    const GPlexQI &XHitSize, const GPlexHitIdx &XHitArr,
+    const GPlexQF &outChi2, float *minChi2, int *bestHit,
+    const int hit_cnt, const int N);
+
+
+void fill_array_cu(float *array, const int size, const float value);
+
+
+void updateTracksWithBestHit_wrapper(const cudaStream_t &stream,
+    LayerOfHitsCU &, const float *minChi2, const int *best_hit, 
+    GPlexHS &msErr, GPlexHV &msPar, const GPlexLV &propPar,
+    GPlexQF &Chi2, GPlexQI& HitsIdx, const int N);
+
+int getMaxNumHits_wrapper(const GPlexQI d_XHitSize, const int N);
+
+
+void bestHit_wrapper(const cudaStream_t &stream,
+    LayerOfHitsCU &layer, const GPlexQI &XHitSize, const GPlexHitIdx &XHitArr,
+    const GPlexLS &propErr, GPlexHS &msErr, GPlexHV &msPar,
+    const GPlexLV &propPar, GPlexQF &outChi2,
+    GPlexQF &Chi2, GPlexQI& HitsIdx,
+    const int maxSize, const int N);
+
+
+void findBestHit_wrapper(cudaStream_t &stream,
+                         LayerOfHitsCU *layers, 
+                         EventOfCandidatesCU &event_of_cands_cu,
+                         GPlexQI &XHitSize, GPlexHitIdx &XHitArr,
+                         GPlexLS &Err_iP, GPlexLV &Par_iP,
+                         GPlexHS *msErr, GPlexHV *msPar,
+                         GPlexLS &Err_iC, GPlexLV &Par_iC,
+                         GPlexQF &outChi2,
+                         GPlexQF &Chi2, GPlexQI *HitsIdx,
+                         GPlexQI &inChg, GPlexQI &Label,
+                         GeometryCU &geom, 
+                         int *maxSize, int N);
+
+
+#endif  /* ifndef BEST_HIT_KERNELS_H */
diff --git a/mkFit/buildtestMPlex.cc b/mkFit/buildtestMPlex.cc
index 22f066fd8c15f..2b83225218a14 100644
--- a/mkFit/buildtestMPlex.cc
+++ b/mkFit/buildtestMPlex.cc
@@ -7,6 +7,13 @@
 #include "BinInfoUtils.h"
 
 #include "MkBuilder.h"
+
+#ifdef USE_CUDA
+#include "FitterCU.h"
+#include "BuilderCU.h"
+#include "check_gpu_hit_structures.h"
+#endif
+
 #include "MkBuilderEndcap.h"
 
 #include <omp.h>
@@ -99,9 +106,20 @@ double runBuildingTestPlexBestHit(Event& ev)
   __itt_resume();
 #endif
 
+#if USE_CUDA
+  //check_event_of_hits_gpu(builder.get_event_of_hits());
+  //check_event_of_cands_gpu(event_of_cands);
+  BuilderCU builder_cu(builder.get_event_of_hits(), builder.get_event(),
+                       event_of_cands);
+#endif
+
   double time = dtime();
 
+#if USE_CUDA
+  builder_cu.FindTracksBestHit(event_of_cands);
+#else
   builder.FindTracksBestHit(event_of_cands);
+#endif
 
   time = dtime() - time;
 
@@ -109,8 +127,11 @@ double runBuildingTestPlexBestHit(Event& ev)
   __itt_pause();
 #endif
   
-  if   (!Config::normal_val) {builder.quality_output_besthit(event_of_cands);}
-  else                       {builder.root_val_besthit(event_of_cands);}
+  if   (!Config::normal_val) {
+    builder.quality_output_besthit(event_of_cands);
+  } else {
+    builder.root_val_besthit(event_of_cands);
+  }
 
   builder.end_event();
   
@@ -248,3 +269,69 @@ double runBuildingTestPlexTbb(Event& ev, EventTmp& ev_tmp)
 
   return time;
 }
+
+
+//==============================================================================
+// runAllBuildTestPlexBestHitGPU
+//==============================================================================
+
+#if USE_CUDA
+double runAllBuildingTestPlexBestHitGPU(std::vector<Event> &events)
+{
+
+  int num_builders = events.size();
+  std::vector<std::unique_ptr<MkBuilder>> builder_ptrs(num_builders);
+  std::vector<EventOfCandidates> event_of_cands_vec(num_builders);
+  std::vector<BuilderCU> builder_cu_vec(num_builders);
+
+  for (int i = 0; i < builder_ptrs.size(); ++i) {
+    Event &ev = events[i];
+    builder_ptrs[i] = std::unique_ptr<MkBuilder> (make_builder());
+
+    MkBuilder &builder = * builder_ptrs[i].get();
+
+    builder.begin_event(&ev, 0, __func__);
+
+    if   (Config::findSeeds) {builder.find_seeds();}
+    else                     {builder.map_seed_hits();} // all other simulated seeds need to have hit indices line up in LOH for seed fit
+
+    builder.fit_seeds_tbb();
+
+    EventOfCandidates &event_of_cands = event_of_cands_vec[i];
+    builder.find_tracks_load_seeds(event_of_cands);
+
+    BuilderCU &builder_cu = builder_cu_vec[i];
+    builder_cu.setUp(builder.get_event_of_hits(), builder.get_event(),
+                         event_of_cands);
+  }
+
+  //omp_set_num_threads(Config::numThreadsEvents);
+  //std::cerr << "num threads "<< omp_get_num_threads() << std::endl;
+//#pragma omp parallel for reduction(+:total_time)
+  //for (int i = 0; i < builder_ptrs.size(); ++i) {
+  double time = dtime();
+  tbb::parallel_for(size_t(0), builder_ptrs.size(), [&](size_t i) {
+    EventOfCandidates &event_of_cands = event_of_cands_vec[i];
+    BuilderCU &builder_cu = builder_cu_vec[i];
+    MkBuilder &builder = * builder_ptrs[i].get();
+
+    builder_cu.FindTracksBestHit(event_of_cands);
+  });
+  time = dtime() - time;
+
+  for (int i = 0; i < builder_ptrs.size(); ++i) {
+    EventOfCandidates &event_of_cands = event_of_cands_vec[i];
+    BuilderCU &builder_cu = builder_cu_vec[i];
+    MkBuilder &builder = * builder_ptrs[i].get();
+    if   (!Config::normal_val) {
+      builder.quality_output_besthit(event_of_cands);
+    } else {
+      builder.root_val_besthit(event_of_cands);
+    }
+
+    builder.end_event();
+  }
+  
+  return time;
+}
+#endif
diff --git a/mkFit/buildtestMPlex.h b/mkFit/buildtestMPlex.h
index 25b636014e108..bb082ad4a07c6 100644
--- a/mkFit/buildtestMPlex.h
+++ b/mkFit/buildtestMPlex.h
@@ -13,4 +13,8 @@ double runBuildingTestPlexCloneEngine(Event& ev, EventTmp& evtmp);
 
 double runBuildingTestPlexTbb(Event& ev, EventTmp& evtmp);
 
+#if USE_CUDA
+double runAllBuildingTestPlexBestHitGPU(std::vector<Event> &events);
+#endif
+
 #endif
diff --git a/mkFit/check_gpu_hit_structures.cu b/mkFit/check_gpu_hit_structures.cu
new file mode 100644
index 0000000000000..5918fa740518d
--- /dev/null
+++ b/mkFit/check_gpu_hit_structures.cu
@@ -0,0 +1,147 @@
+#include "check_gpu_hit_structures.h"
+
+/*#include "reorganize_gplex.cu"*/
+#include "Hit.h"
+#include "HitStructures.h"
+#include "HitStructuresCU.h"
+#include "reorganize_gplex.h"
+#include "gpu_utils.h"
+
+#include <iostream>
+
+
+__global__ void get_hit_pos_and_err(LayerOfHitsCU *layers,
+    int ilay, int hit_idx, float *pos, float *err, int pos_size, int err_size) {
+  if (threadIdx.x + blockDim.x * blockIdx.x == 0) {
+    LayerOfHitsCU &layer = layers[ilay];
+    Hit &hit = layer.m_hits[hit_idx];
+    float *posArray = get_posArray(hit);
+    float *errArray = get_errArray(hit);
+    for (int i = 0; i < pos_size; ++i) {
+      pos[i] = posArray[i];
+    }
+    for (int i = 0; i < err_size; ++i) {
+      err[i] = errArray[i];
+    }
+  }
+}
+
+
+void compare_carrays(const float *h_a, const float *d_a, 
+                     const float prec, const int n) 
+{
+  for (int i = 0; i < n; ++i) {
+    // should be relative comparison, verify if div by 0 will happen
+    if (std::abs(h_a[i] - d_a[i]) > prec) {
+      std::cerr << i << " : " << h_a[i] << " / " << d_a[i] << std::endl;
+    }
+  }
+}
+
+
+void check_event_of_hits_gpu(const EventOfHits& event_of_hits)
+{
+  EventOfHitsCU event_of_hits_cu;
+  event_of_hits_cu.allocGPU(event_of_hits);
+  event_of_hits_cu.copyFromCPU(event_of_hits);
+
+  constexpr int pos_size = 3;
+  constexpr int err_size = 6;
+
+  float *d_pos, *d_err;
+  float pos[pos_size], err[err_size];
+
+  cudaMalloc((void**)&d_pos, pos_size*sizeof(float));
+  cudaMalloc((void**)&d_err, err_size*sizeof(float));
+
+  dim3 grid(1, 1, 1);
+  dim3 block(1, 1, 1);
+
+  int ilay = 2;
+  int hit_idx = 3;
+
+  get_hit_pos_and_err <<< grid, block >>>
+    (event_of_hits_cu.m_layers_of_hits, ilay, hit_idx, d_pos, d_err, pos_size, err_size);
+
+  cudaMemcpy(pos, d_pos, pos_size*sizeof(float), cudaMemcpyDeviceToHost);
+  cudaMemcpy(err, d_err, err_size*sizeof(float), cudaMemcpyDeviceToHost);
+
+  //std::cerr << "pos ......................\n";
+  compare_carrays(event_of_hits.m_layers_of_hits[ilay].m_hits[hit_idx].posArray(),
+                  pos, 1e-3, pos_size);
+  //std::cerr << "err ......................\n";
+  compare_carrays(event_of_hits.m_layers_of_hits[ilay].m_hits[hit_idx].errArray(),
+                  err, 1e-3, err_size);
+
+  cudaFree(d_pos);
+  cudaFree(d_err);
+
+  event_of_hits_cu.deallocGPU();
+}
+
+
+__global__ void get_cand_pos_and_err(EtaBinOfCandidatesCU *etabin_of_cands,
+    const int ebin, const int itrack, float *pos, float *err,
+    const int pos_size, const int err_size)
+{
+  if (threadIdx.x + blockDim.x * blockIdx.x == 0) {
+    Track &track = etabin_of_cands[ebin].m_candidates[itrack];
+    float *posArray = get_posArray(track);
+    float *errArray = get_errArray(track);
+
+    for (int i = 0; i < pos_size; ++i) {
+      pos[i] = posArray[i];
+    }
+    for (int i = 0; i < err_size; ++i) {
+      err[i] = errArray[i];
+    }
+  }
+}
+
+
+void check_event_of_cands_gpu(const EventOfCandidates& event_of_cands)
+{
+  EventOfCandidatesCU event_of_cands_cu;
+  event_of_cands_cu.allocGPU(event_of_cands);
+  event_of_cands_cu.copyFromCPU(event_of_cands);
+
+  constexpr int pos_size = 6;
+  constexpr int err_size = 21;
+
+  float *d_pos, *d_err;
+  float pos[pos_size], err[err_size];
+
+  cudaMalloc((void**)&d_pos, pos_size*sizeof(float));
+  cudaMalloc((void**)&d_err, err_size*sizeof(float));
+
+  dim3 grid(1, 1, 1);
+  dim3 block(1, 1, 1);
+
+  int etabin = std::min(2, Config::nEtaBin-1);
+  int itrack = 3;
+
+  get_cand_pos_and_err <<< grid, block >>>
+      (event_of_cands_cu.m_etabins_of_candidates,
+       etabin, itrack, d_pos, d_err, pos_size, err_size);
+  cudaCheckErrorSync();
+
+  cudaMemcpy(pos, d_pos, pos_size*sizeof(float), cudaMemcpyDeviceToHost);
+  cudaCheckErrorSync();
+  cudaMemcpy(err, d_err, err_size*sizeof(float), cudaMemcpyDeviceToHost);
+  cudaCheckErrorSync();
+  
+  /*std::cerr << "pos ......................\n";*/
+  compare_carrays(event_of_cands.m_etabins_of_candidates[etabin].m_candidates[itrack].posArray(),
+                  pos, 1e-3, pos_size);
+  /*std::cerr << "err ......................\n";*/
+  compare_carrays(event_of_cands.m_etabins_of_candidates[etabin].m_candidates[itrack].errArray(),
+                  err, 1e-3, err_size);
+
+  cudaFree(d_pos);
+  cudaFree(d_err);
+
+  //event_of_cands_cu.copyToCPU(event_of_cands);
+  event_of_cands_cu.deallocGPU();
+
+  cudaCheckErrorSync();
+}
diff --git a/mkFit/check_gpu_hit_structures.h b/mkFit/check_gpu_hit_structures.h
new file mode 100644
index 0000000000000..29e6cdfcce39b
--- /dev/null
+++ b/mkFit/check_gpu_hit_structures.h
@@ -0,0 +1,9 @@
+#ifndef CHECK_GPU_HIT_STRUCTURE_H
+#define CHECK_GPU_HIT_STRUCTURE_H 
+
+#include "HitStructures.h"
+
+void check_event_of_hits_gpu(const EventOfHits& event_of_hits);
+void check_event_of_cands_gpu(const EventOfCandidates& event_of_cands);
+
+#endif /* ifndef CHECK_GPU_HIT_STRUCTURE_H */
diff --git a/mkFit/computeChi2_kernels.cu b/mkFit/computeChi2_kernels.cu
new file mode 100644
index 0000000000000..dc8d95d870640
--- /dev/null
+++ b/mkFit/computeChi2_kernels.cu
@@ -0,0 +1,146 @@
+#include "computeChi2_kernels.h"
+
+#include "GPlex.h"
+#include "kalmanUpdater_kernels.h"
+#include "gpu_utils.h"
+
+#include <stdio.h>
+
+#define L 6
+#define HS 6
+#define HV 3
+#define BLOCK_SIZE_X 256
+
+
+__device__ void chi2Similarity_fn(
+    const GPlexReg2V &a,
+    const GPlexReg2S &c, // in registers
+    float *d, const size_t dN,
+    const int n) {
+
+  //int n = threadIdx.x + blockIdx.x * blockDim.x;
+
+  // manually subrtact into local vars -- 3 of them
+  /*float x0 = a[0 * aN + n] - b[0 * aN + n];*/
+  /*float x1 = a[1 * aN + n] - b[1 * aN + n];*/
+  /*float x2 = a[2 * aN + n] - b[2 * aN + n];*/
+  /*d[0 * dN + n] = c[0]*x0*x0 + c[2]*x1*x1 + c[5]*x2*x2 +*/
+              /*2*( c[1]*x1*x0 + c[3]*x2*x0 + c[4]*x1*x2);*/
+  d[0 * dN + n] = c[0]*a[0]*a[0]
+                + c[2]*a[1]*a[1] 
+            + 2*( c[1]*a[1]*a[0]);
+}
+
+
+__device__ void RotateResidulsOnTangentPlane_fn(const float r00,//r00
+				  const float r01,//r01
+				  const GPlexRegHV &a  ,//res_glo
+          GPlexReg2V &b  )//res_loc
+{
+
+   // res_loc = rotT * res_glo
+   //   B     =  R   *    A   
+  b[0] =  r00*a[0] + r01*a[1];
+  b[1] =  a[2];
+}
+
+
+__device__ void ProjectResErr_fn(const float a00,
+		   const float a01,
+		   const GPlexRegHS &b, 
+       GPlexRegHH &c)
+{
+  // C = A * B, C is 3x3, A is 3x3 , B is 3x3 sym
+
+  // Based on script generation and adapted to custom sizes.
+      c[ 0] = a00*b[ 0] + a01*b[ 1];
+      c[ 1] = a00*b[ 1] + a01*b[ 2];
+      c[ 2] = a00*b[ 3] + a01*b[ 4];
+      c[ 3] = b[ 3];
+      c[ 4] = b[ 4];
+      c[ 5] = b[ 5];
+      c[ 6] = a01*b[ 0] - a00*b[ 1];
+      c[ 7] = a01*b[ 1] - a00*b[ 2];
+      c[ 8] = a01*b[ 3] - a00*b[ 4];
+}
+
+
+__device__ void ProjectResErrTransp_fn(const float a00,
+			 const float a01, const GPlexRegHH &b, GPlexReg2S &c)
+{
+  // C = A * B, C is 3x3 sym, A is 3x3 , B is 3x3
+
+  // Based on script generation and adapted to custom sizes.
+      c[ 0] = b[ 0]*a00 + b[ 1]*a01;
+      c[ 1] = b[ 3]*a00 + b[ 4]*a01;
+      c[ 2] = b[ 5];
+}
+
+
+__device__ void computeChi2_fn(
+    const GPlexLS &propErr, const GPlexHS &msErr, const GPlexHV &msPar,
+    const GPlexLV &propPar, GPlexQF &outChi2, const int n, const int N) {
+  //int n = threadIdx.x + blockIdx.x * blockDim.x;
+  /*float resErr_reg[HS]; // ~ resErr_glo*/
+  GPlexRegHS resErr_reg;
+
+  if (n < N) {
+    // coordinate change
+    float rotT00;
+    float rotT01;
+    const float r = hipo(msPar(n, 0, 0), msPar(n, 1, 0));
+    rotT00 = -(msPar(n, 1, 0) + propPar(n, 1, 0))/(2*r);
+    rotT01 =  (msPar(n, 0, 0) + propPar(n, 0, 0))/(2*r);
+
+    /*float res_glo[HV];*/
+    GPlexRegHV res_glo;
+    subtractFirst3_fn(msPar, propPar, res_glo, N, n);
+
+    for (int j = 0; j < HS; ++j) {
+      resErr_reg[j] = 0; //resErr[j*resErr_stride + n];
+    }
+    addIntoUpperLeft3x3_fn(propErr, msErr, resErr_reg, N, n);
+
+    GPlexReg2V res_loc;   //position residual in local coordinates
+    RotateResidulsOnTangentPlane_fn(rotT00,rotT01,res_glo,res_loc);
+    /*MPlex2S resErr_loc;//covariance sum in local position coordinates*/
+    /*MPlexHH tempHH;*/
+    GPlexReg2S resErr_loc; // 2x2 sym
+    GPlexRegHH tempHH;  // 3*3 sym
+    ProjectResErr_fn  (rotT00, rotT01, resErr_reg, tempHH);
+    ProjectResErrTransp_fn(rotT00, rotT01, tempHH, resErr_loc);
+
+    /*invertCramerSym_fn(resErr_reg);*/
+    invertCramerSym2x2_fn(resErr_loc);
+
+    chi2Similarity_fn(res_loc, resErr_loc, outChi2.ptr, outChi2.stride, n);
+  }
+}
+
+
+__global__ void computeChi2_kernel(
+    const GPlexLS propErr, const GPlexHS msErr, const GPlexHV msPar, 
+    const GPlexLV propPar, GPlexQF outChi2, const int N) {
+  int grid_width = blockDim.x * gridDim.x;
+  int itrack = threadIdx.x + blockDim.x*blockIdx.x;
+  for (int z = 0; z < (N-1)/grid_width  +1; z++) {
+    itrack += z*grid_width;
+
+    if (itrack < N) {
+      computeChi2_fn (propErr, msErr, msPar, propPar, outChi2, itrack, N);
+    }
+  }
+}
+
+
+void computeChi2_wrapper(cudaStream_t &stream, 
+    const GPlexLS &propErr, const GPlexHS &msErr,
+    const GPlexHV &msPar, const GPlexLV &propPar, GPlexQF &outChi2,
+    const int N) {
+  int gridx = std::min((N-1)/BLOCK_SIZE_X + 1,
+                       max_blocks_x);
+  dim3 grid(gridx, 1, 1);
+  dim3 block(BLOCK_SIZE_X, 1, 1);
+  computeChi2_kernel <<< grid, block, 0, stream >>>
+    (propErr, msErr, msPar, propPar, outChi2, N);
+ }
diff --git a/mkFit/computeChi2_kernels.h b/mkFit/computeChi2_kernels.h
new file mode 100644
index 0000000000000..72f0e1f06976e
--- /dev/null
+++ b/mkFit/computeChi2_kernels.h
@@ -0,0 +1,33 @@
+#ifndef _COMPUTE_CHI2_KERNELS_H_
+#define _COMPUTE_CHI2_KERNELS_H_
+
+#include "HitStructuresCU.h"
+#include "GPlex.h"
+#include "GeometryCU.h"
+
+
+__device__ void computeChi2_fn(const GPlexLS &propErr, const GPlexHS &msErr,
+    const GPlexHV &msPar, const GPlexLV &propPar, GPlexQF &outChi2, 
+    const int itrack, const int N);
+
+
+void computeChi2_wrapper(cudaStream_t &stream, 
+    const GPlexLS &propErr, const GPlexHS &msErr, const GPlexHV &msPar, 
+    const GPlexLV &propPar, GPlexQF &outChi2, const int N);
+
+#if 1
+void HitToMs_wrapper(cudaStream_t& stream,
+    GPlexHS &msErr, GPlexHV &msPar, LayerOfHitsCU &layer, GPlexQI &XHitSize, 
+    GPlexHitIdx &XHitArr, GPlexQI &HitsIdx, int hit_cnt, int N);
+#endif
+
+__device__ void RotateResidulsOnTangentPlane_fn(const float r00, 
+    const float r01, const GPlexRegHV &a, GPlexReg2V &b);
+
+__device__ void ProjectResErr_fn(const float a00, const float a01,
+                                 const GPlexRegHS &b, GPlexRegHH &c);
+
+__device__ void ProjectResErrTransp_fn(const float a00, const float a01, 
+                                       const GPlexRegHH &b, GPlexReg2S &c);
+
+#endif
diff --git a/mkFit/fittestMPlex.cc b/mkFit/fittestMPlex.cc
index 9293835b3d909..b5082932d3d7d 100644
--- a/mkFit/fittestMPlex.cc
+++ b/mkFit/fittestMPlex.cc
@@ -10,6 +10,7 @@
 
 #include "MkFitter.h"
 #if USE_CUDA
+#include "fittestMPlex.h"
 #include "FitterCU.h"
 #endif
 
@@ -24,6 +25,7 @@
 
 #include <iostream>
 #include <memory>
+#include <mutex>
 
 #if defined(USE_VTUNE_PAUSE)
 #include "ittnotify.h"
@@ -182,58 +184,107 @@ double runFittingTestPlex(Event& ev, std::vector<Track>& rectracks)
 }
 
 #ifdef USE_CUDA
-double runFittingTestPlexGPU(FitterCU<float> &cuFitter, 
-    Event& ev, std::vector<Track>& rectracks)
+void runAllEventsFittingTestPlexGPU(std::vector<Event>& events)
 {
+  double s_tmp = 0.0;
+#if 0
+  In principle, the warmup loop should not be required.
+  The separate_first_call_for_meaningful_profiling_numbers() function
+  should be enough.
+  // Warmup loop
+  for (int i = 0; i < 1; ++i) {
+    FitterCU<float> cuFitter(NN);
+    cuFitter.allocateDevice();
+    Event &ev = events[0];
+    std::vector<Track> plex_tracks_ev;
+    plex_tracks_ev.resize(ev.simTracks_.size());
+
+    if (g_run_fit_std) runFittingTestPlexGPU(cuFitter, ev, plex_tracks_ev);
+    cuFitter.freeDevice();
+  }
+#endif
+  separate_first_call_for_meaningful_profiling_numbers();
+
+  // Reorgnanization (copyIn) can eventually be multithreaded.
+  omp_set_nested(1);
+      
+  omp_set_num_threads(Config::numThreadsEvents);
+  double total_gpu_time = dtime();
+#pragma omp parallel reduction(+:s_tmp)
+  {
+  int numThreadsEvents = omp_get_num_threads();
+  int thr_idx = omp_get_thread_num();
+
+  // FitterCU is declared here to share allocations and deallocations
+  // between the multiple events processed by a single thread.
+  int gplex_size = 10000;
+  FitterCU<float> cuFitter(gplex_size);
+  cuFitter.allocateDevice();
+  cuFitter.allocate_extra_addBestHit();
+
+    for (int evt = thr_idx+1; evt <= Config::nEvents; evt+= numThreadsEvents) {
+      int idx = thr_idx;
+      printf("==============================================================\n");
+      printf("Processing event %d with thread %d\n", evt, idx);
+      Event &ev = events[evt-1];
+      std::vector<Track> plex_tracks_ev;
+      plex_tracks_ev.resize(ev.simTracks_.size());
+      double tmp = 0, tmp2bh = 0, tmp2 = 0, tmp2ce = 0;
+
+      //if (g_run_fit_std) tmp = runFittingTestPlexGPU(cuFitter, ev, plex_tracks_ev);
+      runFittingTestPlexGPU(cuFitter, ev, plex_tracks_ev);
+
+      printf("Matriplex fit = %.5f  -------------------------------------", tmp);
+      printf("\n");
+      s_tmp    += tmp;
+#if 0  // 0 for timing, 1 for validation
+      // Validation crashes for multiple threads.
+      // It is something in relation to ROOT. Not sure what. 
+      if (omp_get_num_threads() <= 1) {
+        //if (g_run_fit_std) {
+          std::string tree_name = "validation-plex-" + std::to_string(evt) + ".root";
+          make_validation_tree(tree_name.c_str(), ev.simTracks_, plex_tracks_ev);
+        //}
+      }
+#endif
+    }
+    cuFitter.free_extra_addBestHit();
+    cuFitter.freeDevice();
+  }
+  std::cerr << "###### [Fitting] Total GPU time: " << dtime() - total_gpu_time << " ######\n";
+}
 
-   std::vector<Track>& simtracks = ev.simTracks_;
-
-   const int Nhits = MAX_HITS;
-   // XXX What if there's a missing / double layer?
-   // Eventually, should sort track vector by number of hits!
-   // And pass the number in on each "setup" call.
-   // Reserves should be made for maximum possible number (but this is just
-   // measurments errors, params).
 
-   // NOTE: MkFitter *MUST* be on heap, not on stack!
-   // Standard operator new screws up alignment of ALL MPlex memebrs of MkFitter,
-   // even if one adds attr(aligned(64)) thingy to every possible place.
+double runFittingTestPlexGPU(FitterCU<float> &cuFitter, 
+    Event& ev, std::vector<Track>& rectracks)
+{
+  std::vector<Track>& simtracks = ev.simTracks_;
 
-   // MkFitter *mkfp = new (_mm_malloc(sizeof(MkFitter), 64)) MkFitter(Nhits);
+  cuFitter.createStream();
 
-   MkFitter* mkfp_arr = new (_mm_malloc(sizeof(MkFitter), 64)) MkFitter(Nhits);
+  Track *tracks_cu;
+  cudaMalloc((void**)&tracks_cu, simtracks.size()*sizeof(Track));
+  cudaMemcpyAsync(tracks_cu, &simtracks[0], simtracks.size()*sizeof(Track),
+                  cudaMemcpyHostToDevice, cuFitter.get_stream());
 
-   int theEnd = simtracks.size();
-   double time = dtime();
-   int Nstride = NN;
+  EventOfHitsCU events_of_hits_cu;
+  events_of_hits_cu.allocGPU(ev.layerHits_);
+  events_of_hits_cu.copyFromCPU(ev.layerHits_, cuFitter.get_stream());
 
-   for (int itrack = 0; itrack < theEnd; itrack += Nstride)
-   {
-      int end = std::min(itrack + Nstride, theEnd);
+  double time = dtime();
 
-      MkFitter *mkfp = mkfp_arr;
+  cuFitter.FitTracks(tracks_cu, simtracks.size(), events_of_hits_cu, Config::nLayers);
 
-      //double time_input = dtime();
-      mkfp->InputTracksAndHits(simtracks, ev.layerHits_, itrack, end);
-      //std::cerr << "Input time: " << (dtime() - time_input)*1e3 << std::endl;
+  cudaMemcpy(&rectracks[0], tracks_cu, simtracks.size()*sizeof(Track), cudaMemcpyDeviceToHost);
 
-      cuFitter.FitTracks(mkfp->Chg,
-                         mkfp->GetPar0(),
-                         mkfp->GetErr0(),
-                         mkfp->msPar,
-                         mkfp->msErr,
-                         Nhits,
-                         simtracks, itrack, end, ev.layerHits_);
+  time = dtime() - time;
 
-      double time_output = dtime();
-      mkfp->OutputFittedTracks(rectracks, itrack, end);
-      std::cerr << "Output time: " << (dtime() - time_output)*1e3 << std::endl;
-   }
 
-   time = dtime() - time;
+  events_of_hits_cu.deallocGPU();
+  cudaFree(tracks_cu);
 
-   _mm_free(mkfp_arr);
+  cuFitter.destroyStream();
 
-   return time;
+  return time;
 }
 #endif
diff --git a/mkFit/fittestMPlex.h b/mkFit/fittestMPlex.h
index 5fba69308fb23..2b88d56a17680 100644
--- a/mkFit/fittestMPlex.h
+++ b/mkFit/fittestMPlex.h
@@ -15,6 +15,7 @@ void   make_validation_tree(const char         *fname,
 double runFittingTestPlex(Event& ev, std::vector<Track>& rectracks);
 
 #ifdef USE_CUDA
+void runAllEventsFittingTestPlexGPU(std::vector<Event>& events);
 double runFittingTestPlexGPU(FitterCU<float> &cuFitter, Event& ev, std::vector<Track>& rectracks);
 #endif
 
diff --git a/mkFit/fittracks_kernels.cu b/mkFit/fittracks_kernels.cu
new file mode 100644
index 0000000000000..8833d3c3809c4
--- /dev/null
+++ b/mkFit/fittracks_kernels.cu
@@ -0,0 +1,48 @@
+#include "fittracks_kernels.h"
+
+#include "kalmanUpdater_kernels.h"
+#include "propagation_kernels.h"
+
+constexpr int BLOCK_SIZE_X = 256;
+
+__global__ void fittracks_kernel(
+      GPlexLV par_iP, GPlexLS Err_iP,
+      GPlexHV *msPar_arr, GPlexHS *msErr_arr,
+      GPlexLV par_iC, GPlexLS Err_iC,
+      GPlexLL errorProp, GPlexQI inChg,
+      const int Nhits, int N)
+{
+  int grid_width = blockDim.x * gridDim.x;
+  int n = threadIdx.x + blockIdx.x * blockDim.x;
+  for (int hi = 0; hi < Nhits; ++hi) {
+    GPlexHV &msPar = msPar_arr[hi];
+    GPlexHS &msErr = msErr_arr[hi];
+
+    for (int z = 0; z < (N-1)/grid_width  +1; z++) {
+      n += z*grid_width;
+
+      propagation_fn(Err_iC, par_iC, inChg, msPar, Err_iP, par_iP, n, N);
+      kalmanUpdate_fn(Err_iP, msErr, par_iP, msPar, par_iC, Err_iC, n, N);
+    }
+  }
+}
+
+void fittracks_wrapper(cudaStream_t &stream,
+                       GPlexLS &Err_iP, GPlexLV &par_iP, 
+                       GPlexHS *msErr_arr, GPlexHV *msPar_arr,
+                       GPlexLS &Err_iC, GPlexLV &par_iC,
+                       GPlexLL &errorProp, GPlexQI &inChg,
+                       const int Nhits, const int N)
+{
+  int gridx = std::min((N-1)/BLOCK_SIZE_X + 1,
+                       max_blocks_x);
+  dim3 grid(gridx, 1, 1);
+  dim3 block(BLOCK_SIZE_X, 1, 1);
+
+  fittracks_kernel <<< grid, block, 0, stream >>>
+      (par_iP, Err_iP, 
+       msPar_arr, msErr_arr,
+       par_iC, Err_iC,
+       errorProp, inChg, 
+       Nhits, N);
+}
diff --git a/mkFit/fittracks_kernels.h b/mkFit/fittracks_kernels.h
new file mode 100644
index 0000000000000..2486b80a51e3b
--- /dev/null
+++ b/mkFit/fittracks_kernels.h
@@ -0,0 +1,13 @@
+#ifndef FITTRACKS_KERNELS_H_G3FDJYTX
+#define FITTRACKS_KERNELS_H_G3FDJYTX
+
+#include "GPlex.h"
+
+void fittracks_wrapper(cudaStream_t &stream,
+                       GPlexLS &Err_iP, GPlexLV &par_iP, 
+                       GPlexHS *msErr, GPlexHV *msPar,
+                       GPlexLS &Err_iC, GPlexLV &par_iC,
+                       GPlexLL &errorProp, GPlexQI &inChg,
+                       const int hit_idx, const int N);
+
+#endif /* end of include guard: FITTRACKS_KERNELS_H_G3FDJYTX */
diff --git a/mkFit/gpu_utils.cu b/mkFit/gpu_utils.cu
new file mode 100644
index 0000000000000..2800aa39bd4b8
--- /dev/null
+++ b/mkFit/gpu_utils.cu
@@ -0,0 +1,9 @@
+#include "gpu_utils.h"
+
+void sync_gpu() {
+  cudaCheckErrorSync();
+}
+
+void separate_first_call_for_meaningful_profiling_numbers() {
+  sync_gpu();
+}
diff --git a/mkFit/gpu_utils.h b/mkFit/gpu_utils.h
new file mode 100644
index 0000000000000..3995866b9370a
--- /dev/null
+++ b/mkFit/gpu_utils.h
@@ -0,0 +1,29 @@
+#ifndef GPU_UTILS_H
+#define GPU_UTILS_H 
+
+#include <cub/util_debug.cuh>
+
+#include <cstdint>
+
+#define cudaCheckError()               \
+  do {                                 \
+    cudaError_t e=cudaGetLastError();  \
+    CubDebugExit(e)                    \
+  } while(0)  
+
+#define cudaCheckErrorSync()           \
+  do {                                 \
+    cudaDeviceSynchronize();           \
+    cudaCheckError();                  \
+  } while(0)
+
+// CUDA specific:
+// Maximum number of blocks in the X direction of the thread grid.
+constexpr int max_blocks_x = INT32_MAX;
+
+// The first call to a CUDA API function takes the initialization hit.
+void separate_first_call_for_meaningful_profiling_numbers();
+
+void sync_gpu();
+
+#endif /* ifndef GPU_UTILS_H */
diff --git a/mkFit/index_selection_kernels.cu b/mkFit/index_selection_kernels.cu
new file mode 100644
index 0000000000000..00fa9fb987dce
--- /dev/null
+++ b/mkFit/index_selection_kernels.cu
@@ -0,0 +1,176 @@
+#include "index_selection_kernels.h"
+#include "Config.h"
+#include "HitStructures.h"
+#include "gpu_utils.h"
+
+#include "stdio.h"
+
+#define BLOCK_SIZE_X 32
+
+constexpr bool tmp_useCMSGeom = false;
+
+__device__ void selectHitIndices_fn(const LayerOfHitsCU &layer_of_hits,
+    const GPlexLS &Err, const GPlexLV &Par, GPlexQI &XHitSize, 
+    GPlexHitIdx &XHitArr, const int itrack, const int N) {
+  //int itrack = threadIdx.x + blockDim.x*blockIdx.x;
+
+  if (itrack < N) {
+    bool dump = false;
+    const float nSigmaPhi = 3;
+    const float nSigmaZ   = 3;
+
+    int xhitsize_tmp = XHitSize[itrack];
+    XHitSize[itrack] = 0;
+
+    float z, phi, dz, dphi;
+    {
+      const float x = Par(itrack, 0, 0);
+      const float y = Par(itrack, 1, 0);
+
+      const float r2 = x*x + y*y;
+
+      z   = Par(itrack, 2, 0);
+      phi = getPhi(x, y);
+      /*dz  = nSigmaZ * sqrtf(Err(itrack, 2, 2));*/
+      dz  = nSigmaZ * sqrtf(Err[5*Err.stride + itrack]);
+
+      const float dphidx = -y/r2, dphidy = x/r2;
+      /*const float dphi2  = dphidx * dphidx * Err(itrack, 0, 0) +*/
+                           /*dphidy * dphidy * Err(itrack, 1, 1) +*/
+                       /*2 * dphidx * dphidy * Err(itrack, 0, 0);*/
+      const float dphi2  = dphidx * dphidx * Err[0*Err.stride + itrack] +
+                           dphidy * dphidy * Err[2*Err.stride + itrack] +
+                       2 * dphidx * dphidy * Err[1*Err.stride + itrack];
+
+#ifdef HARD_CHECK
+      assert(dphi2 >= 0);
+#endif
+
+      dphi = nSigmaPhi * sqrtf(fabs(dphi2));
+
+      if (tmp_useCMSGeom)
+      {
+        //now correct for bending and for layer thickness unsing linear approximation
+        const float deltaR = Config::cmsDeltaRad; //fixme! using constant value, to be taken from layer properties
+        const float r  = sqrt(r2);
+#ifdef CCSCOORD
+        //here alpha is the difference between posPhi and momPhi
+        const float alpha = phi - Par(itrack, 4, 0);
+        float cosA, sinA;
+        if (Config::useTrigApprox) {
+          sincos4(alpha, sinA, cosA);
+        } else {
+          cosA = cos(alpha);
+          sinA = sin(alpha);
+        }
+#else
+        const float px = Par(itrack, 3, 0);
+        const float py = Par(itrack, 4, 0);
+        const float pt = ::sqrt(px*px + py*py);
+        //here alpha is the difference between posPhi and momPhi
+        //const float cosA = ( x*px + dy*py ) / (pt*r);
+        //const float sinA = ( y*px - dx*py ) / (pt*r);
+        // FIXME dx, dy do not exist: 
+        //       does not matter yet for gpu as cms geom is not implemented 
+        const float cosA = ( x*px + y*py ) / (pt*r);
+        const float sinA = ( y*px - x*py ) / (pt*r);
+#endif
+        //take abs so that we always inflate the window
+        const float dist = fabs(deltaR*sinA/cosA);
+
+        dphi += dist / r;
+      }
+    }
+
+    const LayerOfHitsCU &L = layer_of_hits;
+
+    if (fabs(dz)   > Config::m_max_dz)   dz   = Config::m_max_dz;
+    if (fabs(dphi) > Config::m_max_dphi) dphi = Config::m_max_dphi;
+
+    const int zb1 = L.GetZBinChecked(z - dz);
+    const int zb2 = L.GetZBinChecked(z + dz) + 1;
+    const int pb1 = L.GetPhiBin(phi - dphi);
+    const int pb2 = L.GetPhiBin(phi + dphi) + 1;
+    // MT: The extra phi bins give us ~1.5% more good tracks at expense of 10% runtime.
+    // const int pb1 = L.GetPhiBin(phi - dphi) - 1;
+    // const int pb2 = L.GetPhiBin(phi + dphi) + 2;
+
+    if (dump)
+      printf("LayerOfHitsCU::SelectHitIndices %6.3f %6.3f %6.6f %7.5f %3d %3d %4d %4d\n",
+             z, phi, dz, dphi, zb1, zb2, pb1, pb2);
+
+    // MT: One could iterate in "spiral" order, to pick hits close to the center.
+    // http://stackoverflow.com/questions/398299/looping-in-a-spiral
+    // This would then work best with relatively small bin sizes.
+    // Or, set them up so I can always take 3x3 array around the intersection.
+    for (int zi = zb1; zi < zb2; ++zi)
+    {
+      for (int pi = pb1; pi < pb2; ++pi)
+      {
+        const int pb = pi & L.m_phi_mask;
+
+        // MT: The following line is the biggest hog (4% total run time).
+        // This comes from cache misses, I presume.
+        // It might make sense to make first loop to extract bin indices
+        // and issue prefetches at the same time.
+        // Then enter vectorized loop to actually collect the hits in proper order.
+
+#if 1
+        /*for (int hi = L.m_phi_bin_infos[zi][pb].first; hi < L.m_phi_bin_infos[zi][pb].second; ++hi)*/
+        for (int hi = L.m_phi_bin_infos[zi*Config::m_nphi + pb].first; 
+                 hi < L.m_phi_bin_infos[zi*Config::m_nphi + pb].second; ++hi)
+        {
+          // MT: Access into m_hit_zs and m_hit_phis is 1% run-time each.
+
+#ifdef LOH_USE_PHI_Z_ARRAYS
+          float ddz   = fabs(z   - L.m_hit_zs[hi]);
+          float ddphi = fabs(phi - L.m_hit_phis[hi]);
+          if (ddphi > Config::PI) ddphi = Config::TwoPI - ddphi;
+
+          if (dump)
+            printf("     SHI %3d %4d %4d %5d  %6.3f %6.3f %6.4f %7.5f   %s\n",
+                   zi, pi, pb, hi,
+                   L.m_hit_zs[hi], L.m_hit_phis[hi], ddz, ddphi,
+                   (ddz < dz && ddphi < dphi) ? "PASS" : "FAIL");
+
+          // MT: Commenting this check out gives full efficiency ...
+          //     and means our error estimations are wrong!
+          // Avi says we should have *minimal* search windows per layer.
+          // Also ... if bins are sufficiently small, we do not need the extra
+          // checks, see above.
+          // if (ddz < dz && ddphi < dphi && XHitSize[itrack] < MPlexHitIdxMax)
+#endif
+          // MT: The following check also makes more sense with spiral traversal,
+          // we'd be taking in closest hits first.
+          if (XHitSize[itrack] < GPlexHitIdxMax)
+          {
+            /*if (itrack == 0)*/
+            /*if (XHitArr(itrack, XHitSize[itrack], 0) != hi) {*/
+              /*printf("before %d, after %d\n", XHitArr(itrack, XHitSize[itrack], 0), hi);*/
+              /*printf("--- %d, after %d\n", XHitSize[itrack], hi);*/
+            /*}*/
+            XHitArr(itrack, XHitSize[itrack]++, 0) = hi;
+          }
+        }
+#endif  // 0
+      }
+    }
+  }
+}
+
+__global__ void selectHitIndices_kernel(const LayerOfHitsCU layer_of_hits,
+    const GPlexLS Err, const GPlexLV Par, GPlexQI XHitSize, GPlexHitIdx XHitArr, const int N) {
+  int itrack = threadIdx.x + blockDim.x*blockIdx.x;
+  selectHitIndices_fn(layer_of_hits, Err, Par, XHitSize, XHitArr, itrack, N);
+}
+
+void selectHitIndices_wrapper(const cudaStream_t& stream,
+    const LayerOfHitsCU& layer_of_hits, const GPlexLS& Err, const GPlexLV& Par, 
+    GPlexQI& XHitSize, GPlexHitIdx& XHitArr, const int N) {
+  int gridx = std::min((N-1)/BLOCK_SIZE_X + 1,
+                       max_blocks_x);
+  dim3 grid(gridx, 1, 1);
+  dim3 block(BLOCK_SIZE_X, 1, 1);
+  selectHitIndices_kernel <<< grid, block, 0, stream >>>
+    (layer_of_hits, Err, Par, XHitSize, XHitArr, N);
+}
diff --git a/mkFit/index_selection_kernels.h b/mkFit/index_selection_kernels.h
new file mode 100644
index 0000000000000..801b5a4b068f9
--- /dev/null
+++ b/mkFit/index_selection_kernels.h
@@ -0,0 +1,15 @@
+#ifndef _INDEX_SELECTION_KERNELS_H_
+#define _INDEX_SELECTION_KERNELS_H_
+
+#include "HitStructuresCU.h"
+#include "GPlex.h"
+
+void selectHitIndices_wrapper(const cudaStream_t& stream,
+    const LayerOfHitsCU& layer_of_hits, const GPlexLS& Err, const GPlexLV& Par, 
+    GPlexQI& XHitSize, GPlexHitIdx& XHitArr, const int N);
+
+__device__ void selectHitIndices_fn(const LayerOfHitsCU &layer_of_hits,
+    const GPlexLS &Err, const GPlexLV &Par, GPlexQI &XHitSize,
+    GPlexHitIdx &XHitArr, const int itrack, const int N);
+
+#endif  // _INDEX_SELECTION_KERNELS_H_
diff --git a/mkFit/kalmanUpdater_kernels.cu b/mkFit/kalmanUpdater_kernels.cu
index cfd3ea7587608..0c5a44d2db37f 100644
--- a/mkFit/kalmanUpdater_kernels.cu
+++ b/mkFit/kalmanUpdater_kernels.cu
@@ -1,20 +1,289 @@
 #include "Config.h"
+#include "Hit.h"
 #include "kalmanUpdater_kernels.h"
+#include "computeChi2_kernels.h"
+#include "gpu_utils.h"
 
 // TODO: Clean all the hard-coded #define
 #define LS 21
 #define HS 6
 #define LH 18
+#define HV 3
 
 #define BLOCK_SIZE_X 32
-#define MAX_BLOCKS_X 65535 // CUDA constraint
+
+/*__device__ float getPhi_fn2(float x, float y)*/
+/*{*/
+  /*return atan2(y,x); */
+/*}*/
+
+/*__device__ float getTheta_fn(float r, float z){*/
+  /*return atan2(r,z);*/
+/*}*/
+
+__device__ void subtract_matrix(const float *a, const int aN, 
+                                const float *b, const int bN, 
+                                      float *c, const int cN,
+                                const int size, const int n) {
+  for (int i = 0; i < size; ++i) {
+    c[i*cN + n] = a[i*aN + n] - b[i*bN + n];
+    
+  }
+}
+
+__device__ float getHypot_fn(const float x, const float y)
+{
+  return sqrt(x*x + y*y);
+}
+
+__device__
+void KalmanHTG_fn(const float a00, const float a01,
+                  const GPlexReg2S &b, GPlexRegHH &c)
+{
+
+   // HTG  = rot * res_loc
+   //   C  =  A  *    B   
+
+   // Based on script generation and adapted to custom sizes.
+      c[ 0] = a00*b[ 0];
+      c[ 1] = a00*b[ 1];
+      c[ 2] = 0.;
+      c[ 3] = a01*b[ 0];
+      c[ 4] = a01*b[ 1];
+      c[ 5] = 0.;
+      c[ 6] = b[ 1];
+      c[ 7] = b[ 2];
+      c[ 8] = 0.;
+}
+
+__device__
+void KalmanGain_fn(const GPlexLS &A, const GPlexRegHH &b, GPlexRegLH &c, const int n)
+{
+  // C = A * B, C is 6x3, A is 6x6 sym , B is 6x3
+  using T = float;
+  float *a = A.ptr;
+  int aN = A.stride; int an = n;  // Global array
+  int bN = 1;        int bn = 0;  // Register array
+  int cN = 1;        int cn = 0;
+
+#include "KalmanGain.ah"
+}
+
+__device__
+void KHMult_fn(const GPlexRegLH &a, 
+               const float b00,
+               const float b01,
+               GPlexRegLL &c)
+{
+      c[ 0] = a[ 0]*b00;
+      c[ 1] = a[ 0]*b01;
+      c[ 2] = a[ 1];
+      c[ 3] = 0;
+      c[ 4] = 0;
+      c[ 5] = 0;
+      c[ 6] = a[ 3]*b00;
+      c[ 7] = a[ 3]*b01;
+      c[ 8] = a[ 4];
+      c[ 9] = 0;
+      c[10] = 0;
+      c[11] = 0;
+      c[12] = a[ 6]*b00;
+      c[13] = a[ 6]*b01;
+      c[14] = a[ 7];
+      c[15] = 0;
+      c[16] = 0;
+      c[17] = 0;
+      c[18] = a[ 9]*b00;
+      c[19] = a[ 9]*b01;
+      c[20] = a[10];
+      c[21] = 0;
+      c[22] = 0;
+      c[23] = 0;
+      c[24] = a[12]*b00;
+      c[25] = a[12]*b01;
+      c[26] = a[13];
+      c[27] = 0;
+      c[28] = 0;
+      c[29] = 0;
+      c[30] = a[15]*b00;
+      c[31] = a[15]*b01;
+      c[32] = a[16];
+      c[33] = 0;
+      c[34] = 0;
+      c[35] = 0;
+}
+
+__device__
+void KHC_fn(const GPlexRegLL &a, const GPlexLS &B, GPlexLS &C, const int n)
+{
+  // C = A * B, C is 6x6, A is 6x6 , B is 6x6 sym
+  using T = float;
+                 int aN = 1; int an = 0;  // Register array
+  T *b = B.ptr;  int bN = B.stride;  int bn = n;
+  T *c = C.ptr;  int cN = C.stride;  int cn = n;
+#include "KHC.ah"
+}
+
+// 
+__device__
+void ConvertToPolar_fn(const GPlexLV &a, GPlexRegLV &b, GPlexRegLL &c, const int n)
+{
+  int aN = a.stride; 
+  typedef float T;
+    const float pt = getHypot_fn(a[ 3*aN+n], a[ 4*aN+n]);
+    const float p2 = pt*pt + a[ 5*aN+n]*a[ 5*aN+n];
+    //
+    b[ 0] = a[ 0*aN+n];
+    b[ 1] = a[ 1*aN+n];
+    b[ 2] = a[ 2*aN+n];
+    b[ 3] = 1.0f/pt;
+    b[ 4] = getPhi(a[ 3*aN+n], a[ 4*aN+n]); //fixme: use trig approx
+    b[ 5] = getTheta(pt, a[ 5*aN+n]);
+    //
+    c[ 0] = 1.;
+    c[ 1] = 0.;
+    c[ 2] = 0.;
+    c[ 3] = 0.;
+    c[ 4] = 0.;
+    c[ 5] = 0.;
+    c[ 6] = 0.;
+    c[ 7] = 1.;
+    c[ 8] = 0.;
+    c[ 9] = 0.;
+    c[10] = 0.;
+    c[11] = 0.;
+    c[12] = 0.;
+    c[13] = 0.;
+    c[14] = 1.;
+    c[15] = 0.;
+    c[16] = 0.;
+    c[17] = 0.;
+    c[18] = 0.;
+    c[19] = 0.;
+    c[20] = 0.;
+    c[21] = -a[ 3*aN+n]/(pt*pt*pt);
+    c[22] = -a[ 4*aN+n]/(pt*pt*pt);
+    c[23] = 0.;
+    c[24] = 0.;
+    c[25] = 0.;
+    c[26] = 0.;
+    c[27] = -a[ 4*aN+n]/(pt*pt);
+    c[28] =  a[ 3*aN+n]/(pt*pt);
+    c[29] = 0.;
+    c[30] = 0.;
+    c[31] = 0.;
+    c[32] = 0.;
+    c[33] =  a[ 3*aN+n]*a[ 5*aN+n]/(pt*p2);
+    c[34] =  a[ 4*aN+n]*a[ 5*aN+n]/(pt*p2);
+    c[35] = -pt/p2;
+}
+
+__device__
+void PolarErr_fn(const GPlexRegLL &a, const float *b, int bN, GPlexRegLL &c, const int n)
+{
+  // C = A * B, C is 6x6, A is 6x6 , B is 6x6 sym
+ 
+  // Generated code access arrays with variables cN, cn
+  // c[i*cN+cn]  
+  int aN = 1; int an = 0;  // Register array
+              int bn = n;  // Global array
+  int cN = 1; int cn = 0;
+#include "CCSErr.ah"
+}
+
+__device__
+void PolarErrTransp_fn(const GPlexRegLL &a, const GPlexRegLL &b, GPlexLS &C, const int n)
+{
+  // C = A * B, C is sym, A is 6x6 , B is 6x6
+  using T = float;
+                 int aN = 1;         int an = 0;
+                 int bN = 1;         int bn = 0;
+  T *c = C.ptr;  int cN = C.stride;  int cn = n;
+#include "CCSErrTransp.ah"
+}
+
+__device__
+void ConvertToCartesian_fn(const GPlexRegLV &a, float *b, int bN, GPlexRegLL &c, const int n)
+{
+    const float cosP = std::cos(a[ 4]); //fixme: use trig approx
+    const float sinP = std::sin(a[ 4]);
+    const float cosT = std::cos(a[ 5]);
+    const float sinT = std::sin(a[ 5]);
+    //
+    b[ 0*bN+n] = a[ 0];
+    b[ 1*bN+n] = a[ 1];
+    b[ 2*bN+n] = a[ 2];
+    b[ 3*bN+n] = cosP/a[ 3];
+    b[ 4*bN+n] = sinP/a[ 3];
+    b[ 5*bN+n] = cosT/(sinT*a[ 3]);
+    //
+    c[ 0] = 1.;
+    c[ 1] = 0.;
+    c[ 2] = 0.;
+    c[ 3] = 0.;
+    c[ 4] = 0.;
+    c[ 5] = 0.;
+    c[ 6] = 0.;
+    c[ 7] = 1.;
+    c[ 8] = 0.;
+    c[ 9] = 0.;
+    c[10] = 0.;
+    c[11] = 0.;
+    c[12] = 0.;
+    c[13] = 0.;
+    c[14] = 1.;
+    c[15] = 0.;
+    c[16] = 0.;
+    c[17] = 0.;
+    c[18] = 0.;
+    c[19] = 0.;
+    c[20] = 0.;
+    c[21] = -cosP/(a[ 3]*a[ 3]);
+    c[22] = -sinP/a[ 3];
+    c[23] = 0.;
+    c[24] = 0.;
+    c[25] = 0.;
+    c[26] = 0.;
+    c[27] = -sinP/(a[ 3]*a[ 3]);
+    c[28] =  cosP/a[ 3];
+    c[29] = 0.;
+    c[30] = 0.;
+    c[31] = 0.;
+    c[32] = 0.;
+    c[33] = -cosT/(sinT*a[ 3]*a[ 3]);
+    c[34] = 0.;
+    c[35] = -1.0f/(sinT*sinT*a[ 3]);
+}
+
+__device__
+void CartesianErr_fn(const GPlexRegLL &a, const float *b, const int bN, GPlexRegLL &c, const int n)
+{
+  // C = A * B, C is 6x6, A is 6x6 , B is 6x6 sym
+  int aN = 1; int an = 0;
+              int bn = n;
+  int cN = 1; int cn = 0;
+
+#include "CartesianErr.ah"
+}
+
+__device__
+void CartesianErrTransp_fn(const GPlexRegLL &a, const GPlexRegLL &b, GPlexLS &C, const int n)
+{
+  // C = A * B, C is sym, A is 6x6 , B is 6x6
+  using T = float;
+  int aN = 1; int an = 0;
+  int bN = 1; int bn = 0;
+  T *c = C.ptr;  int cN = C.stride;  int cn = n;
+
+#include "CartesianErrTransp.ah"
+}
 
 
 /// MultKalmanGain ////////////////////////////////////////////////////////////
 
 __device__ void upParam_MultKalmanGain_fn(
-    const float* __restrict__ a, size_t aN,
-    float* b_reg, float *c, int N, int n) {
+    const float* __restrict__ a, const size_t aN,
+    const float* b_reg, float *c, const int N, const int n) {
   // const T* __restrict__ tells the compiler that it can uses the read-only
   // cache, without worrying about coherency.
   // c -> kalmanGain, in register
@@ -101,10 +370,39 @@ __device__ void invertCramerSym_fn(float *a) {
   a[5] = s*c22;
 }
 
+__device__ void invertCramerSym2x2_fn(GPlexReg2S &a) {
+  float det = a[0] * a[2] - a[1] * a[1];
+  const float s   = float(1) / det;
+  const float tmp = s * a[2];
+  a[1] *= -s;
+  a[2]  = s * a[0];
+  a[0]  = tmp;
+}
+
+__device__ void subtractFirst3_fn(const GPlexHV __restrict__ &A,
+                                  const GPlexLV __restrict__ &B,
+                                  GPlexRegHV &C, const int N, int n) {
+  using T = float;
+  const T *a = A.ptr;  int aN = A.stride;
+  const T *b = B.ptr;  int bN = B.stride;
+        T *c = C.arr;
+  /*int n = threadIdx.x + blockIdx.x * blockDim.x;*/
+  
+  if(n < N) {
+    c[0] = a[0*aN+n] - b[0*bN+n];
+    c[1] = a[1*aN+n] - b[1*bN+n];
+    c[2] = a[2*aN+n] - b[2*bN+n];
+  }
+}
+
 /// AddIntoUpperLeft3x3  //////////////////////////////////////////////////////
-__device__ void addIntoUpperLeft3x3_fn(const float* __restrict__ a, size_t aN, 
-                                       const float* __restrict__ b, size_t bN, 
-                                       float *c, const int N, int n) {
+__device__ void addIntoUpperLeft3x3_fn(const GPlexLS __restrict__ &A,
+                                       const GPlexHS __restrict__ &B,
+                                       GPlexRegHS &C, const int N, const int n) {
+  using T = float;
+  const T *a = A.ptr;  int aN = A.stride;
+  const T *b = B.ptr;  int bN = B.stride;
+        T *c = C.arr;
   /*int n = threadIdx.x + blockIdx.x * blockDim.x;*/
   
   if(n < N) {
@@ -120,24 +418,25 @@ __device__ void addIntoUpperLeft3x3_fn(const float* __restrict__ a, size_t aN,
 
 /// MultResidualsAdd //////////////////////////////////////////////////////////
 __device__ void multResidualsAdd_fn(
-    float* reg_a,
-    const float* __restrict__ b, size_t bN,
-    const float* __restrict__ c, size_t cN,
-    float *d, size_t dN, int N, int n) {
+    const GPlexRegLH &reg_a,
+    const GPlexLV __restrict__ &B, 
+    const GPlexReg2V &c,
+          GPlexLV &D,
+    const int N, const int n) {
   // a -> kalmanGain
 
-  /*int i = threadIdx.x;*/
-  /*int n = threadIdx.x + blockIdx.x * blockDim.x;*/
+  using T = float;
+  const T *b = B.ptr;  int bN = B.stride;
+        T *d = D.ptr;  int dN = D.stride;
 
-  float x0, x1, x2;
-
-  /*for (int z = 0; z < (N-1)/gridDim.x  +1; z++) {*/
-    /*n += z*gridDim.x;*/
      if (n < N) {
+       // TODO: This has changed with the introduction of polar coordiantes
+       //       (x0, x1, x2) are not used anymore (see commented values)
+       //       Clean this function's code, once it is better understood.
        // manually substract into local vars -- 3 of them
-       x0 = c[0 * cN + n] - b[0 * bN + n];
-       x1 = c[1 * cN + n] - b[1 * bN + n];
-       x2 = c[2 * cN + n] - b[2 * bN + n];
+       const float x0 = c[0]; // - b[0 * bN + n];
+       const float x1 = c[1]; //- b[1 * bN + n];
+       const float x2 = 0; //c[2] - b[2 * bN + n];
 
        // generate loop (can also write it manually this time, it's not much)
        // WARNING: highly numerically sensitive expressions.
@@ -157,20 +456,40 @@ __device__ void multResidualsAdd_fn(
   /*}*/
 }
 
+__device__
+void MultResidualsAdd_all_reg(const GPlexRegLH &a,
+		      const GPlexRegLV &b,
+		      const GPlexReg2V &c,
+          GPlexRegLV &d)
+{
+   // outPar = psPar + kalmanGain*(dPar)
+   //   D    =   B         A         C
+   // where right half of kalman gain is 0 
+
+   // XXX Regenerate with a script.
+      // generate loop (can also write it manually this time, it's not much)
+      d[0] = b[0] + a[ 0] * c[0] + a[ 1] * c[1];
+      d[1] = b[1] + a[ 3] * c[0] + a[ 4] * c[1];
+      d[2] = b[2] + a[ 6] * c[0] + a[ 7] * c[1];
+      d[3] = b[3] + a[ 9] * c[0] + a[10] * c[1];
+      d[4] = b[4] + a[12] * c[0] + a[13] * c[1];
+      d[5] = b[5] + a[15] * c[0] + a[16] * c[1];
+}
+
 /// KalmanGain_x_propErr //////////////////////////////////////////////////////
 __device__ void kalmanGain_x_propErr_fn(
-    float* d_kalmanGain,
-    const float* __restrict__ d_propErr, size_t stride_propErr,
-    float *d_outErr, size_t stride_outErr, const int N, int n) {
+    const float* d_kalmanGain,
+    const float* __restrict__ d_propErr, const size_t stride_propErr,
+    float *d_outErr, const size_t stride_outErr, const int N, const int n) {
   // a = d_kalmanGain,  b = d_propErr, c = outErrTemp
   // c = b - a*b
-  float *a = d_kalmanGain;
+  const float *a = d_kalmanGain;
   const float *b = d_propErr;
   float *c = d_outErr;
 
   /*size_t aN = stride_kalmanGain;*/
-  size_t bN = stride_propErr;
-  size_t cN = stride_outErr;
+  const size_t bN = stride_propErr;
+  const size_t cN = stride_outErr;
 
   register float reg_c[LS];
 
@@ -210,68 +529,135 @@ __device__ void kalmanGain_x_propErr_fn(
    }
 }
 
-__global__ void kalmanUpdate_kernel(
-    const float* __restrict__ propErr, size_t propErr_stride,
-    const float* __restrict__ msErr, size_t msErr_stride,
-    const float* __restrict__ par_iP, size_t par_iP_stride,
-    const float* __restrict__ msPar, size_t msPar_stride,
-    float *par_iC, size_t par_iC_stride,
-    float *outErr, size_t outErr_stride,
-    const int N) {
-  int grid_width = blockDim.x * gridDim.x;
+__device__ void kalmanUpdate_fn(
+    GPlexLS &propErr, const GPlexHS __restrict__ &msErr,
+    const GPlexLV __restrict__ &par_iP, const GPlexHV __restrict__ &msPar,
+    GPlexLV &par_iC, GPlexLS &outErr, const int n, const int N) {
   // Note: similar results with propErr kept in registers.
   //       It is read-only so using the read-only cache yields more flexibility
   //       wrt block size without increasing the pressure on registers to much.
-  int n = threadIdx.x + blockIdx.x * blockDim.x;
   // There is no need to keep resErr and kalmanGain as global memory arrays.
-  float resErr_reg[HS];
-  float kalmanGain_reg[LH];
+  /*float resErr_reg[HS];*/
+  GPlexRegHS resErr_reg;
+  /*float kalmanGain_reg[LH];*/
+
+  // If there is more matrices than max_blocks_x * BLOCK_SIZE_X 
+  if (n < N) {
+    for (int j = 0; j < HS; ++j) {
+      resErr_reg[j] = 0; //resErr[j*resErr_stride + n];
+    }
+
+    // FIXME: Add useCMSGeom -> port propagateHelixToRMPlex
+#if 0
+    if (Config::useCMSGeom) {
+      propagateHelixToRMPlex(psErr,  psPar, inChg,  msPar, propErr, propPar);
+    } else {
+      propErr = psErr;
+      propPar = psPar;
+    }
+#endif
+    float rotT00;
+    float rotT01;
+    const float r = hipo(msPar(n, 0, 0), msPar(n, 1, 0));
+    rotT00 = -(msPar(n, 1, 0) + par_iP(n, 1, 0))/(2*r);
+    rotT01 =  (msPar(n, 0, 0) + par_iP(n, 0, 0))/(2*r);
+
+    GPlexRegHV res_glo;
+    subtractFirst3_fn(msPar, par_iP, res_glo, N, n);
+
+    addIntoUpperLeft3x3_fn(propErr, msErr, resErr_reg, N, n);
+    GPlexReg2V res_loc;   //position residual in local coordinates
+    RotateResidulsOnTangentPlane_fn(rotT00,rotT01,res_glo,res_loc);
+    GPlexReg2S resErr_loc; // 2x2 sym
+    GPlexRegHH tempHH;  // 3*3 sym
+    ProjectResErr_fn  (rotT00, rotT01, resErr_reg, tempHH);
+    ProjectResErrTransp_fn(rotT00, rotT01, tempHH, resErr_loc);
+
+    /*invertCramerSym_fn(resErr_reg);*/
+    invertCramerSym2x2_fn(resErr_loc);
+#ifndef CCSCOORD
+    // Move to "polar" coordinates: (x,y,z,1/pT,phi,theta) [can we find a better name?]
+
+    /*MPlexLV propPar_pol;// propagated parameters in "polar" coordinates*/
+    /*MPlexLL jac_pol;    // jacobian from cartesian to "polar"*/
+    /*ConvertToPolar_fn(propPar,propPar_pol,jac_pol);*/
+    /*float propPar_pol[6];*/
+    GPlexRegLV propPar_pol;
+    GPlexRegLL jac_pol;
+    ConvertToPolar_fn(par_iP, propPar_pol, jac_pol, n);
+
+    GPlexRegLL tempLL;
+    PolarErr_fn(jac_pol, propErr.ptr, propErr.stride, tempLL, n);
+    PolarErrTransp_fn(jac_pol, tempLL, propErr, n);// propErr is now propagated errors in "polar" coordinates
+#endif
+
+    // Kalman update in "polar" coordinates
+    GPlexRegLH K;
+    KalmanHTG_fn(rotT00, rotT01, resErr_loc, tempHH);
+    KalmanGain_fn(propErr, tempHH, K, n);
+
+#ifdef CCSCOORD
+    // FIXME: assuming no polcoord for now
+    //MultResidualsAdd(K.arr, propPar, res_loc, outPar);// propPar_pol is now the updated parameters in "polar" coordinates
+    multResidualsAdd_fn(K, par_iP, res_loc, par_iC, N, n);// propPar_pol is now the updated parameters in "polar" coordinates
+    GPlexRegLL tempLL;
+#else
+    /*MultResidualsAdd(K, propPar_pol, res_loc, propPar_pol);// propPar_pol is now the updated parameters in "polar" coordinates*/
+    MultResidualsAdd_all_reg(K, propPar_pol, res_loc, propPar_pol);
+#endif
+
+    KHMult_fn(K, rotT00, rotT01, tempLL);
+    KHC_fn(tempLL, propErr, outErr, n);
+    /*outErr.Subtract(propErr, outErr);// outErr is in "polar" coordinates now*/
+    subtract_matrix(propErr.ptr, propErr.stride, outErr.ptr, outErr.stride, 
+        /*propErr.ptr, propErr.stride, LS, n);*/
+      outErr.ptr, outErr.stride, LS, n);
+
+#ifndef CCSCOORD
+    // Go back to cartesian coordinates
+
+    // jac_pol is now the jacobian from "polar" to cartesian
+    // outPar -> par_iC
+    ConvertToCartesian_fn(propPar_pol, par_iC.ptr, par_iC.stride, jac_pol, n);
+    CartesianErr_fn      (jac_pol, outErr.ptr, outErr.stride, tempLL, n);
+    CartesianErrTransp_fn(jac_pol, tempLL, outErr, n);// outErr is in cartesian coordinates now
+#endif
+#if 0
+    upParam_MultKalmanGain_fn(propErr, propErr_stride,
+        resErr_reg, kalmanGain_reg, N, n);             
+    multResidualsAdd_fn(kalmanGain_reg, par_iP, par_iP_stride, 
+        msPar, msPar_stride, par_iC, par_iC_stride, N, n);
+
+    kalmanGain_x_propErr_fn(kalmanGain_reg,
+        propErr, propErr_stride,
+        outErr, outErr_stride, N, n);
+#endif
+  }
+}
+
+__global__ void kalmanUpdate_kernel(
+    GPlexLS propErr, const GPlexHS __restrict__ msErr,
+    const GPlexLV __restrict__ par_iP, const GPlexHV __restrict__ msPar,
+    GPlexLV par_iC, GPlexLS outErr, const int N) {
+  int grid_width = blockDim.x * gridDim.x;
+  int n = threadIdx.x + blockIdx.x * blockDim.x;
 
-  // If there is more matrices than MAX_BLOCKS_X * BLOCK_SIZE_X 
   for (int z = 0; z < (N-1)/grid_width  +1; z++) {
-    /*n += z*gridDim.x;*/
     n += z*grid_width;
-    if (n < N) {
-      for (int j = 0; j < HS; ++j) {
-        resErr_reg[j] = 0; //resErr[j*resErr_stride + n];
-      }
-      addIntoUpperLeft3x3_fn(propErr, propErr_stride,
-          msErr, msErr_stride, resErr_reg, N, n);
-      invertCramerSym_fn(resErr_reg);
-
-      upParam_MultKalmanGain_fn(propErr, propErr_stride,
-          resErr_reg, kalmanGain_reg, N, n);             
-      multResidualsAdd_fn(kalmanGain_reg, par_iP, par_iP_stride, 
-          msPar, msPar_stride, par_iC, par_iC_stride, N, n);
-
-      kalmanGain_x_propErr_fn(kalmanGain_reg,
-          propErr, propErr_stride,
-          outErr, outErr_stride, N, n);
-    }
+    kalmanUpdate_fn(propErr, msErr, par_iP, msPar, par_iC, outErr, n, N);
   }
 }
 
-void kalmanUpdate_wrapper(cudaStream_t& stream,
-    GPlex<float>& d_propErr, GPlex<float>& d_msErr,
-    GPlex<float>& d_par_iP, GPlex<float>& d_msPar,
-    GPlex<float>& d_par_iC, GPlex<float>& d_outErr,
+void kalmanUpdate_wrapper(const cudaStream_t& stream,
+    GPlexLS& d_propErr, const GPlexHS& d_msErr,
+    GPlexLV& d_par_iP, const GPlexHV& d_msPar,
+    GPlexLV& d_par_iC, GPlexLS& d_outErr,
     const int N) {
   int gridx = std::min((N-1)/BLOCK_SIZE_X + 1,
-                       MAX_BLOCKS_X);
+                       max_blocks_x);
   dim3 grid(gridx, 1, 1);
   dim3 block(BLOCK_SIZE_X, 1, 1);
   kalmanUpdate_kernel <<<grid, block, 0, stream >>>
-      (d_propErr.ptr, d_propErr.stride,
-       d_msErr.ptr, d_msErr.stride,
-       d_par_iP.ptr, d_par_iP.stride,
-       d_msPar.ptr, d_msPar.stride, 
-       d_par_iC.ptr, d_par_iC.stride, 
-       d_outErr.ptr, d_outErr.stride,
-       N);
+      (d_propErr, d_msErr, d_par_iP, d_msPar, d_par_iC, d_outErr, N);
 }
 
-// Should probably not be in this file, but creating a file for
-// this oneliner seems overkill.
-void separate_first_call_for_meaningful_profiling_numbers() {
-  cudaDeviceSynchronize();
-}
diff --git a/mkFit/kalmanUpdater_kernels.h b/mkFit/kalmanUpdater_kernels.h
index 3c00760050582..112462eb4c8ba 100644
--- a/mkFit/kalmanUpdater_kernels.h
+++ b/mkFit/kalmanUpdater_kernels.h
@@ -3,14 +3,36 @@
 
 #include "GPlex.h"
 
-void kalmanUpdate_wrapper(cudaStream_t& stream,
-    GPlex<float>& d_propErr, GPlex<float>& d_msErr,
-    GPlex<float>& d_par_iP, GPlex<float>& d_msPar,
-    GPlex<float>& d_par_iC, GPlex<float>& d_outErr,
+void kalmanUpdate_wrapper(const cudaStream_t& stream,
+    GPlexLS& d_propErr, const GPlexHS& d_msErr,
+    GPlexLV& d_par_iP, const GPlexHV& d_msPar,
+    GPlexLV& d_par_iC, GPlexLS& d_outErr,
     const int N);
 
-void reorganizeMs_wrapper(cudaStream_t& stream, GPlex<float>& msPar, float *full_posArray,
-    GPlex<float>& msErr, float *full_errArray, int *full_hitIdx, int hi, int maxHits,
-    int N, int hs, int hv, int Nhits);
+//void reorganizeMs_wrapper(cudaStream_t& stream, GPlexQF& msPar,
+//    float *full_posArray, GPlexHS& msErr, 
+//    float *full_errArray, int *full_hitIdx, int hi, int maxHits,
+//    int N, int hs, int hv, int Nhits);
+
+__global__ void kalmanUpdate_kernel(
+    GPlexLS propErr, const GPlexHS __restrict__ msErr,
+    const GPlexLV __restrict__ par_iP, const GPlexHV __restrict__ msPar,
+    GPlexLV par_iC, GPlexLS outErr, const int N);
+
+__device__ void kalmanUpdate_fn(
+    GPlexLS &propErr, const GPlexHS __restrict__ &msErr,
+    const GPlexLV __restrict__ &par_iP, const GPlexHV __restrict__ &msPar,
+    GPlexLV &par_iC, GPlexLS &outErr, const int itrack, const int N);
+
+__device__ void addIntoUpperLeft3x3_fn(const GPlexLS __restrict__ &A,
+                                       const GPlexHS __restrict__ &B,
+                                       GPlexRegHS &c, const int N, const int n);
+
+__device__ void subtractFirst3_fn(const GPlexHV __restrict__ &A,
+                                  const GPlexLV __restrict__ &B,
+                                  GPlexRegHV &C, const int N, const int n);
+
+__device__ void invertCramerSym_fn(float *a);
+__device__ void invertCramerSym2x2_fn(GPlexReg2S &a);
 
 #endif  // _KALMAN_UPDATER_KERNELS_H_
diff --git a/mkFit/mkFit.cc b/mkFit/mkFit.cc
index 5afe3fd38b530..e1c11e6e38f19 100644
--- a/mkFit/mkFit.cc
+++ b/mkFit/mkFit.cc
@@ -20,8 +20,13 @@
 
 #ifdef USE_CUDA
 #include "FitterCU.h"
+#include "gpu_utils.h"
 #endif
 
+#include <cstdlib>
+//#define DEBUG
+#include "Debug.h"
+
 #include <omp.h>
 
 #include <tbb/task_scheduler_init.h>
@@ -184,14 +189,18 @@ void test_standard()
   EventTmp ev_tmp;
 
 #if USE_CUDA
+  tbb::task_scheduler_init tbb_init(Config::numThreadsFinder);
+  //tbb::task_scheduler_init tbb_init(tbb::task_scheduler_init::automatic);
+  
+  //omp_set_num_threads(Config::numThreadsFinder);
   // fittest time. Sum of all events. In case of multiple events
   // being run simultaneously in different streams this time will
   // be larger than the elapsed time.
-  double s_tmp = 0.0;
 
   std::vector<Event> events;
   std::vector<Validation> validations(Config::nEvents);
 
+  events.reserve(Config::nEvents);
   // simulations are all performed before the fitting loop.
   // It is mandatory in order to see the benefits of running
   // multiple streams.
@@ -200,6 +209,7 @@ void test_standard()
     events.emplace_back(geom, val, evt);
     events.back().Simulate();
     events.back().resetLayerHitMap(true);
+    dprint("Event #" << events.back().evtID() << " simtracks " << events.back().simTracks_.size() << " layerhits " << events.back().layerHits_.size());
   }
 
   // The first call to a GPU function always take a very long time.
@@ -210,67 +220,14 @@ void test_standard()
   // tell you how much time is spend running cudaDeviceSynchronize(),
   // use another function). 
   separate_first_call_for_meaningful_profiling_numbers();
-#if 0
-  In principle, the warmup loop should not be required.
-  The separate_first_call_for_meaningful_profiling_numbers() function
-  should be enough.
-  // Warmup loop
-  for (int i = 0; i < 1; ++i) {
-    FitterCU<float> cuFitter(NN);
-    cuFitter.allocateDevice();
-    Event &ev = events[0];
-    std::vector<Track> plex_tracks_ev;
-    plex_tracks_ev.resize(ev.simTracks_.size());
-
-    if (g_run_fit_std) runFittingTestPlexGPU(cuFitter, ev, plex_tracks_ev);
-    cuFitter.freeDevice();
-  }
-#endif
 
-  // Reorgnanization (copyIn) can eventually be multithreaded.
-  omp_set_nested(1);
-      
-  omp_set_num_threads(Config::numThreadsEvents);
-  double total_gpu_time = dtime();
-#pragma omp parallel reduction(+:s_tmp)
-  {
-  int numThreadsEvents = omp_get_num_threads();
-  int thr_idx = omp_get_thread_num();
-
-  // FitterCU is declared here to share allocations and deallocations
-  // between the multiple events processed by a single thread.
-  FitterCU<float> cuFitter(NN);
-  cuFitter.allocateDevice();
-
-    for (int evt = thr_idx+1; evt <= Config::nEvents; evt+= numThreadsEvents) {
-      int idx = thr_idx;
-      printf("==============================================================\n");
-      printf("Processing event %d with thread %d\n", evt, idx);
-      Event &ev = events[evt-1];
-      std::vector<Track> plex_tracks_ev;
-      plex_tracks_ev.resize(ev.simTracks_.size());
-      double tmp = 0, tmp2bh = 0, tmp2 = 0, tmp2ce = 0;
-
-      if (g_run_fit_std) tmp = runFittingTestPlexGPU(cuFitter, ev, plex_tracks_ev);
-
-      printf("Matriplex fit = %.5f  -------------------------------------", tmp);
-      printf("\n");
-      s_tmp    += tmp;
-#if 1  // 0 for timing, 1 for validation
-      // Validation crashes for multiple threads.
-      // It is something in relation to ROOT. Not sure what. 
-      if (omp_get_num_threads() <= 1) {
-        if (g_run_fit_std) {
-          std::string tree_name = "validation-plex-" + std::to_string(evt) + ".root";
-          make_validation_tree(tree_name.c_str(), ev.simTracks_, plex_tracks_ev);
-        }
-      }
-#endif
-    }
-    cuFitter.freeDevice();
-  }
-  std::cerr << "###### Total GPU time: " << dtime() - total_gpu_time << " ######\n";
+  if (g_run_fit_std) runAllEventsFittingTestPlexGPU(events);
 
+  if (g_run_build_all || g_run_build_bh) {
+    double total_best_hit_time = 0.;
+    total_best_hit_time = runAllBuildingTestPlexBestHitGPU(events);
+    std::cout << "Total best hit time (GPU): " << total_best_hit_time << std::endl;
+  }
 #else
   // MT: task_scheduler_init::automatic doesn't really work (segv!) + we don't
   // know what to do for non-tbb cases.
@@ -310,7 +267,14 @@ void test_standard()
 
     for (int b = 0; b < Config::finderReportBestOutOfN; ++b)
     {
+#ifndef USE_CUDA
       t_cur[0] = (g_run_fit_std) ? runFittingTestPlex(ev, plex_tracks) : 0;
+#else
+      FitterCU<float> cuFitter(NN);
+      cuFitter.allocateDevice();
+      t_cur[0] = (g_run_fit_std) ? runFittingTestPlexGPU(cuFitter, ev, plex_tracks) : 0;
+      cuFitter.freeDevice();
+#endif
       t_cur[1] = (g_run_build_all || g_run_build_bh)  ? runBuildingTestPlexBestHit(ev) : 0;
       t_cur[2] = (g_run_build_all || g_run_build_std) ? runBuildingTestPlex(ev, ev_tmp) : 0;
       t_cur[3] = (g_run_build_all || g_run_build_ce)  ? runBuildingTestPlexCloneEngine(ev, ev_tmp) : 0;
@@ -346,6 +310,7 @@ void test_standard()
          t_sum[0], t_sum[1], t_sum[2], t_sum[3], t_sum[4]);
   printf("Total event > 1 fit = %.5f  --- Build  BHMX = %.5f  MX = %.5f  CEMX = %.5f  TBBMX = %.5f\n",
          t_skip[0], t_skip[1], t_skip[2], t_skip[3], t_skip[4]);
+  //fflush(stdout);
 
   if (g_operation == "read")
   {
diff --git a/mkFit/propagation_kernels.cu b/mkFit/propagation_kernels.cu
index 6b6f3f8b2d9ec..70f4957177a44 100644
--- a/mkFit/propagation_kernels.cu
+++ b/mkFit/propagation_kernels.cu
@@ -1,35 +1,56 @@
 #include "Config.h"
+#include "Debug.h"
 #include "propagation_kernels.h"
 #include <stdio.h>
+#include "gpu_utils.h"
 
-#define L 6
-#define LL 36
-#define LS 21
+constexpr int L = 6;
+constexpr int LL2 = 36;
+constexpr int LS = 21;
 
 // values from 32 to 512 give good results.
 // 32 gives slightly better results (on a K40)
-#define BLOCK_SIZE_X 32
-#define MAX_BLOCKS_X 65535 // CUDA constraint
+constexpr int BLOCK_SIZE_X = 32;
 
-__device__ float hipo(float x, float y) {
-  return sqrt(x*x + y*y);
+__device__
+void MultHelixProp_fn(const GPlexRegLL& a, const GPlexLS& b, GPlexRegLL& c, const int n)
+{
+   // C = A * B
+
+   typedef float T;
+   /*const idx_t N  = NN;*/
+
+   /*const T *a = A.fArray; ASSUME_ALIGNED(a, 64);*/
+   /*const T *b = B.fArray; ASSUME_ALIGNED(b, 64);*/
+         /*T *c = C.fArray; ASSUME_ALIGNED(c, 64);*/
+  /*float *a = A.ptr;*/
+  int aN = 1       ; int an = 0;  // Register array
+  int bN = b.stride; int bn = n;  // Global array
+  int cN = 1;        int cn = 0;
+
+#include "MultHelixProp.ah"
 }
-__device__ void sincos4(float x, float& sin, float& cos) {
-   // Had this writen with explicit division by factorial.
-   // The *whole* fitting test ran like 2.5% slower on MIC, sigh.
-   cos  = 1;
-   sin  = x;   x *= x * 0.5f;
-   cos -= x;   x *= x * 0.33333333f;
-   sin -= x;   x *= x * 0.25f;
-   cos += x;
+
+__device__
+void MultHelixPropTransp_fn(const GPlexRegLL& a, const GPlexRegLL& b, GPlexLS& c, const int n)
+{
+   // C = B * AT;
+
+   typedef float T;
+  int aN = 1       ; int an = 0;  // Register array
+  int bN = 1       ; int bn = 0;  // Global array
+  int cN = c.stride; int cn = n;
+
+#include "MultHelixPropTransp.ah"
 }
 
 // computeJacobianSimple works on values that are in registers.
 // Registers are thread-private. Thus this function has no notion of
 // parallelism. It is ran serially by each calling thread.
 __device__ void computeJacobianSimple(float *errorProp,
-    float s, float k, float p, float pxin, float pyin, float pzin, 
-    float TP, float cosTP, float sinTP, int N) {
+    const float s, const float k, const float p, 
+    const float pxin, const float pyin, const float pzin, 
+    const float TP, const float cosTP, const float sinTP, const int N) {
 
   // std::cout << "total path s=" << s << std::endl;
   // TD = s*pt/p;
@@ -90,225 +111,263 @@ __device__ void computeJacobianSimple(float *errorProp,
 }
 
 /// Compute MsRad /////////////////////////////////////////////////////////////
-// Not passing msRad.stride, as QF == 1 (second dim f msRad)
-__device__ void computeMsRad_fn(const float* __restrict__ msPar,
-    size_t stride_msPar, float* msRad, int N, int n) {
+__device__ void assignMsRad_fn(const float r, float* msRad, const int N, const int n) {
   /*int n = threadIdx.x + blockIdx.x * blockDim.x;*/
   if (n < N) {
-    *msRad = hipo(msPar[n], msPar[n + stride_msPar]);
+    *msRad = r;
   }
 }
 
-__device__ 
-void helixAtRFromIterative_fn(float *inPar, size_t inPar_stride,
-    int *inChg, float *outPar, size_t outPar_stride, float msRad, 
-    float *errorProp_reg, int N, int n) {
-
-  size_t opN = outPar_stride;
-  size_t ipN = inPar_stride;
-
+// Not passing msRad.stride, as QF == 1 (second dim f msRad)
+__device__ void computeMsRad_fn(const GPlexHV& __restrict__ msPar,
+    GPlexRegQF &msRad, const int N, const int n) {
   /*int n = threadIdx.x + blockIdx.x * blockDim.x;*/
-
-  float outPar_reg[5];
-
   if (n < N) {
-    for (int j = 0; j < 5; ++j) {
-      outPar_reg[j] = outPar[n+j*opN]; 
-    }
-    const float& xin = inPar[n + 0*ipN]; 
-    const float& yin = inPar[n + 1*ipN]; 
-    const float& pxin = inPar[n + 3*ipN]; 
-    const float& pyin = inPar[n + 4*ipN]; 
-    const float& pzin = inPar[n + 5*ipN]; 
-    const float& r = msRad; 
-    float r0 = hipo(xin, yin);
-
-    if (fabs(r-r0)<0.0001) {
-      // get an identity matrix
-      computeJacobianSimple(errorProp_reg, 0, 1, 1, 1, 1, 1, 0, 1, 0, N);
-      return;  // continue;
-    }
-    float pt2    = pxin*pxin+pyin*pyin;
-    float pt     = sqrt(pt2);
-    float ptinv  = 1./pt;
-    float pt2inv = ptinv*ptinv;
-    //p=0.3Br => r=p/(0.3*B)
-    float k = inChg[n] * 100. / (-0.299792458*Config::Bfield);
-    float invcurvature = 1./(pt*k);//in 1./cm
-    float ctgTheta=pzin*ptinv;
-
-    //variables to be updated at each iterations
-    float totalDistance = 0;
-    //derivatives initialized to value for first iteration, i.e. distance = r-r0in
-    float dTDdx = r0>0. ? -xin/r0 : 0.;
-    float dTDdy = r0>0. ? -yin/r0 : 0.;
-    float dTDdpx = 0.;
-    float dTDdpy = 0.;
-    //temporaries used within the loop (declare here to reduce memory operations)
-    float x = 0.;
-    float y = 0.;
-    float px = 0.;
-    float py = 0.;
-    float cosAP=0.;
-    float sinAP=0.;
-    float dAPdx = 0.;
-    float dAPdy = 0.;
-    float dAPdpx = 0.;
-    float dAPdpy = 0.;
-    // float dxdvar = 0.;
-    // float dydvar = 0.;
-    //5 iterations is a good starting point
-    //const unsigned int Niter = 10;
-    // const unsigned int Niter = 5+std::round(r-r0)/2;
-    for (unsigned int iter=0; iter < Config::Niter; ++iter) {
-      x  = outPar_reg[0];
-      y  = outPar_reg[1];
-      px = outPar_reg[3];
-      py = outPar_reg[4];
-      r0 = hipo(outPar_reg[0], outPar_reg[1]);
-
-      totalDistance += (r-r0);
-      if (Config::useTrigApprox) {  // TODO: uncomment
-        sincos4((r-r0)*invcurvature, sinAP, cosAP);
-      } else {
-        cosAP=cos((r-r0)*invcurvature);
-        sinAP=sin((r-r0)*invcurvature);
+    msRad(n, 0, 0) = hipo(msPar(n, 0, 0), msPar(n, 1, 0));
+  }
+}
+
+__device__ void helixAtRFromIterativePolar_fn(const GPlexLV& inPar, 
+    const GPlexQI& inChg, GPlexLV& outPar, const GPlexReg<float, 1, 1>& msRad, 
+    GPlexReg<float, LL2, L>& errorProp, const int N, const int n)
+{
+  errorProp.SetVal(0);
+
+#pragma simd
+  //for (int n = 0; n < NN; ++n)
+  if (n < N) 
+    {
+      //initialize erroProp to identity matrix
+      errorProp(n,0,0) = 1.f;
+      errorProp(n,1,1) = 1.f;
+      errorProp(n,2,2) = 1.f;
+      errorProp(n,3,3) = 1.f;
+      errorProp(n,4,4) = 1.f;
+      errorProp(n,5,5) = 1.f;
+
+      const float k = inChg(n, 0, 0) * 100.f / (-Config::sol*Config::Bfield);
+      const float r = msRad(n, 0, 0);
+      float r0 = hipo(inPar(n, 0, 0), inPar(n, 1, 0));
+
+      // if (std::abs(r-r0)<0.0001f) {
+      // 	dprint("distance less than 1mum, skip");
+      // 	continue;
+      // }
+
+      const float xin   = inPar(n, 0, 0);
+      const float yin   = inPar(n, 1, 0);
+      const float zin   = inPar(n, 2, 0);
+      const float ipt   = inPar(n, 3, 0);
+      const float phiin = inPar(n, 4, 0);
+      const float theta = inPar(n, 5, 0);
+
+      dprint_np(n, std::endl << "input parameters"
+            << " inPar(n, 0, 0)=" << std::setprecision(9) << inPar(n, 0, 0)
+            << " inPar(n, 1, 0)=" << std::setprecision(9) << inPar(n, 1, 0)
+            << " inPar(n, 2, 0)=" << std::setprecision(9) << inPar(n, 2, 0)
+            << " inPar(n, 3, 0)=" << std::setprecision(9) << inPar(n, 3, 0)
+            << " inPar(n, 4, 0)=" << std::setprecision(9) << inPar(n, 4, 0)
+            << " inPar(n, 5, 0)=" << std::setprecision(9) << inPar(n, 5, 0)
+            );
+
+      const float kinv  = 1.f/k;
+      const float pt = 1.f/ipt;
+
+      float D = 0., cosa = 0., sina = 0., id = 0.;
+      //no trig approx here, phi can be large
+      float cosPorT = std::cos(phiin), sinPorT = std::sin(phiin);
+      float pxin = cosPorT*pt;
+      float pyin = sinPorT*pt;
+
+      dprint_np(n, std::endl << "k=" << std::setprecision(9) << k << " pxin=" << std::setprecision(9) << pxin << " pyin="
+             << std::setprecision(9) << pyin << " cosPorT=" << std::setprecision(9) << cosPorT
+             << " sinPorT=" << std::setprecision(9) << sinPorT << " pt=" << std::setprecision(9) << pt);
+
+      //derivatives initialized to value for first iteration, i.e. distance = r-r0in
+      float dDdx = r0 > 0.f ? -xin/r0 : 0.f;
+      float dDdy = r0 > 0.f ? -yin/r0 : 0.f;
+      float dDdipt = 0.;
+      float dDdphi = 0.;
+
+      for (int i = 0; i < Config::Niter; ++i)
+      {
+	dprint_np(n, std::endl << "attempt propagation from r=" << r0 << " to r=" << r << std::endl
+	       << "x=" << xin << " y=" << yin  << " z=" << inPar(n, 2, 0) << " px=" << pxin << " py=" << pyin << " pz=" << pt*std::tan(theta) << " q=" << inChg(n, 0, 0));
+
+	//compute distance and path for the current iteration
+	r0 = hipo(outPar(n, 0, 0), outPar(n, 1, 0));
+	id = (r-r0);
+	D+=id;
+	if (Config::useTrigApprox) {
+	  sincos4(id*ipt*kinv, sina, cosa);
+	} else {
+          cosa=std::cos(id*ipt*kinv);
+          sina=std::sin(id*ipt*kinv);
+	}
+
+        dprint_np(n, std::endl << "r=" << std::setprecision(9) << r << " r0=" << std::setprecision(9) << r0
+               << " id=" << std::setprecision(9) << id << " cosa=" << cosa << " sina=" << sina);
+
+	//update derivatives on total distance
+	if (i+1 != Config::Niter) {
+
+          const float x = outPar(n, 0, 0);
+          const float y = outPar(n, 1, 0);
+          const float oor0 = (r0>0.f && std::abs(r-r0)<0.0001f) ? 1.f/r0 : 0.f;
+
+          const float dadipt = id*kinv;
+
+          const float dadx = -x*ipt*kinv*oor0;
+          const float dady = -y*ipt*kinv*oor0;
+
+	  const float pxca = pxin*cosa;
+	  const float pxsa = pxin*sina;
+	  const float pyca = pyin*cosa;
+	  const float pysa = pyin*sina;
+
+	  float tmp = k*dadx;
+          dDdx   -= ( x*(1.f + tmp*(pxca - pysa)) + y*tmp*(pyca + pxsa) )*oor0;
+	  tmp = k*dady;
+          dDdy   -= ( x*tmp*(pxca - pysa) + y*(1.f + tmp*(pyca + pxsa)) )*oor0;
+          //now r0 depends on ipt and phi as well
+	  tmp = dadipt*ipt;
+          dDdipt -= k*( x*(pxca*tmp - pysa*tmp - pyca - pxsa + pyin) +
+                        y*(pyca*tmp + pxsa*tmp - pysa + pxca - pxin))*pt*oor0;
+          dDdphi += k*( x*(pysa - pxin + pxca) - y*(pxsa - pyin + pyca))*oor0;
+        }
+
+	//update parameters
+	outPar(n, 0, 0) = outPar(n, 0, 0) + k*(pxin*sina - pyin*(1.f-cosa));
+	outPar(n, 1, 0) = outPar(n, 1, 0) + k*(pyin*sina + pxin*(1.f-cosa));
+	const float pxinold = pxin;//copy before overwriting
+	pxin = pxin*cosa - pyin*sina;
+	pyin = pyin*cosa + pxinold*sina;
+
+        dprint_np(n, std::endl << "outPar(n, 0, 0)=" << outPar(n, 0, 0) << " outPar(n, 1, 0)=" << outPar(n, 1, 0)
+               << " pxin=" << pxin << " pyin=" << pyin);
       }
 
-      //helix propagation formulas
-      //http://www.phys.ufl.edu/~avery/fitting/fitting4.pdf
-      outPar_reg[0] = outPar_reg[0] + k*(px*sinAP-py*(1-cosAP));
-      outPar_reg[1] = outPar_reg[1] + k*(py*sinAP+px*(1-cosAP));
-      outPar_reg[2] = outPar_reg[2] + (r-r0)*ctgTheta;
-      outPar_reg[3] = px*cosAP-py*sinAP;
-      outPar_reg[4] = py*cosAP+px*sinAP;
-      //outPar.At(n, 5, 0) = pz; //take this out as it is redundant
-
-      if (Config::useSimpleJac==0 && 
-          iter +1 != Config::Niter &&
-          r0 > 0 && fabs((r-r0)*invcurvature)>0.000000001) {
-        //update derivatives on total distance for next step, where totalDistance+=r-r0
-        //now r0 depends on px and py
-        r0 = 1./r0;//WARNING, now r0 is r0inv (one less temporary)
-
-        //update derivative on D
-        dAPdx = -x*r0*invcurvature;
-        dAPdy = -y*r0*invcurvature;
-        dAPdpx = -(r-1./r0)*invcurvature*px*pt2inv;//weird, using r0 instead of 1./r0 improves things but it should be wrong since r0 in now r0inv
-        dAPdpy = -(r-1./r0)*invcurvature*py*pt2inv;//weird, using r0 instead of 1./r0 improves things but it should be wrong since r0 in now r0inv
-        //reduce temporary variables
-        //dxdx = 1 + k*dAPdx*(px*cosAP - py*sinAP);
-        //dydx = k*dAPdx*(py*cosAP + px*sinAP);
-        //dTDdx -= r0*(x*dxdx + y*dydx);
-        dTDdx -= r0*(x*(1 + k*dAPdx*(px*cosAP - py*sinAP)) + y*(k*dAPdx*(py*cosAP + px*sinAP)));
-        //reuse same temporary variables
-        //dxdy = k*dAPdy*(px*cosAP - py*sinAP);
-        //dydy = 1 + k*dAPdy*(py*cosAP + px*sinAP);
-        //dTDdy -= r0*(x*dxdy + y*dydy);
-        dTDdy -= r0*(x*(k*dAPdy*(px*cosAP - py*sinAP)) + y*(1 + k*dAPdy*(py*cosAP + px*sinAP)));
-        //dxdpx = k*(sinAP + px*cosAP*dAPdpx - py*sinAP*dAPdpx);
-        //dydpx = k*(py*cosAP*dAPdpx + 1. - cosAP + px*sinAP*dAPdpx);
-        //dTDdpx -= r0*(x*dxdpx + y*dydpx);
-        dTDdpx -= r0*(x*(k*(sinAP + px*cosAP*dAPdpx - py*sinAP*dAPdpx)) + y*(k*(py*cosAP*dAPdpx + 1. - cosAP + px*sinAP*dAPdpx)));
-        //dxdpy = k*(px*cosAP*dAPdpy - 1. + cosAP - py*sinAP*dAPdpy);
-        //dydpy = k*(sinAP + py*cosAP*dAPdpy + px*sinAP*dAPdpy);
-        //dTDdpy -= r0*(x*dxdpy + y*(k*dydpy);
-        dTDdpy -= r0*(x*(k*(px*cosAP*dAPdpy - 1. + cosAP - py*sinAP*dAPdpy)) + y*(k*(sinAP + py*cosAP*dAPdpy + px*sinAP*dAPdpy)));
+      const float alpha  = D*ipt*kinv;
+      const float dadx   = dDdx*ipt*kinv;
+      const float dady   = dDdy*ipt*kinv;
+      const float dadipt = (ipt*dDdipt + D)*kinv;
+      const float dadphi = dDdphi*ipt*kinv;
 
-      }
-      float& TD=totalDistance;
-      float  TP=TD*invcurvature;//totalAngPath
-      
-      float& iC=invcurvature;
-      float dCdpx = k*pxin*ptinv;
-      float dCdpy = k*pyin*ptinv;
-      float dTPdx = dTDdx*iC;
-      float dTPdy = dTDdy*iC;
-      float dTPdpx = (dTDdpx - TD*dCdpx*iC)*iC; // MT change: avoid division
-      float dTPdpy = (dTDdpy - TD*dCdpy*iC)*iC; // MT change: avoid division
-      
-      float cosTP, sinTP;
       if (Config::useTrigApprox) {
-        sincos4(TP, sinTP, cosTP);
+	sincos4(alpha, sina, cosa);
       } else {
-        cosTP = cos(TP);
-        sinTP = sin(TP);
+	cosa=std::cos(alpha);
+	sina=std::sin(alpha);
       }
 
-      if (Config::useSimpleJac) { 
-        //assume total path length s as given and with no uncertainty
-        float p = pt2 + pzin*pzin;
-        p = sqrt(p);
-        float s = TD*p*ptinv;
-        computeJacobianSimple(errorProp_reg, s, k, p, pxin, pyin, pzin, TP, cosTP, sinTP, N);
-      } else {
-        //now try to make full jacobian
-        //derive these to compute jacobian
-        //x = xin + k*(pxin*sinTP-pyin*(1-cosTP));
-        //y = yin + k*(pyin*sinTP+pxin*(1-cosTP));
-        //z = zin + k*TP*pzin;
-        //px = pxin*cosTP-pyin*sinTP;
-        //py = pyin*cosTP+pxin*sinTP;
-        //pz = pzin;
-        //jacobian
-
-        errorProp_reg[(0*L + 0)] = 1 + k*dTPdx*(pxin*cosTP - pyin*sinTP);	//dxdx;
-        errorProp_reg[(0*L + 1)] = k*dTPdy*(pxin*cosTP - pyin*sinTP);	//dxdy;
-        errorProp_reg[(0*L + 2)] = 0.;
-        errorProp_reg[(0*L + 3)] = k*(sinTP + pxin*cosTP*dTPdpx - pyin*sinTP*dTPdpx); //dxdpx;
-        errorProp_reg[(0*L + 4)] = k*(pxin*cosTP*dTPdpy - 1. + cosTP - pyin*sinTP*dTPdpy);//dxdpy;
-        errorProp_reg[(0*L + 5)] = 0.;
-
-        errorProp_reg[(1*L + 0)] = k*dTPdx*(pyin*cosTP + pxin*sinTP);	//dydx;
-        errorProp_reg[(1*L + 1)] = 1 + k*dTPdy*(pyin*cosTP + pxin*sinTP);	//dydy;
-        errorProp_reg[(1*L + 2)] = 0.;
-        errorProp_reg[(1*L + 3)] = k*(pyin*cosTP*dTPdpx + 1. - cosTP + pxin*sinTP*dTPdpx);//dydpx;
-        errorProp_reg[(1*L + 4)] = k*(sinTP + pyin*cosTP*dTPdpy + pxin*sinTP*dTPdpy); //dydpy;
-        errorProp_reg[(1*L + 5)] = 0.;
-
-        errorProp_reg[(2*L + 0)] = k*pzin*dTPdx;	//dzdx;
-        errorProp_reg[(2*L + 1)] = k*pzin*dTPdy;	//dzdy;
-        errorProp_reg[(2*L + 2)] = 1.;
-        errorProp_reg[(2*L + 3)] = k*pzin*dTPdpx;//dzdpx;
-        errorProp_reg[(2*L + 4)] = k*pzin*dTPdpy;//dzdpy;
-        errorProp_reg[(2*L + 5)] = k*TP; //dzdpz;
-
-        errorProp_reg[(3*L + 0)] = -dTPdx*(pxin*sinTP + pyin*cosTP);	//dpxdx;
-        errorProp_reg[(3*L + 1)] = -dTPdy*(pxin*sinTP + pyin*cosTP);	//dpxdy;
-        errorProp_reg[(3*L + 2)] = 0.;
-        errorProp_reg[(3*L + 3)] = cosTP - dTPdpx*(pxin*sinTP + pyin*cosTP); //dpxdpx;
-        errorProp_reg[(3*L + 4)] = -sinTP - dTPdpy*(pxin*sinTP + pyin*cosTP);//dpxdpy;
-        errorProp_reg[(3*L + 5)] = 0.;
-
-        errorProp_reg[(4*L + 0)] = -dTPdx*(pyin*sinTP - pxin*cosTP); //dpydx;
-        errorProp_reg[(4*L + 1)] = -dTPdy*(pyin*sinTP - pxin*cosTP);	//dpydy;
-        errorProp_reg[(4*L + 2)] = 0.;
-        errorProp_reg[(4*L + 3)] = +sinTP - dTPdpx*(pyin*sinTP - pxin*cosTP);//dpydpx;
-        errorProp_reg[(4*L + 4)] = +cosTP - dTPdpy*(pyin*sinTP - pxin*cosTP);//dpydpy;
-        errorProp_reg[(4*L + 5)] = 0.;
-
-        errorProp_reg[(5*L + 0)] = 0.;
-        errorProp_reg[(5*L + 1)] = 0.;
-        errorProp_reg[(5*L + 2)] = 0.;
-        errorProp_reg[(5*L + 3)] = 0.;
-        errorProp_reg[(5*L + 4)] = 0.;
-        errorProp_reg[(5*L + 5)] = 1.;
+      errorProp(n,0,0) = 1.f+k*dadx*(cosPorT*cosa-sinPorT*sina)*pt;
+      errorProp(n,0,1) =     k*dady*(cosPorT*cosa-sinPorT*sina)*pt;
+      errorProp(n,0,2) = 0.f;
+      errorProp(n,0,3) = k*(cosPorT*(ipt*dadipt*cosa-sina)+sinPorT*((1.f-cosa)-ipt*dadipt*sina))*pt*pt;
+      errorProp(n,0,4) = k*(cosPorT*dadphi*cosa - sinPorT*dadphi*sina - sinPorT*sina + cosPorT*cosa - cosPorT)*pt;
+      errorProp(n,0,5) = 0.f;
+
+      errorProp(n,1,0) =     k*dadx*(sinPorT*cosa+cosPorT*sina)*pt;
+      errorProp(n,1,1) = 1.f+k*dady*(sinPorT*cosa+cosPorT*sina)*pt;
+      errorProp(n,1,2) = 0.f;
+      errorProp(n,1,3) = k*(sinPorT*(ipt*dadipt*cosa-sina)+cosPorT*(ipt*dadipt*sina-(1.f-cosa)))*pt*pt;
+      errorProp(n,1,4) = k*(sinPorT*dadphi*cosa + cosPorT*dadphi*sina + sinPorT*cosa + cosPorT*sina - sinPorT)*pt;
+      errorProp(n,1,5) = 0.f;
+
+      //no trig approx here, theta can be large
+      cosPorT=std::cos(theta);
+      sinPorT=std::sin(theta);
+      //redefine sinPorT as 1./sinPorT to reduce the number of temporaries
+      sinPorT = 1.f/sinPorT;
+
+      outPar(n, 2, 0) = inPar(n, 2, 0) + k*alpha*cosPorT*pt*sinPorT;
+
+      errorProp(n,2,0) = k*cosPorT*dadx*pt*sinPorT;
+      errorProp(n,2,1) = k*cosPorT*dady*pt*sinPorT;
+      errorProp(n,2,2) = 1.f;
+      errorProp(n,2,3) = k*cosPorT*(ipt*dadipt-alpha)*pt*pt*sinPorT;
+      errorProp(n,2,4) = k*dadphi*cosPorT*pt*sinPorT;
+      errorProp(n,2,5) =-k*alpha*pt*sinPorT*sinPorT;
+
+      outPar(n, 3, 0) = ipt;
+
+      errorProp(n,3,0) = 0.f;
+      errorProp(n,3,1) = 0.f;
+      errorProp(n,3,2) = 0.f;
+      errorProp(n,3,3) = 1.f;
+      errorProp(n,3,4) = 0.f;
+      errorProp(n,3,5) = 0.f;
+
+      outPar(n, 4, 0) = inPar(n, 4, 0)+alpha;
+
+      errorProp(n,4,0) = dadx;
+      errorProp(n,4,1) = dady;
+      errorProp(n,4,2) = 0.f;
+      errorProp(n,4,3) = dadipt;
+      errorProp(n,4,4) = 1.f+dadphi;
+      errorProp(n,4,5) = 0.f;
+
+      outPar(n, 5, 0) = theta;
+
+      errorProp(n,5,0) = 0.f;
+      errorProp(n,5,1) = 0.f;
+      errorProp(n,5,2) = 0.f;
+      errorProp(n,5,3) = 0.f;
+      errorProp(n,5,4) = 0.f;
+      errorProp(n,5,5) = 1.f;
+
+      dprint_np(n, "propagation end, dump parameters" << std::endl
+	     << "pos = " << outPar(n, 0, 0) << " " << outPar(n, 1, 0) << " " << outPar(n, 2, 0) << std::endl
+	     << "mom = " << std::cos(outPar(n, 4, 0))/outPar(n, 3, 0) << " " << std::sin(outPar(n, 4, 0))/outPar(n, 3, 0) << " " << 1./(outPar(n, 3, 0)*tan(outPar(n, 5, 0)))
+	     << " r=" << std::sqrt( outPar(n, 0, 0)*outPar(n, 0, 0) + outPar(n, 1, 0)*outPar(n, 1, 0) ) << " pT=" << 1./std::abs(outPar(n, 3, 0)) << std::endl);
+      
+#ifdef DEBUG
+      if (n < N_proc) {
+	dmutex_guard;
+	std::cout << n << ": jacobian" << std::endl;
+	printf("%5f %5f %5f %5f %5f %5f\n", errorProp(n,0,0),errorProp(n,0,1),errorProp(n,0,2),errorProp(n,0,3),errorProp(n,0,4),errorProp(n,0,5));
+	printf("%5f %5f %5f %5f %5f %5f\n", errorProp(n,1,0),errorProp(n,1,1),errorProp(n,1,2),errorProp(n,1,3),errorProp(n,1,4),errorProp(n,1,5));
+	printf("%5f %5f %5f %5f %5f %5f\n", errorProp(n,2,0),errorProp(n,2,1),errorProp(n,2,2),errorProp(n,2,3),errorProp(n,2,4),errorProp(n,2,5));
+	printf("%5f %5f %5f %5f %5f %5f\n", errorProp(n,3,0),errorProp(n,3,1),errorProp(n,3,2),errorProp(n,3,3),errorProp(n,3,4),errorProp(n,3,5));
+	printf("%5f %5f %5f %5f %5f %5f\n", errorProp(n,4,0),errorProp(n,4,1),errorProp(n,4,2),errorProp(n,4,3),errorProp(n,4,4),errorProp(n,4,5));
+	printf("%5f %5f %5f %5f %5f %5f\n", errorProp(n,5,0),errorProp(n,5,1),errorProp(n,5,2),errorProp(n,5,3),errorProp(n,5,4),errorProp(n,5,5));
       }
+#endif
+    }
+}
+
+
+#include "PropagationMPlex.icc"
+
+__device__ 
+void helixAtRFromIterative_fn(const GPlexLV& inPar,
+    const GPlexQI& inChg, GPlexLV& outPar_global, const GPlexReg<float,1,1>& msRad, 
+    GPlexReg<float, LL2, L>& errorProp, const int N, const int n) {
+
+  GPlexReg<float, LL2, 1> outPar;
+
+  if (n < N) {
+    for (int j = 0; j < 5; ++j) {
+      outPar[j] = outPar_global(n, j, 0);
     }
+    errorProp.SetVal(0);
+
+    helixAtRFromIterative_impl(inPar, inChg, outPar, msRad, errorProp, n, n+1);
+
     // Once computations are done. Get values from registers to global memory.
     for (int j = 0; j < 5; ++j) {
-      outPar[n + j*opN] = outPar_reg[j];
+      outPar_global(n, j, 0) = outPar[j];
     }
   }
 }
 
 /// Similarity ////////////////////////////////////////////////////////////////
-__device__ void similarity_fn(float* a, float *b, size_t stride_outErr,
-    int N, int n) {
-  size_t bN = stride_outErr;
+__device__ void similarity_fn(GPlexRegLL &a, GPlexLS &b, int N, int n) {
+
+  size_t bN = b.stride;
   
   // Keep most values in registers.
-  float b_reg[LL];
+  float b_reg[LL2];
   // To avoid using too many registers, tmp[] as a limited size and is reused.
   float tmp[6];
 
@@ -395,46 +454,169 @@ __device__ void similarity_fn(float* a, float *b, size_t stride_outErr,
   }
 }
 
-__global__ void propagation_kernel(
-    const float* __restrict__ msPar, size_t stride_msPar, 
-    float *inPar, size_t inPar_stride, int *inChg,
-    float *outPar, size_t outPar_stride, float *errorProp,
-    size_t errorProp_stride, float *outErr, size_t outErr_stride, int N) {
 
+// PropagationMPlex.cc:propagateHelixToRMPlex, first version with 6 arguments 
+__device__ void propagation_fn(
+    GPlexLS &inErr, GPlexLV &inPar, 
+    GPlexQI &inChg, GPlexHV &msPar,
+    GPlexLS &outErr, GPlexLV &outPar,
+    int n, int N) {
+
+  GPlexRegQF msRad_reg;
+  // Using registers instead of shared memory is ~ 30% faster.
+  GPlexRegLL errorProp_reg;
+  // If there is more matrices than max_blocks_x * BLOCK_SIZE_X 
+  if (n < N) {
+    for (int i = 0; i < inErr.kSize; ++i) {
+      outErr[n + i*outErr.stride] = inErr[n + i*inErr.stride];
+    }
+    for (int i = 0; i < inPar.kSize; ++i) {
+      outPar[n + i*outPar.stride] = inPar[n + i*inPar.stride];
+    }
+    for (int i = 0; i < 36; ++i) {
+      errorProp_reg[i] = 0.0;
+    }
+#if 0
+    computeMsRad_fn(msPar, stride_msPar, &msRad_reg, N, n);
+    if (Config::doIterative) {
+      helixAtRFromIterative_fn(inPar, inPar_stride,
+          inChg, outPar, outPar_stride, msRad_reg, 
+          errorProp_reg, N, n);
+    } else {
+      // TODO: not ported for now. Assuming Config::doIterative
+      // helixAtRFromIntersection(inPar, inChg, outPar, msRad, errorProp);
+    }
+    similarity_fn(errorProp_reg, outErr, outErr_stride, N, n);
+#endif
+    computeMsRad_fn(msPar, msRad_reg, N, n);
+#ifdef CCSCOORD
+    helixAtRFromIterativePolar_fn(inPar, inChg, outPar, msRad_reg, errorProp_reg, N, n);
+#else
+    helixAtRFromIterative_fn(inPar, inChg, outPar, msRad_reg, errorProp_reg, N, n);
+#endif
+    /*similarity_fn(errorProp_reg, outErr, N, n);*/
+    GPlexRegLL temp;
+    MultHelixProp_fn      (errorProp_reg, outErr, temp, n);
+    MultHelixPropTransp_fn(errorProp_reg, temp,   outErr, n);
+  }
+}
+
+
+__global__ void propagation_kernel(
+    GPlexLS inErr,
+    GPlexHV msPar,
+    GPlexLV inPar, GPlexQI inChg,
+    GPlexLV outPar,
+    GPlexLS outErr, int N)
+{
   int grid_width = blockDim.x * gridDim.x;
   int n = threadIdx.x + blockIdx.x * blockDim.x;
-  float msRad_reg;
-  // Using registers instead of shared memory is ~ 30% faster.
-  float errorProp_reg[LL];
-  // If there is more matrices than MAX_BLOCKS_X * BLOCK_SIZE_X 
   for (int z = 0; z < (N-1)/grid_width  +1; z++) {
     n += z*grid_width;
-    if (n < N) {
-      computeMsRad_fn(msPar, stride_msPar, &msRad_reg, N, n);
-      if (Config::doIterative) {
-        helixAtRFromIterative_fn(inPar, inPar_stride,
-            inChg, outPar, outPar_stride, msRad_reg, 
-            errorProp_reg, N, n);
-      }
-      similarity_fn(errorProp_reg, outErr, outErr_stride, N, n);
-    }
+    propagation_fn(inErr, inPar, inChg, msPar, outErr, outPar, n, N);
   }
 }
 
 
-void propagation_wrapper(cudaStream_t& stream,
-    GPlex<float>& msPar,
-    GPlex<float>& inPar, GPlex<int>& inChg,
-    GPlex<float>& outPar, GPlex<float>& errorProp,
-    GPlex<float>& outErr, 
+void propagation_wrapper(const cudaStream_t& stream,
+    GPlexHV& msPar, GPlexLS& inErr,
+    GPlexLV& inPar, GPlexQI& inChg,
+    GPlexLV& outPar,
+    GPlexLS& outErr, 
     const int N) {
   int gridx = std::min((N-1)/BLOCK_SIZE_X + 1,
-                       MAX_BLOCKS_X);
+                       max_blocks_x);
   dim3 grid(gridx, 1, 1);
   dim3 block(BLOCK_SIZE_X, 1, 1);
   propagation_kernel <<<grid, block, 0, stream >>>
-    (msPar.ptr, msPar.stride,
-     inPar.ptr, inPar.stride, inChg.ptr,
-     outPar.ptr, outPar.stride, errorProp.ptr,
-     errorProp.stride, outErr.ptr, outErr.stride, N);
+    (inErr, msPar, inPar, inChg, outPar, outErr, N);
 }
+
+
+// PropagationMPlex.cc:propagateHelixToRMPlex, second version with 7 arguments 
+// Imposes the radius
+__device__ void propagationForBuilding_fn(
+    const GPlexLS &inErr, const GPlexLV &inPar,
+    const GPlexQI &inChg, const float radius,
+    GPlexLS &outErr, GPlexLV &outPar, 
+    const int n, const int N) {
+#if 1
+  GPlexRegQF msRad_reg;
+  // Using registers instead of shared memory is ~ 30% faster.
+  GPlexRegLL errorProp_reg;
+  // If there is more matrices than max_blocks_x * BLOCK_SIZE_X 
+  if (n < N) {
+
+    for (int i = 0; i < inErr.kSize; ++i) {
+      outErr[n + i*outErr.stride] = inErr[n + i*inErr.stride];
+    }
+    for (int i = 0; i < inPar.kSize; ++i) {
+      outPar[n + i*outPar.stride] = inPar[n + i*inPar.stride];
+    }
+    for (int i = 0; i < 36; ++i) {
+      errorProp_reg[i] = 0.0;
+    }
+
+    /*assignMsRad_fn(radius, &msRad_reg, N, n);*/
+    msRad_reg(n, 0, 0) = radius;
+    /*if (n == 0) printf("gpu r = %f\n", radius);*/
+
+#ifdef CCSCOORD
+    helixAtRFromIterativePolar_fn(inPar, inChg, outPar, msRad_reg, errorProp_reg, N, n);
+#else
+    helixAtRFromIterative_fn(inPar, inChg, outPar, msRad_reg, errorProp_reg, N, n);
+#endif
+    // TODO: port me
+    /*if (Config::useCMSGeom) {*/
+    /*MPlexQF hitsRl;*/
+    /*MPlexQF hitsXi;*/
+    /*for (int n = 0; n < NN; ++n) {*/
+    /*hitsRl.At(n, 0, 0) = getRlVal(r, outPar.ConstAt(n, 2, 0));*/
+    /*hitsXi.At(n, 0, 0) = getXiVal(r, outPar.ConstAt(n, 2, 0));*/
+    /*}*/
+    /*applyMaterialEffects(hitsRl, hitsXi, outErr, outPar, N_proc);*/
+    /*}*/
+    /*similarity_fn(errorProp_reg, outErr, N, n);*/
+
+    // Matriplex version of:
+    // result.errors = ROOT::Math::Similarity(errorProp, outErr);
+
+    //MultHelixProp can be optimized for polar coordinates, see GenMPlexOps.pl
+    /*MPlexLL temp;*/
+    /*MultHelixProp      (errorProp, outErr, temp);*/
+    /*MultHelixPropTransp(errorProp, temp,   outErr);*/
+    GPlexRegLL temp;
+    MultHelixProp_fn      (errorProp_reg, outErr, temp, n);
+    MultHelixPropTransp_fn(errorProp_reg, temp,   outErr, n);
+
+  }
+#endif
+}
+
+__global__ void propagationForBuilding_kernel(
+    const GPlexLS inErr, const GPlexLV inPar,
+    const GPlexQI inChg, const float radius,
+    GPlexLS outErr, GPlexLV outPar, 
+    const int N) {
+  int grid_width = blockDim.x * gridDim.x;
+  int n = threadIdx.x + blockIdx.x * blockDim.x;
+
+  for (int z = 0; z < (N-1)/grid_width  +1; z++) {
+    n += z*grid_width;
+    propagationForBuilding_fn( inErr, inPar, inChg, radius, outErr, outPar, n, N);
+  }
+}
+
+void propagationForBuilding_wrapper(const cudaStream_t& stream,
+    const GPlexLS& inErr, const GPlexLV& inPar,
+    const GPlexQI& inChg, const float radius,
+    GPlexLS& outErr, GPlexLV& outPar, 
+    const int N) {
+  int gridx = std::min((N-1)/BLOCK_SIZE_X + 1,
+                       max_blocks_x);
+  dim3 grid(gridx, 1, 1);
+  dim3 block(BLOCK_SIZE_X, 1, 1);
+  propagationForBuilding_kernel<<<grid, block, 0, stream >>>
+    (inErr, inPar, inChg, radius, outErr, outPar, N);
+}
+
diff --git a/mkFit/propagation_kernels.h b/mkFit/propagation_kernels.h
index 2fb92f0689f39..aec2e256bf918 100644
--- a/mkFit/propagation_kernels.h
+++ b/mkFit/propagation_kernels.h
@@ -3,11 +3,29 @@
 
 #include "GPlex.h"
 
-void propagation_wrapper(cudaStream_t& stream,
-    GPlex<float>& msPar,
-    GPlex<float>& inPar, GPlex<int>& inChg,
-    GPlex<float>& outPar, GPlex<float>& errorProp,
-    GPlex<float>& outErr, 
+void propagation_wrapper(const cudaStream_t& stream,
+    GPlexHV& msPar, GPlexLS& inErr,
+    GPlexLV& inPar, GPlexQI& inChg,
+    GPlexLV& outPar,
+    GPlexLS& outErr, 
     const int N);
 
+void propagationForBuilding_wrapper(const cudaStream_t& stream,
+    const GPlexLS& inErr, const GPlexLV& inPar,
+    const GPlexQI& inChg, const float radius,
+    GPlexLS& outErr, GPlexLV& outPar, 
+    const int N);
+
+__device__ void propagation_fn(
+    GPlexLS &inErr, GPlexLV &inPar, 
+    GPlexQI &inChg, GPlexHV &msPar,
+    GPlexLS &outErr, GPlexLV &outPar,
+    int n, int N);
+
+__device__ void propagationForBuilding_fn(
+    const GPlexLS &inErr, const GPlexLV &inPar,
+    const GPlexQI &inChg, const float radius,
+    GPlexLS &outErr, GPlexLV &outPar, 
+    const int n, const int N);
+
 #endif  // _PROPAGATION_KERNELS_H_
diff --git a/mkFit/reorganize.cu b/mkFit/reorganize.cu
deleted file mode 100644
index 80d3b7912cac9..0000000000000
--- a/mkFit/reorganize.cu
+++ /dev/null
@@ -1,64 +0,0 @@
-#include "reorganize.h"
-#include <stdio.h>
-
-__global__ void toMatriplex_kernel(float *dst, int dst_stride,
-                                   const float* __restrict__ src, int src_stride,
-                                   int N, int LS) {
-  int i = threadIdx.x + blockIdx.x * blockDim.x;
-  int j = threadIdx.y + blockIdx.y * blockDim.y;
-
-  if (i < N && j < LS) {
-    if (i==-1) {
-      printf(" %d, mplex[%f]  /  lin[%f]\n", j, dst[i+j*dst_stride], src[j+i*src_stride]);
-    }
-    dst[i + j*dst_stride] = src[j + i*src_stride];
-    /*float diff = fabs((dst[i + j*dst_stride] - src[j + i*src_stride]));*/
-    /*if (diff > 1e-3) printf("%f\n", diff);*/
-  }
-}
-
-void toMatriplex_wrapper(cudaStream_t& stream, GPlex<float> &dst, GPlex<float> &src, int N, int LS) {
-  dim3 block(16, 8, 1);
-  dim3 grid((N-1)/16 + 1, (LS-1)/8 +1, 1);
-  toMatriplex_kernel <<<grid, block, 0, stream>>> (dst.ptr, dst.stride, src.ptr, src.stride, N, LS);
-}
-
-
-__global__ void reorganizeMs(float *msPar, size_t msPar_stride,
-                             float *full_posArray,
-                             float *msErr, size_t msErr_stride,
-                             float *full_errArray,
-                             int *full_hitIdx, int hi,
-                             int maxHits,
-                             int N, int HS, int HV, int Nhits) {
-
-  int i = threadIdx.x + blockIdx.x * blockDim.x;
-  int j = threadIdx.y + blockIdx.y * blockDim.y;
-
-  if (i < N) {
-    int hidx = full_hitIdx[i + hi*N];
-    if (j < HV) {
-      /*float tmp1 = msPar[i + msPar_stride*j];*/
-      msPar[i + msPar_stride*j] = full_posArray[j + HV*(hidx + hi*maxHits)];
-      /*float tmp2 = msPar[i + msPar_stride*j];*/
-      
-      /*if (i==0 && hi == 0) {*/
-        /*if (fabs(tmp1 - tmp2) > 1e-3) {*/
-          /*printf("i %d, j %d, old: %f, new %f\n", i, j, tmp1, tmp2);*/
-        /*}*/
-      /*}*/
-    }
-    if (j < HS) {
-      msErr[i + msErr_stride*j] = full_errArray[j + HS*(hidx + hi*maxHits)];
-    }
-  }
-}
-
-void reorganizeMs_wrapper(cudaStream_t& stream, GPlex<float>& msPar, float *full_posArray,
-    GPlex<float>& msErr, float *full_errArray, int *full_hitIdx, int hi, int maxHits,
-    int N, int hs, int hv, int Nhits) {
-  dim3 block(16, 6, 1);
-  dim3 grid((N-1)/16 + 1, (hs-1)/6 +1, 1);
-  reorganizeMs <<<grid, block, 0, stream>>> (msPar.ptr, msPar.stride, full_posArray,
-      msErr.ptr, msErr.stride, full_errArray, full_hitIdx, hi, maxHits, N, hs, hv, Nhits);
-}
diff --git a/mkFit/reorganize.h b/mkFit/reorganize.h
deleted file mode 100644
index fd72eb8f91be6..0000000000000
--- a/mkFit/reorganize.h
+++ /dev/null
@@ -1,8 +0,0 @@
-#ifndef _REORGANIZE_KERNELS_H_
-#define _REORGANIZE_KERNELS_H_
-
-#include "GPlex.h"
-
-void toMatriplex_wrapper(cudaStream_t& stream, GPlex<float> &dst, GPlex<float> &src, int n, int ls);
-
-#endif
diff --git a/mkFit/reorganize_gplex.cu b/mkFit/reorganize_gplex.cu
new file mode 100644
index 0000000000000..305ab59ac277f
--- /dev/null
+++ b/mkFit/reorganize_gplex.cu
@@ -0,0 +1,318 @@
+#include "reorganize_gplex.h"
+#include <stdio.h>
+
+#include "FitterCU.h"
+#include "accessors_cu.h"
+#include "Track.h"
+#include "gpu_utils.h"
+
+__device__ float *get_posArray(Hit &hit) {
+    return hit.posArrayCU();
+}
+__device__ float *get_errArray(Hit &hit) {
+    return hit.errArrayCU();
+}
+
+__device__ float *get_posArray(Track &track) {
+    return track.posArrayCU();
+}
+__device__ float *get_errArray(Track &track) {
+    return track.errArrayCU();
+}
+
+template <typename GPlexObj>
+__device__ void SlurpIn_fn(GPlexObj to, // float *fArray, int stride, int kSize, 
+                           const char *arr, const int *vi, const int N) {
+  int j = threadIdx.x + blockDim.x * blockIdx.x;
+  if (j<N) {
+    const int *XHitPos = vi;
+    const int off = XHitPos[j] * sizeof(Hit);
+    for (int i = 0; i < to.kSize; ++i) { // plex_size
+      /*fArray[i*stride+ j] = * (const T*) (arr + i*sizeof(T) + off);*/
+      to(j, i, 0) = * (decltype(to.ptr)) (arr + i*sizeof(decltype(*to.ptr)) + off);
+    }
+  }
+}
+
+
+template <typename GPlexObj>
+__device__ void SlurpInIdx_fn(GPlexObj to,
+                             const char *arr, const int idx, const int N) {
+  int j = threadIdx.x + blockDim.x * blockIdx.x;
+  if (j<N) {
+    for (int i = 0; i < to.kSize; ++i) { // plex_size
+      to(j, i, 0) = * (decltype(to.ptr)) (arr + i*sizeof(decltype(*to.ptr)) + idx);
+    }
+  }
+}
+
+
+template <typename GPlexObj>
+__device__ void SlurpOutIdx_fn(GPlexObj from, // float *fArray, int stride, int kSize, 
+                               const char *arr, const int idx, const int N) {
+  int j = threadIdx.x + blockDim.x * blockIdx.x;
+  if (j<N) {
+    for (int i = 0; i < from.kSize; ++i) { // plex_size
+      * (decltype(from.ptr)) (arr + i*sizeof(decltype(*from.ptr)) + idx) = from(j, i , 0);
+    }
+  }
+}
+
+
+__device__ void HitToMs_fn(GPlexHS &msErr, GPlexHV &msPar,
+                           Hit *hits, const GPlexQI &XHitSize, 
+                           const GPlexHitIdx &XHitArr, 
+                           GPlexQI &HitsIdx, const int hit_cnt, 
+                           const int itrack, const int N) {
+  /*int j = threadIdx.x + blockDim.x*blockIdx.x;*/
+  //int itrack = threadIdx.x + blockDim.x * blockIdx.x;
+  if (itrack < N) {
+
+    const char *varr      = (char*) hits;
+    const int   off_error = (char*) hits[0].errArrayCU() - varr;
+    const int   off_param = (char*) hits[0].posArrayCU() - varr;
+
+    if (hit_cnt < XHitSize[itrack]) {
+      HitsIdx[itrack] = XHitArr(itrack, hit_cnt, 0) * sizeof(Hit);
+    }
+    SlurpInIdx_fn(msErr, varr + off_error, HitsIdx[itrack], N);
+    SlurpInIdx_fn(msPar, varr + off_param, HitsIdx[itrack], N);
+  }
+}
+
+__global__ void HitToMs_kernel(GPlexHS msErr, GPlexHV msPar, Hit *hits,
+                               const GPlexQI XHitSize, const GPlexHitIdx XHitArr,
+                               GPlexQI HitsIdx, const int hit_cnt, const int N) {
+  int itrack = threadIdx.x + blockDim.x * blockIdx.x;
+  HitToMs_fn(msErr, msPar, hits, XHitSize, XHitArr, HitsIdx, hit_cnt, itrack, N);
+}
+
+void HitToMs_wrapper(const cudaStream_t& stream,
+                     GPlexHS &msErr, GPlexHV &msPar, LayerOfHitsCU &layer, 
+                     const GPlexQI &XHitSize, const GPlexHitIdx &XHitArr,
+                     GPlexQI &HitsIdx, int hit_cnt, const int N) {
+  int gridx = std::min((N-1)/BLOCK_SIZE_X + 1,
+                       max_blocks_x);
+  dim3 grid(gridx, 1, 1);
+  dim3 block(BLOCK_SIZE_X, 1, 1);
+  HitToMs_kernel <<< grid, block, 0 , stream >>>
+    (msErr, msPar, layer.m_hits, XHitSize, XHitArr, HitsIdx, hit_cnt, N);
+  cudaDeviceSynchronize();
+}
+
+
+__device__ void InputTracksCU_fn (Track *tracks, 
+                                  GPlexLS &Err_iP, GPlexLV &Par_iP,
+                                  GPlexQI &Chg, GPlexQF &Chi2,
+                                  GPlexQI &Label, GPlexQI *HitsIdx,
+                                  const int beg, const int end, 
+                                  const int itrack, const int N) {
+  //int itrack = threadIdx.x + blockDim.x*blockIdx.x;
+
+  if (itrack < (end-beg) && itrack < N) {
+    Track &trk = tracks[beg];
+    const char *varr       = (char*) &trk;
+    int   off_error = (char*) trk.errArrayCU() - varr;
+    int   off_param = (char*) trk.posArrayCU() - varr;
+
+    int i= itrack + beg;
+    const Track &trk_i = tracks[i];
+    int idx = (char*) &trk_i - varr;
+
+    Label(itrack, 0, 0) = tracks[i].label();
+    Chg(itrack, 0, 0) = tracks[i].charge();
+    Chi2(itrack, 0, 0) = tracks[i].chi2();
+    SlurpInIdx_fn(Err_iP, varr + off_error, idx, N);
+    SlurpInIdx_fn(Par_iP, varr + off_param, idx, N);
+
+    for (int hi = 0; hi < 3; ++hi)
+      HitsIdx[hi](itrack, 0, 0) = tracks[i].getHitIdx(hi);//dummy value for now
+  }
+}
+
+
+__global__ void InputTracksCU_kernel(Track *tracks, 
+                                     GPlexLS Err_iP, GPlexLV Par_iP,
+                                     GPlexQI Chg, GPlexQF Chi2, GPlexQI Label,
+                                     GPlexQI *HitsIdx,
+                                     int beg, int end, int N) {
+  int itrack = threadIdx.x + blockDim.x*blockIdx.x;
+  InputTracksCU_fn(tracks, Err_iP, Par_iP, Chg, Chi2, Label, HitsIdx, beg, end, itrack, N);
+}
+
+
+void InputTracksCU_wrapper(const cudaStream_t &stream, 
+                           const EtaBinOfCandidatesCU &etaBin,
+                           GPlexLS &Err_iP, GPlexLV &Par_iP,
+                           GPlexQI &Chg, GPlexQF &Chi2, GPlexQI &Label,
+                           GPlexQI *HitsIdx,
+                           const int beg, const int end, const bool inputProp, int N) {
+  int gridx = std::min((N-1)/BLOCK_SIZE_X + 1,
+                       max_blocks_x);
+  dim3 grid(gridx, 1, 1);
+  dim3 block(BLOCK_SIZE_X, 1, 1);
+
+  InputTracksCU_kernel <<< grid, block, 0, stream >>>
+    (etaBin.m_candidates, Err_iP, Par_iP, Chg, Chi2, Label, HitsIdx,
+     beg, end, N);
+}
+
+
+__device__ void InputTracksAndHitsCU_fn (Track *tracks, LayerOfHitsCU *layerHits,
+                                         GPlexLS &Err_iP, GPlexLV &Par_iP,
+                                         GPlexHS *msErr_arr, GPlexHV *msPar_arr,
+                                         GPlexQI &Chg, GPlexQF &Chi2,
+                                         GPlexQI &Label, GPlexQI *HitsIdx,
+                                         const int beg, const int end, 
+                                         const int itrack, const int N) {
+  //int itrack = threadIdx.x + blockDim.x*blockIdx.x;
+
+  if (itrack < (end-beg) && itrack < N) {
+    Track &trk = tracks[beg];
+    const char *varr       = (char*) &trk;
+    int   off_error = (char*) trk.errArrayCU() - varr;
+    int   off_param = (char*) trk.posArrayCU() - varr;
+
+    int i= itrack + beg;
+    const Track &trk_i = tracks[i];
+    int idx = (char*) &trk_i - varr;
+
+    Label(itrack, 0, 0) = tracks[i].label();
+    Chg(itrack, 0, 0) = tracks[i].charge();
+    Chi2(itrack, 0, 0) = tracks[i].chi2();
+    SlurpInIdx_fn(Err_iP, varr + off_error, idx, N);
+    SlurpInIdx_fn(Par_iP, varr + off_param, idx, N);
+
+    // Note Config::nLayers -- not suitable for building
+    for (int hi = 0; hi < Config::nLayers; ++hi) {
+      int hidx = tracks[i].getHitIdx(hi);
+      Hit &hit = layerHits[hi].m_hits[hidx];
+
+      HitsIdx[hi](itrack, 0, 0) = idx;
+      if (hidx < 0) continue;
+
+      SlurpInIdx_fn(msErr_arr[hi], (char *)hit.errArrayCU(), 0, N);
+      SlurpInIdx_fn(msPar_arr[hi], (char *)hit.posArrayCU(), 0, N);
+    }
+  }
+}
+
+
+__global__ void InputTracksAndHitsCU_kernel(Track *tracks, LayerOfHitsCU *layers,
+                                            GPlexLS Err_iP, GPlexLV Par_iP,
+                                            GPlexHS *msErr_arr, GPlexHV *msPar_arr,
+                                            GPlexQI Chg, GPlexQF Chi2, GPlexQI Label,
+                                            GPlexQI *HitsIdx,
+                                            int beg, int end, int N) {
+  int itrack = threadIdx.x + blockDim.x*blockIdx.x;
+  InputTracksAndHitsCU_fn(tracks, layers, Err_iP, Par_iP, msErr_arr, msPar_arr,
+                          Chg, Chi2, Label, HitsIdx, beg, end, itrack, N);
+}
+
+
+void InputTracksAndHitsCU_wrapper(const cudaStream_t &stream, 
+                                  Track *tracks, EventOfHitsCU &event_of_hits,
+                                  GPlexLS &Err_iP, GPlexLV &Par_iP,
+                                  GPlexHS *msErr_arr, GPlexHV *msPar_arr,
+                                  GPlexQI &Chg, GPlexQF &Chi2, GPlexQI &Label,
+                                  GPlexQI *HitsIdx,
+                                  const int beg, const int end, 
+                                  const bool inputProp, int N) {
+  int gridx = std::min((N-1)/BLOCK_SIZE_X + 1,
+                       max_blocks_x);
+  dim3 grid(gridx, 1, 1);
+  dim3 block(BLOCK_SIZE_X, 1, 1);
+
+  InputTracksAndHitsCU_kernel <<< grid, block, 0, stream >>>
+    (tracks, event_of_hits.m_layers_of_hits, 
+     Err_iP, Par_iP, 
+     msErr_arr, msPar_arr, 
+     Chg, Chi2, Label, HitsIdx,
+     beg, end, N);
+}
+
+
+__device__ void OutputTracksCU_fn(Track *tracks, 
+                                  const GPlexLS &Err_iP, const GPlexLV &Par_iP,
+                                  const GPlexQI &Chg, const GPlexQF &Chi2,
+                                  const GPlexQI &Label, const GPlexQI *HitsIdx,
+                                  const int beg, const int end, 
+                                  const int itrack, const int N,
+                                  const bool update_hit_idx) {
+  //int itrack = threadIdx.x + blockDim.x*blockIdx.x;
+
+  if (itrack < (end-beg) && itrack < N) {
+    Track &trk = tracks[beg];
+    const char *varr       = (char*) &trk;
+    int   off_error = (char*) trk.errArrayCU() - varr;
+    int   off_param = (char*) trk.posArrayCU() - varr;
+
+    int i= itrack + beg;
+    const Track &trk_i = tracks[i];
+    int idx = (char*) &trk_i - varr;
+
+    SlurpOutIdx_fn(Err_iP, varr + off_error, idx, N);
+    SlurpOutIdx_fn(Par_iP, varr + off_param, idx, N);
+    tracks[i].setCharge(Chg(itrack, 0, 0));
+    tracks[i].setChi2(Chi2(itrack, 0, 0));
+    tracks[i].setLabel(Label(itrack, 0, 0));
+
+    if (update_hit_idx) {
+      tracks[i].resetHits();
+      /*int nGoodItIdx = 0;*/
+      for (int hi = 0; hi < Config::nLayers; ++hi) {
+        tracks[i].addHitIdx(HitsIdx[hi](itrack, 0, 0),0.);
+        // FIXME: We probably want to use registers instead of going for class members:
+        /*int hit_idx = HitsIdx[hi](itrack, 0, 0);*/
+        /*tracks[i].setHitIdx(hi, hit_idx);*/
+        /*if (hit_idx >= 0) {*/
+        /*nGoodItIdx++; */
+        /*}*/
+      }
+      /*tracks[i].setNGoodHitIdx(nGoodItIdx);*/
+      /*tracks[i].setChi2(0.);*/
+    }
+  }
+}
+
+__global__ void OutputTracksCU_kernel(Track *tracks, 
+                                     GPlexLS Err_iP, GPlexLV Par_iP,
+                                     GPlexQI Chg, GPlexQF Chi2, GPlexQI Label,
+                                     GPlexQI *HitsIdx,
+                                     int beg, int end, int N,
+                                     const bool update_hit_idx=true) {
+  int itrack = threadIdx.x + blockDim.x*blockIdx.x;
+  OutputTracksCU_fn(tracks, Err_iP, Par_iP, Chg, Chi2, Label, HitsIdx,
+                    beg, end, itrack, N, update_hit_idx);
+}
+
+
+void OutputTracksCU_wrapper(const cudaStream_t &stream,
+                            EtaBinOfCandidatesCU &etaBin,
+                            GPlexLS &Err_iP, GPlexLV &Par_iP,
+                            GPlexQI &Chg, GPlexQF &Chi2, GPlexQI &Label,
+                            GPlexQI *HitsIdx,
+                            const int beg, const int end, const bool outputProp, int N) {
+  int gridx = std::min((N-1)/BLOCK_SIZE_X + 1,
+                       max_blocks_x);
+  dim3 grid(gridx, 1, 1);
+  dim3 block(BLOCK_SIZE_X, 1, 1);
+
+  OutputTracksCU_kernel <<< grid, block, 0, stream >>>
+    (etaBin.m_candidates, Err_iP, Par_iP, Chg, Chi2, Label, HitsIdx, beg, end, N);
+}
+
+
+void OutputFittedTracksCU_wrapper(const cudaStream_t &stream,
+                                  Track *tracks_cu, 
+                                  GPlexLS &Err_iP, GPlexLV &Par_iP,
+                                  GPlexQI &Chg, GPlexQF &Chi2, GPlexQI &Label,
+                                  const int beg, const int end, int N) {
+  int gridx = std::min((N-1)/BLOCK_SIZE_X + 1,
+                       max_blocks_x);
+  dim3 grid(gridx, 1, 1);
+  dim3 block(BLOCK_SIZE_X, 1, 1);
+
+  OutputTracksCU_kernel <<< grid, block, 0, stream >>>
+    (tracks_cu, Err_iP, Par_iP, Chg, Chi2, Label, nullptr, beg, end, N, false);
+}
diff --git a/mkFit/reorganize_gplex.h b/mkFit/reorganize_gplex.h
new file mode 100644
index 0000000000000..f59e2b6c0c464
--- /dev/null
+++ b/mkFit/reorganize_gplex.h
@@ -0,0 +1,74 @@
+#ifndef REORGANIZE_GPLEX_H
+#define REORGANIZE_GPLEX_H
+
+#include "GPlex.h"
+#include "Hit.h"
+#include "HitStructuresCU.h"
+#include "Track.h"
+
+__device__ float *get_posArray(Hit &hit);
+__device__ float *get_errArray(Hit &hit);
+__device__ float *get_posArray(Track &track);
+__device__ float *get_errArray(Track &track);
+
+__device__ void HitToMs_fn(GPlexHS &msErr, GPlexHV &msPar,
+                           Hit *hits, const GPlexQI &XHitSize,
+                           const GPlexHitIdx &XHitArr, 
+                           GPlexQI &HitsIdx, const int hit_cnt,
+                           const int itrack, const int N);
+
+__global__ void HitToMs_kernel(GPlexHS msErr, GPlexHV msPar, Hit *hits, 
+                               const GPlexQI XHitSize, const GPlexHitIdx XHitArr, 
+                               GPlexQI HitsIdx, const int hit_cnt, const int N);
+
+void HitToMs_wrapper(const cudaStream_t& stream,
+                     GPlexHS &msErr, GPlexHV &msPar, LayerOfHitsCU &layer, 
+                     const GPlexQI &XHitSize, const GPlexHitIdx &XHitArr, 
+                     GPlexQI &HitsIdx, const int hit_cnt, const int N);
+
+__device__ void InputTracksCU_fn(Track *tracks, 
+                                 GPlexLS &Err_iP, GPlexLV &Par_iP,
+                                 GPlexQI &Chg, GPlexQF &Chi2,
+                                 GPlexQI &Label, GPlexQI *HitsIdx,
+                                 const int beg, const int end,
+                                 const int itrack, const int N);
+
+__device__ void OutputTracksCU_fn(Track *tracks, 
+                                  const GPlexLS &Err_iP, const GPlexLV &Par_iP,
+                                  const GPlexQI &Chg, const GPlexQF &Chi2,
+                                  const GPlexQI &Label, const GPlexQI *HitsIdx,
+                                  const int beg, const int end, 
+                                  const int itrack, const int N,
+                                  const bool update_hit_idx=true);
+
+void InputTracksCU_wrapper(const cudaStream_t &stream, 
+                           const EtaBinOfCandidatesCU &etaBin,
+                           GPlexLS &Err_iP, GPlexLV &Par_iP,
+                           GPlexQI &Chg, GPlexQF &Chi2, GPlexQI &Label,
+                           GPlexQI *HitsIdx,
+                           const int beg, const int end, const bool inputProp, int N);
+
+void InputTracksAndHitsCU_wrapper(const cudaStream_t &stream, 
+                                  Track *tracks, EventOfHitsCU &event_of_hits,
+                                  GPlexLS &Err_iP, GPlexLV &Par_iP,
+                                  GPlexHS *msErr_arr, GPlexHV *msPar_arr,
+                                  GPlexQI &Chg, GPlexQF &Chi2, GPlexQI &Label,
+                                  GPlexQI *HitsIdx,
+                                  const int beg, const int end,
+                                  const bool inputProp, int N);
+
+void OutputTracksCU_wrapper(const cudaStream_t &stream,
+                            EtaBinOfCandidatesCU &etaBin,
+                            GPlexLS &Err_iP, GPlexLV &Par_iP,
+                            GPlexQI &Chg, GPlexQF &Chi2, GPlexQI &Label,
+                            GPlexQI *HitsIdx,
+                            const int beg, const int end, bool outputProp, int N);
+
+
+void OutputFittedTracksCU_wrapper(const cudaStream_t &stream,
+                                  Track *tracks_cu, 
+                                  GPlexLS &Err_iP, GPlexLV &Par_iP,
+                                  GPlexQI &Chg, GPlexQF &Chi2, GPlexQI &Label,
+                                  const int beg, const int end, int N);
+
+#endif  // REORGANIZE_GPLEX_H