Revert "Merge pull request #1059 from bstatcomp/gpu_cholesky_prim"

Had some tests failures on develop. This reverts commit a933f65, reversing changes made to 70edefd.
stan-dev · Feb 13, 2019 · 0caabb0 · 0caabb0
1 parent a933f65
commit 0caabb0
Show file tree

Hide file tree

Showing 12 changed files with 37 additions and 367 deletions.
diff --git a/runTests.py b/runTests.py
@@ -40,9 +40,10 @@ def processCLIArgs():
     parser.add_argument("tests", nargs="+", type=str,
                         help=tests_help_msg)
     f_help_msg = "Only tests with file names matching these will be executed.\n"
-    f_help_msg += "Example: '-f chol', '-f gpu', '-f prim'"
-    parser.add_argument("-f", type=str, default = [], action="append",
+    f_help_msg += "Example: '-f chol', '-f gpu', '-f prim mat'"
+    parser.add_argument("-f", nargs="+", type=str, default = "",
                         help=f_help_msg)
+
     parser.add_argument("-d", "--debug", dest="debug", action="store_true",
                         help="request additional script debugging output.")
     parser.add_argument("-m", "--make-only", dest="make_only",
@@ -157,8 +158,6 @@ def main():
     tests = findTests(inputs.tests, inputs.f)
     if not tests:
         stopErr("No matching tests found.", -1)
-    if inputs.debug:
-        print("Collected the following tests:\n", tests)
 
     # pass 1: make test executables
     for batch in batched(tests):

diff --git a/stan/math/gpu/cholesky_decompose.hpp b/stan/math/gpu/cholesky_decompose.hpp
diff --git a/stan/math/gpu/kernels/cholesky_decompose.hpp b/stan/math/gpu/kernels/cholesky_decompose.hpp
diff --git a/stan/math/gpu/kernels/inv_lower_tri_multiply.hpp b/stan/math/gpu/kernels/inv_lower_tri_multiply.hpp
@@ -77,19 +77,14 @@ const char* inv_lower_tri_multiply_kernel_code = STRINGIFY(
           const int local_col = thread_block_col + w * THREAD_BLOCK_SIZE_COL;
           const int local_row = thread_block_row;
           // Element above the diagonal will not be transferred.
-          if (C2_global_col <= C2_global_row && C2_global_col < A_rows
-              && C2_global_row < A_rows) {
+          if (C2_global_col <= C2_global_row) {
             C2_local[local_col][local_row]
                 = A[C2_global_col * A_rows + C2_global_row];
           } else {
             C2_local[local_col][local_row] = 0;
           }
-          if (A3_global_col < A_rows && A3_global_row < A_rows) {
-            A3_local[local_col][local_row]
-                = A[A3_global_col * A_rows + A3_global_row];
-          } else {
-            A3_local[local_col][local_row] = 0.0;
-          }
+          A3_local[local_col][local_row]
+              = A[A3_global_col * A_rows + A3_global_row];
         }
         // Wait until all tile values are loaded to the local memory
         barrier(CLK_LOCAL_MEM_FENCE);

diff --git a/stan/math/gpu/kernels/neg_rect_lower_tri_multiply.hpp b/stan/math/gpu/kernels/neg_rect_lower_tri_multiply.hpp
@@ -73,8 +73,7 @@ const char* neg_rect_lower_tri_multiply_kernel_code = STRINGIFY(
             temp_local[local_col][local_row] = 0.0;
           }
           // Element above the diagonal will not be transferred.
-          if (C1_global_col <= C1_global_row && C1_global_col < A_rows
-              && C1_global_row < A_rows) {
+          if (C1_global_col <= C1_global_row) {
             C1_local[local_col][local_row]
                 = A[C1_global_col * A_rows + C1_global_row];
           } else {
@@ -103,9 +102,7 @@ const char* neg_rect_lower_tri_multiply_kernel_code = STRINGIFY(
       for (int w = 0; w < WORK_PER_THREAD; w++) {
         const int A_global_col
             = A_global_col_offset + w * THREAD_BLOCK_SIZE_COL;
-        if (A_global_col < A_rows && (i + rows + offset) < A_rows) {
-          A[A_global_col * A_rows + i + rows + offset] = -acc[w];
-        }
+        A[A_global_col * A_rows + i + rows + offset] = -acc[w];
       }
     }
     // \cond

diff --git a/stan/math/gpu/multiply.hpp b/stan/math/gpu/multiply.hpp
@@ -57,10 +57,8 @@ inline auto multiply(const matrix_gpu& A, const matrix_gpu& B) {
   check_size_match("multiply (GPU)", "A.cols()", A.cols(), "B.rows()",
                    B.rows());
   matrix_gpu temp(A.rows(), B.cols());
-  if (A.size() == 0 || B.size() == 0) {
-    temp.zeros();
+  if (A.size() == 0 || B.size() == 0)
     return temp;
-  }
   int local = opencl_kernels::matrix_multiply.make_functor.get_opts().at(
       "THREAD_BLOCK_SIZE");
   int Mpad = ((A.rows() + local - 1) / local) * local;

diff --git a/stan/math/gpu/opencl_context.hpp b/stan/math/gpu/opencl_context.hpp
@@ -3,7 +3,7 @@
 #ifdef STAN_OPENCL
 #define __CL_ENABLE_EXCEPTIONS
 
-#define DEVICE_FILTER CL_DEVICE_TYPE_ALL
+#define DEVICE_FILTER CL_DEVICE_TYPE_GPU
 #ifndef OPENCL_DEVICE_ID
 #error OPENCL_DEVICE_ID_NOT_SET
 #endif
@@ -106,13 +106,6 @@ class opencl_context_base {
         base_opts_["THREAD_BLOCK_SIZE"] = thread_block_size_sqrt;
         base_opts_["WORK_PER_THREAD"] = 1;
       }
-      // Thread block size for the Cholesky
-      // TODO(Steve): This should be tuned in a higher part of the stan language
-      if (max_thread_block_size_ >= 256) {
-        tuning_opts_.cholesky_min_L11_size = 256;
-      } else {
-        tuning_opts_.cholesky_min_L11_size = max_thread_block_size_;
-      }
     } catch (const cl::Error& e) {
       check_opencl_error("opencl_context", e);
     }
@@ -140,12 +133,6 @@ class opencl_context_base {
          {"LOWER_TO_UPPER", static_cast<int>(TriangularMapGPU::LowerToUpper)},
          {"THREAD_BLOCK_SIZE", 32},
          {"WORK_PER_THREAD", 8}};
-  // TODO(Steve): Make these tunable during warmup
-  struct tuning_struct {
-    int cholesky_min_L11_size = 256;
-    int cholesky_partition = 4;
-    int cholesky_size_worth_transfer = 1250;
-  } tuning_opts_;
 
   static opencl_context_base& getInstance() {
     static opencl_context_base instance_;
@@ -242,7 +229,7 @@ class opencl_context {
 
       try {
         std::vector<cl::Device> all_devices;
-        platform.getDevices(CL_DEVICE_TYPE_ALL, &all_devices);
+        platform.getDevices(CL_DEVICE_TYPE_GPU, &all_devices);
 
         for (auto device_iter : all_devices) {
           cl::Device device(device_iter);
@@ -317,13 +304,6 @@ class opencl_context {
     return opencl_context_base::getInstance().max_thread_block_size_;
   }
 
-  /**
-   * Returns the thread block size for the Cholesky Decompositions L_11.
-   */
-  inline opencl_context_base::tuning_struct& tuning_opts() {
-    return opencl_context_base::getInstance().tuning_opts_;
-  }
-
   /**
    * Returns a vector containing the OpenCL device used to create the context
    */

diff --git a/stan/math/prim/mat/fun/cholesky_decompose.hpp b/stan/math/prim/mat/fun/cholesky_decompose.hpp
@@ -1,17 +1,10 @@
 #ifndef STAN_MATH_PRIM_MAT_FUN_CHOLESKY_DECOMPOSE_HPP
 #define STAN_MATH_PRIM_MAT_FUN_CHOLESKY_DECOMPOSE_HPP
 
-#include <stan/math/gpu/opencl_context.hpp>
 #include <stan/math/prim/mat/fun/Eigen.hpp>
 #include <stan/math/prim/mat/err/check_pos_definite.hpp>
 #include <stan/math/prim/mat/err/check_square.hpp>
 #include <stan/math/prim/mat/err/check_symmetric.hpp>
-#ifdef STAN_OPENCL
-#include <stan/math/gpu/cholesky_decompose.hpp>
-#include <stan/math/gpu/copy.hpp>
-#endif
-
-#include <cmath>
 
 namespace stan {
 namespace math {
@@ -32,27 +25,12 @@ Eigen::Matrix<T, Eigen::Dynamic, Eigen::Dynamic> cholesky_decompose(
     const Eigen::Matrix<T, Eigen::Dynamic, Eigen::Dynamic>& m) {
   check_square("cholesky_decompose", "m", m);
   check_symmetric("cholesky_decompose", "m", m);
-#ifdef STAN_OPENCL
-  if (m.rows() >= opencl_context.tuning_opts().cholesky_size_worth_transfer) {
-    matrix_gpu m_gpu(m);
-    Eigen::Matrix<T, Eigen::Dynamic, Eigen::Dynamic> m_chol(m.rows(), m.cols());
-    m_gpu = cholesky_decompose(m_gpu);
-    copy(m_chol, m_gpu);  // NOLINT
-    return m_chol;
-  } else {
-    Eigen::LLT<Eigen::Matrix<T, Eigen::Dynamic, Eigen::Dynamic> > llt(m.rows());
-    llt.compute(m);
-    check_pos_definite("cholesky_decompose", "m", llt);
-    return llt.matrixL();
-  }
-#else
   Eigen::LLT<Eigen::Matrix<T, Eigen::Dynamic, Eigen::Dynamic> > llt(m.rows());
   llt.compute(m);
   check_pos_definite("cholesky_decompose", "m", llt);
   return llt.matrixL();
-#endif
 }
-}  // namespace math
 
+}  // namespace math
 }  // namespace stan
 #endif