Skip to content

Commit

Permalink
Revert "Merge pull request #1059 from bstatcomp/gpu_cholesky_prim"
Browse files Browse the repository at this point in the history
Had some tests failures on develop.

This reverts commit a933f65, reversing
changes made to 70edefd.
  • Loading branch information
seantalts committed Feb 13, 2019
1 parent a933f65 commit 0caabb0
Show file tree
Hide file tree
Showing 12 changed files with 37 additions and 367 deletions.
7 changes: 3 additions & 4 deletions runTests.py
Expand Up @@ -40,9 +40,10 @@ def processCLIArgs():
parser.add_argument("tests", nargs="+", type=str,
help=tests_help_msg)
f_help_msg = "Only tests with file names matching these will be executed.\n"
f_help_msg += "Example: '-f chol', '-f gpu', '-f prim'"
parser.add_argument("-f", type=str, default = [], action="append",
f_help_msg += "Example: '-f chol', '-f gpu', '-f prim mat'"
parser.add_argument("-f", nargs="+", type=str, default = "",
help=f_help_msg)

parser.add_argument("-d", "--debug", dest="debug", action="store_true",
help="request additional script debugging output.")
parser.add_argument("-m", "--make-only", dest="make_only",
Expand Down Expand Up @@ -157,8 +158,6 @@ def main():
tests = findTests(inputs.tests, inputs.f)
if not tests:
stopErr("No matching tests found.", -1)
if inputs.debug:
print("Collected the following tests:\n", tests)

# pass 1: make test executables
for batch in batched(tests):
Expand Down
101 changes: 0 additions & 101 deletions stan/math/gpu/cholesky_decompose.hpp

This file was deleted.

80 changes: 0 additions & 80 deletions stan/math/gpu/kernels/cholesky_decompose.hpp

This file was deleted.

11 changes: 3 additions & 8 deletions stan/math/gpu/kernels/inv_lower_tri_multiply.hpp
Expand Up @@ -77,19 +77,14 @@ const char* inv_lower_tri_multiply_kernel_code = STRINGIFY(
const int local_col = thread_block_col + w * THREAD_BLOCK_SIZE_COL;
const int local_row = thread_block_row;
// Element above the diagonal will not be transferred.
if (C2_global_col <= C2_global_row && C2_global_col < A_rows
&& C2_global_row < A_rows) {
if (C2_global_col <= C2_global_row) {
C2_local[local_col][local_row]
= A[C2_global_col * A_rows + C2_global_row];
} else {
C2_local[local_col][local_row] = 0;
}
if (A3_global_col < A_rows && A3_global_row < A_rows) {
A3_local[local_col][local_row]
= A[A3_global_col * A_rows + A3_global_row];
} else {
A3_local[local_col][local_row] = 0.0;
}
A3_local[local_col][local_row]
= A[A3_global_col * A_rows + A3_global_row];
}
// Wait until all tile values are loaded to the local memory
barrier(CLK_LOCAL_MEM_FENCE);
Expand Down
7 changes: 2 additions & 5 deletions stan/math/gpu/kernels/neg_rect_lower_tri_multiply.hpp
Expand Up @@ -73,8 +73,7 @@ const char* neg_rect_lower_tri_multiply_kernel_code = STRINGIFY(
temp_local[local_col][local_row] = 0.0;
}
// Element above the diagonal will not be transferred.
if (C1_global_col <= C1_global_row && C1_global_col < A_rows
&& C1_global_row < A_rows) {
if (C1_global_col <= C1_global_row) {
C1_local[local_col][local_row]
= A[C1_global_col * A_rows + C1_global_row];
} else {
Expand Down Expand Up @@ -103,9 +102,7 @@ const char* neg_rect_lower_tri_multiply_kernel_code = STRINGIFY(
for (int w = 0; w < WORK_PER_THREAD; w++) {
const int A_global_col
= A_global_col_offset + w * THREAD_BLOCK_SIZE_COL;
if (A_global_col < A_rows && (i + rows + offset) < A_rows) {
A[A_global_col * A_rows + i + rows + offset] = -acc[w];
}
A[A_global_col * A_rows + i + rows + offset] = -acc[w];
}
}
// \cond
Expand Down
4 changes: 1 addition & 3 deletions stan/math/gpu/multiply.hpp
Expand Up @@ -57,10 +57,8 @@ inline auto multiply(const matrix_gpu& A, const matrix_gpu& B) {
check_size_match("multiply (GPU)", "A.cols()", A.cols(), "B.rows()",
B.rows());
matrix_gpu temp(A.rows(), B.cols());
if (A.size() == 0 || B.size() == 0) {
temp.zeros();
if (A.size() == 0 || B.size() == 0)
return temp;
}
int local = opencl_kernels::matrix_multiply.make_functor.get_opts().at(
"THREAD_BLOCK_SIZE");
int Mpad = ((A.rows() + local - 1) / local) * local;
Expand Down
24 changes: 2 additions & 22 deletions stan/math/gpu/opencl_context.hpp
Expand Up @@ -3,7 +3,7 @@
#ifdef STAN_OPENCL
#define __CL_ENABLE_EXCEPTIONS

#define DEVICE_FILTER CL_DEVICE_TYPE_ALL
#define DEVICE_FILTER CL_DEVICE_TYPE_GPU
#ifndef OPENCL_DEVICE_ID
#error OPENCL_DEVICE_ID_NOT_SET
#endif
Expand Down Expand Up @@ -106,13 +106,6 @@ class opencl_context_base {
base_opts_["THREAD_BLOCK_SIZE"] = thread_block_size_sqrt;
base_opts_["WORK_PER_THREAD"] = 1;
}
// Thread block size for the Cholesky
// TODO(Steve): This should be tuned in a higher part of the stan language
if (max_thread_block_size_ >= 256) {
tuning_opts_.cholesky_min_L11_size = 256;
} else {
tuning_opts_.cholesky_min_L11_size = max_thread_block_size_;
}
} catch (const cl::Error& e) {
check_opencl_error("opencl_context", e);
}
Expand Down Expand Up @@ -140,12 +133,6 @@ class opencl_context_base {
{"LOWER_TO_UPPER", static_cast<int>(TriangularMapGPU::LowerToUpper)},
{"THREAD_BLOCK_SIZE", 32},
{"WORK_PER_THREAD", 8}};
// TODO(Steve): Make these tunable during warmup
struct tuning_struct {
int cholesky_min_L11_size = 256;
int cholesky_partition = 4;
int cholesky_size_worth_transfer = 1250;
} tuning_opts_;

static opencl_context_base& getInstance() {
static opencl_context_base instance_;
Expand Down Expand Up @@ -242,7 +229,7 @@ class opencl_context {

try {
std::vector<cl::Device> all_devices;
platform.getDevices(CL_DEVICE_TYPE_ALL, &all_devices);
platform.getDevices(CL_DEVICE_TYPE_GPU, &all_devices);

for (auto device_iter : all_devices) {
cl::Device device(device_iter);
Expand Down Expand Up @@ -317,13 +304,6 @@ class opencl_context {
return opencl_context_base::getInstance().max_thread_block_size_;
}

/**
* Returns the thread block size for the Cholesky Decompositions L_11.
*/
inline opencl_context_base::tuning_struct& tuning_opts() {
return opencl_context_base::getInstance().tuning_opts_;
}

/**
* Returns a vector containing the OpenCL device used to create the context
*/
Expand Down
24 changes: 1 addition & 23 deletions stan/math/prim/mat/fun/cholesky_decompose.hpp
@@ -1,17 +1,10 @@
#ifndef STAN_MATH_PRIM_MAT_FUN_CHOLESKY_DECOMPOSE_HPP
#define STAN_MATH_PRIM_MAT_FUN_CHOLESKY_DECOMPOSE_HPP

#include <stan/math/gpu/opencl_context.hpp>
#include <stan/math/prim/mat/fun/Eigen.hpp>
#include <stan/math/prim/mat/err/check_pos_definite.hpp>
#include <stan/math/prim/mat/err/check_square.hpp>
#include <stan/math/prim/mat/err/check_symmetric.hpp>
#ifdef STAN_OPENCL
#include <stan/math/gpu/cholesky_decompose.hpp>
#include <stan/math/gpu/copy.hpp>
#endif

#include <cmath>

namespace stan {
namespace math {
Expand All @@ -32,27 +25,12 @@ Eigen::Matrix<T, Eigen::Dynamic, Eigen::Dynamic> cholesky_decompose(
const Eigen::Matrix<T, Eigen::Dynamic, Eigen::Dynamic>& m) {
check_square("cholesky_decompose", "m", m);
check_symmetric("cholesky_decompose", "m", m);
#ifdef STAN_OPENCL
if (m.rows() >= opencl_context.tuning_opts().cholesky_size_worth_transfer) {
matrix_gpu m_gpu(m);
Eigen::Matrix<T, Eigen::Dynamic, Eigen::Dynamic> m_chol(m.rows(), m.cols());
m_gpu = cholesky_decompose(m_gpu);
copy(m_chol, m_gpu); // NOLINT
return m_chol;
} else {
Eigen::LLT<Eigen::Matrix<T, Eigen::Dynamic, Eigen::Dynamic> > llt(m.rows());
llt.compute(m);
check_pos_definite("cholesky_decompose", "m", llt);
return llt.matrixL();
}
#else
Eigen::LLT<Eigen::Matrix<T, Eigen::Dynamic, Eigen::Dynamic> > llt(m.rows());
llt.compute(m);
check_pos_definite("cholesky_decompose", "m", llt);
return llt.matrixL();
#endif
}
} // namespace math

} // namespace math
} // namespace stan
#endif

0 comments on commit 0caabb0

Please sign in to comment.