From fde7bf8993309ca4b77cdef564178fff2c48546d Mon Sep 17 00:00:00 2001 From: Shinya Morino Date: Sat, 10 Nov 2018 17:27:04 +0900 Subject: [PATCH] getSystemE() is implemented for CUDA-based annealers. --- sqaodc/cuda/CUDABipartiteGraphAnnealer.cu | 46 +++++++++++++++++++++++ sqaodc/cuda/CUDABipartiteGraphAnnealer.h | 6 +++ sqaodc/cuda/CUDADenseGraphAnnealer.cu | 36 ++++++++++++++++++ sqaodc/cuda/CUDADenseGraphAnnealer.h | 6 ++- sqaodc/cuda/DeviceBatchedDot.cuh | 41 ++++++++++++++++++++ 5 files changed, 134 insertions(+), 1 deletion(-) diff --git a/sqaodc/cuda/CUDABipartiteGraphAnnealer.cu b/sqaodc/cuda/CUDABipartiteGraphAnnealer.cu index 95b883a..4e12dda 100644 --- a/sqaodc/cuda/CUDABipartiteGraphAnnealer.cu +++ b/sqaodc/cuda/CUDABipartiteGraphAnnealer.cu @@ -18,6 +18,7 @@ template CUDABipartiteGraphAnnealer::CUDABipartiteGraphAnnealer() { devStream_ = NULL; m_ = -1; + dotSpins0_ = dotSpins1_ = NULL; selectAlgorithm(sq::algoDefault); } @@ -33,6 +34,12 @@ template CUDABipartiteGraphAnnealer::~CUDABipartiteGraphAnnealer() { deallocate(); d_random_.deallocate(); + + if (dotSpins0_ != NULL) { + delete dotSpins0_; + delete dotSpins1_; + dotSpins0_ = dotSpins1_ = NULL; + } } template @@ -57,6 +64,7 @@ void CUDABipartiteGraphAnnealer::deallocateInternalObjects() { halloc.deallocate(h_q1_); halloc.deallocate(h_E_); E_ = HostVector(); + halloc.deallocate(h_spinDotSum_); d_randReal_.deallocate(); @@ -83,6 +91,10 @@ void CUDABipartiteGraphAnnealer::assignDevice(Device &device) { devCopy_.assignDevice(device); d_random_.assignDevice(device); d_randReal_.assignDevice(device); + + /* initialize sumJq */ + dotSpins0_ = new DotSpins(device, devStream_); + dotSpins1_ = new DotSpins(device, devStream_); } template @@ -298,6 +310,7 @@ void CUDABipartiteGraphAnnealer::prepare() { halloc.allocate(&h_q1_, m_, N1_); bitsPairX_.reserve(m_); bitsPairQ_.reserve(m_); + halloc.allocate(&h_spinDotSum_); /* estimate # rand nums required per one anneal. */ sq::SizeType N = N0_ + N1_; @@ -306,6 +319,11 @@ void CUDABipartiteGraphAnnealer::prepare() { sq::SizeType requiredSize = nRunsPerRandGen_ * m_ * N * sizeof(real) / sizeof(float); d_random_.setRequiredSize(requiredSize); + DotSpins &dotSpins0 = static_cast&>(*dotSpins0_); + dotSpins0.configure(N0_, m_, false); + DotSpins &dotSpins1 = static_cast&>(*dotSpins1_); + dotSpins1.configure(N0_, m_, false); + setState(solPrepared); } @@ -321,6 +339,34 @@ void CUDABipartiteGraphAnnealer::makeSolution() { } +template +real CUDABipartiteGraphAnnealer::getSystemE(real G, real beta) const { + auto _this = const_cast*>(this); + _this->calculate_E(); /* asynchronous */ + + if (isSQAAlgorithm(algo_)) { + DeviceVector *d_spinDot0 = devStream_->tempDeviceVector(m_); + DotSpins &dotSpins0 = static_cast&>(*dotSpins0_); + dotSpins0(d_matq0_, d_spinDot0); + DeviceVector *d_spinDot1 = devStream_->tempDeviceVector(m_); + DotSpins &dotSpins1 = static_cast&>(*dotSpins1_); + dotSpins1(d_matq1_, d_spinDot1); + + _this->devFormulas_.devMath.sum(&_this->h_spinDotSum_, real(1.), *d_spinDot0); + _this->devFormulas_.devMath.sum(&_this->h_spinDotSum_, real(1.), *d_spinDot1, 1.); + } + devStream_->synchronize(); + + real E = E_.sum() / m_; + if (isSQAAlgorithm(algo_)) { + real coef = real(0.5) / beta * std::log(std::tanh(G * beta / m_)); + E -= *h_spinDotSum_.d_data * coef; + } + if (om_ == sq::optMaximize) + E *= real(-1.); + return E; +} + // template // void CUDABipartiteGraphAnnealer:: diff --git a/sqaodc/cuda/CUDABipartiteGraphAnnealer.h b/sqaodc/cuda/CUDABipartiteGraphAnnealer.h index ff92b37..8f6e5b5 100644 --- a/sqaodc/cuda/CUDABipartiteGraphAnnealer.h +++ b/sqaodc/cuda/CUDABipartiteGraphAnnealer.h @@ -77,6 +77,8 @@ class WAR_VC_MULTIPLE_INHERITANCE CUDABipartiteGraphAnnealer : public sqaod::cud void makeSolution(); + real getSystemE(real G, real beta) const; + void annealOneStep(real G, real beta) { (this->*annealMethod_)(G, beta); } @@ -120,6 +122,10 @@ class WAR_VC_MULTIPLE_INHERITANCE CUDABipartiteGraphAnnealer : public sqaod::cud DeviceMatrix d_matq0_, d_matq1_; DeviceBitMatrix h_q0_, h_q1_; sq::SizeType nRunsPerRandGen_; + + sq::NullBase *dotSpins0_; + sq::NullBase *dotSpins1_; + DeviceScalar h_spinDotSum_; DeviceMatrix d_Jq0_; DeviceMatrix d_Jq1_; diff --git a/sqaodc/cuda/CUDADenseGraphAnnealer.cu b/sqaodc/cuda/CUDADenseGraphAnnealer.cu index 7d0f38a..6cbf105 100644 --- a/sqaodc/cuda/CUDADenseGraphAnnealer.cu +++ b/sqaodc/cuda/CUDADenseGraphAnnealer.cu @@ -19,6 +19,9 @@ template using DotJq = DeviceDotJq; #endif +template +using DotSpins = DeviceDotSpins; + template CUDADenseGraphAnnealer::CUDADenseGraphAnnealer() { @@ -47,6 +50,8 @@ CUDADenseGraphAnnealer::~CUDADenseGraphAnnealer() { if (dotJq_ != NULL) { delete dotJq_; dotJq_ = NULL; + delete dotSpins_; + dotSpins_ = NULL; } } @@ -73,6 +78,7 @@ void CUDADenseGraphAnnealer::deallocateInternalObjects() { halloc.deallocate(h_E_); halloc.deallocate(h_q_); E_ = HostVector(); + halloc.deallocate(h_spinDotSum_); flipPosBuffer_.deallocate(); realNumBuffer_.deallocate(); @@ -101,6 +107,7 @@ void CUDADenseGraphAnnealer::assignDevice(Device &device) { /* initialize sumJq */ dotJq_ = new DotJq(device, devStream_); + dotSpins_ = new DotSpins(device, devStream_); } template @@ -295,6 +302,8 @@ void CUDADenseGraphAnnealer::prepare() { halloc.allocate(&h_E_, m_); E_.map(h_E_.d_data, h_E_.size); halloc.allocate(&h_q_, sq::Dim(m_, N_)); + halloc.allocate(&h_spinDotSum_); + xlist_.reserve(m_); qlist_.reserve(m_); @@ -308,6 +317,9 @@ void CUDADenseGraphAnnealer::prepare() { DotJq &dotJq = static_cast&>(*dotJq_); dotJq.configure(N_, m_, false); + + DotSpins &dotSpins = static_cast&>(*dotSpins_); + dotSpins.configure(N_, m_, false); setState(solPrepared); } @@ -337,6 +349,30 @@ void CUDADenseGraphAnnealer::syncBits() { } } +template +real CUDADenseGraphAnnealer::getSystemE(real G, real beta) const { + auto _this = const_cast*>(this); + _this->calculate_E(); /* asynchronous */ + + if (isSQAAlgorithm(algo_)) { + DeviceVector *d_spinDot = devStream_->tempDeviceVector(m_); + DotSpins &dotSpins = static_cast&>(*dotSpins_); + dotSpins(d_matq_, d_spinDot); + _this->devFormulas_.devMath.sum(&_this->h_spinDotSum_, real(1.), *d_spinDot); + } + devStream_->synchronize(); + + real E = E_.sum() / m_; + if (isSQAAlgorithm(algo_)) { + real coef = real(0.5) / beta * std::log(std::tanh(G * beta / m_)); + E -= *_this->h_spinDotSum_.d_data * coef; + } + + if (om_ == sq::optMaximize) + E *= real(-1.); + return E; +} + #if 0 /* equivalent code */ template diff --git a/sqaodc/cuda/CUDADenseGraphAnnealer.h b/sqaodc/cuda/CUDADenseGraphAnnealer.h index 17a621c..07eaacb 100644 --- a/sqaodc/cuda/CUDADenseGraphAnnealer.h +++ b/sqaodc/cuda/CUDADenseGraphAnnealer.h @@ -72,6 +72,8 @@ class CUDADenseGraphAnnealer : public sqaod::cuda::DenseGraphAnnealer { void makeSolution(); + real getSystemE(real G, real beta) const; + void annealOneStep(real G, real beta) { (this->*annealMethod_)(G, beta); } @@ -124,7 +126,9 @@ class CUDADenseGraphAnnealer : public sqaod::cuda::DenseGraphAnnealer { sq::BitSetArray qlist_; sq::NullBase *dotJq_; - + sq::NullBase *dotSpins_; + DeviceScalar h_spinDotSum_; + DeviceStream *devStream_; DeviceFormulas devFormulas_; DeviceCopy devCopy_; diff --git a/sqaodc/cuda/DeviceBatchedDot.cuh b/sqaodc/cuda/DeviceBatchedDot.cuh index 256bb3c..b059a81 100644 --- a/sqaodc/cuda/DeviceBatchedDot.cuh +++ b/sqaodc/cuda/DeviceBatchedDot.cuh @@ -57,6 +57,23 @@ int2 operator+(const int2 &lhs, const int v) { } +/* wrapped offset */ + +struct WrappedOffset { + __host__ + WrappedOffset(int _m, int _stride) + : m(_m), stride(_stride) { } + + __device__ __forceinline__ + int2 operator[](sq::IdxType idx) const { + int idxNeighbour = (idx + 1) % m; + return make_int2(idx * stride, idxNeighbour * stride); + } + sq::SizeType m; + sq::SizeType stride; +}; + + } @@ -72,6 +89,9 @@ struct iterator_traits> : sqaod_cuda template<> struct iterator_traits : sqaod_cuda::base_iterator_traits { }; +template<> +struct iterator_traits : sqaod_cuda::base_iterator_traits { }; + } @@ -120,6 +140,27 @@ public: }; + +template +struct DeviceDotSpins : DeviceSegmentedSumType, Vout*, WrappedOffset, 1> { + typedef DeviceSegmentedSumType, Vout*, WrappedOffset, 1> Base; +public: + + DeviceDotSpins(Device &device, DeviceStream *devStream) + : Base(device, devStream) { + } + + DeviceDotSpins(DeviceStream *devStream) : Base(devStream) { } + + void operator()(const DeviceMatrixType &d_q, DeviceVectorType *out) { + In2TypeDotPtr in(d_q.d_data, d_q.d_data); + WrappedOffset offset(d_q.rows, d_q.stride); + Base::operator()(in, out->d_data, offset); + } +}; + + + /* Vectorized */ /* Value traits class for vector types. */