Skip to content

Commit

Permalink
getSystemE() is implemented for CUDA-based annealers.
Browse files Browse the repository at this point in the history
  • Loading branch information
shinmorino committed Nov 11, 2018
1 parent d72b261 commit fde7bf8
Show file tree
Hide file tree
Showing 5 changed files with 134 additions and 1 deletion.
46 changes: 46 additions & 0 deletions sqaodc/cuda/CUDABipartiteGraphAnnealer.cu
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@ template<class real>
CUDABipartiteGraphAnnealer<real>::CUDABipartiteGraphAnnealer() {
devStream_ = NULL;
m_ = -1;
dotSpins0_ = dotSpins1_ = NULL;
selectAlgorithm(sq::algoDefault);
}

Expand All @@ -33,6 +34,12 @@ template<class real>
CUDABipartiteGraphAnnealer<real>::~CUDABipartiteGraphAnnealer() {
deallocate();
d_random_.deallocate();

if (dotSpins0_ != NULL) {
delete dotSpins0_;
delete dotSpins1_;
dotSpins0_ = dotSpins1_ = NULL;
}
}

template<class real>
Expand All @@ -57,6 +64,7 @@ void CUDABipartiteGraphAnnealer<real>::deallocateInternalObjects() {
halloc.deallocate(h_q1_);
halloc.deallocate(h_E_);
E_ = HostVector();
halloc.deallocate(h_spinDotSum_);

d_randReal_.deallocate();

Expand All @@ -83,6 +91,10 @@ void CUDABipartiteGraphAnnealer<real>::assignDevice(Device &device) {
devCopy_.assignDevice(device);
d_random_.assignDevice(device);
d_randReal_.assignDevice(device);

/* initialize sumJq */
dotSpins0_ = new DotSpins<real>(device, devStream_);
dotSpins1_ = new DotSpins<real>(device, devStream_);
}

template<class real>
Expand Down Expand Up @@ -298,6 +310,7 @@ void CUDABipartiteGraphAnnealer<real>::prepare() {
halloc.allocate(&h_q1_, m_, N1_);
bitsPairX_.reserve(m_);
bitsPairQ_.reserve(m_);
halloc.allocate(&h_spinDotSum_);

/* estimate # rand nums required per one anneal. */
sq::SizeType N = N0_ + N1_;
Expand All @@ -306,6 +319,11 @@ void CUDABipartiteGraphAnnealer<real>::prepare() {
sq::SizeType requiredSize = nRunsPerRandGen_ * m_ * N * sizeof(real) / sizeof(float);
d_random_.setRequiredSize(requiredSize);

DotSpins<real> &dotSpins0 = static_cast<DotSpins<real>&>(*dotSpins0_);
dotSpins0.configure(N0_, m_, false);
DotSpins<real> &dotSpins1 = static_cast<DotSpins<real>&>(*dotSpins1_);
dotSpins1.configure(N0_, m_, false);

setState(solPrepared);
}

Expand All @@ -321,6 +339,34 @@ void CUDABipartiteGraphAnnealer<real>::makeSolution() {
}


template<class real>
real CUDABipartiteGraphAnnealer<real>::getSystemE(real G, real beta) const {
auto _this = const_cast<CUDABipartiteGraphAnnealer<real>*>(this);
_this->calculate_E(); /* asynchronous */

if (isSQAAlgorithm(algo_)) {
DeviceVector *d_spinDot0 = devStream_->tempDeviceVector<real>(m_);
DotSpins<real> &dotSpins0 = static_cast<DotSpins<real>&>(*dotSpins0_);
dotSpins0(d_matq0_, d_spinDot0);
DeviceVector *d_spinDot1 = devStream_->tempDeviceVector<real>(m_);
DotSpins<real> &dotSpins1 = static_cast<DotSpins<real>&>(*dotSpins1_);
dotSpins1(d_matq1_, d_spinDot1);

_this->devFormulas_.devMath.sum(&_this->h_spinDotSum_, real(1.), *d_spinDot0);
_this->devFormulas_.devMath.sum(&_this->h_spinDotSum_, real(1.), *d_spinDot1, 1.);
}
devStream_->synchronize();

real E = E_.sum() / m_;
if (isSQAAlgorithm(algo_)) {
real coef = real(0.5) / beta * std::log(std::tanh(G * beta / m_));
E -= *h_spinDotSum_.d_data * coef;
}
if (om_ == sq::optMaximize)
E *= real(-1.);
return E;
}


// template<class real>
// void CUDABipartiteGraphAnnealer<real>::
Expand Down
6 changes: 6 additions & 0 deletions sqaodc/cuda/CUDABipartiteGraphAnnealer.h
Original file line number Diff line number Diff line change
Expand Up @@ -77,6 +77,8 @@ class WAR_VC_MULTIPLE_INHERITANCE CUDABipartiteGraphAnnealer : public sqaod::cud

void makeSolution();

real getSystemE(real G, real beta) const;

void annealOneStep(real G, real beta) {
(this->*annealMethod_)(G, beta);
}
Expand Down Expand Up @@ -120,6 +122,10 @@ class WAR_VC_MULTIPLE_INHERITANCE CUDABipartiteGraphAnnealer : public sqaod::cud
DeviceMatrix d_matq0_, d_matq1_;
DeviceBitMatrix h_q0_, h_q1_;
sq::SizeType nRunsPerRandGen_;

sq::NullBase *dotSpins0_;
sq::NullBase *dotSpins1_;
DeviceScalar h_spinDotSum_;

DeviceMatrix d_Jq0_;
DeviceMatrix d_Jq1_;
Expand Down
36 changes: 36 additions & 0 deletions sqaodc/cuda/CUDADenseGraphAnnealer.cu
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,9 @@ template<class real>
using DotJq = DeviceDotJq<real, real*>;
#endif

template<class real>
using DotSpins = DeviceDotSpins<real, char>;


template<class real>
CUDADenseGraphAnnealer<real>::CUDADenseGraphAnnealer() {
Expand Down Expand Up @@ -47,6 +50,8 @@ CUDADenseGraphAnnealer<real>::~CUDADenseGraphAnnealer() {
if (dotJq_ != NULL) {
delete dotJq_;
dotJq_ = NULL;
delete dotSpins_;
dotSpins_ = NULL;
}
}

Expand All @@ -73,6 +78,7 @@ void CUDADenseGraphAnnealer<real>::deallocateInternalObjects() {
halloc.deallocate(h_E_);
halloc.deallocate(h_q_);
E_ = HostVector();
halloc.deallocate(h_spinDotSum_);

flipPosBuffer_.deallocate();
realNumBuffer_.deallocate();
Expand Down Expand Up @@ -101,6 +107,7 @@ void CUDADenseGraphAnnealer<real>::assignDevice(Device &device) {

/* initialize sumJq */
dotJq_ = new DotJq<real>(device, devStream_);
dotSpins_ = new DotSpins<real>(device, devStream_);
}

template<class real>
Expand Down Expand Up @@ -295,6 +302,8 @@ void CUDADenseGraphAnnealer<real>::prepare() {
halloc.allocate(&h_E_, m_);
E_.map(h_E_.d_data, h_E_.size);
halloc.allocate(&h_q_, sq::Dim(m_, N_));
halloc.allocate(&h_spinDotSum_);

xlist_.reserve(m_);
qlist_.reserve(m_);

Expand All @@ -308,6 +317,9 @@ void CUDADenseGraphAnnealer<real>::prepare() {

DotJq<real> &dotJq = static_cast<DotJq<real>&>(*dotJq_);
dotJq.configure(N_, m_, false);

DotSpins<real> &dotSpins = static_cast<DotSpins<real>&>(*dotSpins_);
dotSpins.configure(N_, m_, false);

setState(solPrepared);
}
Expand Down Expand Up @@ -337,6 +349,30 @@ void CUDADenseGraphAnnealer<real>::syncBits() {
}
}

template<class real>
real CUDADenseGraphAnnealer<real>::getSystemE(real G, real beta) const {
auto _this = const_cast<CUDADenseGraphAnnealer<real>*>(this);
_this->calculate_E(); /* asynchronous */

if (isSQAAlgorithm(algo_)) {
DeviceVector *d_spinDot = devStream_->tempDeviceVector<real>(m_);
DotSpins<real> &dotSpins = static_cast<DotSpins<real>&>(*dotSpins_);
dotSpins(d_matq_, d_spinDot);
_this->devFormulas_.devMath.sum(&_this->h_spinDotSum_, real(1.), *d_spinDot);
}
devStream_->synchronize();

real E = E_.sum() / m_;
if (isSQAAlgorithm(algo_)) {
real coef = real(0.5) / beta * std::log(std::tanh(G * beta / m_));
E -= *_this->h_spinDotSum_.d_data * coef;
}

if (om_ == sq::optMaximize)
E *= real(-1.);
return E;
}

#if 0
/* equivalent code */
template<class real>
Expand Down
6 changes: 5 additions & 1 deletion sqaodc/cuda/CUDADenseGraphAnnealer.h
Original file line number Diff line number Diff line change
Expand Up @@ -72,6 +72,8 @@ class CUDADenseGraphAnnealer : public sqaod::cuda::DenseGraphAnnealer<real> {

void makeSolution();

real getSystemE(real G, real beta) const;

void annealOneStep(real G, real beta) {
(this->*annealMethod_)(G, beta);
}
Expand Down Expand Up @@ -124,7 +126,9 @@ class CUDADenseGraphAnnealer : public sqaod::cuda::DenseGraphAnnealer<real> {
sq::BitSetArray qlist_;

sq::NullBase *dotJq_;

sq::NullBase *dotSpins_;
DeviceScalar h_spinDotSum_;

DeviceStream *devStream_;
DeviceFormulas devFormulas_;
DeviceCopy devCopy_;
Expand Down
41 changes: 41 additions & 0 deletions sqaodc/cuda/DeviceBatchedDot.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -57,6 +57,23 @@ int2 operator+(const int2 &lhs, const int v) {
}


/* wrapped offset */

struct WrappedOffset {
__host__
WrappedOffset(int _m, int _stride)
: m(_m), stride(_stride) { }

__device__ __forceinline__
int2 operator[](sq::IdxType idx) const {
int idxNeighbour = (idx + 1) % m;
return make_int2(idx * stride, idxNeighbour * stride);
}
sq::SizeType m;
sq::SizeType stride;
};


}


Expand All @@ -72,6 +89,9 @@ struct iterator_traits<sqaod_cuda::In2TypeDotPtr<Vout, Vin0, Vin1>> : sqaod_cuda
template<>
struct iterator_traits<sqaod_cuda::Offset2way> : sqaod_cuda::base_iterator_traits<int2> { };

template<>
struct iterator_traits<sqaod_cuda::WrappedOffset> : sqaod_cuda::base_iterator_traits<int2> { };

}


Expand Down Expand Up @@ -120,6 +140,27 @@ public:
};



template<class Vout, class Vin>
struct DeviceDotSpins : DeviceSegmentedSumType<Vout, In2TypeDotPtr<Vout, Vin, Vin>, Vout*, WrappedOffset, 1> {
typedef DeviceSegmentedSumType<Vout, In2TypeDotPtr<Vout, Vin, Vin>, Vout*, WrappedOffset, 1> Base;
public:

DeviceDotSpins(Device &device, DeviceStream *devStream)
: Base(device, devStream) {
}

DeviceDotSpins(DeviceStream *devStream) : Base(devStream) { }

void operator()(const DeviceMatrixType<Vin> &d_q, DeviceVectorType<Vout> *out) {
In2TypeDotPtr<Vout, Vin, Vin> in(d_q.d_data, d_q.d_data);
WrappedOffset offset(d_q.rows, d_q.stride);
Base::operator()(in, out->d_data, offset);
}
};



/* Vectorized */

/* Value traits class for vector types. */
Expand Down

0 comments on commit fde7bf8

Please sign in to comment.