From fde7bf8993309ca4b77cdef564178fff2c48546d Mon Sep 17 00:00:00 2001
From: Shinya Morino <shin.morino@gmail.com>
Date: Sat, 10 Nov 2018 17:27:04 +0900
Subject: [PATCH] getSystemE() is implemented for CUDA-based annealers.

---
 sqaodc/cuda/CUDABipartiteGraphAnnealer.cu | 46 +++++++++++++++++++++++
 sqaodc/cuda/CUDABipartiteGraphAnnealer.h  |  6 +++
 sqaodc/cuda/CUDADenseGraphAnnealer.cu     | 36 ++++++++++++++++++
 sqaodc/cuda/CUDADenseGraphAnnealer.h      |  6 ++-
 sqaodc/cuda/DeviceBatchedDot.cuh          | 41 ++++++++++++++++++++
 5 files changed, 134 insertions(+), 1 deletion(-)
diff --git a/sqaodc/cuda/CUDABipartiteGraphAnnealer.cu b/sqaodc/cuda/CUDABipartiteGraphAnnealer.cu
index 95b883a..4e12dda 100644
--- a/sqaodc/cuda/CUDABipartiteGraphAnnealer.cu
+++ b/sqaodc/cuda/CUDABipartiteGraphAnnealer.cu
@@ -18,6 +18,7 @@ template<class real>
 CUDABipartiteGraphAnnealer<real>::CUDABipartiteGraphAnnealer() {
     devStream_ = NULL;
     m_ = -1;
+    dotSpins0_ = dotSpins1_ = NULL;
     selectAlgorithm(sq::algoDefault);
 }
 
@@ -33,6 +34,12 @@ template<class real>
 CUDABipartiteGraphAnnealer<real>::~CUDABipartiteGraphAnnealer() {
     deallocate();
     d_random_.deallocate();
+
+    if (dotSpins0_ != NULL) {
+        delete dotSpins0_;
+        delete dotSpins1_;
+        dotSpins0_ = dotSpins1_ = NULL;
+    }
 }
 
 template<class real>
@@ -57,6 +64,7 @@ void CUDABipartiteGraphAnnealer<real>::deallocateInternalObjects() {
     halloc.deallocate(h_q1_);
     halloc.deallocate(h_E_);
     E_ = HostVector();
+    halloc.deallocate(h_spinDotSum_);
     
     d_randReal_.deallocate();
     
@@ -83,6 +91,10 @@ void CUDABipartiteGraphAnnealer<real>::assignDevice(Device &device) {
     devCopy_.assignDevice(device);
     d_random_.assignDevice(device);
     d_randReal_.assignDevice(device);
+
+    /* initialize sumJq */
+    dotSpins0_ = new DotSpins<real>(device, devStream_);
+    dotSpins1_ = new DotSpins<real>(device, devStream_);
 }
 
 template<class real>
@@ -298,6 +310,7 @@ void CUDABipartiteGraphAnnealer<real>::prepare() {
     halloc.allocate(&h_q1_, m_, N1_);
     bitsPairX_.reserve(m_);
     bitsPairQ_.reserve(m_);
+    halloc.allocate(&h_spinDotSum_);
 
     /* estimate # rand nums required per one anneal. */
     sq::SizeType N = N0_ + N1_;
@@ -306,6 +319,11 @@ void CUDABipartiteGraphAnnealer<real>::prepare() {
     sq::SizeType requiredSize = nRunsPerRandGen_ * m_ * N * sizeof(real) / sizeof(float);
     d_random_.setRequiredSize(requiredSize);
 
+    DotSpins<real> &dotSpins0 = static_cast<DotSpins<real>&>(*dotSpins0_);
+    dotSpins0.configure(N0_, m_, false);
+    DotSpins<real> &dotSpins1 = static_cast<DotSpins<real>&>(*dotSpins1_);
+    dotSpins1.configure(N0_, m_, false);
+
     setState(solPrepared);
 }
 
@@ -321,6 +339,34 @@ void CUDABipartiteGraphAnnealer<real>::makeSolution() {
 }
 
 
+template<class real>
+real CUDABipartiteGraphAnnealer<real>::getSystemE(real G, real beta) const {
+    auto _this = const_cast<CUDABipartiteGraphAnnealer<real>*>(this);
+    _this->calculate_E(); /* asynchronous */
+
+    if (isSQAAlgorithm(algo_)) {
+        DeviceVector *d_spinDot0 = devStream_->tempDeviceVector<real>(m_);
+        DotSpins<real> &dotSpins0 = static_cast<DotSpins<real>&>(*dotSpins0_);
+        dotSpins0(d_matq0_, d_spinDot0);
+        DeviceVector *d_spinDot1 = devStream_->tempDeviceVector<real>(m_);
+        DotSpins<real> &dotSpins1 = static_cast<DotSpins<real>&>(*dotSpins1_);
+        dotSpins1(d_matq1_, d_spinDot1);
+
+        _this->devFormulas_.devMath.sum(&_this->h_spinDotSum_, real(1.), *d_spinDot0);
+        _this->devFormulas_.devMath.sum(&_this->h_spinDotSum_, real(1.), *d_spinDot1, 1.);
+    }
+    devStream_->synchronize();
+
+    real E = E_.sum() / m_;
+    if (isSQAAlgorithm(algo_)) {
+        real coef = real(0.5) / beta * std::log(std::tanh(G * beta / m_));
+        E -= *h_spinDotSum_.d_data * coef;
+    }
+    if (om_ == sq::optMaximize)
+        E *= real(-1.);
+    return E;
+}
+
 
 // template<class real>
 // void CUDABipartiteGraphAnnealer<real>::
diff --git a/sqaodc/cuda/CUDABipartiteGraphAnnealer.h b/sqaodc/cuda/CUDABipartiteGraphAnnealer.h
index ff92b37..8f6e5b5 100644
--- a/sqaodc/cuda/CUDABipartiteGraphAnnealer.h
+++ b/sqaodc/cuda/CUDABipartiteGraphAnnealer.h
@@ -77,6 +77,8 @@ class WAR_VC_MULTIPLE_INHERITANCE CUDABipartiteGraphAnnealer : public sqaod::cud
 
     void makeSolution();
 
+    real getSystemE(real G, real beta) const;
+
     void annealOneStep(real G, real beta) {
         (this->*annealMethod_)(G, beta);
     }
@@ -120,6 +122,10 @@ class WAR_VC_MULTIPLE_INHERITANCE CUDABipartiteGraphAnnealer : public sqaod::cud
     DeviceMatrix d_matq0_, d_matq1_;
     DeviceBitMatrix h_q0_, h_q1_;
     sq::SizeType nRunsPerRandGen_;
+
+    sq::NullBase *dotSpins0_;
+    sq::NullBase *dotSpins1_;
+    DeviceScalar h_spinDotSum_;
     
     DeviceMatrix d_Jq0_;
     DeviceMatrix d_Jq1_;
diff --git a/sqaodc/cuda/CUDADenseGraphAnnealer.cu b/sqaodc/cuda/CUDADenseGraphAnnealer.cu
index 7d0f38a..6cbf105 100644
--- a/sqaodc/cuda/CUDADenseGraphAnnealer.cu
+++ b/sqaodc/cuda/CUDADenseGraphAnnealer.cu
@@ -19,6 +19,9 @@ template<class real>
 using DotJq = DeviceDotJq<real, real*>;
 #endif
 
+template<class real>
+using DotSpins = DeviceDotSpins<real, char>;
+
 
 template<class real>
 CUDADenseGraphAnnealer<real>::CUDADenseGraphAnnealer() {
@@ -47,6 +50,8 @@ CUDADenseGraphAnnealer<real>::~CUDADenseGraphAnnealer() {
     if (dotJq_ != NULL) {
         delete dotJq_;
         dotJq_ = NULL;
+        delete dotSpins_;
+        dotSpins_ = NULL;
     }
 }
 
@@ -73,6 +78,7 @@ void CUDADenseGraphAnnealer<real>::deallocateInternalObjects() {
     halloc.deallocate(h_E_);
     halloc.deallocate(h_q_);
     E_ = HostVector();
+    halloc.deallocate(h_spinDotSum_);
         
     flipPosBuffer_.deallocate();
     realNumBuffer_.deallocate();
@@ -101,6 +107,7 @@ void CUDADenseGraphAnnealer<real>::assignDevice(Device &device) {
 
     /* initialize sumJq */
     dotJq_ = new DotJq<real>(device, devStream_);
+    dotSpins_ = new DotSpins<real>(device, devStream_);
 }
 
 template<class real>
@@ -295,6 +302,8 @@ void CUDADenseGraphAnnealer<real>::prepare() {
     halloc.allocate(&h_E_, m_);
     E_.map(h_E_.d_data, h_E_.size);
     halloc.allocate(&h_q_, sq::Dim(m_, N_));
+    halloc.allocate(&h_spinDotSum_);
+    
     xlist_.reserve(m_);
     qlist_.reserve(m_);
 
@@ -308,6 +317,9 @@ void CUDADenseGraphAnnealer<real>::prepare() {
 
     DotJq<real> &dotJq = static_cast<DotJq<real>&>(*dotJq_);
     dotJq.configure(N_, m_, false);
+    
+    DotSpins<real> &dotSpins = static_cast<DotSpins<real>&>(*dotSpins_);
+    dotSpins.configure(N_, m_, false);
 
     setState(solPrepared);
 }
@@ -337,6 +349,30 @@ void CUDADenseGraphAnnealer<real>::syncBits() {
     }
 }
 
+template<class real>
+real CUDADenseGraphAnnealer<real>::getSystemE(real G, real beta) const {
+    auto _this = const_cast<CUDADenseGraphAnnealer<real>*>(this);
+    _this->calculate_E(); /* asynchronous */
+
+    if (isSQAAlgorithm(algo_)) {
+        DeviceVector *d_spinDot = devStream_->tempDeviceVector<real>(m_);
+        DotSpins<real> &dotSpins = static_cast<DotSpins<real>&>(*dotSpins_);
+        dotSpins(d_matq_, d_spinDot);
+        _this->devFormulas_.devMath.sum(&_this->h_spinDotSum_, real(1.), *d_spinDot);
+    }
+    devStream_->synchronize();
+
+    real E = E_.sum() / m_;
+    if (isSQAAlgorithm(algo_)) {
+        real coef = real(0.5) / beta * std::log(std::tanh(G * beta / m_));
+        E -= *_this->h_spinDotSum_.d_data * coef;
+    }
+
+    if (om_ == sq::optMaximize)
+        E *= real(-1.);
+    return E;
+}
+
 #if 0
 /* equivalent code */
 template<class real>
diff --git a/sqaodc/cuda/CUDADenseGraphAnnealer.h b/sqaodc/cuda/CUDADenseGraphAnnealer.h
index 17a621c..07eaacb 100644
--- a/sqaodc/cuda/CUDADenseGraphAnnealer.h
+++ b/sqaodc/cuda/CUDADenseGraphAnnealer.h
@@ -72,6 +72,8 @@ class CUDADenseGraphAnnealer : public sqaod::cuda::DenseGraphAnnealer<real> {
 
     void makeSolution();
 
+    real getSystemE(real G, real beta) const;
+
     void annealOneStep(real G, real beta) {
         (this->*annealMethod_)(G, beta);
     }
@@ -124,7 +126,9 @@ class CUDADenseGraphAnnealer : public sqaod::cuda::DenseGraphAnnealer<real> {
     sq::BitSetArray qlist_;
 
     sq::NullBase *dotJq_;
-
+    sq::NullBase *dotSpins_;
+    DeviceScalar h_spinDotSum_;
+    
     DeviceStream *devStream_;
     DeviceFormulas devFormulas_;
     DeviceCopy devCopy_;
diff --git a/sqaodc/cuda/DeviceBatchedDot.cuh b/sqaodc/cuda/DeviceBatchedDot.cuh
index 256bb3c..b059a81 100644
--- a/sqaodc/cuda/DeviceBatchedDot.cuh
+++ b/sqaodc/cuda/DeviceBatchedDot.cuh
@@ -57,6 +57,23 @@ int2 operator+(const int2 &lhs, const int v) {
 }
 
 
+/* wrapped offset */
+
+struct WrappedOffset  {
+    __host__
+    WrappedOffset(int _m, int _stride)
+            : m(_m), stride(_stride) { }
+
+    __device__ __forceinline__
+    int2 operator[](sq::IdxType idx) const {
+        int idxNeighbour = (idx + 1) % m;
+        return make_int2(idx * stride, idxNeighbour * stride);
+    }
+    sq::SizeType m;
+    sq::SizeType stride;
+};
+
+
 }
 
 
@@ -72,6 +89,9 @@ struct iterator_traits<sqaod_cuda::In2TypeDotPtr<Vout, Vin0, Vin1>> : sqaod_cuda
 template<>
 struct iterator_traits<sqaod_cuda::Offset2way> : sqaod_cuda::base_iterator_traits<int2> { };
 
+template<>
+struct iterator_traits<sqaod_cuda::WrappedOffset> : sqaod_cuda::base_iterator_traits<int2> { };
+
 }
 
 
@@ -120,6 +140,27 @@ public:
 };
 
 
+
+template<class Vout, class Vin>
+struct DeviceDotSpins : DeviceSegmentedSumType<Vout, In2TypeDotPtr<Vout, Vin, Vin>, Vout*, WrappedOffset, 1> {
+    typedef DeviceSegmentedSumType<Vout, In2TypeDotPtr<Vout, Vin, Vin>, Vout*, WrappedOffset, 1> Base;
+public:
+    
+    DeviceDotSpins(Device &device, DeviceStream *devStream)
+            : Base(device, devStream) {
+    }
+
+    DeviceDotSpins(DeviceStream *devStream) : Base(devStream) { }
+    
+    void operator()(const DeviceMatrixType<Vin> &d_q, DeviceVectorType<Vout> *out) {
+        In2TypeDotPtr<Vout, Vin, Vin> in(d_q.d_data, d_q.d_data);
+        WrappedOffset offset(d_q.rows, d_q.stride);
+        Base::operator()(in, out->d_data, offset);
+    }
+};
+
+
+
 /* Vectorized */
 
 /* Value traits class for vector types. */