spcl · ChrisPattison · Dec 29, 2021 · Dec 13, 2021 · Dec 17, 2021 · Dec 17, 2021
diff --git a/CMakeLists.txt b/CMakeLists.txt
@@ -1,15 +1,16 @@
 cmake_minimum_required(VERSION 3.0)
 project(apfp)
-
+ 
 set(CMAKE_CXX_STANDARD 17)
 
 # Target options 
 set(APFP_PLATFORM "xilinx_u280_xdma_201920_3" CACHE STRING "Platform string for Vitis.")
 set(APFP_BITS 1024 CACHE STRING "Number of bits to use for a floating point number, including mantissa, exponent, and sign.")
-set(APFP_MULT_BASE_BITS 16 CACHE STRING "Number of bits to bottom out the multiplication at and use native multiplication.")
+set(APFP_MULT_BASE_BITS 18 CACHE STRING "Number of bits to bottom out the multiplication at and use native multiplication.")
 set(APFP_TILE_SIZE_N 32 CACHE STRING "Tile size in the N-dimension when running matrix-matrix multiplication.")
 set(APFP_TILE_SIZE_M 32 CACHE STRING "Tile size in the M-dimension when running matrix-matrix multiplication.")
 set(APFP_SEMANTICS "MPFR" CACHE STRING "Which semantics to use for floating point operations [GMP/MPFR].")
+set(APFP_PROFILING OFF CACHE BOOL "Enable profiling in the generated kernel.")
 set_property(CACHE APFP_SEMANTICS PROPERTY STRINGS GMP MPFR)
 
 # Validation and derived numbers
@@ -26,7 +27,7 @@ find_package(MPFR REQUIRED)
 find_package(GMP REQUIRED)
 find_package(Threads REQUIRED)
 
-set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wall -Wextra -Wpedantic -DAPFP_${APFP_SEMANTICS}_SEMANTICS")
+set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wall -Wextra -Wpedantic -Wno-unused-label -DAPFP_${APFP_SEMANTICS}_SEMANTICS")
 include_directories(${CMAKE_BINARY_DIR} include SYSTEM hlslib/include ${Vitis_INCLUDE_DIRS} )
 
 configure_file(include/Config.h.in Config.h)
@@ -49,7 +50,7 @@ add_vitis_kernel(MatrixMultiplication FILES ${APFP_KERNEL_FILES}
                              m_axi_b:DDR[1]
                              m_axi_c_read:DDR[1]
                              m_axi_c_write:DDR[1])
-add_vitis_program(MatrixMultiplication ${APFP_PLATFORM})
+add_vitis_program(MatrixMultiplication ${APFP_PLATFORM} PROFILING ${APFP_PROFILING})
 
 # Internal library 
 add_library(apfp host/Random.cpp host/MatrixMultiplicationReference.cpp)

diff --git a/device/ArithmeticOperations.cpp b/device/ArithmeticOperations.cpp
@@ -54,6 +54,7 @@ PackedFloat Add(PackedFloat const &a, PackedFloat const &b) {
     // Optionally shift by 1, 2, 4, 8, 16... log2(B), such that all bits have eventually traveled to
     // their designated position.
     const int kNumStages = hlslib::ConstLog2(kBits);
+ShiftStages:
     for (int i = 0; i < kNumStages; ++i) {
 #pragma HLS UNROLL
         a_mantissa = ((shift_c & (1 << i)) == 0) ? a_mantissa : (a_mantissa >> (1 << i));

diff --git a/device/Karatsuba.cpp b/device/Karatsuba.cpp
@@ -2,6 +2,11 @@
 
 #include <type_traits>  // std::enable_if
 
+constexpr int AddLatency(int bits) {
+    // 4 is the maximum supported latency of integer adds using the BIND_OP pragma
+    return (bits >= 1024) ? 4 : (bits >= 768) ? 3 : (bits >= 512) ? 2 : (bits >= 256) ? 1 : 0;
+}
+
 template <int bits>
 auto _Karatsuba(ap_uint<bits> const &a, ap_uint<bits> const &b) ->
     typename std::enable_if<(bits > kMultBaseBits), ap_uint<2 * bits>>::type {
@@ -33,6 +38,7 @@ auto _Karatsuba(ap_uint<bits> const &a, ap_uint<bits> const &b) ->
     Full a0a1b0b1 = _Karatsuba<bits / 2>(a0a1, b0b1);
     ap_int<bits + 2> a0a1b0b1_signed = a0a1b0b1_is_neg ? -ap_int<bits + 1>(a0a1b0b1) : ap_int<bits + 2>(a0a1b0b1);
     ap_uint<bits + 2> z1 = ap_uint<bits + 2>(a0a1b0b1_signed) + z0 + z2;
+#pragma HLS BIND_OP variable = z1 op = add impl = fabric latency = AddLatency(bits)
 
     // Align everything and combine
     ap_uint<(2 * bits)> z0z2 = z0 | (ap_uint<(2 * bits)>(z2) << bits);
@@ -41,6 +47,7 @@ auto _Karatsuba(ap_uint<bits> const &a, ap_uint<bits> const &b) ->
     // Workaround to avoid HLS padding an extra bit for the add. This is necessary to support 2048 bit multiplication,
     // which required adding two 4096 numbers, because 4096 bits is the maximum width support by the ap_uint type.
     z.V = z1_aligned.V + z0z2.V;
+#pragma HLS BIND_OP variable = z.V op = add impl = fabric latency = AddLatency(bits)
 
     return z;
 }