diff --git a/CMakeLists.txt b/CMakeLists.txt index c3c5ecf..8321b35 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -1,15 +1,16 @@ cmake_minimum_required(VERSION 3.0) project(apfp) - +  set(CMAKE_CXX_STANDARD 17) # Target options set(APFP_PLATFORM "xilinx_u280_xdma_201920_3" CACHE STRING "Platform string for Vitis.") set(APFP_BITS 1024 CACHE STRING "Number of bits to use for a floating point number, including mantissa, exponent, and sign.") -set(APFP_MULT_BASE_BITS 16 CACHE STRING "Number of bits to bottom out the multiplication at and use native multiplication.") +set(APFP_MULT_BASE_BITS 18 CACHE STRING "Number of bits to bottom out the multiplication at and use native multiplication.") set(APFP_TILE_SIZE_N 32 CACHE STRING "Tile size in the N-dimension when running matrix-matrix multiplication.") set(APFP_TILE_SIZE_M 32 CACHE STRING "Tile size in the M-dimension when running matrix-matrix multiplication.") set(APFP_SEMANTICS "MPFR" CACHE STRING "Which semantics to use for floating point operations [GMP/MPFR].") +set(APFP_PROFILING OFF CACHE BOOL "Enable profiling in the generated kernel.") set_property(CACHE APFP_SEMANTICS PROPERTY STRINGS GMP MPFR) # Validation and derived numbers @@ -26,7 +27,7 @@ find_package(MPFR REQUIRED) find_package(GMP REQUIRED) find_package(Threads REQUIRED) -set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wall -Wextra -Wpedantic -DAPFP_${APFP_SEMANTICS}_SEMANTICS") +set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wall -Wextra -Wpedantic -Wno-unused-label -DAPFP_${APFP_SEMANTICS}_SEMANTICS") include_directories(${CMAKE_BINARY_DIR} include SYSTEM hlslib/include ${Vitis_INCLUDE_DIRS} ) configure_file(include/Config.h.in Config.h) @@ -49,7 +50,7 @@ add_vitis_kernel(MatrixMultiplication FILES ${APFP_KERNEL_FILES} m_axi_b:DDR[1] m_axi_c_read:DDR[1] m_axi_c_write:DDR[1]) -add_vitis_program(MatrixMultiplication ${APFP_PLATFORM}) +add_vitis_program(MatrixMultiplication ${APFP_PLATFORM} PROFILING ${APFP_PROFILING}) # Internal library add_library(apfp host/Random.cpp host/MatrixMultiplicationReference.cpp) diff --git a/device/ArithmeticOperations.cpp b/device/ArithmeticOperations.cpp index 066c23c..5558fdd 100644 --- a/device/ArithmeticOperations.cpp +++ b/device/ArithmeticOperations.cpp @@ -54,6 +54,7 @@ PackedFloat Add(PackedFloat const &a, PackedFloat const &b) { // Optionally shift by 1, 2, 4, 8, 16... log2(B), such that all bits have eventually traveled to // their designated position. const int kNumStages = hlslib::ConstLog2(kBits); +ShiftStages: for (int i = 0; i < kNumStages; ++i) { #pragma HLS UNROLL a_mantissa = ((shift_c & (1 << i)) == 0) ? a_mantissa : (a_mantissa >> (1 << i)); diff --git a/device/Karatsuba.cpp b/device/Karatsuba.cpp index 1a9b4a4..1b4724f 100644 --- a/device/Karatsuba.cpp +++ b/device/Karatsuba.cpp @@ -2,6 +2,11 @@ #include // std::enable_if +constexpr int AddLatency(int bits) { + // 4 is the maximum supported latency of integer adds using the BIND_OP pragma + return (bits >= 1024) ? 4 : (bits >= 768) ? 3 : (bits >= 512) ? 2 : (bits >= 256) ? 1 : 0; +} + template auto _Karatsuba(ap_uint const &a, ap_uint const &b) -> typename std::enable_if<(bits > kMultBaseBits), ap_uint<2 * bits>>::type { @@ -33,6 +38,7 @@ auto _Karatsuba(ap_uint const &a, ap_uint const &b) -> Full a0a1b0b1 = _Karatsuba(a0a1, b0b1); ap_int a0a1b0b1_signed = a0a1b0b1_is_neg ? -ap_int(a0a1b0b1) : ap_int(a0a1b0b1); ap_uint z1 = ap_uint(a0a1b0b1_signed) + z0 + z2; +#pragma HLS BIND_OP variable = z1 op = add impl = fabric latency = AddLatency(bits) // Align everything and combine ap_uint<(2 * bits)> z0z2 = z0 | (ap_uint<(2 * bits)>(z2) << bits); @@ -41,6 +47,7 @@ auto _Karatsuba(ap_uint const &a, ap_uint const &b) -> // Workaround to avoid HLS padding an extra bit for the add. This is necessary to support 2048 bit multiplication, // which required adding two 4096 numbers, because 4096 bits is the maximum width support by the ap_uint type. z.V = z1_aligned.V + z0z2.V; +#pragma HLS BIND_OP variable = z.V op = add impl = fabric latency = AddLatency(bits) return z; } diff --git a/device/MatrixMultiplication.cpp b/device/MatrixMultiplication.cpp index b1a8851..0dabc64 100644 --- a/device/MatrixMultiplication.cpp +++ b/device/MatrixMultiplication.cpp @@ -9,42 +9,78 @@ // Annoyingly we have to specialize the innermost loop on whether multiple DRAM flits per number are required or not, // because HLS otherwise gets confused by pragmas applied to a loop of size 1 in the latter case. template -void ReadAInner(DramLine const *const mem, hlslib::Stream &to_kernel, const int size_k, const int n0, +void ReadAInner(DramLine const *const mem, hlslib::Stream &a_to_feeder, const int size_k, const int n0, const int k) { #pragma HLS INLINE + DramLine num[kLinesPerNumber]; +ReadA_N: for (int n1 = 0; n1 < kTileSizeN; ++n1) { - DramLine num[kLinesPerNumber]; + ReadA_Flits: for (int i = 0; i < kLinesPerNumber; ++i) { #pragma HLS PIPELINE II = 1 #pragma HLS LOOP_FLATTEN num[i] = mem[((n0 * kTileSizeN + n1) * size_k + k) * kLinesPerNumber + i]; if (i == kLinesPerNumber - 1) { - to_kernel.Push(*reinterpret_cast(num)); + a_to_feeder.Push(*reinterpret_cast(num)); } } } } template <> -void ReadAInner<1>(DramLine const *const mem, hlslib::Stream &to_kernel, const int size_k, const int n0, +void ReadAInner<1>(DramLine const *const mem, hlslib::Stream &a_to_feeder, const int size_k, const int n0, const int k) { #pragma HLS INLINE +ReadA_N: for (int n1 = 0; n1 < kTileSizeN; ++n1) { #pragma HLS PIPELINE II = 1 #pragma HLS LOOP_FLATTEN const auto num = mem[((n0 * kTileSizeN + n1) * size_k + k) * kLinesPerNumber]; - to_kernel.Push(*reinterpret_cast(&num)); + a_to_feeder.Push(*reinterpret_cast(&num)); } } -void ReadA(DramLine const *const mem, hlslib::Stream &to_kernel, const int size_n, const int size_k, +void ReadA(DramLine const *const mem, hlslib::Stream &a_to_feeder, const int size_n, const int size_k, const int size_m) { const auto tiles_n = hlslib::CeilDivide(size_n, kTileSizeN); const auto tiles_m = hlslib::CeilDivide(size_m, kTileSizeM); +ReadA_TilesN: for (int n0 = 0; n0 < tiles_n; ++n0) { + ReadA_TilesM: for (int m0 = 0; m0 < tiles_m; ++m0) { + ReadA_K: for (int k = 0; k < size_k; ++k) { - ReadAInner(mem, to_kernel, size_k, n0, k); + ReadAInner(mem, a_to_feeder, size_k, n0, k); + } + } + } +} + +// In order to eliminate control logic in the compute function, we introduce extra feeders that run in the iteration +// space of the computational module, but write to the kernel every iteration to absorb the conditional pipeline reads +void FeedA(hlslib::Stream &a_to_feeder, hlslib::Stream &a_to_kernel, const int size_n, + const int size_k, const int size_m) { + const auto tiles_n = hlslib::CeilDivide(size_n, kTileSizeN); + const auto tiles_m = hlslib::CeilDivide(size_m, kTileSizeM); + PackedFloat a; +FeedA_TilesN: + for (int n0 = 0; n0 < tiles_n; ++n0) { + FeedA_TilesM: + for (int m0 = 0; m0 < tiles_m; ++m0) { + FeedA_K: + for (int k = 0; k < size_k; ++k) { + FeedA_N: + for (int n1 = 0; n1 < kTileSizeN; ++n1) { + FeedA_M: + for (int m1 = 0; m1 < kTileSizeM; ++m1) { +#pragma HLS PIPELINE II = 1 +#pragma HLS LOOP_FLATTEN + if (m1 == 0) { + a = a_to_feeder.Pop(); + } + a_to_kernel.Push(a); + } + } } } } @@ -53,42 +89,76 @@ void ReadA(DramLine const *const mem, hlslib::Stream &to_kernel, co //////////////////////////////////////////////////////////////////////////////// template -void ReadBInner(DramLine const *const mem, hlslib::Stream &to_kernel, const int size_m, const int m0, +void ReadBInner(DramLine const *const mem, hlslib::Stream &b_to_feeder, const int size_m, const int m0, const int k) { #pragma HLS INLINE + DramLine num[kLinesPerNumber]; +ReadB_M: for (int m1 = 0; m1 < kTileSizeM; ++m1) { - DramLine num[kLinesPerNumber]; + ReadB_Flits: for (int i = 0; i < kLinesPerNumber; ++i) { #pragma HLS PIPELINE II = 1 #pragma HLS LOOP_FLATTEN num[i] = mem[(k * size_m + m0 * kTileSizeM + m1) * kLinesPerNumber + i]; if (i == kLinesPerNumber - 1) { - to_kernel.Push(*reinterpret_cast(num)); + b_to_feeder.Push(*reinterpret_cast(num)); } } } } template <> -void ReadBInner<1>(DramLine const *const mem, hlslib::Stream &to_kernel, const int size_m, const int m0, +void ReadBInner<1>(DramLine const *const mem, hlslib::Stream &b_to_feeder, const int size_m, const int m0, const int k) { #pragma HLS INLINE +ReadB_M: for (int m1 = 0; m1 < kTileSizeM; ++m1) { #pragma HLS PIPELINE II = 1 #pragma HLS LOOP_FLATTEN const auto num = mem[(k * size_m + m0 * kTileSizeM + m1) * kLinesPerNumber]; - to_kernel.Push(*reinterpret_cast(&num)); + b_to_feeder.Push(*reinterpret_cast(&num)); } } -void ReadB(DramLine const *const mem, hlslib::Stream &to_kernel, const int size_n, const int size_k, +void ReadB(DramLine const *const mem, hlslib::Stream &b_to_feeder, const int size_n, const int size_k, const int size_m) { const auto tiles_n = hlslib::CeilDivide(size_n, kTileSizeN); const auto tiles_m = hlslib::CeilDivide(size_m, kTileSizeM); +ReadB_TilesN: for (int n0 = 0; n0 < tiles_n; ++n0) { + ReadB_TilesM: for (int m0 = 0; m0 < tiles_m; ++m0) { + ReadB_K: for (int k = 0; k < size_k; ++k) { - ReadBInner(mem, to_kernel, size_m, m0, k); + ReadBInner(mem, b_to_feeder, size_m, m0, k); + } + } + } +} + +void FeedB(hlslib::Stream &b_to_feeder, hlslib::Stream &b_to_kernel, const int size_n, + const int size_k, const int size_m) { + const auto tiles_n = hlslib::CeilDivide(size_n, kTileSizeN); + const auto tiles_m = hlslib::CeilDivide(size_m, kTileSizeM); + PackedFloat b; +FeedB_TilesN: + for (int n0 = 0; n0 < tiles_n; ++n0) { + FeedB_TilesM: + for (int m0 = 0; m0 < tiles_m; ++m0) { + FeedB_K: + for (int k = 0; k < size_k; ++k) { + FeedB_N: + for (int n1 = 0; n1 < kTileSizeN; ++n1) { + FeedB_M: + for (int m1 = 0; m1 < kTileSizeM; ++m1) { +#pragma HLS PIPELINE II = 1 +#pragma HLS LOOP_FLATTEN + if (n1 == 0) { + b = b_to_feeder.Pop(); + } + b_to_kernel.Push(b); + } + } } } } @@ -97,41 +167,75 @@ void ReadB(DramLine const *const mem, hlslib::Stream &to_kernel, co //////////////////////////////////////////////////////////////////////////////// template -void ReadCInner(DramLine const *const mem, hlslib::Stream &to_kernel, const int size_m, const int n0, +void ReadCInner(DramLine const *const mem, hlslib::Stream &c_to_feeder, const int size_m, const int n0, const int m0, const int n1) { #pragma HLS INLINE +ReadC_M: for (int m1 = 0; m1 < kTileSizeM; ++m1) { DramLine num[kLinesPerNumber]; + ReadC_Flits: for (int i = 0; i < kLinesPerNumber; ++i) { #pragma HLS PIPELINE II = 1 #pragma HLS LOOP_FLATTEN num[i] = mem[((n0 * kTileSizeN + n1) * size_m + m0 * kTileSizeM + m1) * kLinesPerNumber + i]; if (i == kLinesPerNumber - 1) { - to_kernel.Push(*reinterpret_cast(num)); + c_to_feeder.Push(*reinterpret_cast(num)); } } } } template <> -void ReadCInner<1>(DramLine const *const mem, hlslib::Stream &to_kernel, const int size_m, const int n0, +void ReadCInner<1>(DramLine const *const mem, hlslib::Stream &c_to_feeder, const int size_m, const int n0, const int m0, const int n1) { #pragma HLS INLINE +ReadC_M: for (int m1 = 0; m1 < kTileSizeM; ++m1) { #pragma HLS PIPELINE II = 1 #pragma HLS LOOP_FLATTEN const auto num = mem[((n0 * kTileSizeN + n1) * size_m + m0 * kTileSizeM + m1) * kLinesPerNumber]; - to_kernel.Push(*reinterpret_cast(&num)); + c_to_feeder.Push(*reinterpret_cast(&num)); } } -void ReadC(DramLine const *const mem, hlslib::Stream &to_kernel, const int size_n, const int size_m) { +void ReadC(DramLine const *const mem, hlslib::Stream &c_to_feeder, const int size_n, const int size_m) { const auto tiles_n = hlslib::CeilDivide(size_n, kTileSizeN); const auto tiles_m = hlslib::CeilDivide(size_m, kTileSizeM); +ReadC_TilesN: for (int n0 = 0; n0 < tiles_n; ++n0) { + ReadC_TilesM: for (int m0 = 0; m0 < tiles_m; ++m0) { + ReadC_N: for (int n1 = 0; n1 < kTileSizeN; ++n1) { - ReadCInner(mem, to_kernel, size_m, n0, m0, n1); + ReadCInner(mem, c_to_feeder, size_m, n0, m0, n1); + } + } + } +} + +void FeedC(hlslib::Stream &c_to_feeder, hlslib::Stream &c_to_kernel, const int size_n, + const int size_k, const int size_m) { + const auto tiles_n = hlslib::CeilDivide(size_n, kTileSizeN); + const auto tiles_m = hlslib::CeilDivide(size_m, kTileSizeM); + PackedFloat c; +FeedC_TilesN: + for (int n0 = 0; n0 < tiles_n; ++n0) { + FeedC_TilesM: + for (int m0 = 0; m0 < tiles_m; ++m0) { + FeedC_K: + for (int k = 0; k < size_k; ++k) { + FeedC_N: + for (int n1 = 0; n1 < kTileSizeN; ++n1) { + FeedC_M: + for (int m1 = 0; m1 < kTileSizeM; ++m1) { +#pragma HLS PIPELINE II = 1 +#pragma HLS LOOP_FLATTEN + if (k == 0) { + c = c_to_feeder.Pop(); + } + c_to_kernel.Push(c); + } + } } } } @@ -139,13 +243,42 @@ void ReadC(DramLine const *const mem, hlslib::Stream &to_kernel, co //////////////////////////////////////////////////////////////////////////////// +void DrainC(hlslib::Stream &c_to_drainer, hlslib::Stream &drainer_to_c, const int size_n, + const int size_k, const int size_m) { + const auto tiles_n = hlslib::CeilDivide(size_n, kTileSizeN); + const auto tiles_m = hlslib::CeilDivide(size_m, kTileSizeM); +DrainC_TilesN: + for (int n0 = 0; n0 < tiles_n; ++n0) { + DrainC_TilesM: + for (int m0 = 0; m0 < tiles_m; ++m0) { + DrainC_K: + for (int k = 0; k < size_k; ++k) { + DrainC_N: + for (int n1 = 0; n1 < kTileSizeN; ++n1) { + DrainC_M: + for (int m1 = 0; m1 < kTileSizeM; ++m1) { +#pragma HLS PIPELINE II = 1 +#pragma HLS LOOP_FLATTEN + const auto c = c_to_drainer.Pop(); + if (k == size_k - 1) { + drainer_to_c.Push(c); + } + } + } + } + } + } +} + template void WriteCInner(hlslib::Stream &from_kernel, DramLine *const mem, const int size_m, const int n0, const int m0, const int n1) { #pragma HLS INLINE +WriteC_M: for (int m1 = 0; m1 < kTileSizeM; ++m1) { DramLine num[kLinesPerNumber]; #pragma HLS ARRAY_PARTITION variable = num complete + WriteC_Flits: for (int i = 0; i < kLinesPerNumber; ++i) { #pragma HLS PIPELINE II = 1 #pragma HLS LOOP_FLATTEN @@ -161,6 +294,7 @@ template <> void WriteCInner<1>(hlslib::Stream &from_kernel, DramLine *const mem, const int size_m, const int n0, const int m0, const int n1) { #pragma HLS INLINE +WriteC_M: for (int m1 = 0; m1 < kTileSizeM; ++m1) { #pragma HLS PIPELINE II = 1 #pragma HLS LOOP_FLATTEN @@ -171,8 +305,13 @@ void WriteCInner<1>(hlslib::Stream &from_kernel, DramLine *const me } void WriteC(hlslib::Stream &from_kernel, DramLine *const mem, const int size_n, int const size_m) { - for (int n0 = 0; n0 < hlslib::CeilDivide(size_n, kTileSizeN); ++n0) { - for (int m0 = 0; m0 < hlslib::CeilDivide(size_m, kTileSizeM); ++m0) { + const auto tiles_n = hlslib::CeilDivide(size_n, kTileSizeN); + const auto tiles_m = hlslib::CeilDivide(size_m, kTileSizeM); +WriteC_TilesN: + for (int n0 = 0; n0 < tiles_n; ++n0) { + WriteC_TilesM: + for (int m0 = 0; m0 < tiles_m; ++m0) { + WriteC_N: for (int n1 = 0; n1 < kTileSizeN; ++n1) { WriteCInner(from_kernel, mem, size_m, n0, m0, n1); } @@ -187,41 +326,35 @@ void Compute(hlslib::Stream &a_in, hlslib::Stream &b_i PackedFloat a_buffer; // Just to make A symmetric to B and C PackedFloat b_buffer[kTileSizeM]; PackedFloat c_buffer[kTileSizeN * kTileSizeM]; - for (int n0 = 0; n0 < hlslib::CeilDivide(size_n, kTileSizeN); ++n0) { - for (int m0 = 0; m0 < hlslib::CeilDivide(size_m, kTileSizeM); ++m0) { + const int tiles_n = hlslib::CeilDivide(size_n, kTileSizeN); + const int tiles_m = hlslib::CeilDivide(size_m, kTileSizeM); +Compute_TilesN: + for (int n0 = 0; n0 < tiles_n; ++n0) { + Compute_TilesM: + for (int m0 = 0; m0 < tiles_m; ++m0) { + Compute_K: for (int k = 0; k < size_k; ++k) { + Compute_N: for (int n1 = 0; n1 < kTileSizeN; ++n1) { + Compute_M: for (int m1 = 0; m1 < kTileSizeM; ++m1) { #pragma HLS PIPELINE II = 1 #pragma HLS LOOP_FLATTEN - PackedFloat a, b, c; - if (m1 == 0) { - a = a_in.Pop(); - a_buffer = a; - } else { - a = a_buffer; - } - if (n1 == 0) { - b = b_in.Pop(); - b_buffer[m1] = b; - } else { - b = b_buffer[m1]; - } - if (k == 0) { - c = c_in.Pop(); - } else { - c = c_buffer[n1 * kTileSizeM + m1]; - } + const PackedFloat a_read = a_in.Pop(); + const PackedFloat b_read = b_in.Pop(); + const PackedFloat c_read = c_in.Pop(); + const PackedFloat a = (m1 == 0) ? a_read : a_buffer; + const PackedFloat b = (n1 == 0) ? b_read : b_buffer[m1]; + const PackedFloat c = (k == 0) ? c_read : c_buffer[n1 * kTileSizeM + m1]; + a_buffer = a; + b_buffer[m1] = b; // Ignore contributions from out-of-bound indices const bool in_bounds = (n0 * kTileSizeN + n1 < size_n) && (m0 * kTileSizeM + m1 < size_m); // Meat of the computation const auto res = in_bounds ? MultiplyAccumulate(a, b, c) : c; // Write back to buffer c_buffer[n1 * kTileSizeM + m1] = res; - // Write out on last slice - if (k == size_k - 1) { - c_out.Push(res); - } + c_out.Push(res); } } } @@ -229,6 +362,8 @@ void Compute(hlslib::Stream &a_in, hlslib::Stream &b_i } } +//////////////////////////////////////////////////////////////////////////////// + void MatrixMultiplication(DramLine const *const a, DramLine const *const b, DramLine const *const c_read, DramLine *const c_write, const int size_n, const int size_k, int const size_m) { #pragma HLS INTERFACE m_axi offset = slave port = a bundle = a @@ -237,16 +372,38 @@ void MatrixMultiplication(DramLine const *const a, DramLine const *const b, Dram // C, to make sure that the compiler doesn't try to look for dependencies/conflicts #pragma HLS INTERFACE m_axi offset = slave port = c_read bundle = c_read #pragma HLS INTERFACE m_axi offset = slave port = c_write bundle = c_write +#pragma HLS INTERFACE s_axilite port = a +#pragma HLS INTERFACE s_axilite port = b +#pragma HLS INTERFACE s_axilite port = c_read +#pragma HLS INTERFACE s_axilite port = c_write +#pragma HLS INTERFACE s_axilite port = size_n +#pragma HLS INTERFACE s_axilite port = size_k +#pragma HLS INTERFACE s_axilite port = size_m +#pragma HLS STABLE variable = a +#pragma HLS STABLE variable = b +#pragma HLS STABLE variable = c_read +#pragma HLS STABLE variable = c_write +#pragma HLS STABLE variable = size_n +#pragma HLS STABLE variable = size_k +#pragma HLS STABLE variable = size_m #pragma HLS DATAFLOW - hlslib::Stream a_to_kernel("a_to_kernel"); - hlslib::Stream b_to_kernel("b_to_kernel"); - hlslib::Stream c_to_kernel("c_to_kernel"); - hlslib::Stream kernel_to_c("kernel_to_c"); + hlslib::Stream a_to_feeder("a_to_feeder"); + hlslib::Stream a_to_kernel("a_to_kernel"); + hlslib::Stream b_to_feeder("b_to_feeder"); + hlslib::Stream b_to_kernel("b_to_kernel"); + hlslib::Stream c_to_feeder("c_to_feeder"); + hlslib::Stream c_to_kernel("c_to_kernel"); + hlslib::Stream c_from_kernel("c_from_kernel"); + hlslib::Stream c_from_drainer("c_from_drainer"); HLSLIB_DATAFLOW_INIT(); - HLSLIB_DATAFLOW_FUNCTION(ReadA, a, a_to_kernel, size_n, size_k, size_m); - HLSLIB_DATAFLOW_FUNCTION(ReadB, b, b_to_kernel, size_n, size_k, size_m); - HLSLIB_DATAFLOW_FUNCTION(ReadC, c_read, c_to_kernel, size_n, size_m); - HLSLIB_DATAFLOW_FUNCTION(Compute, a_to_kernel, b_to_kernel, c_to_kernel, kernel_to_c, size_n, size_k, size_m); - HLSLIB_DATAFLOW_FUNCTION(WriteC, kernel_to_c, c_write, size_n, size_m); + HLSLIB_DATAFLOW_FUNCTION(ReadA, a, a_to_feeder, size_n, size_k, size_m); + HLSLIB_DATAFLOW_FUNCTION(FeedA, a_to_feeder, a_to_kernel, size_n, size_k, size_m); + HLSLIB_DATAFLOW_FUNCTION(ReadB, b, b_to_feeder, size_n, size_k, size_m); + HLSLIB_DATAFLOW_FUNCTION(FeedB, b_to_feeder, b_to_kernel, size_n, size_k, size_m); + HLSLIB_DATAFLOW_FUNCTION(ReadC, c_read, c_to_feeder, size_n, size_m); + HLSLIB_DATAFLOW_FUNCTION(FeedC, c_to_feeder, c_to_kernel, size_n, size_k, size_m); + HLSLIB_DATAFLOW_FUNCTION(Compute, a_to_kernel, b_to_kernel, c_to_kernel, c_from_kernel, size_n, size_k, size_m); + HLSLIB_DATAFLOW_FUNCTION(DrainC, c_from_kernel, c_from_drainer, size_n, size_k, size_m); + HLSLIB_DATAFLOW_FUNCTION(WriteC, c_from_drainer, c_write, size_n, size_m); HLSLIB_DATAFLOW_FINALIZE(); } diff --git a/hlslib b/hlslib index bdc88d0..7e2aacd 160000 --- a/hlslib +++ b/hlslib @@ -1 +1 @@ -Subproject commit bdc88d044c62a722d4e1a21b439b1bb002864368 +Subproject commit 7e2aacd0d6352c963931efea46f140ef499e4b9b diff --git a/host/Random.cpp b/host/Random.cpp index c5f58c4..a534b59 100644 --- a/host/Random.cpp +++ b/host/Random.cpp @@ -24,7 +24,7 @@ __mpf_struct RandomNumberGenerator::GenerateGmp() { __mpfr_struct RandomNumberGenerator::GenerateMpfr() { mpfr_t num; - mpfr_init2(num, kBits); + mpfr_init2(num, kMantissaBits); Generate(num); return num[0]; } diff --git a/host/TestProgram.cpp b/host/TestProgram.cpp index 3b44916..efc0967 100644 --- a/host/TestProgram.cpp +++ b/host/TestProgram.cpp @@ -17,8 +17,11 @@ bool RunTestSimulation(int size_n, int size_k, int size_m) { bool RunTest(std::string const &kernel_path, int size_n, int size_k, int size_m) { #endif hlslib::ocl::Context context; + std::cout << "Configuring the device..." << std::flush; auto program = context.MakeProgram(kernel_path); + std::cout << " Done.\n"; // Initialize some random data + std::cout << "Initializing input data..." << std::flush; std::vector<__mpfr_struct> a_mpfr, b_mpfr, c_mpfr; RandomNumberGenerator rng; for (int n = 0; n < size_n; ++n) { @@ -47,7 +50,9 @@ bool RunTest(std::string const &kernel_path, int size_n, int size_k, int size_m) for (auto &x : c_mpfr) { c_host.emplace_back(&x); } + std::cout << " Done.\n"; // Allocate device memory, padding each buffer to the tile size + std::cout << "Copying data to the device..." << std::flush; auto a_device = context.MakeBuffer( hlslib::ocl::StorageType::DDR, 1, kLinesPerNumber * (hlslib::CeilDivide(size_n, kTileSizeN) * kTileSizeN) * size_k); @@ -62,16 +67,23 @@ bool RunTest(std::string const &kernel_path, int size_n, int size_k, int size_m) a_device.CopyFromHost(0, kLinesPerNumber * size_n * size_k, reinterpret_cast(&a_host[0])); b_device.CopyFromHost(0, kLinesPerNumber * size_k * size_m, reinterpret_cast(&b_host[0])); c_device.CopyFromHost(0, kLinesPerNumber * size_n * size_m, reinterpret_cast(&c_host[0])); + std::cout << " Done.\n"; // In simulation mode, this will call the function "MatrixMultiplication" and run it in software. // Otherwise, the provided path to a kernel binary will be loaded and executed. auto kernel = program.MakeKernel(MatrixMultiplication, "MatrixMultiplication", a_device, b_device, c_device, c_device, size_n, size_k, size_m); + const unsigned long expected_cycles = hlslib::CeilDivide(size_n, kTileSizeN) * + hlslib::CeilDivide(size_m, kTileSizeM) * kTileSizeN * kTileSizeM * size_k; + const float expected_runtime = expected_cycles / 0.3e9; + std::cout << "The expected number of cycles to completion is " << expected_cycles << ", which is " + << expected_runtime << " seconds at 300 MHz.\n"; + const auto communication_volume = hlslib::CeilDivide(size_n, kTileSizeN) * hlslib::CeilDivide(size_m, kTileSizeM) * + ((kTileSizeN + kTileSizeM) * size_k + 2 * kTileSizeN * kTileSizeM); + std::cout << "This communicates " << 1e-6 * kBytes * communication_volume << " MB, requiring a bandwidth of " + << 1e-9 * kBytes * communication_volume / expected_runtime << " GB/s.\n"; std::cout << "Executing kernel...\n"; const auto elapsed = kernel.ExecuteTask(); std::cout << "Ran in " << elapsed.first << " seconds.\n"; - const unsigned long ideal_cycles = - hlslib::CeilDivide(size_n, kTileSizeN) * hlslib::CeilDivide(size_m, kTileSizeM) * kTileSizeN * kTileSizeM; - std::cout << "The ideal number of cycles to completion is " << ideal_cycles << ".\n"; // Copy back result c_device.CopyToHost(0, kLinesPerNumber * size_n * size_m, reinterpret_cast(&c_host[0])); // Run reference implementation. Because of GMP's "clever" way of wrapping their struct in an array of size 1, @@ -95,6 +107,7 @@ bool RunTest(std::string const &kernel_path, int size_n, int size_k, int size_m) } } } + std::cout << "Results successfully verified against MPFR.\n"; // Clean up for (int n = 0; n < size_n; ++n) {