Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Free Running Kernel #8

Merged
merged 8 commits into from
Dec 29, 2021
9 changes: 5 additions & 4 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
@@ -1,15 +1,16 @@
cmake_minimum_required(VERSION 3.0)
project(apfp)

 
set(CMAKE_CXX_STANDARD 17)

# Target options
set(APFP_PLATFORM "xilinx_u280_xdma_201920_3" CACHE STRING "Platform string for Vitis.")
set(APFP_BITS 1024 CACHE STRING "Number of bits to use for a floating point number, including mantissa, exponent, and sign.")
set(APFP_MULT_BASE_BITS 16 CACHE STRING "Number of bits to bottom out the multiplication at and use native multiplication.")
set(APFP_MULT_BASE_BITS 18 CACHE STRING "Number of bits to bottom out the multiplication at and use native multiplication.")
set(APFP_TILE_SIZE_N 32 CACHE STRING "Tile size in the N-dimension when running matrix-matrix multiplication.")
set(APFP_TILE_SIZE_M 32 CACHE STRING "Tile size in the M-dimension when running matrix-matrix multiplication.")
set(APFP_SEMANTICS "MPFR" CACHE STRING "Which semantics to use for floating point operations [GMP/MPFR].")
set(APFP_PROFILING OFF CACHE BOOL "Enable profiling in the generated kernel.")
set_property(CACHE APFP_SEMANTICS PROPERTY STRINGS GMP MPFR)

# Validation and derived numbers
Expand All @@ -26,7 +27,7 @@ find_package(MPFR REQUIRED)
find_package(GMP REQUIRED)
find_package(Threads REQUIRED)

set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wall -Wextra -Wpedantic -DAPFP_${APFP_SEMANTICS}_SEMANTICS")
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wall -Wextra -Wpedantic -Wno-unused-label -DAPFP_${APFP_SEMANTICS}_SEMANTICS")
include_directories(${CMAKE_BINARY_DIR} include SYSTEM hlslib/include ${Vitis_INCLUDE_DIRS} )

configure_file(include/Config.h.in Config.h)
Expand All @@ -49,7 +50,7 @@ add_vitis_kernel(MatrixMultiplication FILES ${APFP_KERNEL_FILES}
m_axi_b:DDR[1]
m_axi_c_read:DDR[1]
m_axi_c_write:DDR[1])
add_vitis_program(MatrixMultiplication ${APFP_PLATFORM})
add_vitis_program(MatrixMultiplication ${APFP_PLATFORM} PROFILING ${APFP_PROFILING})

# Internal library
add_library(apfp host/Random.cpp host/MatrixMultiplicationReference.cpp)
Expand Down
1 change: 1 addition & 0 deletions device/ArithmeticOperations.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -54,6 +54,7 @@ PackedFloat Add(PackedFloat const &a, PackedFloat const &b) {
// Optionally shift by 1, 2, 4, 8, 16... log2(B), such that all bits have eventually traveled to
// their designated position.
const int kNumStages = hlslib::ConstLog2(kBits);
ShiftStages:
for (int i = 0; i < kNumStages; ++i) {
#pragma HLS UNROLL
a_mantissa = ((shift_c & (1 << i)) == 0) ? a_mantissa : (a_mantissa >> (1 << i));
Expand Down
7 changes: 7 additions & 0 deletions device/Karatsuba.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,11 @@

#include <type_traits> // std::enable_if

constexpr int AddLatency(int bits) {
// 4 is the maximum supported latency of integer adds using the BIND_OP pragma
return (bits >= 1024) ? 4 : (bits >= 768) ? 3 : (bits >= 512) ? 2 : (bits >= 256) ? 1 : 0;
}

template <int bits>
auto _Karatsuba(ap_uint<bits> const &a, ap_uint<bits> const &b) ->
typename std::enable_if<(bits > kMultBaseBits), ap_uint<2 * bits>>::type {
Expand Down Expand Up @@ -33,6 +38,7 @@ auto _Karatsuba(ap_uint<bits> const &a, ap_uint<bits> const &b) ->
Full a0a1b0b1 = _Karatsuba<bits / 2>(a0a1, b0b1);
ap_int<bits + 2> a0a1b0b1_signed = a0a1b0b1_is_neg ? -ap_int<bits + 1>(a0a1b0b1) : ap_int<bits + 2>(a0a1b0b1);
ap_uint<bits + 2> z1 = ap_uint<bits + 2>(a0a1b0b1_signed) + z0 + z2;
#pragma HLS BIND_OP variable = z1 op = add impl = fabric latency = AddLatency(bits)

// Align everything and combine
ap_uint<(2 * bits)> z0z2 = z0 | (ap_uint<(2 * bits)>(z2) << bits);
Expand All @@ -41,6 +47,7 @@ auto _Karatsuba(ap_uint<bits> const &a, ap_uint<bits> const &b) ->
// Workaround to avoid HLS padding an extra bit for the add. This is necessary to support 2048 bit multiplication,
// which required adding two 4096 numbers, because 4096 bits is the maximum width support by the ap_uint type.
z.V = z1_aligned.V + z0z2.V;
#pragma HLS BIND_OP variable = z.V op = add impl = fabric latency = AddLatency(bits)

return z;
}
Expand Down
Loading