Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
9 changes: 9 additions & 0 deletions include/fbgemm/Fbgemm.h
Original file line number Diff line number Diff line change
Expand Up @@ -472,6 +472,15 @@ class FBGEMM_API PackBMatrix final
const T* smat_;
std::int32_t ld_;
std::int32_t row_interleave_;

/**
* @brief Internal function performing both pack & unpack
*/
void pack_unpack_(
const block_type_t& block,
T* unpack_buf,
T* pack_buf,
bool ispack);
};

/**
Expand Down
117 changes: 47 additions & 70 deletions src/PackBMatrix.cc
Original file line number Diff line number Diff line change
Expand Up @@ -234,15 +234,19 @@ PackBMatrix<T, accT>::PackBMatrix(
}

template <typename T, typename accT>
void PackBMatrix<T, accT>::pack(const block_type_t& block) {
void PackBMatrix<T, accT>::pack_unpack_(
const block_type_t& block,
T* unpack_buf,
T* pack_buf,
bool ispack) {
assert((BaseType::blockRowSize() % row_interleave_) == 0);
assert((block.row_start % BaseType::blockRowSize()) == 0);
assert((block.col_start % BaseType::blockColSize()) == 0);

BaseType::packedBlock(block);
bool tr = (trans_ == matrix_op_t::Transpose);
for (int g = 0; g < BaseType::numGroups(); ++g) {
T* out = BaseType::getBuf() +
T* pack_buf_cur = pack_buf +
g * BaseType::packedBufferSize(block.row_size, block.col_size);
for (int i = block.row_start; i < block.row_start + block.row_size; ++i) {
int r_offset = ((i / BaseType::blockRowSize()) * BaseType::blockCols()) *
Expand All @@ -268,10 +272,16 @@ void PackBMatrix<T, accT>::pack(const block_type_t& block) {
c_blk_offset * BaseType::blockRowSize() * BaseType::blockColSize() +
c_idx_offset * row_interleave_;

int out_idx = r_offset + c_offset;
T val = tr ? smat_[i + (g * block.col_size + j) * ld_]
: smat_[(g * block.row_size + i) * ld_ + j];
out[out_idx] = val;
if (ispack) {
pack_buf_cur[r_offset + c_offset] = tr
? unpack_buf[i + (g * block.col_size + j) * ld_]
: unpack_buf[(g * block.row_size + i) * ld_ + j];
} else {
T* unpack_buf_cur = tr
? &(unpack_buf[i + (g * block.col_size + j) * ld_])
: &(unpack_buf[(g * block.row_size + i) * ld_ + j]);
*unpack_buf_cur = pack_buf_cur[r_offset + c_offset];
}

c_idx_offset++;
if (c_idx_offset == BaseType::blockColSize()) {
Expand All @@ -280,78 +290,45 @@ void PackBMatrix<T, accT>::pack(const block_type_t& block) {
}
}
}
// fill the remaining with zero.
// Please see the comment in PackAMatrix.cc on zero vs zero_pt fill.
for (int i = block.row_start + block.row_size;
i < (block.row_start + block.row_size + row_interleave_ - 1) /
row_interleave_ * row_interleave_;
++i) {
int r_offset = ((i / BaseType::blockRowSize()) * BaseType::blockCols()) *
(BaseType::blockRowSize() * BaseType::blockColSize()) +
(i % BaseType::blockRowSize() / row_interleave_) *
BaseType::blockColSize() * row_interleave_ +
i % row_interleave_;
for (int j = block.col_start; j < block.col_start + block.col_size; j++) {
int c_offset = (j / BaseType::blockColSize()) *
BaseType::blockRowSize() * BaseType::blockColSize() +
(j % BaseType::blockColSize()) * row_interleave_;
if (ispack) {
// fill the remaining with zero.
// Please see the comment in PackAMatrix.cc on zero vs zero_pt fill.
for (int i = block.row_start + block.row_size;
i < (block.row_start + block.row_size + row_interleave_ - 1) /
row_interleave_ * row_interleave_;
++i) {
int r_offset =
((i / BaseType::blockRowSize()) * BaseType::blockCols()) *
(BaseType::blockRowSize() * BaseType::blockColSize()) +
(i % BaseType::blockRowSize() / row_interleave_) *
BaseType::blockColSize() * row_interleave_ +
i % row_interleave_;
for (int j = block.col_start; j < block.col_start + block.col_size;
j++) {
int c_offset = (j / BaseType::blockColSize()) *
BaseType::blockRowSize() * BaseType::blockColSize() +
(j % BaseType::blockColSize()) * row_interleave_;

int out_idx = r_offset + c_offset;
out[out_idx] = 0;
int out_idx = r_offset + c_offset;
pack_buf_cur[out_idx] = 0;
}
}
}
} // for each group
}

template <typename T, typename accT>
void PackBMatrix<T, accT>::unpack(T* origin_buf) {
bool tr = (trans_ == matrix_op_t::Transpose);
for (int g = 0; g < this->numGroups(); ++g) {
T* out = BaseType::getBuf() +
g *
BaseType::packedBufferSize(
BaseType::numPackedRows(), BaseType::numPackedCols());
for (int i = BaseType::packedRowStart();
i < BaseType::packedRowStart() + BaseType::numPackedRows();
++i) {
int r_offset = ((i / BaseType::blockRowSize()) * BaseType::blockCols()) *
(BaseType::blockRowSize() * BaseType::blockColSize()) +
(i % BaseType::blockRowSize() / row_interleave_) *
BaseType::blockColSize() * row_interleave_ +
i % row_interleave_;

int c_start_offset =
(BaseType::packedColStart() / BaseType::blockColSize()) *
BaseType::blockRowSize() * BaseType::blockColSize() +
(BaseType::packedColStart() % BaseType::blockColSize()) *
row_interleave_;

int c_idx_offset = 0;
int c_blk_offset = 0;
for (int j = BaseType::packedColStart();
j < BaseType::packedColStart() + BaseType::numPackedCols();
++j) {
int c_offset = c_start_offset +
c_blk_offset * BaseType::blockRowSize() * BaseType::blockColSize() +
c_idx_offset * row_interleave_;

int out_idx = r_offset + c_offset;

T val = out[out_idx];
if (tr) {
origin_buf[i + (g * BaseType::numPackedCols() + j) * ld_] = val;
} else {
origin_buf[(g * BaseType::numPackedRows() + i) * ld_ + j] = val;
}
void PackBMatrix<T, accT>::pack(const block_type_t& block) {
pack_unpack_(block, const_cast<T*>(smat_), BaseType::getBuf(), true);
}

c_idx_offset++;
if (c_idx_offset == BaseType::blockColSize()) {
c_idx_offset = 0;
c_blk_offset++;
}
}
}
} // for each group
template <typename T, typename accT>
void PackBMatrix<T, accT>::unpack(T* origin_buf) {
block_type_t blockB{BaseType::packedRowStart(),
BaseType::numPackedRows(),
BaseType::packedColStart(),
BaseType::numPackedCols()};
pack_unpack_(blockB, origin_buf, BaseType::getBuf(), false);
}

template <typename T, typename accT>
Expand Down
72 changes: 71 additions & 1 deletion test/PackedRequantizeAcc16Test.cc
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,7 @@ using namespace std;
using namespace fbgemm;

vector<matrix_op_t> transposeVals{matrix_op_t::NoTranspose,
matrix_op_t::Transpose};
matrix_op_t::Transpose};

vector<QuantizationGranularity> qGranularityVals{
QuantizationGranularity::TENSOR,
Expand All @@ -39,6 +39,8 @@ class fbgemmu8s8acc16WithQuantGranularityTest
tuple<matrix_op_t, matrix_op_t, bool, QuantizationGranularity>> {};
class fbgemmu8s8acc16Test
: public testing::TestWithParam<tuple<matrix_op_t, matrix_op_t, bool>> {};
class fbgemmPackUnpackAcc16Test
: public testing::TestWithParam<tuple<matrix_op_t, bool>> {};
}; // namespace

INSTANTIATE_TEST_CASE_P(
Expand All @@ -58,6 +60,11 @@ INSTANTIATE_TEST_CASE_P(
::testing::ValuesIn(transposeVals),
::testing::Bool()));

INSTANTIATE_TEST_CASE_P(
InstantiationName,
fbgemmPackUnpackAcc16Test,
::testing::Combine(::testing::ValuesIn(transposeVals), ::testing::Bool()));

/**
* @brief Shapes for unit test.
*/
Expand Down Expand Up @@ -809,3 +816,66 @@ TEST_P(fbgemmu8s8acc16Test, NoRequantizeTest) {
} // for each groups
} // for each shape
}

/**
* @brief Unit test for packing and unpacking the weight tensor.
*/
TEST_P(fbgemmPackUnpackAcc16Test, TestPackUnpack) {
vector<vector<int>> shapes(GetShapes_());
matrix_op_t btrans;
bool test_ld;
tie(btrans, test_ld) = GetParam();

for (auto shape : shapes) {
for (int groups : {1, 3, 4}) {
int n = shape[1];
int k = shape[2];

if (k % groups != 0) {
continue;
}
int k_per_group = k / groups;

// kxn matrix
aligned_vector<int8_t> Bint8(k * n);
randFill<int8_t>(Bint8, -128, 127);

// To test lda != k , we just reduce k by half and use the original k
// as lda.
int n_adjusted = n;
if (test_ld) {
if (btrans == matrix_op_t::NoTranspose) {
n_adjusted = std::max(n / 2, 1);
}
}

// Note that packing for weight is performed during the constructor
// stage.
PackBMatrix<int8_t, int16_t> packedWeights(
btrans,
k,
n_adjusted,
Bint8.data(),
(btrans == matrix_op_t::Transpose) ? k_per_group : n,
nullptr,
groups);

// Setup a buffer to get pack -> unpacked results
aligned_vector<int8_t> unpack_buf(k * n, 0);

// Perform unpacking
packedWeights.unpack(unpack_buf.data());

// Sanity check
for (int i = 0; i < k; i++) {
for (int j = 0; j < n_adjusted; j++) {
EXPECT_EQ(Bint8.data()[i * n + j], unpack_buf.data()[i * n + j])
<< "Pack/Unpack results differ at index (" << i << ", " << j
<< ", Reference: " << static_cast<int>(Bint8.data()[i * n + j])
<< ", Pack-Unpacked: "
<< static_cast<int>(unpack_buf.data()[i * n + j]);
}
}
}
}
}
70 changes: 70 additions & 0 deletions test/PackedRequantizeTest.cc
Original file line number Diff line number Diff line change
Expand Up @@ -39,6 +39,8 @@ class fbgemmu8s8acc32WithQuantGranularityTest
tuple<matrix_op_t, matrix_op_t, bool, QuantizationGranularity>> {};
class fbgemmu8s8acc32Test
: public testing::TestWithParam<tuple<matrix_op_t, matrix_op_t, bool>> {};
class fbgemmPackUnpackAcc32Test
: public testing::TestWithParam<tuple<matrix_op_t, bool>> {};
}; // namespace

INSTANTIATE_TEST_CASE_P(
Expand All @@ -58,6 +60,11 @@ INSTANTIATE_TEST_CASE_P(
::testing::ValuesIn(transposeVals),
::testing::Bool()));

INSTANTIATE_TEST_CASE_P(
InstantiationName,
fbgemmPackUnpackAcc32Test,
::testing::Combine(::testing::ValuesIn(transposeVals), ::testing::Bool()));

/**
* @brief Shapes for unit test.
*/
Expand Down Expand Up @@ -749,3 +756,66 @@ TEST_P(fbgemmu8s8acc32Test, TestSymmetricQuantizedInputOutput) {
} // for each groups
} // for each shape
}

/**
* @brief Unit test for packing and unpacking the weight tensor.
*/
TEST_P(fbgemmPackUnpackAcc32Test, TestPackUnpack) {
vector<vector<int>> shapes(GetShapes_());
matrix_op_t btrans;
bool test_ld;
tie(btrans, test_ld) = GetParam();

for (auto shape : shapes) {
for (int groups : {1, 3, 4}) {
int n = shape[1];
int k = shape[2];

if (k % groups != 0) {
continue;
}
int k_per_group = k / groups;

// kxn matrix
aligned_vector<int8_t> Bint8(k * n);
randFill<int8_t>(Bint8, -128, 127);

// To test lda != k , we just reduce k by half and use the original k
// as lda.
int n_adjusted = n;
if (test_ld) {
if (btrans == matrix_op_t::NoTranspose) {
n_adjusted = std::max(n / 2, 1);
}
}

// Note that packing for weight is performed during the constructor
// stage.
PackBMatrix<int8_t> packedWeights(
btrans,
k,
n_adjusted,
Bint8.data(),
(btrans == matrix_op_t::Transpose) ? k_per_group : n,
nullptr,
groups);

// Setup a buffer to get pack -> unpacked results
aligned_vector<int8_t> unpack_buf(k * n, 0);

// Perform unpacking
packedWeights.unpack(unpack_buf.data());

// Sanity check
for (int i = 0; i < k; i++) {
for (int j = 0; j < n_adjusted; j++) {
EXPECT_EQ(Bint8.data()[i * n + j], unpack_buf.data()[i * n + j])
<< "Pack/Unpack results differ at index (" << i << ", " << j
<< ", Reference: " << static_cast<int>(Bint8.data()[i * n + j])
<< ", Pack-Unpacked: "
<< static_cast<int>(unpack_buf.data()[i * n + j]);
}
}
}
}
}