This repository has been archived by the owner on May 19, 2023. It is now read-only.
-
Notifications
You must be signed in to change notification settings - Fork 68
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge pull request #155 from brhodes10/feature/tokenizer-rmm
[REVIEW] Tokenizer with rmm integration
- Loading branch information
Showing
26 changed files
with
124,217 additions
and
123,609 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,53 @@ | ||
cmake_minimum_required(VERSION 3.14 FATAL_ERROR) | ||
|
||
project(CLX_BENCHS LANGUAGES C CXX CUDA) | ||
|
||
################################################################################################### | ||
# - compiler function ----------------------------------------------------------------------------- | ||
|
||
set(BENCHMARK_LIST CACHE INTERNAL "BENCHMARK_LIST") | ||
|
||
function(ConfigureBench CMAKE_BENCH_NAME CMAKE_BENCH_SRC) | ||
add_executable(${CMAKE_BENCH_NAME} | ||
${CMAKE_BENCH_SRC} | ||
#"${CMAKE_CURRENT_SOURCE_DIR}/synchronization/synchronization.cpp" | ||
#"${CMAKE_SOURCE_DIR}/tests/utilities/base_fixture.cpp") | ||
) | ||
set_target_properties(${CMAKE_BENCH_NAME} PROPERTIES POSITION_INDEPENDENT_CODE ON) | ||
target_link_libraries(${CMAKE_BENCH_NAME} benchmark benchmark_main pthread clx ) | ||
set_target_properties(${CMAKE_BENCH_NAME} PROPERTIES | ||
RUNTIME_OUTPUT_DIRECTORY "${CMAKE_BINARY_DIR}/gbenchmarks") | ||
set(BENCHMARK_LIST ${BENCHMARK_LIST} ${CMAKE_BENCH_NAME} CACHE INTERNAL "BENCHMARK_LIST") | ||
endfunction(ConfigureBench) | ||
|
||
################################################################################################### | ||
# - include paths --------------------------------------------------------------------------------- | ||
include_directories("${CMAKE_CUDA_TOOLKIT_INCLUDE_DIRECTORIES}" | ||
"${CMAKE_BINARY_DIR}/include" | ||
"${CMAKE_SOURCE_DIR}/include" | ||
"${CMAKE_SOURCE_DIR}" | ||
"${CMAKE_SOURCE_DIR}/src" | ||
"${GBENCH_INCLUDE_DIR}" | ||
"${RMM_INCLUDE}" | ||
"${CMAKE_CURRENT_SOURCE_DIR}") | ||
|
||
|
||
|
||
################################################################################################### | ||
# - library paths --------------------------------------------------------------------------------- | ||
|
||
link_directories("${CMAKE_CUDA_IMPLICIT_LINK_DIRECTORIES}" # CMAKE_CUDA_IMPLICIT_LINK_DIRECTORIES is an undocumented/unsupported variable containing the link directories for nvcc | ||
"${CMAKE_BINARY_DIR}/lib" | ||
"${CMAKE_BINARY_DIR}" | ||
"${FLATBUFFERS_LIBRARY_DIR}" | ||
"${GTEST_LIBRARY_DIR}" | ||
"${GBENCH_LIBRARY_DIR}" | ||
"${RMM_LIBRARY}") | ||
|
||
################################################################################################### | ||
# - tokenizer benchmarks ----------------------------------------------------------------------------- | ||
|
||
set(TOKENIZER_BENCH_SRC | ||
"${CMAKE_CURRENT_SOURCE_DIR}/tokenizer_benchmark.cu") | ||
|
||
ConfigureBench(TOKENIZER_BENCH "${TOKENIZER_BENCH_SRC}") |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,65 @@ | ||
#include <benchmark/benchmark.h> | ||
#include <for_cython.h> | ||
|
||
#include <thrust/device_vector.h> | ||
#include <rmm/rmm.h> | ||
#include <rmm/thrust_rmm_allocator.h> | ||
|
||
#define MAX_NUM_SENTENCES 101 | ||
#define MAX_NUM_CHARS 150000 | ||
#define MAX_ROWS_TENSOR 300 | ||
|
||
static void BM_cuda_tokenizer_file(benchmark::State& state) { | ||
std::string input_file_name = "cpp/benchmarks/tokenizer_benchmark.txt"; | ||
std::string hash_file = "python/clx/analytics/resources/bert_hash_table.txt"; | ||
uint32_t max_sequence_length = 64; | ||
uint32_t stride = 48; | ||
uint32_t do_truncate = 0; | ||
uint32_t do_lower = 1; | ||
TokenizerResult* result = new TokenizerResult(); | ||
for (auto _ : state){ | ||
cuda_tokenizer_file(input_file_name, hash_file, max_sequence_length, stride, do_lower, do_truncate, | ||
MAX_NUM_SENTENCES, MAX_NUM_CHARS, MAX_ROWS_TENSOR, result); | ||
} | ||
} | ||
BENCHMARK(BM_cuda_tokenizer_file); | ||
|
||
void flatten_sentences(const std::vector<std::string>& sentences, | ||
char* flattened_sentences, | ||
uint32_t* sentence_offsets) { | ||
|
||
uint32_t start_copy = 0; | ||
for(uint32_t i = 0; i < sentences.size(); ++i){ | ||
const uint32_t sentence_length = sentences[i].size(); | ||
|
||
sentences[i].copy(flattened_sentences + start_copy, sentence_length); | ||
sentence_offsets[i] = start_copy; | ||
start_copy += sentence_length; | ||
} | ||
sentence_offsets[sentences.size()] = start_copy; | ||
} | ||
|
||
static void BM_cuda_tokenizer_cudf(benchmark::State& state) { | ||
rmm::device_vector<char> device_sentences{}; | ||
device_sentences.resize(MAX_NUM_CHARS); | ||
|
||
std::string sentences = "This is a test"; | ||
std::vector<char> char_sentences(sentences.length()); | ||
std::copy(sentences.begin(), sentences.end(), char_sentences.begin()); | ||
device_sentences = char_sentences; | ||
|
||
std::string hash_file = "python/clx/analytics/resources/bert_hash_table.txt"; | ||
std::vector<uint32_t> offsets{14}; | ||
uint32_t max_sequence_length = 64; | ||
uint32_t stride = 48; | ||
uint32_t do_truncate = 0; | ||
uint32_t do_lower = 1; | ||
TokenizerResult* result = new TokenizerResult(); | ||
for (auto _ : state){ | ||
cuda_tokenizer_cudf(thrust::raw_pointer_cast(device_sentences.data()), offsets.data(), offsets.size(), hash_file, max_sequence_length, stride, do_lower, do_truncate, | ||
MAX_NUM_SENTENCES, MAX_NUM_CHARS, MAX_ROWS_TENSOR, result); | ||
} | ||
} | ||
BENCHMARK(BM_cuda_tokenizer_cudf); | ||
|
||
BENCHMARK_MAIN(); |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1 @@ | ||
This is a test. |
Oops, something went wrong.