From 32949765f1fa3a4f84f86e7640521922787036a8 Mon Sep 17 00:00:00 2001 From: Juan Cruz Viotti Date: Thu, 28 May 2026 14:32:04 -0400 Subject: [PATCH] Upgrade Core to `e586f557af367c74d08d2818bc4bd0d07c8b20bc` Signed-off-by: Juan Cruz Viotti --- DEPENDENCIES | 2 +- .../core/cmake/common/compiler/options.cmake | 16 +- .../cmake/common/targets/executable.cmake | 8 +- .../common/targets/googlebenchmark.cmake | 3 + .../core/cmake/common/targets/library.cmake | 3 + vendor/core/src/core/idna/CMakeLists.txt | 20 +- vendor/core/src/core/idna/codegen.cc | 203 +++++ vendor/core/src/core/idna/codegen.py | 117 --- .../idna/include/sourcemeta/core/idna_ucd.h | 17 +- vendor/core/src/core/regex/preprocess.h | 4 + vendor/core/src/core/unicode/CMakeLists.txt | 20 +- vendor/core/src/core/unicode/codegen.cc | 730 ++++++++++++++++++ vendor/core/src/core/unicode/codegen.py | 516 ------------- .../include/sourcemeta/core/unicode_ucd.h | 448 ++++++----- .../src/lang/io/include/sourcemeta/core/io.h | 24 + .../include/sourcemeta/core/numeric_parse.h | 26 +- vendor/core/src/lang/numeric/parse.cc | 36 +- .../lang/text/include/sourcemeta/core/text.h | 132 ++++ vendor/core/src/lang/text/text.cc | 63 ++ 19 files changed, 1500 insertions(+), 888 deletions(-) create mode 100644 vendor/core/src/core/idna/codegen.cc delete mode 100644 vendor/core/src/core/idna/codegen.py create mode 100644 vendor/core/src/core/unicode/codegen.cc delete mode 100644 vendor/core/src/core/unicode/codegen.py diff --git a/DEPENDENCIES b/DEPENDENCIES index fd038337c..756d9aab4 100644 --- a/DEPENDENCIES +++ b/DEPENDENCIES @@ -1,4 +1,4 @@ vendorpull https://github.com/sourcemeta/vendorpull 1dcbac42809cf87cb5b045106b863e17ad84ba02 -core https://github.com/sourcemeta/core cd56ace324a42f067b4b8f651f73b9aa0313ca2a +core https://github.com/sourcemeta/core e586f557af367c74d08d2818bc4bd0d07c8b20bc blaze https://github.com/sourcemeta/blaze bc1f434acafd38803f58a941a756a6f788e556e2 bootstrap https://github.com/twbs/bootstrap 1a6fdfae6be09b09eaced8f0e442ca6f7680a61e diff --git a/vendor/core/cmake/common/compiler/options.cmake b/vendor/core/cmake/common/compiler/options.cmake index d43bb2d91..54cca1ad3 100644 --- a/vendor/core/cmake/common/compiler/options.cmake +++ b/vendor/core/cmake/common/compiler/options.cmake @@ -17,7 +17,9 @@ function(sourcemeta_add_default_options visibility target) $<$,$>:/W4> $<$,$>:/WL> $<$,$>:/MP> - $<$,$>:/sdl>) + $<$,$>:/sdl> + # See https://learn.microsoft.com/en-us/cpp/build/reference/guard-enable-control-flow-guard + $<$,$>:/guard:cf>) elseif(SOURCEMETA_COMPILER_LLVM OR SOURCEMETA_COMPILER_GCC) target_compile_options("${target}" ${visibility} -Wall @@ -107,7 +109,17 @@ function(sourcemeta_add_default_options visibility target) # GCC seems to print a lot of false-positives here -Wno-free-nonheap-object # Disables runtime type information - $<$,$>:-fno-rtti>) + $<$,$>:-fno-rtti> + # See https://best.openssf.org/Compiler-Hardening-Guides/Compiler-Options-Hardening-Guide-for-C-and-C++.html + -fstack-clash-protection) + + # _GLIBCXX_ASSERTIONS is libstdc++ (GNU) specific, not honored by libc++ + # (which the LLVM toolchain on Apple ships). Restrict to non-Apple GCC + # to avoid emitting a Debug-only definition that does nothing on macOS + if(NOT APPLE) + target_compile_definitions("${target}" ${visibility} + $<$:_GLIBCXX_ASSERTIONS>) + endif() endif() endfunction() diff --git a/vendor/core/cmake/common/targets/executable.cmake b/vendor/core/cmake/common/targets/executable.cmake index 63f61cc2e..84814a5b6 100644 --- a/vendor/core/cmake/common/targets/executable.cmake +++ b/vendor/core/cmake/common/targets/executable.cmake @@ -32,12 +32,9 @@ function(sourcemeta_executable) sourcemeta_add_default_options(PRIVATE ${TARGET_NAME}) # See https://best.openssf.org/Compiler-Hardening-Guides/Compiler-Options-Hardening-Guide-for-C-and-C++.html - # Position Independent Executable (PIE) for ASLR support + # PIE linker flags for ASLR support. The compile-time -fPIE is already + # enabled globally via CMAKE_POSITION_INDEPENDENT_CODE in defaults.cmake. if(SOURCEMETA_COMPILER_LLVM OR SOURCEMETA_COMPILER_GCC) - target_compile_options(${TARGET_NAME} PRIVATE - $<$:-fPIE> - $<$:-fPIE> - $<$:-fPIE>) target_link_options(${TARGET_NAME} PRIVATE $<$:-pie> $<$:-pie> @@ -47,7 +44,6 @@ function(sourcemeta_executable) # See https://learn.microsoft.com/en-us/cpp/build/reference/guard-enable-control-flow-guard # See https://learn.microsoft.com/en-us/cpp/build/reference/cetcompat if(SOURCEMETA_COMPILER_MSVC) - target_compile_options(${TARGET_NAME} PRIVATE /guard:cf) target_link_options(${TARGET_NAME} PRIVATE /guard:cf /CETCOMPAT) endif() diff --git a/vendor/core/cmake/common/targets/googlebenchmark.cmake b/vendor/core/cmake/common/targets/googlebenchmark.cmake index d911a93b8..2b14b815c 100644 --- a/vendor/core/cmake/common/targets/googlebenchmark.cmake +++ b/vendor/core/cmake/common/targets/googlebenchmark.cmake @@ -19,6 +19,9 @@ function(sourcemeta_googlebenchmark) add_executable("${TARGET_NAME}" ${SOURCEMETA_GOOGLEBENCHMARK_SOURCES}) sourcemeta_add_default_options(PRIVATE ${TARGET_NAME}) + if(SOURCEMETA_COMPILER_MSVC) + target_link_options("${TARGET_NAME}" PRIVATE /guard:cf /CETCOMPAT) + endif() set_target_properties("${TARGET_NAME}" PROPERTIES FOLDER "${FOLDER_NAME}") target_link_libraries("${TARGET_NAME}" PRIVATE benchmark::benchmark) target_link_libraries("${TARGET_NAME}" PRIVATE benchmark::benchmark_main) diff --git a/vendor/core/cmake/common/targets/library.cmake b/vendor/core/cmake/common/targets/library.cmake index 216f8b025..35dd39a77 100644 --- a/vendor/core/cmake/common/targets/library.cmake +++ b/vendor/core/cmake/common/targets/library.cmake @@ -55,6 +55,9 @@ function(sourcemeta_library) add_library(${TARGET_NAME} ${PUBLIC_HEADER} ${ABSOLUTE_PRIVATE_HEADERS} ${SOURCEMETA_LIBRARY_SOURCES}) sourcemeta_add_default_options(PRIVATE ${TARGET_NAME}) + if(SOURCEMETA_COMPILER_MSVC) + target_link_options(${TARGET_NAME} PRIVATE /guard:cf /CETCOMPAT) + endif() else() add_library(${TARGET_NAME} INTERFACE ${PUBLIC_HEADER} ${ABSOLUTE_PRIVATE_HEADERS}) diff --git a/vendor/core/src/core/idna/CMakeLists.txt b/vendor/core/src/core/idna/CMakeLists.txt index b37440dd1..21006e31e 100644 --- a/vendor/core/src/core/idna/CMakeLists.txt +++ b/vendor/core/src/core/idna/CMakeLists.txt @@ -1,19 +1,25 @@ -find_package(Python3 REQUIRED COMPONENTS Interpreter) - +# Codegen set(SOURCEMETA_CORE_IDNA_UCD_DIR "${core_SOURCE_DIR}/vendor/unicodetools/unicodetools/data/idna/dev") - set(SOURCEMETA_CORE_IDNA_DATA_HEADER "${CMAKE_CURRENT_BINARY_DIR}/idna_data.h") - +sourcemeta_executable(NAMESPACE sourcemeta PROJECT core NAME idna_codegen + OUTPUT SOURCEMETA_CORE_IDNA_CODEGEN_TARGET + SOURCES codegen.cc include/sourcemeta/core/idna_ucd.h) +target_link_libraries("${SOURCEMETA_CORE_IDNA_CODEGEN_TARGET}" PRIVATE + sourcemeta::core::io + sourcemeta::core::options + sourcemeta::core::numeric + sourcemeta::core::text) +target_include_directories("${SOURCEMETA_CORE_IDNA_CODEGEN_TARGET}" PRIVATE + "${CMAKE_CURRENT_SOURCE_DIR}/include") add_custom_command( OUTPUT "${SOURCEMETA_CORE_IDNA_DATA_HEADER}" - COMMAND "${Python3_EXECUTABLE}" - "${CMAKE_CURRENT_SOURCE_DIR}/codegen.py" + COMMAND "${SOURCEMETA_CORE_IDNA_CODEGEN_TARGET}" "${SOURCEMETA_CORE_IDNA_DATA_HEADER}" "${SOURCEMETA_CORE_IDNA_UCD_DIR}/Idna2008.txt" DEPENDS - "${CMAKE_CURRENT_SOURCE_DIR}/codegen.py" + "${SOURCEMETA_CORE_IDNA_CODEGEN_TARGET}" "${SOURCEMETA_CORE_IDNA_UCD_DIR}/Idna2008.txt" COMMENT "Generating IDNA property tables" VERBATIM) diff --git a/vendor/core/src/core/idna/codegen.cc b/vendor/core/src/core/idna/codegen.cc new file mode 100644 index 000000000..9c57203df --- /dev/null +++ b/vendor/core/src/core/idna/codegen.cc @@ -0,0 +1,203 @@ +#include + +#include +#include +#include +#include + +#include // std::size_t, std::ptrdiff_t +#include // std::uint8_t, std::uint16_t, std::uint32_t, std::uint64_t +#include // EXIT_FAILURE, EXIT_SUCCESS +#include // std::exception +#include // std::filesystem::path +#include // std::cerr +#include // std::ostream +#include // std::views::transform +#include // std::span +#include // std::runtime_error +#include // std::string +#include // std::string_view +#include // std::unordered_map +#include // std::vector + +namespace { + +constexpr std::size_t TOTAL_CODEPOINTS{0x110000}; +constexpr std::size_t PAGE_SHIFT{10}; +constexpr std::size_t PAGE_SIZE{1 << PAGE_SHIFT}; +constexpr std::size_t NUM_PAGES{TOTAL_CODEPOINTS / PAGE_SIZE}; + +struct PropertyEntry { + std::uint32_t first; + std::uint32_t last; + sourcemeta::core::IDNAProperty value; +}; + +struct TwoStageTable { + std::vector stage1; + std::vector stage2; +}; + +auto property_from_token(const std::string_view token) + -> sourcemeta::core::IDNAProperty { +#define SOURCEMETA_CORE_IDNA_PROPERTY_CASE(name, alias) \ + if (token == alias) { \ + return sourcemeta::core::IDNAProperty::name; \ + } + SOURCEMETA_CORE_IDNA_PROPERTY_LIST(SOURCEMETA_CORE_IDNA_PROPERTY_CASE) +#undef SOURCEMETA_CORE_IDNA_PROPERTY_CASE + throw std::runtime_error{ + std::string{"Unknown IDNA property value: "}.append(token)}; +} + +auto parse_hex_codepoint(const std::string_view token) -> std::uint32_t { + const auto parsed{sourcemeta::core::to_uint32_t(token, 16)}; + if (!parsed.has_value() || parsed.value() > 0x10FFFF) { + throw std::runtime_error{std::string{"Invalid codepoint: "}.append(token)}; + } + return parsed.value(); +} + +auto parse_entry(const std::string_view payload) -> PropertyEntry { + const auto trimmed{ + sourcemeta::core::trim(sourcemeta::core::take_until(payload, '#'))}; + const auto parts{sourcemeta::core::split_once(trimmed, ';')}; + if (!parts.has_value()) { + throw std::runtime_error{ + std::string{"Missing ';' in line: "}.append(payload)}; + } + const auto range_part{sourcemeta::core::trim(parts->first)}; + const auto value_part{sourcemeta::core::trim(parts->second)}; + const auto range_split{ + sourcemeta::core::split_once(range_part, std::string_view{".."})}; + const auto first{parse_hex_codepoint( + range_split.has_value() ? range_split->first : range_part)}; + const auto last{range_split.has_value() + ? parse_hex_codepoint(range_split->second) + : first}; + return {first, last, property_from_token(value_part)}; +} + +auto parse_idna_file(const std::filesystem::path &input_path) + -> std::vector { + auto stream{sourcemeta::core::read_file(input_path)}; + std::vector missing; + std::vector data; + constexpr std::string_view missing_prefix{"@missing:"}; + sourcemeta::core::for_each_line(stream, [&](const std::string_view raw_line) { + const auto line{sourcemeta::core::trim(raw_line)}; + if (line.empty()) { + return; + } + if (line.front() == '#') { + const auto comment_body{sourcemeta::core::trim(line.substr(1))}; + if (comment_body.size() < missing_prefix.size() || + comment_body.substr(0, missing_prefix.size()) != missing_prefix) { + return; + } + missing.push_back( + parse_entry(comment_body.substr(missing_prefix.size()))); + return; + } + data.push_back(parse_entry(line)); + }); + std::vector result; + result.reserve(missing.size() + data.size()); + result.insert(result.end(), missing.begin(), missing.end()); + result.insert(result.end(), data.begin(), data.end()); + return result; +} + +auto build_pages(const std::vector &entries) -> TwoStageTable { + std::vector values( + TOTAL_CODEPOINTS, + static_cast(sourcemeta::core::IDNAProperty::PValid)); + for (const auto &entry : entries) { + for (std::uint32_t codepoint{entry.first}; codepoint <= entry.last; + codepoint += 1) { + values[codepoint] = static_cast(entry.value); + } + } + + std::unordered_map page_to_id; + TwoStageTable table; + table.stage1.reserve(NUM_PAGES); + for (std::size_t page_index{0}; page_index < NUM_PAGES; page_index += 1) { + const auto page_start{page_index * PAGE_SIZE}; + const std::string page_key{ + reinterpret_cast(values.data() + page_start), PAGE_SIZE}; + const auto existing{page_to_id.find(page_key)}; + if (existing != page_to_id.end()) { + table.stage1.push_back(existing->second); + continue; + } + const auto new_id{ + static_cast(table.stage2.size() / PAGE_SIZE)}; + page_to_id.emplace(page_key, new_id); + table.stage2.insert( + table.stage2.end(), + values.begin() + static_cast(page_start), + values.begin() + static_cast(page_start + PAGE_SIZE)); + table.stage1.push_back(new_id); + } + return table; +} + +template +auto emit_row(std::ostream &stream, const std::span items) -> void { + constexpr std::size_t row_width{16}; + for (std::size_t offset{0}; offset < items.size(); offset += row_width) { + stream << " "; + const auto upper{offset + row_width < items.size() ? offset + row_width + : items.size()}; + const auto row{items.subspan(offset, upper - offset)}; + const auto widened{row | std::views::transform([](const T value) { + return static_cast(value); + })}; + sourcemeta::core::join_to(stream, widened, ", "); + stream << ",\n"; + } +} + +auto emit_property(std::ostream &stream, const std::string_view prefix, + const TwoStageTable &table) -> void { + stream << "constexpr std::uint16_t " << prefix << "_STAGE1[" + << table.stage1.size() << "] = {\n"; + emit_row(stream, table.stage1); + stream << "};\n\n"; + stream << "constexpr std::uint8_t " << prefix << "_STAGE2[" + << table.stage2.size() << "] = {\n"; + emit_row(stream, table.stage2); + stream << "};\n\n"; +} + +} // namespace + +auto main(const int argc, const char *const argv[]) -> int { + try { + sourcemeta::core::Options app; + app.parse(argc, argv); + const auto &positional{app.positional()}; + if (positional.size() != 2) { + std::cerr << "Usage: " << (argc > 0 ? argv[0] : "codegen") + << " \n"; + return EXIT_FAILURE; + } + + const std::filesystem::path output_path{positional.at(0)}; + const std::filesystem::path input_path{positional.at(1)}; + + const auto entries{parse_idna_file(input_path)}; + const auto table{build_pages(entries)}; + sourcemeta::core::write_file(output_path, [&](std::ostream &stream) { + stream << "#include \n\n"; + stream << "namespace {\n\n"; + emit_property(stream, "IDNA_PROPERTY", table); + stream << "} // namespace\n"; + }); + } catch (const std::exception &error) { + std::cerr << "codegen: " << error.what() << "\n"; + return EXIT_FAILURE; + } + return EXIT_SUCCESS; +} diff --git a/vendor/core/src/core/idna/codegen.py b/vendor/core/src/core/idna/codegen.py deleted file mode 100644 index 65c64fff3..000000000 --- a/vendor/core/src/core/idna/codegen.py +++ /dev/null @@ -1,117 +0,0 @@ -#!/usr/bin/env python3 - -import re -import sys - -LINE = re.compile(r"^([0-9A-Fa-f]+)(?:\.\.([0-9A-Fa-f]+))?\s*;\s*(\S+)") -MISSING_PREFIX = re.compile(r"^#\s*@missing:\s*") - -TOTAL_CODEPOINTS = 0x110000 -PAGE_SHIFT = 10 -PAGE_SIZE = 1 << PAGE_SHIFT -NUM_PAGES = TOTAL_CODEPOINTS // PAGE_SIZE - -# Integer values must match the IDNAProperty enum in idna_ucd.h. -IDNA_PROPERTY_VALUES = { - "PVALID": 0, - "CONTEXTJ": 1, - "CONTEXTO": 2, - "DISALLOWED": 3, - "UNASSIGNED": 4, -} - - -def parse_file(path, value_map): - missing = [] - data = [] - with open(path) as source: - for line_number, line in enumerate(source, start=1): - stripped = line.strip() - if not stripped: - continue - target = data - if stripped.startswith("#"): - prefix = MISSING_PREFIX.match(stripped) - if not prefix: - continue - stripped = stripped[prefix.end():] - target = missing - match = LINE.match(stripped) - if not match: - raise ValueError( - f"{path}:{line_number}: unparseable line: {stripped!r}" - ) - first = int(match.group(1), 16) - last = int(match.group(2), 16) if match.group(2) else first - raw_value = match.group(3) - try: - value = value_map[raw_value] - except KeyError as error: - raise ValueError( - f"{path}:{line_number}: invalid value {raw_value!r}: {error}" - ) from error - target.append((first, last, value)) - return missing + data - - -def build_pages(entries): - values = [0] * TOTAL_CODEPOINTS - for first, last, value in entries: - values[first : last + 1] = [value] * (last - first + 1) - page_to_id = {} - unique_pages = [] - stage1 = [] - for page_index in range(NUM_PAGES): - start = page_index * PAGE_SIZE - page = tuple(values[start : start + PAGE_SIZE]) - if page not in page_to_id: - page_to_id[page] = len(unique_pages) - unique_pages.append(page) - stage1.append(page_to_id[page]) - return stage1, unique_pages - - -def emit_row(output, items): - for offset in range(0, len(items), 16): - chunk = items[offset : offset + 16] - output.write(" " + ", ".join(str(value) for value in chunk) + ",\n") - - -def emit_property(output, prefix, stage1, unique_pages): - output.write( - f"constexpr std::uint16_t {prefix}_STAGE1[{len(stage1)}] = {{\n" - ) - emit_row(output, stage1) - output.write("};\n\n") - stage2_size = len(unique_pages) * PAGE_SIZE - output.write( - f"constexpr std::uint8_t {prefix}_STAGE2[{stage2_size}] = {{\n" - ) - for page in unique_pages: - emit_row(output, list(page)) - output.write("};\n\n") - - -def main(): - if len(sys.argv) != 3: - print( - f"Usage: {sys.argv[0]} ", - file=sys.stderr, - ) - sys.exit(1) - - output_path = sys.argv[1] - idna_property_input = sys.argv[2] - - with open(output_path, "w") as output: - output.write("#include \n\n") - output.write("namespace {\n\n") - stage1, pages = build_pages( - parse_file(idna_property_input, IDNA_PROPERTY_VALUES) - ) - emit_property(output, "IDNA_PROPERTY", stage1, pages) - output.write("} // namespace\n") - - -if __name__ == "__main__": - main() diff --git a/vendor/core/src/core/idna/include/sourcemeta/core/idna_ucd.h b/vendor/core/src/core/idna/include/sourcemeta/core/idna_ucd.h index 89b694cb0..737537cb0 100644 --- a/vendor/core/src/core/idna/include/sourcemeta/core/idna_ucd.h +++ b/vendor/core/src/core/idna/include/sourcemeta/core/idna_ucd.h @@ -5,15 +5,22 @@ namespace sourcemeta::core { +/// @ingroup idna +/// Each entry maps an `IDNAProperty` enum name to its RFC 5892 token. +#define SOURCEMETA_CORE_IDNA_PROPERTY_LIST(X) \ + X(PValid, "PVALID") \ + X(ContextJ, "CONTEXTJ") \ + X(ContextO, "CONTEXTO") \ + X(Disallowed, "DISALLOWED") \ + X(Unassigned, "UNASSIGNED") + /// @ingroup idna /// The RFC 5892 derived property of a Unicode codepoint. See /// https://www.rfc-editor.org/rfc/rfc5892 for the property's definition. enum class IDNAProperty : std::uint8_t { - PValid = 0, - ContextJ = 1, - ContextO = 2, - Disallowed = 3, - Unassigned = 4, +#define SOURCEMETA_CORE_IDNA_ENUM_ENTRY(name, alias) name, + SOURCEMETA_CORE_IDNA_PROPERTY_LIST(SOURCEMETA_CORE_IDNA_ENUM_ENTRY) +#undef SOURCEMETA_CORE_IDNA_ENUM_ENTRY }; } // namespace sourcemeta::core diff --git a/vendor/core/src/core/regex/preprocess.h b/vendor/core/src/core/regex/preprocess.h index c6ced47ab..8164c82b4 100644 --- a/vendor/core/src/core/regex/preprocess.h +++ b/vendor/core/src/core/regex/preprocess.h @@ -561,6 +561,10 @@ inline auto expand_set_ops(const std::string &content, std::bitset<128> &result) result = op_char == '-' ? (result & ~operand_chars) : (result & operand_chars); + if (next == std::string::npos) { + break; + } + position = next; } diff --git a/vendor/core/src/core/unicode/CMakeLists.txt b/vendor/core/src/core/unicode/CMakeLists.txt index fc6b64dec..0b781358a 100644 --- a/vendor/core/src/core/unicode/CMakeLists.txt +++ b/vendor/core/src/core/unicode/CMakeLists.txt @@ -1,15 +1,21 @@ -find_package(Python3 REQUIRED COMPONENTS Interpreter) - +# Codegen set(SOURCEMETA_CORE_UNICODE_UCD_DIR "${core_SOURCE_DIR}/vendor/unicodetools/unicodetools/data/ucd/dev") - set(SOURCEMETA_CORE_UNICODE_DATA_HEADER "${CMAKE_CURRENT_BINARY_DIR}/unicode_data.h") - +sourcemeta_executable(NAMESPACE sourcemeta PROJECT core NAME unicode_codegen + OUTPUT SOURCEMETA_CORE_UNICODE_CODEGEN_TARGET + SOURCES codegen.cc include/sourcemeta/core/unicode_ucd.h) +target_link_libraries("${SOURCEMETA_CORE_UNICODE_CODEGEN_TARGET}" PRIVATE + sourcemeta::core::io + sourcemeta::core::options + sourcemeta::core::numeric + sourcemeta::core::text) +target_include_directories("${SOURCEMETA_CORE_UNICODE_CODEGEN_TARGET}" PRIVATE + "${CMAKE_CURRENT_SOURCE_DIR}/include") add_custom_command( OUTPUT "${SOURCEMETA_CORE_UNICODE_DATA_HEADER}" - COMMAND "${Python3_EXECUTABLE}" - "${CMAKE_CURRENT_SOURCE_DIR}/codegen.py" + COMMAND "${SOURCEMETA_CORE_UNICODE_CODEGEN_TARGET}" "${SOURCEMETA_CORE_UNICODE_DATA_HEADER}" "${SOURCEMETA_CORE_UNICODE_UCD_DIR}/PropertyValueAliases.txt" "${SOURCEMETA_CORE_UNICODE_UCD_DIR}/extracted/DerivedCombiningClass.txt" @@ -21,7 +27,7 @@ add_custom_command( "${SOURCEMETA_CORE_UNICODE_UCD_DIR}/UnicodeData.txt" "${SOURCEMETA_CORE_UNICODE_UCD_DIR}/CompositionExclusions.txt" DEPENDS - "${CMAKE_CURRENT_SOURCE_DIR}/codegen.py" + "${SOURCEMETA_CORE_UNICODE_CODEGEN_TARGET}" "${SOURCEMETA_CORE_UNICODE_UCD_DIR}/PropertyValueAliases.txt" "${SOURCEMETA_CORE_UNICODE_UCD_DIR}/extracted/DerivedCombiningClass.txt" "${SOURCEMETA_CORE_UNICODE_UCD_DIR}/extracted/DerivedJoiningType.txt" diff --git a/vendor/core/src/core/unicode/codegen.cc b/vendor/core/src/core/unicode/codegen.cc new file mode 100644 index 000000000..7e478003b --- /dev/null +++ b/vendor/core/src/core/unicode/codegen.cc @@ -0,0 +1,730 @@ +#include + +#include +#include +#include +#include + +#include // std::sort +#include // std::array +#include // std::size_t, std::ptrdiff_t +#include // std::uint8_t, std::uint16_t, std::uint32_t, std::uint64_t +#include // EXIT_FAILURE, EXIT_SUCCESS +#include // std::exception +#include // std::filesystem::path +#include // std::hex, std::uppercase, std::dec +#include // std::ios +#include // std::cerr +#include // std::map +#include // std::optional +#include // std::ostream +#include // std::views::transform +#include // std::span +#include // std::runtime_error +#include // std::string +#include // std::string_view +#include // std::unordered_map +#include // std::unordered_set +#include // std::vector + +namespace { + +constexpr std::size_t TOTAL_CODEPOINTS{0x110000}; +constexpr std::size_t PAGE_SHIFT{10}; +constexpr std::size_t PAGE_SIZE{1 << PAGE_SHIFT}; +constexpr std::size_t NUM_PAGES{TOTAL_CODEPOINTS / PAGE_SIZE}; +constexpr std::size_t DECOMPOSITION_OFFSET_BITS{14}; +constexpr std::size_t DECOMPOSITION_OFFSET_MASK{ + (1U << DECOMPOSITION_OFFSET_BITS) - 1U}; + +constexpr auto JOINING_TYPE_ORDER{std::to_array({ +#define SOURCEMETA_CORE_UCD_ALIAS_ENTRY(name, alias) alias, + SOURCEMETA_CORE_JOINING_TYPE_LIST(SOURCEMETA_CORE_UCD_ALIAS_ENTRY) +#undef SOURCEMETA_CORE_UCD_ALIAS_ENTRY +})}; + +constexpr auto BIDI_CLASS_ORDER{std::to_array({ +#define SOURCEMETA_CORE_UCD_ALIAS_ENTRY(name, alias) alias, + SOURCEMETA_CORE_BIDI_CLASS_LIST(SOURCEMETA_CORE_UCD_ALIAS_ENTRY) +#undef SOURCEMETA_CORE_UCD_ALIAS_ENTRY +})}; + +constexpr auto NFC_QUICK_CHECK_ORDER{std::to_array({ +#define SOURCEMETA_CORE_UCD_ALIAS_ENTRY(name, alias) alias, + SOURCEMETA_CORE_NFC_QUICK_CHECK_LIST(SOURCEMETA_CORE_UCD_ALIAS_ENTRY) +#undef SOURCEMETA_CORE_UCD_ALIAS_ENTRY +})}; + +constexpr auto UNICODE_SCRIPT_ORDER{std::to_array({ +#define SOURCEMETA_CORE_UCD_ALIAS_ENTRY(name, alias) alias, + SOURCEMETA_CORE_UNICODE_SCRIPT_LIST(SOURCEMETA_CORE_UCD_ALIAS_ENTRY) +#undef SOURCEMETA_CORE_UCD_ALIAS_ENTRY +})}; + +using ValueMap = std::map>; + +struct PropertyEntry { + std::uint32_t first; + std::uint32_t last; + std::uint8_t value; +}; + +struct TwoStageTable { + std::vector stage1; + std::vector stage2; +}; + +struct DecompositionTable { + std::vector blob; + std::vector stage1; + std::vector stage2; +}; + +struct CanonicalCompositionTriple { + std::uint32_t starter; + std::uint32_t combining; + std::uint32_t composed; +}; + +auto parse_hex_codepoint(const std::string_view token) -> std::uint32_t { + const auto parsed{sourcemeta::core::to_uint32_t(token, 16)}; + if (!parsed.has_value() || parsed.value() > 0x10FFFF) { + throw std::runtime_error{std::string{"Invalid codepoint: "}.append(token)}; + } + return parsed.value(); +} + +auto parse_property_file(const std::filesystem::path &input_path, + const ValueMap &value_map, + const std::optional property_filter) + -> std::vector { + auto stream{sourcemeta::core::read_file(input_path)}; + std::vector missing; + std::vector data; + constexpr std::string_view missing_prefix{"@missing:"}; + + const auto parse_payload{ + [&](const std::string_view payload) -> std::optional { + const auto trimmed{ + sourcemeta::core::trim(sourcemeta::core::take_until(payload, '#'))}; + const auto first_split{sourcemeta::core::split_once(trimmed, ';')}; + if (!first_split.has_value()) { + throw std::runtime_error{ + std::string{"Unparseable line: "}.append(payload)}; + } + const auto range_part{sourcemeta::core::trim(first_split->first)}; + const auto after_first{sourcemeta::core::trim(first_split->second)}; + + std::string_view value_token; + if (property_filter.has_value()) { + const auto second_split{ + sourcemeta::core::split_once(after_first, ';')}; + std::string_view property_token; + std::string_view tail; + if (second_split.has_value()) { + property_token = sourcemeta::core::trim(second_split->first); + tail = sourcemeta::core::trim(second_split->second); + } else { + property_token = after_first; + tail = {}; + } + if (property_token != property_filter.value()) { + return std::nullopt; + } + value_token = tail; + } else { + value_token = after_first; + } + + const auto range_split{ + sourcemeta::core::split_once(range_part, std::string_view{".."})}; + const auto first{parse_hex_codepoint( + range_split.has_value() ? range_split->first : range_part)}; + const auto last{range_split.has_value() + ? parse_hex_codepoint(range_split->second) + : first}; + + const auto value_it{value_map.find(value_token)}; + if (value_it == value_map.end()) { + throw std::runtime_error{ + std::string{"Unknown property value: "}.append(value_token)}; + } + return PropertyEntry{first, last, value_it->second}; + }}; + + sourcemeta::core::for_each_line(stream, [&](const std::string_view raw_line) { + const auto line{sourcemeta::core::trim(raw_line)}; + if (line.empty()) { + return; + } + if (line.front() == '#') { + const auto comment_body{sourcemeta::core::trim(line.substr(1))}; + if (comment_body.size() < missing_prefix.size() || + comment_body.substr(0, missing_prefix.size()) != missing_prefix) { + return; + } + const auto entry{ + parse_payload(comment_body.substr(missing_prefix.size()))}; + if (entry.has_value()) { + missing.push_back(entry.value()); + } + return; + } + const auto entry{parse_payload(line)}; + if (entry.has_value()) { + data.push_back(entry.value()); + } + }); + + std::vector result; + result.reserve(missing.size() + data.size()); + result.insert(result.end(), missing.begin(), missing.end()); + result.insert(result.end(), data.begin(), data.end()); + return result; +} + +auto parse_alias_rows(const std::filesystem::path &aliases_path, + const std::string_view property_short) + -> std::vector> { + auto stream{sourcemeta::core::read_file(aliases_path)}; + std::vector> rows; + sourcemeta::core::for_each_line(stream, [&](const std::string_view raw_line) { + const auto line{ + sourcemeta::core::trim(sourcemeta::core::take_until(raw_line, '#'))}; + if (line.empty()) { + return; + } + std::vector row; + bool matched{false}; + std::size_t field_index{0}; + sourcemeta::core::split(line, ';', [&](const std::string_view field) { + const auto trimmed{sourcemeta::core::trim(field)}; + if (field_index == 0) { + matched = (trimmed == property_short); + } else if (matched && !trimmed.empty()) { + row.emplace_back(trimmed); + } + field_index += 1; + }); + if (matched) { + rows.push_back(std::move(row)); + } + }); + return rows; +} + +auto build_combining_mark_value_map(const std::filesystem::path &aliases_path) + -> ValueMap { + static constexpr std::array combining{ + {"M", "Mn", "Mc", "Me"}}; + ValueMap result; + for (const auto &row : parse_alias_rows(aliases_path, "gc")) { + std::uint8_t value{0}; + for (const auto &field : row) { + for (const auto &candidate : combining) { + if (field == candidate) { + value = 1; + break; + } + } + if (value == 1) { + break; + } + } + for (const auto &field : row) { + result[field] = value; + } + } + return result; +} + +auto build_value_map(const std::filesystem::path &aliases_path, + const std::string_view property_short, + const std::span canonical_order) + -> ValueMap { + std::unordered_map canonical_to_int; + canonical_to_int.reserve(canonical_order.size()); + for (std::size_t index{0}; index < canonical_order.size(); index += 1) { + canonical_to_int.emplace(canonical_order[index], + static_cast(index)); + } + ValueMap result; + std::vector> unmatched; + for (const auto &row : parse_alias_rows(aliases_path, property_short)) { + std::optional value; + for (const auto &field : row) { + const auto found{canonical_to_int.find(field)}; + if (found != canonical_to_int.end()) { + value = found->second; + break; + } + } + if (!value.has_value()) { + unmatched.push_back(row); + continue; + } + for (const auto &field : row) { + result[field] = value.value(); + } + } + if (!unmatched.empty()) { + throw std::runtime_error{ + std::string{"Property has values not in canonical order: "}.append( + property_short)}; + } + return result; +} + +auto build_integer_value_map(const std::filesystem::path &aliases_path, + const std::string_view property_short) + -> ValueMap { + ValueMap result; + for (const auto &row : parse_alias_rows(aliases_path, property_short)) { + if (row.empty()) { + continue; + } + const auto parsed{sourcemeta::core::to_uint32_t(row.front())}; + if (!parsed.has_value() || parsed.value() > 0xFFU) { + throw std::runtime_error{ + std::string{"Invalid integer property value: "}.append(row.front())}; + } + const auto value{static_cast(parsed.value())}; + for (const auto &field : row) { + result[field] = value; + } + } + return result; +} + +struct UnicodeData { + std::map> decompositions; + std::unordered_map ccc; +}; + +auto parse_unicode_data(const std::filesystem::path &input_path) + -> UnicodeData { + auto stream{sourcemeta::core::read_file(input_path)}; + UnicodeData result; + sourcemeta::core::for_each_line(stream, [&](const std::string_view raw_line) { + const auto line{sourcemeta::core::trim(raw_line)}; + if (line.empty() || line.front() == '#') { + return; + } + std::array fields{}; + std::size_t field_count{0}; + sourcemeta::core::split(line, ';', [&](const std::string_view field) { + if (field_count < fields.size()) { + fields[field_count] = field; + } + field_count += 1; + }); + if (field_count < fields.size()) { + throw std::runtime_error{ + std::string{"UnicodeData.txt: too few fields in line: "}.append( + line)}; + } + const auto codepoint{ + parse_hex_codepoint(sourcemeta::core::trim(fields[0]))}; + const auto ccc_token{sourcemeta::core::trim(fields[3])}; + const auto ccc_value{sourcemeta::core::to_uint32_t(ccc_token)}; + if (!ccc_value.has_value() || ccc_value.value() > 0xFFU) { + throw std::runtime_error{ + std::string{"UnicodeData.txt: invalid CCC: "}.append(ccc_token)}; + } + if (ccc_value.value() != 0) { + result.ccc[codepoint] = static_cast(ccc_value.value()); + } + const auto decomp_field{sourcemeta::core::trim(fields[5])}; + if (decomp_field.empty() || decomp_field.front() == '<') { + return; + } + std::vector decomposition; + std::string_view rest{decomp_field}; + while (!rest.empty()) { + const auto token_end{rest.find(' ')}; + const auto token{token_end == std::string_view::npos + ? rest + : rest.substr(0, token_end)}; + decomposition.push_back(parse_hex_codepoint(token)); + if (token_end == std::string_view::npos) { + break; + } + rest.remove_prefix(token_end + 1); + while (!rest.empty() && rest.front() == ' ') { + rest.remove_prefix(1); + } + } + if (decomposition.size() > 2) { + throw std::runtime_error{ + std::string{"UnicodeData.txt: canonical decomposition has more " + "than 2 codepoints"}}; + } + result.decompositions[codepoint] = std::move(decomposition); + }); + return result; +} + +auto parse_full_composition_exclusions(const std::filesystem::path &input_path) + -> std::unordered_set { + auto stream{sourcemeta::core::read_file(input_path)}; + std::unordered_set result; + constexpr std::string_view target{"Full_Composition_Exclusion"}; + sourcemeta::core::for_each_line(stream, [&](const std::string_view raw_line) { + const auto line{sourcemeta::core::trim(raw_line)}; + if (line.empty() || line.front() == '#') { + return; + } + const auto trimmed{ + sourcemeta::core::trim(sourcemeta::core::take_until(line, '#'))}; + const auto first_split{sourcemeta::core::split_once(trimmed, ';')}; + if (!first_split.has_value()) { + throw std::runtime_error{ + std::string{"DerivedNormalizationProps.txt: unparseable line: "} + .append(line)}; + } + const auto range_part{sourcemeta::core::trim(first_split->first)}; + const auto after_first{sourcemeta::core::trim(first_split->second)}; + const auto second_split{sourcemeta::core::split_once(after_first, ';')}; + const auto property_token{second_split.has_value() + ? sourcemeta::core::trim(second_split->first) + : after_first}; + if (property_token != target) { + return; + } + const auto range_split{ + sourcemeta::core::split_once(range_part, std::string_view{".."})}; + const auto first{parse_hex_codepoint( + range_split.has_value() ? range_split->first : range_part)}; + const auto last{range_split.has_value() + ? parse_hex_codepoint(range_split->second) + : first}; + for (std::uint32_t codepoint{first}; codepoint <= last; codepoint += 1) { + result.insert(codepoint); + } + }); + return result; +} + +auto parse_explicit_composition_exclusions( + const std::filesystem::path &input_path) + -> std::unordered_set { + auto stream{sourcemeta::core::read_file(input_path)}; + std::unordered_set result; + sourcemeta::core::for_each_line(stream, [&](const std::string_view raw_line) { + const auto line{sourcemeta::core::trim(raw_line)}; + if (line.empty() || line.front() == '#') { + return; + } + const auto trimmed{ + sourcemeta::core::trim(sourcemeta::core::take_until(line, '#'))}; + if (trimmed.empty()) { + return; + } + result.insert(parse_hex_codepoint(trimmed)); + }); + return result; +} + +auto build_canonical_compositions( + const std::map> &decompositions, + const std::unordered_map &ccc, + const std::unordered_set &full_exclusions, + const std::unordered_set &explicit_exclusions) + -> std::vector { + for (const auto codepoint : explicit_exclusions) { + if (!full_exclusions.contains(codepoint)) { + throw std::runtime_error{ + std::string{"CompositionExclusions.txt has entries missing from " + "Full_Composition_Exclusion"}}; + } + } + + const auto ccc_of{[&](const std::uint32_t codepoint) -> std::uint8_t { + const auto found{ccc.find(codepoint)}; + return found == ccc.end() ? std::uint8_t{0} : found->second; + }}; + + std::vector triples; + for (const auto &[composed, decomposition] : decompositions) { + if (decomposition.size() != 2) { + continue; + } + if (ccc_of(composed) != 0) { + continue; + } + if (ccc_of(decomposition[0]) != 0) { + continue; + } + if (full_exclusions.contains(composed)) { + continue; + } + triples.push_back({decomposition[0], decomposition[1], composed}); + } + std::sort(triples.begin(), triples.end(), + [](const CanonicalCompositionTriple &left, + const CanonicalCompositionTriple &right) { + if (left.starter != right.starter) { + return left.starter < right.starter; + } + if (left.combining != right.combining) { + return left.combining < right.combining; + } + return left.composed < right.composed; + }); + return triples; +} + +auto build_canonical_decomposition_pages( + const std::map> &decompositions) + -> DecompositionTable { + std::vector blob; + std::vector packed(TOTAL_CODEPOINTS, 0); + for (const auto &[codepoint, decomposition] : decompositions) { + const auto offset{blob.size()}; + if (offset > DECOMPOSITION_OFFSET_MASK) { + throw std::runtime_error{ + std::string{"canonical decomposition blob exceeds offset cap"}}; + } + for (const auto value : decomposition) { + blob.push_back(static_cast(value)); + } + packed[codepoint] = static_cast( + (decomposition.size() << DECOMPOSITION_OFFSET_BITS) | offset); + } + + std::unordered_map page_to_id; + DecompositionTable table; + table.blob = std::move(blob); + table.stage1.reserve(NUM_PAGES); + for (std::size_t page_index{0}; page_index < NUM_PAGES; page_index += 1) { + const auto page_start{page_index * PAGE_SIZE}; + const std::string page_key{ + reinterpret_cast(packed.data() + page_start), + PAGE_SIZE * sizeof(std::uint16_t)}; + const auto existing{page_to_id.find(page_key)}; + if (existing != page_to_id.end()) { + table.stage1.push_back(existing->second); + continue; + } + const auto new_id{ + static_cast(table.stage2.size() / PAGE_SIZE)}; + page_to_id.emplace(page_key, new_id); + table.stage2.insert( + table.stage2.end(), + packed.begin() + static_cast(page_start), + packed.begin() + static_cast(page_start + PAGE_SIZE)); + table.stage1.push_back(new_id); + } + return table; +} + +auto build_pages(const std::vector &entries) -> TwoStageTable { + std::vector values(TOTAL_CODEPOINTS, 0); + for (const auto &entry : entries) { + for (std::uint32_t codepoint{entry.first}; codepoint <= entry.last; + codepoint += 1) { + values[codepoint] = entry.value; + } + } + + std::unordered_map page_to_id; + TwoStageTable table; + table.stage1.reserve(NUM_PAGES); + for (std::size_t page_index{0}; page_index < NUM_PAGES; page_index += 1) { + const auto page_start{page_index * PAGE_SIZE}; + const std::string page_key{ + reinterpret_cast(values.data() + page_start), PAGE_SIZE}; + const auto existing{page_to_id.find(page_key)}; + if (existing != page_to_id.end()) { + table.stage1.push_back(existing->second); + continue; + } + const auto new_id{ + static_cast(table.stage2.size() / PAGE_SIZE)}; + page_to_id.emplace(page_key, new_id); + table.stage2.insert( + table.stage2.end(), + values.begin() + static_cast(page_start), + values.begin() + static_cast(page_start + PAGE_SIZE)); + table.stage1.push_back(new_id); + } + return table; +} + +template +auto emit_row_decimal(std::ostream &stream, const std::span items) + -> void { + constexpr std::size_t row_width{16}; + for (std::size_t offset{0}; offset < items.size(); offset += row_width) { + stream << " "; + const auto upper{offset + row_width < items.size() ? offset + row_width + : items.size()}; + const auto row{items.subspan(offset, upper - offset)}; + const auto widened{row | std::views::transform([](const T value) { + return static_cast(value); + })}; + sourcemeta::core::join_to(stream, widened, ", "); + stream << ",\n"; + } +} + +template +auto emit_row_hex(std::ostream &stream, const std::span items, + const std::size_t row_width) -> void { + for (std::size_t offset{0}; offset < items.size(); offset += row_width) { + stream << " "; + const auto upper{offset + row_width < items.size() ? offset + row_width + : items.size()}; + for (std::size_t column{offset}; column < upper; column += 1) { + if (column > offset) { + stream << ", "; + } + stream << "0x" << std::hex << std::uppercase + << static_cast(items[column]) << std::dec; + } + stream << ",\n"; + } +} + +auto emit_property(std::ostream &stream, const std::string_view prefix, + const TwoStageTable &table) -> void { + stream << "constexpr std::uint16_t " << prefix << "_STAGE1[" + << table.stage1.size() << "] = {\n"; + emit_row_decimal(stream, table.stage1); + stream << "};\n\n"; + stream << "constexpr std::uint8_t " << prefix << "_STAGE2[" + << table.stage2.size() << "] = {\n"; + emit_row_decimal(stream, table.stage2); + stream << "};\n\n"; +} + +auto emit_canonical_decomposition(std::ostream &stream, + const DecompositionTable &table) -> void { + stream << "constexpr char32_t CANONICAL_DECOMPOSITION_BLOB[" + << table.blob.size() << "] = {\n"; + emit_row_hex(stream, table.blob, 8); + stream << "};\n\n"; + stream << "constexpr std::uint16_t CANONICAL_DECOMPOSITION_STAGE1[" + << table.stage1.size() << "] = {\n"; + emit_row_decimal(stream, table.stage1); + stream << "};\n\n"; + stream << "constexpr std::uint16_t CANONICAL_DECOMPOSITION_STAGE2[" + << table.stage2.size() << "] = {\n"; + emit_row_decimal(stream, table.stage2); + stream << "};\n\n"; +} + +auto emit_canonical_composition( + std::ostream &stream, + const std::vector &triples) -> void { + stream << "struct CanonicalCompositionEntry {\n"; + stream << " char32_t starter;\n"; + stream << " char32_t combining;\n"; + stream << " char32_t composed;\n"; + stream << "};\n\n"; + stream << "constexpr CanonicalCompositionEntry CANONICAL_COMPOSITIONS[" + << triples.size() << "] = {\n"; + for (const auto &triple : triples) { + stream << " {0x" << std::hex << std::uppercase + << static_cast(triple.starter) << ", 0x" + << static_cast(triple.combining) << ", 0x" + << static_cast(triple.composed) << std::dec << "},\n"; + } + stream << "};\n\n"; +} + +} // namespace + +auto main(const int argc, const char *const argv[]) -> int { + try { + sourcemeta::core::Options app; + app.parse(argc, argv); + const auto &positional{app.positional()}; + if (positional.size() != 10) { + std::cerr + << "Usage: " << (argc > 0 ? argv[0] : "codegen") + << " " + " " + " " + " " + " \n"; + return EXIT_FAILURE; + } + + const std::filesystem::path output_path{positional.at(0)}; + const std::filesystem::path aliases_path{positional.at(1)}; + const std::filesystem::path combining_class_path{positional.at(2)}; + const std::filesystem::path joining_type_path{positional.at(3)}; + const std::filesystem::path bidi_class_path{positional.at(4)}; + const std::filesystem::path scripts_path{positional.at(5)}; + const std::filesystem::path general_category_path{positional.at(6)}; + const std::filesystem::path normalization_props_path{positional.at(7)}; + const std::filesystem::path unicode_data_path{positional.at(8)}; + const std::filesystem::path composition_exclusions_path{positional.at(9)}; + + const auto unicode_data{parse_unicode_data(unicode_data_path)}; + const auto full_exclusions{ + parse_full_composition_exclusions(normalization_props_path)}; + const auto explicit_exclusions{ + parse_explicit_composition_exclusions(composition_exclusions_path)}; + + sourcemeta::core::write_file(output_path, [&](std::ostream &stream) { + stream << "#include \n"; + stream << "#include \n\n"; + stream << "namespace {\n\n"; + + struct PropertySpec { + std::string_view prefix; + const std::filesystem::path &input_path; + std::optional property_filter; + ValueMap value_map; + }; + + const auto combining_class_map{ + build_integer_value_map(aliases_path, "ccc")}; + const auto joining_type_map{ + build_value_map(aliases_path, "jt", JOINING_TYPE_ORDER)}; + const auto bidi_class_map{ + build_value_map(aliases_path, "bc", BIDI_CLASS_ORDER)}; + const auto script_map{ + build_value_map(aliases_path, "sc", UNICODE_SCRIPT_ORDER)}; + const auto combining_mark_map{ + build_combining_mark_value_map(aliases_path)}; + const auto nfc_quick_check_map{ + build_value_map(aliases_path, "NFC_QC", NFC_QUICK_CHECK_ORDER)}; + + const std::array properties{ + {{"COMBINING_CLASS", combining_class_path, std::nullopt, + combining_class_map}, + {"JOINING_TYPE", joining_type_path, std::nullopt, joining_type_map}, + {"BIDI_CLASS", bidi_class_path, std::nullopt, bidi_class_map}, + {"UNICODE_SCRIPT", scripts_path, std::nullopt, script_map}, + {"IS_COMBINING_MARK", general_category_path, std::nullopt, + combining_mark_map}, + {"NFC_QUICK_CHECK", normalization_props_path, + std::optional{"NFC_QC"}, nfc_quick_check_map}}}; + + for (const auto &spec : properties) { + const auto entries{parse_property_file(spec.input_path, spec.value_map, + spec.property_filter)}; + const auto table{build_pages(entries)}; + emit_property(stream, spec.prefix, table); + } + + const auto decomposition_table{ + build_canonical_decomposition_pages(unicode_data.decompositions)}; + emit_canonical_decomposition(stream, decomposition_table); + + const auto triples{build_canonical_compositions( + unicode_data.decompositions, unicode_data.ccc, full_exclusions, + explicit_exclusions)}; + emit_canonical_composition(stream, triples); + + stream << "} // namespace\n"; + }); + } catch (const std::exception &error) { + std::cerr << "codegen: " << error.what() << "\n"; + return EXIT_FAILURE; + } + return EXIT_SUCCESS; +} diff --git a/vendor/core/src/core/unicode/codegen.py b/vendor/core/src/core/unicode/codegen.py deleted file mode 100644 index d76411c3f..000000000 --- a/vendor/core/src/core/unicode/codegen.py +++ /dev/null @@ -1,516 +0,0 @@ -#!/usr/bin/env python3 - -import re -import sys - -LINE = re.compile(r"^([0-9A-Fa-f]+)(?:\.\.([0-9A-Fa-f]+))?\s*;\s*(\S+)") -MULTI_PROPERTY_LINE = re.compile( - r"^([0-9A-Fa-f]+)(?:\.\.([0-9A-Fa-f]+))?\s*;\s*(\S+)\s*;\s*(\S+)" -) -# Boolean-property rows in multi-property files use a two-field shape, -# with no value column. Used to recognise the row instead of silently -# skipping it. -BOOLEAN_PROPERTY_LINE = re.compile( - r"^([0-9A-Fa-f]+)(?:\.\.([0-9A-Fa-f]+))?\s*;\s*(\S+)\s*$" -) -MISSING_PREFIX = re.compile(r"^#\s*@missing:\s*") - -TOTAL_CODEPOINTS = 0x110000 -PAGE_SHIFT = 10 -PAGE_SIZE = 1 << PAGE_SHIFT -NUM_PAGES = TOTAL_CODEPOINTS // PAGE_SIZE - -# Per-property canonical order. Position in this list defines the integer -# value of the matching C++ enum entry. PropertyValueAliases.txt supplies -# the short/long alias mappings at codegen time, so we only need to -# declare one form per value here. - -JOINING_TYPE_ORDER = ["U", "T", "L", "R", "D", "C"] - -BIDI_CLASS_ORDER = [ - "L", "R", "AL", "EN", "ES", "ET", "AN", "CS", "NSM", "BN", - "B", "S", "WS", "ON", "LRE", "LRO", "RLE", "RLO", "PDF", - "LRI", "RLI", "FSI", "PDI", -] - -NFC_QUICK_CHECK_ORDER = ["Y", "N", "M"] - -UNICODE_SCRIPT_ORDER = [ - "Adlam", "Ahom", "Anatolian_Hieroglyphs", "Arabic", "Armenian", - "Avestan", "Balinese", "Bamum", "Bassa_Vah", "Batak", "Bengali", - "Beria_Erfe", "Bhaiksuki", "Bopomofo", "Brahmi", "Braille", - "Buginese", "Buhid", "Canadian_Aboriginal", "Carian", - "Caucasian_Albanian", "Chakma", "Cham", "Cherokee", "Chorasmian", - "Common", "Coptic", "Cuneiform", "Cypriot", "Cypro_Minoan", - "Cyrillic", "Deseret", "Devanagari", "Dives_Akuru", "Dogra", - "Duployan", "Egyptian_Hieroglyphs", "Elbasan", "Elymaic", - "Ethiopic", "Garay", "Georgian", "Glagolitic", "Gothic", "Grantha", - "Greek", "Gujarati", "Gunjala_Gondi", "Gurmukhi", "Gurung_Khema", - "Han", "Hangul", "Hanifi_Rohingya", "Hanunoo", "Hatran", "Hebrew", - "Hiragana", "Imperial_Aramaic", "Inherited", "Inscriptional_Pahlavi", - "Inscriptional_Parthian", "Javanese", "Kaithi", "Kannada", "Katakana", - "Kawi", "Kayah_Li", "Kharoshthi", "Khitan_Small_Script", "Khmer", - "Khojki", "Khudawadi", "Kirat_Rai", "Lao", "Latin", "Lepcha", "Limbu", - "Linear_A", "Linear_B", "Lisu", "Lycian", "Lydian", "Mahajani", - "Makasar", "Malayalam", "Mandaic", "Manichaean", "Marchen", - "Masaram_Gondi", "Medefaidrin", "Meetei_Mayek", "Mende_Kikakui", - "Meroitic_Cursive", "Meroitic_Hieroglyphs", "Miao", "Modi", - "Mongolian", "Mro", "Multani", "Myanmar", "Nabataean", "Nag_Mundari", - "Nandinagari", "New_Tai_Lue", "Newa", "Nko", "Nushu", - "Nyiakeng_Puachue_Hmong", "Ogham", "Ol_Chiki", "Ol_Onal", - "Old_Hungarian", "Old_Italic", "Old_North_Arabian", "Old_Permic", - "Old_Persian", "Old_Sogdian", "Old_South_Arabian", "Old_Turkic", - "Old_Uyghur", "Oriya", "Osage", "Osmanya", "Pahawh_Hmong", - "Palmyrene", "Pau_Cin_Hau", "Phags_Pa", "Phoenician", - "Psalter_Pahlavi", "Rejang", "Runic", "Samaritan", "Saurashtra", - "Sharada", "Shavian", "Siddham", "Sidetic", "SignWriting", "Sinhala", - "Sogdian", "Sora_Sompeng", "Soyombo", "Sundanese", "Sunuwar", - "Syloti_Nagri", "Syriac", "Tagalog", "Tagbanwa", "Tai_Le", "Tai_Tham", - "Tai_Viet", "Tai_Yo", "Takri", "Tamil", "Tangsa", "Tangut", "Telugu", - "Thaana", "Thai", "Tibetan", "Tifinagh", "Tirhuta", "Todhri", - "Tolong_Siki", "Toto", "Tulu_Tigalari", "Ugaritic", "Unknown", "Vai", - "Vithkuqi", "Wancho", "Warang_Citi", "Yezidi", "Yi", "Zanabazar_Square", - "Katakana_Or_Hiragana", -] - - -def parse_alias_lines(aliases_path, property_short): - rows = [] - with open(aliases_path) as source: - for line in source: - stripped = line.split("#", 1)[0].strip() - if not stripped: - continue - parts = [part.strip() for part in stripped.split(";")] - if parts[0] == property_short: - rows.append([field for field in parts[1:] if field]) - return rows - - -def build_combining_mark_value_map(aliases_path): - """Build {form: int} from PropertyValueAliases.txt mapping each - General_Category alias to 1 if it is a combining mark (Mn, Mc, Me, - or the supergroup M / Mark / Combining_Mark) and to 0 otherwise.""" - combining = {"M", "Mn", "Mc", "Me"} - result = {} - for row in parse_alias_lines(aliases_path, "gc"): - value = 1 if any(field in combining for field in row) else 0 - for field in row: - result[field] = value - return result - - -def build_value_map(aliases_path, property_short, canonical_order=None): - """Build {form: int} for a property. With canonical_order, each row's - integer is its canonical's position in that list; without, the row's - first field is read as the integer directly (used for ccc).""" - canonical_to_int = ( - {name: index for index, name in enumerate(canonical_order)} - if canonical_order is not None - else None - ) - result = {} - unmatched = [] - for row in parse_alias_lines(aliases_path, property_short): - if canonical_to_int is None: - value = int(row[0]) - else: - value = next( - (canonical_to_int[field] for field in row if field in canonical_to_int), - None, - ) - if value is None: - unmatched.append(row) - continue - for field in row: - result[field] = value - if unmatched: - raise ValueError( - f"{aliases_path}: property {property_short!r} has values not " - f"declared in canonical order: {unmatched}" - ) - return result - - -def parse_file(path, value_map, property_filter=None): - """Read a UCD file and return a list of (first, last, value) entries - with @missing defaults first and data ranges second, so callers can - apply them in order regardless of where @missing appears in the file. - - With property_filter set, lines have shape `codepoint; property; value` - (as in DerivedNormalizationProps.txt) and only rows whose property - name matches are returned. Without it, lines have shape - `codepoint; value` and every row contributes.""" - line_re = MULTI_PROPERTY_LINE if property_filter is not None else LINE - missing = [] - data = [] - with open(path) as source: - for line_number, line in enumerate(source, start=1): - stripped = line.strip() - if not stripped: - continue - target = data - if stripped.startswith("#"): - prefix = MISSING_PREFIX.match(stripped) - if not prefix: - continue - stripped = stripped[prefix.end():] - target = missing - match = line_re.match(stripped) - if not match: - # Recognise the boolean-property shape used in multi-property - # files, but only for properties other than the one we are - # filtering for. A boolean-shape row that names our target - # property would be malformed data and must raise. - data_only = stripped.split("#", 1)[0].strip() - if property_filter is not None: - boolean = BOOLEAN_PROPERTY_LINE.fullmatch(data_only) - if boolean and boolean.group(3) != property_filter: - continue - raise ValueError( - f"{path}:{line_number}: unparseable line: {stripped!r}" - ) - if property_filter is not None and match.group(3) != property_filter: - continue - first = int(match.group(1), 16) - last = int(match.group(2), 16) if match.group(2) else first - raw_value = match.group(4 if property_filter is not None else 3) - try: - value = value_map[raw_value] - except KeyError as error: - raise ValueError( - f"{path}:{line_number}: invalid value {raw_value!r}: {error}" - ) from error - target.append((first, last, value)) - return missing + data - - -def parse_unicode_data(path): - """Read UnicodeData.txt once and return (decompositions, ccc) where - decompositions is {codepoint: [decomposition codepoints]} for canonical - decompositions only (compatibility decompositions, those whose field 5 - starts with a `` prefix per UAX #44, are excluded), and ccc is - {codepoint: canonical_combining_class} for codepoints with non-zero CCC. - - Raises if any canonical decomposition has more than two codepoints, which - would indicate a format change in UnicodeData.txt.""" - decompositions = {} - ccc = {} - with open(path) as source: - for line_number, line in enumerate(source, start=1): - stripped = line.strip() - if not stripped or stripped.startswith("#"): - continue - fields = stripped.split(";") - if len(fields) < 6: - raise ValueError( - f"{path}:{line_number}: too few fields: {stripped!r}" - ) - try: - codepoint = int(fields[0], 16) - except ValueError as error: - raise ValueError( - f"{path}:{line_number}: invalid codepoint: {fields[0]!r}" - ) from error - try: - ccc_value = int(fields[3]) - except ValueError as error: - raise ValueError( - f"{path}:{line_number}: invalid CCC: {fields[3]!r}" - ) from error - if ccc_value != 0: - ccc[codepoint] = ccc_value - decomp_field = fields[5].strip() - if not decomp_field or decomp_field.startswith("<"): - continue - decomposition = [int(token, 16) for token in decomp_field.split()] - if len(decomposition) > 2: - raise ValueError( - f"{path}:{line_number}: canonical decomposition of " - f"U+{codepoint:04X} has {len(decomposition)} codepoints, " - f"expected 1 or 2" - ) - decompositions[codepoint] = decomposition - return decompositions, ccc - - -def parse_full_composition_exclusions(path): - """Read DerivedNormalizationProps.txt and return the set of codepoints - for which Full_Composition_Exclusion=Yes. Lines that match neither the - three-field nor the two-field property shape raise, so a file format - change cannot silently drop exclusion data.""" - result = set() - with open(path) as source: - for line_number, line in enumerate(source, start=1): - stripped = line.strip() - if not stripped or stripped.startswith("#"): - continue - match = MULTI_PROPERTY_LINE.match(stripped) - if match: - if match.group(3) != "Full_Composition_Exclusion": - continue - first = int(match.group(1), 16) - last = int(match.group(2), 16) if match.group(2) else first - for codepoint in range(first, last + 1): - result.add(codepoint) - continue - data_only = stripped.split("#", 1)[0].strip() - boolean_match = BOOLEAN_PROPERTY_LINE.fullmatch(data_only) - if not boolean_match: - raise ValueError( - f"{path}:{line_number}: unparseable line: {stripped!r}" - ) - if boolean_match.group(3) != "Full_Composition_Exclusion": - continue - first = int(boolean_match.group(1), 16) - last = (int(boolean_match.group(2), 16) - if boolean_match.group(2) else first) - for codepoint in range(first, last + 1): - result.add(codepoint) - return result - - -EXPLICIT_COMPOSITION_EXCLUSION_LINE = re.compile( - r"^([0-9A-Fa-f]+)(?:\s+#.*)?$" -) - - -def parse_explicit_composition_exclusions(path): - """Read the script-specific list from CompositionExclusions.txt. The - file has a flat `codepoint # NAME` shape with no semicolons. The full - line is anchored so prefix-only matches and trailing junk fail loud.""" - result = set() - with open(path) as source: - for line_number, line in enumerate(source, start=1): - stripped = line.strip() - if not stripped or stripped.startswith("#"): - continue - match = EXPLICIT_COMPOSITION_EXCLUSION_LINE.fullmatch(stripped) - if not match: - raise ValueError( - f"{path}:{line_number}: unparseable line: {stripped!r}" - ) - result.add(int(match.group(1), 16)) - return result - - -def build_canonical_compositions(decompositions, ccc, full_exclusions, - explicit_exclusions): - """Apply the UAX #15 §1.3 primary-composite filters and return a sorted - list of (starter, combining, composed) triples. - - Raises if the explicit CompositionExclusions.txt list is not a subset of - the derived Full_Composition_Exclusion set in - DerivedNormalizationProps.txt, which would indicate the two data files - have drifted out of sync.""" - missing = explicit_exclusions - full_exclusions - if missing: - raise ValueError( - "CompositionExclusions.txt entries missing from " - "Full_Composition_Exclusion: " - + ", ".join(f"U+{codepoint:04X}" for codepoint in sorted(missing)) - ) - - triples = [] - for composed, decomposition in decompositions.items(): - if len(decomposition) != 2: - continue - if ccc.get(composed, 0) != 0: - continue - if ccc.get(decomposition[0], 0) != 0: - continue - if composed in full_exclusions: - continue - triples.append((decomposition[0], decomposition[1], composed)) - triples.sort() - return triples - - -def emit_canonical_composition(output, triples): - output.write("struct CanonicalCompositionEntry {\n") - output.write(" char32_t starter;\n") - output.write(" char32_t combining;\n") - output.write(" char32_t composed;\n") - output.write("};\n\n") - output.write( - f"constexpr CanonicalCompositionEntry " - f"CANONICAL_COMPOSITIONS[{len(triples)}] = {{\n" - ) - for starter, combining, composed in triples: - output.write( - f" {{0x{starter:X}, 0x{combining:X}, 0x{composed:X}}},\n" - ) - output.write("};\n\n") - - -# Packed per-codepoint entry: (length << OFFSET_BITS) | offset. A zero entry -# means no decomposition. Length 1 / 2 covers the entire canonical space. -DECOMPOSITION_OFFSET_BITS = 14 -DECOMPOSITION_OFFSET_MASK = (1 << DECOMPOSITION_OFFSET_BITS) - 1 - - -def build_canonical_decomposition_pages(decompositions): - """Build the flat blob plus per-codepoint packed entries, then run the - standard two-stage page-table dedup on top of the packed array.""" - blob = [] - packed = [0] * TOTAL_CODEPOINTS - for codepoint in sorted(decompositions): - decomposition = decompositions[codepoint] - offset = len(blob) - if offset > DECOMPOSITION_OFFSET_MASK: - raise ValueError( - f"canonical decomposition blob exceeds " - f"{DECOMPOSITION_OFFSET_BITS}-bit offset cap at " - f"U+{codepoint:04X}" - ) - blob.extend(decomposition) - packed[codepoint] = (len(decomposition) << DECOMPOSITION_OFFSET_BITS) | offset - - page_to_id = {} - unique_pages = [] - stage1 = [] - for page_index in range(NUM_PAGES): - start = page_index * PAGE_SIZE - page = tuple(packed[start : start + PAGE_SIZE]) - if page not in page_to_id: - page_to_id[page] = len(unique_pages) - unique_pages.append(page) - stage1.append(page_to_id[page]) - return blob, stage1, unique_pages - - -def emit_canonical_decomposition(output, blob, stage1, unique_pages): - output.write( - f"constexpr char32_t CANONICAL_DECOMPOSITION_BLOB[{len(blob)}] = {{\n" - ) - for offset in range(0, len(blob), 8): - chunk = blob[offset : offset + 8] - output.write( - " " + ", ".join(f"0x{value:X}" for value in chunk) + ",\n" - ) - output.write("};\n\n") - - output.write( - f"constexpr std::uint16_t CANONICAL_DECOMPOSITION_STAGE1" - f"[{len(stage1)}] = {{\n" - ) - emit_row(output, stage1) - output.write("};\n\n") - stage2_size = len(unique_pages) * PAGE_SIZE - output.write( - f"constexpr std::uint16_t CANONICAL_DECOMPOSITION_STAGE2" - f"[{stage2_size}] = {{\n" - ) - for page in unique_pages: - emit_row(output, list(page)) - output.write("};\n\n") - - -def build_pages(entries): - values = [0] * TOTAL_CODEPOINTS - for first, last, value in entries: - values[first : last + 1] = [value] * (last - first + 1) - page_to_id = {} - unique_pages = [] - stage1 = [] - for page_index in range(NUM_PAGES): - start = page_index * PAGE_SIZE - page = tuple(values[start : start + PAGE_SIZE]) - if page not in page_to_id: - page_to_id[page] = len(unique_pages) - unique_pages.append(page) - stage1.append(page_to_id[page]) - return stage1, unique_pages - - -def emit_row(output, items): - for offset in range(0, len(items), 16): - chunk = items[offset : offset + 16] - output.write(" " + ", ".join(str(value) for value in chunk) + ",\n") - - -def emit_property(output, prefix, stage1, unique_pages): - output.write( - f"constexpr std::uint16_t {prefix}_STAGE1[{len(stage1)}] = {{\n" - ) - emit_row(output, stage1) - output.write("};\n\n") - stage2_size = len(unique_pages) * PAGE_SIZE - output.write( - f"constexpr std::uint8_t {prefix}_STAGE2[{stage2_size}] = {{\n" - ) - for page in unique_pages: - emit_row(output, list(page)) - output.write("};\n\n") - - -def main(): - if len(sys.argv) != 11: - print( - f"Usage: {sys.argv[0]} " - " " - " " - " " - " " - " " - " " - " " - " " - " " - "", - file=sys.stderr, - ) - sys.exit(1) - - output_path = sys.argv[1] - aliases_path = sys.argv[2] - derived_normalization_props_path = sys.argv[8] - - properties = [ - ("COMBINING_CLASS", sys.argv[3], None, - build_value_map(aliases_path, "ccc")), - ("JOINING_TYPE", sys.argv[4], None, - build_value_map(aliases_path, "jt", JOINING_TYPE_ORDER)), - ("BIDI_CLASS", sys.argv[5], None, - build_value_map(aliases_path, "bc", BIDI_CLASS_ORDER)), - ("UNICODE_SCRIPT", sys.argv[6], None, - build_value_map(aliases_path, "sc", UNICODE_SCRIPT_ORDER)), - ("IS_COMBINING_MARK", sys.argv[7], None, - build_combining_mark_value_map(aliases_path)), - ("NFC_QUICK_CHECK", derived_normalization_props_path, "NFC_QC", - build_value_map(aliases_path, "NFC_QC", NFC_QUICK_CHECK_ORDER)), - ] - - unicode_data_path = sys.argv[9] - composition_exclusions_path = sys.argv[10] - - decompositions, ccc = parse_unicode_data(unicode_data_path) - full_exclusions = parse_full_composition_exclusions( - derived_normalization_props_path - ) - explicit_exclusions = parse_explicit_composition_exclusions( - composition_exclusions_path - ) - - with open(output_path, "w") as output: - output.write("#include \n") - output.write("#include \n\n") - output.write("namespace {\n\n") - for prefix, input_path, property_filter, value_map in properties: - stage1, pages = build_pages( - parse_file(input_path, value_map, property_filter) - ) - emit_property(output, prefix, stage1, pages) - blob, stage1, pages = build_canonical_decomposition_pages( - decompositions - ) - emit_canonical_decomposition(output, blob, stage1, pages) - triples = build_canonical_compositions( - decompositions, ccc, full_exclusions, explicit_exclusions - ) - emit_canonical_composition(output, triples) - output.write("} // namespace\n") - - -if __name__ == "__main__": - main() diff --git a/vendor/core/src/core/unicode/include/sourcemeta/core/unicode_ucd.h b/vendor/core/src/core/unicode/include/sourcemeta/core/unicode_ucd.h index 536054c68..2093e51d4 100644 --- a/vendor/core/src/core/unicode/include/sourcemeta/core/unicode_ucd.h +++ b/vendor/core/src/core/unicode/include/sourcemeta/core/unicode_ucd.h @@ -5,238 +5,266 @@ namespace sourcemeta::core { +/// @ingroup unicode +/// Each entry maps a `JoiningType` enum name to its UCD short alias. +#define SOURCEMETA_CORE_JOINING_TYPE_LIST(X) \ + X(NonJoining, "U") \ + X(Transparent, "T") \ + X(LeftJoining, "L") \ + X(RightJoining, "R") \ + X(DualJoining, "D") \ + X(JoinCausing, "C") + /// @ingroup unicode /// The joining type of a Unicode codepoint per UAX #44. See /// https://www.unicode.org/reports/tr44/ for the property's definition. enum class JoiningType : std::uint8_t { - NonJoining = 0, - Transparent = 1, - LeftJoining = 2, - RightJoining = 3, - DualJoining = 4, - JoinCausing = 5, +#define SOURCEMETA_CORE_UCD_ENUM_ENTRY(name, alias) name, + SOURCEMETA_CORE_JOINING_TYPE_LIST(SOURCEMETA_CORE_UCD_ENUM_ENTRY) +#undef SOURCEMETA_CORE_UCD_ENUM_ENTRY }; +/// @ingroup unicode +/// Each entry maps a `BidiClass` enum name to its UCD short alias. +#define SOURCEMETA_CORE_BIDI_CLASS_LIST(X) \ + X(LeftToRight, "L") \ + X(RightToLeft, "R") \ + X(ArabicLetter, "AL") \ + X(EuropeanNumber, "EN") \ + X(EuropeanSeparator, "ES") \ + X(EuropeanTerminator, "ET") \ + X(ArabicNumber, "AN") \ + X(CommonSeparator, "CS") \ + X(NonspacingMark, "NSM") \ + X(BoundaryNeutral, "BN") \ + X(ParagraphSeparator, "B") \ + X(SegmentSeparator, "S") \ + X(WhiteSpace, "WS") \ + X(OtherNeutral, "ON") \ + X(LeftToRightEmbedding, "LRE") \ + X(LeftToRightOverride, "LRO") \ + X(RightToLeftEmbedding, "RLE") \ + X(RightToLeftOverride, "RLO") \ + X(PopDirectionalFormat, "PDF") \ + X(LeftToRightIsolate, "LRI") \ + X(RightToLeftIsolate, "RLI") \ + X(FirstStrongIsolate, "FSI") \ + X(PopDirectionalIsolate, "PDI") + /// @ingroup unicode /// The bidirectional class of a Unicode codepoint per UAX #44. See /// https://www.unicode.org/reports/tr44/ for the property's definition. enum class BidiClass : std::uint8_t { - LeftToRight = 0, - RightToLeft = 1, - ArabicLetter = 2, - EuropeanNumber = 3, - EuropeanSeparator = 4, - EuropeanTerminator = 5, - ArabicNumber = 6, - CommonSeparator = 7, - NonspacingMark = 8, - BoundaryNeutral = 9, - ParagraphSeparator = 10, - SegmentSeparator = 11, - WhiteSpace = 12, - OtherNeutral = 13, - LeftToRightEmbedding = 14, - LeftToRightOverride = 15, - RightToLeftEmbedding = 16, - RightToLeftOverride = 17, - PopDirectionalFormat = 18, - LeftToRightIsolate = 19, - RightToLeftIsolate = 20, - FirstStrongIsolate = 21, - PopDirectionalIsolate = 22, +#define SOURCEMETA_CORE_UCD_ENUM_ENTRY(name, alias) name, + SOURCEMETA_CORE_BIDI_CLASS_LIST(SOURCEMETA_CORE_UCD_ENUM_ENTRY) +#undef SOURCEMETA_CORE_UCD_ENUM_ENTRY }; +/// @ingroup unicode +/// Each entry maps a `UnicodeScript` enum name to its UCD long alias. +/// Per UAX #24 §1.4, `Katakana_Or_Hiragana` only appears in the +/// `Script_Extensions` property and never in the `Script` property itself. +#define SOURCEMETA_CORE_UNICODE_SCRIPT_LIST(X) \ + X(Adlam, "Adlam") \ + X(Ahom, "Ahom") \ + X(AnatolianHieroglyphs, "Anatolian_Hieroglyphs") \ + X(Arabic, "Arabic") \ + X(Armenian, "Armenian") \ + X(Avestan, "Avestan") \ + X(Balinese, "Balinese") \ + X(Bamum, "Bamum") \ + X(BassaVah, "Bassa_Vah") \ + X(Batak, "Batak") \ + X(Bengali, "Bengali") \ + X(BeriaErfe, "Beria_Erfe") \ + X(Bhaiksuki, "Bhaiksuki") \ + X(Bopomofo, "Bopomofo") \ + X(Brahmi, "Brahmi") \ + X(Braille, "Braille") \ + X(Buginese, "Buginese") \ + X(Buhid, "Buhid") \ + X(CanadianAboriginal, "Canadian_Aboriginal") \ + X(Carian, "Carian") \ + X(CaucasianAlbanian, "Caucasian_Albanian") \ + X(Chakma, "Chakma") \ + X(Cham, "Cham") \ + X(Cherokee, "Cherokee") \ + X(Chorasmian, "Chorasmian") \ + X(Common, "Common") \ + X(Coptic, "Coptic") \ + X(Cuneiform, "Cuneiform") \ + X(Cypriot, "Cypriot") \ + X(CyproMinoan, "Cypro_Minoan") \ + X(Cyrillic, "Cyrillic") \ + X(Deseret, "Deseret") \ + X(Devanagari, "Devanagari") \ + X(DivesAkuru, "Dives_Akuru") \ + X(Dogra, "Dogra") \ + X(Duployan, "Duployan") \ + X(EgyptianHieroglyphs, "Egyptian_Hieroglyphs") \ + X(Elbasan, "Elbasan") \ + X(Elymaic, "Elymaic") \ + X(Ethiopic, "Ethiopic") \ + X(Garay, "Garay") \ + X(Georgian, "Georgian") \ + X(Glagolitic, "Glagolitic") \ + X(Gothic, "Gothic") \ + X(Grantha, "Grantha") \ + X(Greek, "Greek") \ + X(Gujarati, "Gujarati") \ + X(GunjalaGondi, "Gunjala_Gondi") \ + X(Gurmukhi, "Gurmukhi") \ + X(GurungKhema, "Gurung_Khema") \ + X(Han, "Han") \ + X(Hangul, "Hangul") \ + X(HanifiRohingya, "Hanifi_Rohingya") \ + X(Hanunoo, "Hanunoo") \ + X(Hatran, "Hatran") \ + X(Hebrew, "Hebrew") \ + X(Hiragana, "Hiragana") \ + X(ImperialAramaic, "Imperial_Aramaic") \ + X(Inherited, "Inherited") \ + X(InscriptionalPahlavi, "Inscriptional_Pahlavi") \ + X(InscriptionalParthian, "Inscriptional_Parthian") \ + X(Javanese, "Javanese") \ + X(Kaithi, "Kaithi") \ + X(Kannada, "Kannada") \ + X(Katakana, "Katakana") \ + X(Kawi, "Kawi") \ + X(KayahLi, "Kayah_Li") \ + X(Kharoshthi, "Kharoshthi") \ + X(KhitanSmallScript, "Khitan_Small_Script") \ + X(Khmer, "Khmer") \ + X(Khojki, "Khojki") \ + X(Khudawadi, "Khudawadi") \ + X(KiratRai, "Kirat_Rai") \ + X(Lao, "Lao") \ + X(Latin, "Latin") \ + X(Lepcha, "Lepcha") \ + X(Limbu, "Limbu") \ + X(LinearA, "Linear_A") \ + X(LinearB, "Linear_B") \ + X(Lisu, "Lisu") \ + X(Lycian, "Lycian") \ + X(Lydian, "Lydian") \ + X(Mahajani, "Mahajani") \ + X(Makasar, "Makasar") \ + X(Malayalam, "Malayalam") \ + X(Mandaic, "Mandaic") \ + X(Manichaean, "Manichaean") \ + X(Marchen, "Marchen") \ + X(MasaramGondi, "Masaram_Gondi") \ + X(Medefaidrin, "Medefaidrin") \ + X(MeeteiMayek, "Meetei_Mayek") \ + X(MendeKikakui, "Mende_Kikakui") \ + X(MeroiticCursive, "Meroitic_Cursive") \ + X(MeroiticHieroglyphs, "Meroitic_Hieroglyphs") \ + X(Miao, "Miao") \ + X(Modi, "Modi") \ + X(Mongolian, "Mongolian") \ + X(Mro, "Mro") \ + X(Multani, "Multani") \ + X(Myanmar, "Myanmar") \ + X(Nabataean, "Nabataean") \ + X(NagMundari, "Nag_Mundari") \ + X(Nandinagari, "Nandinagari") \ + X(NewTaiLue, "New_Tai_Lue") \ + X(Newa, "Newa") \ + X(Nko, "Nko") \ + X(Nushu, "Nushu") \ + X(NyiakengPuachueHmong, "Nyiakeng_Puachue_Hmong") \ + X(Ogham, "Ogham") \ + X(OlChiki, "Ol_Chiki") \ + X(OlOnal, "Ol_Onal") \ + X(OldHungarian, "Old_Hungarian") \ + X(OldItalic, "Old_Italic") \ + X(OldNorthArabian, "Old_North_Arabian") \ + X(OldPermic, "Old_Permic") \ + X(OldPersian, "Old_Persian") \ + X(OldSogdian, "Old_Sogdian") \ + X(OldSouthArabian, "Old_South_Arabian") \ + X(OldTurkic, "Old_Turkic") \ + X(OldUyghur, "Old_Uyghur") \ + X(Oriya, "Oriya") \ + X(Osage, "Osage") \ + X(Osmanya, "Osmanya") \ + X(PahawhHmong, "Pahawh_Hmong") \ + X(Palmyrene, "Palmyrene") \ + X(PauCinHau, "Pau_Cin_Hau") \ + X(PhagsPa, "Phags_Pa") \ + X(Phoenician, "Phoenician") \ + X(PsalterPahlavi, "Psalter_Pahlavi") \ + X(Rejang, "Rejang") \ + X(Runic, "Runic") \ + X(Samaritan, "Samaritan") \ + X(Saurashtra, "Saurashtra") \ + X(Sharada, "Sharada") \ + X(Shavian, "Shavian") \ + X(Siddham, "Siddham") \ + X(Sidetic, "Sidetic") \ + X(SignWriting, "SignWriting") \ + X(Sinhala, "Sinhala") \ + X(Sogdian, "Sogdian") \ + X(SoraSompeng, "Sora_Sompeng") \ + X(Soyombo, "Soyombo") \ + X(Sundanese, "Sundanese") \ + X(Sunuwar, "Sunuwar") \ + X(SylotiNagri, "Syloti_Nagri") \ + X(Syriac, "Syriac") \ + X(Tagalog, "Tagalog") \ + X(Tagbanwa, "Tagbanwa") \ + X(TaiLe, "Tai_Le") \ + X(TaiTham, "Tai_Tham") \ + X(TaiViet, "Tai_Viet") \ + X(TaiYo, "Tai_Yo") \ + X(Takri, "Takri") \ + X(Tamil, "Tamil") \ + X(Tangsa, "Tangsa") \ + X(Tangut, "Tangut") \ + X(Telugu, "Telugu") \ + X(Thaana, "Thaana") \ + X(Thai, "Thai") \ + X(Tibetan, "Tibetan") \ + X(Tifinagh, "Tifinagh") \ + X(Tirhuta, "Tirhuta") \ + X(Todhri, "Todhri") \ + X(TolongSiki, "Tolong_Siki") \ + X(Toto, "Toto") \ + X(TuluTigalari, "Tulu_Tigalari") \ + X(Ugaritic, "Ugaritic") \ + X(Unknown, "Unknown") \ + X(Vai, "Vai") \ + X(Vithkuqi, "Vithkuqi") \ + X(Wancho, "Wancho") \ + X(WarangCiti, "Warang_Citi") \ + X(Yezidi, "Yezidi") \ + X(Yi, "Yi") \ + X(ZanabazarSquare, "Zanabazar_Square") \ + X(KatakanaOrHiragana, "Katakana_Or_Hiragana") + /// @ingroup unicode /// The script of a Unicode codepoint per UAX #24. See /// https://www.unicode.org/reports/tr24/ for the property's definition. enum class UnicodeScript : std::uint8_t { - Adlam = 0, - Ahom = 1, - AnatolianHieroglyphs = 2, - Arabic = 3, - Armenian = 4, - Avestan = 5, - Balinese = 6, - Bamum = 7, - BassaVah = 8, - Batak = 9, - Bengali = 10, - BeriaErfe = 11, - Bhaiksuki = 12, - Bopomofo = 13, - Brahmi = 14, - Braille = 15, - Buginese = 16, - Buhid = 17, - CanadianAboriginal = 18, - Carian = 19, - CaucasianAlbanian = 20, - Chakma = 21, - Cham = 22, - Cherokee = 23, - Chorasmian = 24, - Common = 25, - Coptic = 26, - Cuneiform = 27, - Cypriot = 28, - CyproMinoan = 29, - Cyrillic = 30, - Deseret = 31, - Devanagari = 32, - DivesAkuru = 33, - Dogra = 34, - Duployan = 35, - EgyptianHieroglyphs = 36, - Elbasan = 37, - Elymaic = 38, - Ethiopic = 39, - Garay = 40, - Georgian = 41, - Glagolitic = 42, - Gothic = 43, - Grantha = 44, - Greek = 45, - Gujarati = 46, - GunjalaGondi = 47, - Gurmukhi = 48, - GurungKhema = 49, - Han = 50, - Hangul = 51, - HanifiRohingya = 52, - Hanunoo = 53, - Hatran = 54, - Hebrew = 55, - Hiragana = 56, - ImperialAramaic = 57, - Inherited = 58, - InscriptionalPahlavi = 59, - InscriptionalParthian = 60, - Javanese = 61, - Kaithi = 62, - Kannada = 63, - Katakana = 64, - Kawi = 65, - KayahLi = 66, - Kharoshthi = 67, - KhitanSmallScript = 68, - Khmer = 69, - Khojki = 70, - Khudawadi = 71, - KiratRai = 72, - Lao = 73, - Latin = 74, - Lepcha = 75, - Limbu = 76, - LinearA = 77, - LinearB = 78, - Lisu = 79, - Lycian = 80, - Lydian = 81, - Mahajani = 82, - Makasar = 83, - Malayalam = 84, - Mandaic = 85, - Manichaean = 86, - Marchen = 87, - MasaramGondi = 88, - Medefaidrin = 89, - MeeteiMayek = 90, - MendeKikakui = 91, - MeroiticCursive = 92, - MeroiticHieroglyphs = 93, - Miao = 94, - Modi = 95, - Mongolian = 96, - Mro = 97, - Multani = 98, - Myanmar = 99, - Nabataean = 100, - NagMundari = 101, - Nandinagari = 102, - NewTaiLue = 103, - Newa = 104, - Nko = 105, - Nushu = 106, - NyiakengPuachueHmong = 107, - Ogham = 108, - OlChiki = 109, - OlOnal = 110, - OldHungarian = 111, - OldItalic = 112, - OldNorthArabian = 113, - OldPermic = 114, - OldPersian = 115, - OldSogdian = 116, - OldSouthArabian = 117, - OldTurkic = 118, - OldUyghur = 119, - Oriya = 120, - Osage = 121, - Osmanya = 122, - PahawhHmong = 123, - Palmyrene = 124, - PauCinHau = 125, - PhagsPa = 126, - Phoenician = 127, - PsalterPahlavi = 128, - Rejang = 129, - Runic = 130, - Samaritan = 131, - Saurashtra = 132, - Sharada = 133, - Shavian = 134, - Siddham = 135, - Sidetic = 136, - SignWriting = 137, - Sinhala = 138, - Sogdian = 139, - SoraSompeng = 140, - Soyombo = 141, - Sundanese = 142, - Sunuwar = 143, - SylotiNagri = 144, - Syriac = 145, - Tagalog = 146, - Tagbanwa = 147, - TaiLe = 148, - TaiTham = 149, - TaiViet = 150, - TaiYo = 151, - Takri = 152, - Tamil = 153, - Tangsa = 154, - Tangut = 155, - Telugu = 156, - Thaana = 157, - Thai = 158, - Tibetan = 159, - Tifinagh = 160, - Tirhuta = 161, - Todhri = 162, - TolongSiki = 163, - Toto = 164, - TuluTigalari = 165, - Ugaritic = 166, - Unknown = 167, - Vai = 168, - Vithkuqi = 169, - Wancho = 170, - WarangCiti = 171, - Yezidi = 172, - Yi = 173, - ZanabazarSquare = 174, - // Per UAX #24 §1.4, the value Katakana_Or_Hiragana only appears in the - // Script_Extensions property and never in the Script property itself. - KatakanaOrHiragana = 175, +#define SOURCEMETA_CORE_UCD_ENUM_ENTRY(name, alias) name, + SOURCEMETA_CORE_UNICODE_SCRIPT_LIST(SOURCEMETA_CORE_UCD_ENUM_ENTRY) +#undef SOURCEMETA_CORE_UCD_ENUM_ENTRY }; +/// @ingroup unicode +/// Each entry maps an `NFCQuickCheck` enum name to its UCD short alias. +#define SOURCEMETA_CORE_NFC_QUICK_CHECK_LIST(X) \ + X(Yes, "Y") \ + X(No, "N") \ + X(Maybe, "M") + /// @ingroup unicode /// The NFC quick-check result for a Unicode codepoint per UAX #15. /// See https://www.unicode.org/reports/tr15/ for the property's definition. enum class NFCQuickCheck : std::uint8_t { - Yes = 0, - No = 1, - Maybe = 2, +#define SOURCEMETA_CORE_UCD_ENUM_ENTRY(name, alias) name, + SOURCEMETA_CORE_NFC_QUICK_CHECK_LIST(SOURCEMETA_CORE_UCD_ENUM_ENTRY) +#undef SOURCEMETA_CORE_UCD_ENUM_ENTRY }; } // namespace sourcemeta::core diff --git a/vendor/core/src/lang/io/include/sourcemeta/core/io.h b/vendor/core/src/lang/io/include/sourcemeta/core/io.h index 46c43b27c..82632183b 100644 --- a/vendor/core/src/lang/io/include/sourcemeta/core/io.h +++ b/vendor/core/src/lang/io/include/sourcemeta/core/io.h @@ -220,6 +220,30 @@ auto read_file_to_string(const std::filesystem::path &path) /// ``` inline auto read_stdin() -> std::string { return read_to_string(std::cin); } +/// @ingroup io +/// +/// Iterate the lines of `stream`, invoking `callback` with each line. The +/// line view is only valid for the duration of the callback. For example: +/// +/// ```cpp +/// #include +/// #include +/// #include +/// +/// std::istringstream stream{"alpha\nbeta\ngamma\n"}; +/// sourcemeta::core::for_each_line(stream, +/// [](const std::string_view line) { +/// std::cout << line << '\n'; +/// }); +/// ``` +template +auto for_each_line(std::istream &stream, Callback callback) -> void { + std::string line; + while (std::getline(stream, line)) { + callback(std::string_view{line}); + } +} + /// @ingroup io /// /// Recursively mirror a directory tree using hard links for regular files. diff --git a/vendor/core/src/lang/numeric/include/sourcemeta/core/numeric_parse.h b/vendor/core/src/lang/numeric/include/sourcemeta/core/numeric_parse.h index ff745ecf3..c30def5f9 100644 --- a/vendor/core/src/lang/numeric/include/sourcemeta/core/numeric_parse.h +++ b/vendor/core/src/lang/numeric/include/sourcemeta/core/numeric_parse.h @@ -5,35 +5,47 @@ #include #endif -#include // std::int64_t, std::uint64_t -#include // std::optional -#include // std::string +#include // std::int64_t, std::uint32_t, std::uint64_t +#include // std::optional +#include // std::string_view namespace sourcemeta::core { /// @ingroup numeric /// Attempt to parse a string as a double SOURCEMETA_CORE_NUMERIC_EXPORT -auto to_double(const std::string &input) noexcept -> std::optional; +auto to_double(const std::string_view input) noexcept -> std::optional; /// @ingroup numeric /// Attempt to parse a string as a signed 64-bit integer SOURCEMETA_CORE_NUMERIC_EXPORT -auto to_int64_t(const std::string &input) noexcept +auto to_int64_t(const std::string_view input) noexcept -> std::optional; /// @ingroup numeric /// Attempt to parse a string as a signed 64-bit integer in a given base SOURCEMETA_CORE_NUMERIC_EXPORT -auto to_int64_t(const std::string &input, const int base) noexcept +auto to_int64_t(const std::string_view input, const int base) noexcept -> std::optional; /// @ingroup numeric /// Attempt to parse a string as an unsigned 64-bit decimal integer. SOURCEMETA_CORE_NUMERIC_EXPORT -auto to_uint64_t(const std::string &input) noexcept +auto to_uint64_t(const std::string_view input) noexcept -> std::optional; +/// @ingroup numeric +/// Attempt to parse a string as an unsigned 32-bit decimal integer +SOURCEMETA_CORE_NUMERIC_EXPORT +auto to_uint32_t(const std::string_view input) noexcept + -> std::optional; + +/// @ingroup numeric +/// Attempt to parse a string as an unsigned 32-bit integer in a given base +SOURCEMETA_CORE_NUMERIC_EXPORT +auto to_uint32_t(const std::string_view input, const int base) noexcept + -> std::optional; + } // namespace sourcemeta::core #endif diff --git a/vendor/core/src/lang/numeric/parse.cc b/vendor/core/src/lang/numeric/parse.cc index 645d7f869..0edd740b2 100644 --- a/vendor/core/src/lang/numeric/parse.cc +++ b/vendor/core/src/lang/numeric/parse.cc @@ -2,32 +2,31 @@ #include // std::from_chars #include // std::size_t -#include // std::invalid_argument, std::out_of_range +#include // std::string #include // std::errc namespace sourcemeta::core { -auto to_double(const std::string &input) noexcept -> std::optional { +auto to_double(const std::string_view input) noexcept -> std::optional { try { + const std::string owned{input}; std::size_t position{0}; - const auto value{std::stod(input, &position)}; - if (position != input.size()) { + const auto value{std::stod(owned, &position)}; + if (position != owned.size()) { return std::nullopt; } return value; - } catch (const std::invalid_argument &) { - return std::nullopt; - } catch (const std::out_of_range &) { + } catch (...) { return std::nullopt; } } -auto to_int64_t(const std::string &input) noexcept +auto to_int64_t(const std::string_view input) noexcept -> std::optional { return to_int64_t(input, 10); } -auto to_int64_t(const std::string &input, const int base) noexcept +auto to_int64_t(const std::string_view input, const int base) noexcept -> std::optional { std::int64_t value{}; const auto result = @@ -39,7 +38,7 @@ auto to_int64_t(const std::string &input, const int base) noexcept return value; } -auto to_uint64_t(const std::string &input) noexcept +auto to_uint64_t(const std::string_view input) noexcept -> std::optional { std::uint64_t value{}; const auto result = @@ -51,4 +50,21 @@ auto to_uint64_t(const std::string &input) noexcept return value; } +auto to_uint32_t(const std::string_view input) noexcept + -> std::optional { + return to_uint32_t(input, 10); +} + +auto to_uint32_t(const std::string_view input, const int base) noexcept + -> std::optional { + std::uint32_t value{}; + const auto result = + std::from_chars(input.data(), input.data() + input.size(), value, base); + if (result.ec != std::errc{} || result.ptr != input.data() + input.size()) { + return std::nullopt; + } + + return value; +} + } // namespace sourcemeta::core diff --git a/vendor/core/src/lang/text/include/sourcemeta/core/text.h b/vendor/core/src/lang/text/include/sourcemeta/core/text.h index 599892230..26ffc0616 100644 --- a/vendor/core/src/lang/text/include/sourcemeta/core/text.h +++ b/vendor/core/src/lang/text/include/sourcemeta/core/text.h @@ -6,8 +6,11 @@ #endif #include // std::size_t +#include // std::optional +#include // std::ostream #include // std::string #include // std::string_view +#include // std::pair /// @defgroup text Text /// @brief A collection of general-purpose text manipulation utilities @@ -71,6 +74,135 @@ SOURCEMETA_CORE_TEXT_EXPORT auto truncate(std::string &input, const std::size_t maximum_length, const std::string_view marker) -> void; +/// @ingroup text +/// +/// Return `input` with leading and trailing ASCII whitespace removed. For +/// example: +/// +/// ```cpp +/// #include +/// #include +/// +/// assert(sourcemeta::core::trim(" hello ") == "hello"); +/// assert(sourcemeta::core::trim("\t\nfoo\r\n") == "foo"); +/// assert(sourcemeta::core::trim(" ").empty()); +/// ``` +SOURCEMETA_CORE_TEXT_EXPORT +auto trim(const std::string_view input) noexcept -> std::string_view; + +/// @ingroup text +/// +/// Return the prefix of `input` up to (but excluding) the first occurrence +/// of `marker`, or the full input when `marker` is absent. For example: +/// +/// ```cpp +/// #include +/// #include +/// +/// assert(sourcemeta::core::take_until("foo # bar", '#') == "foo "); +/// assert(sourcemeta::core::take_until("no marker", '#') == "no marker"); +/// assert(sourcemeta::core::take_until("#leading", '#').empty()); +/// ``` +SOURCEMETA_CORE_TEXT_EXPORT +auto take_until(const std::string_view input, const char marker) noexcept + -> std::string_view; + +/// @ingroup text +/// +/// Split `input` at the first occurrence of `delimiter`, returning the +/// parts before and after it. Return `std::nullopt` when the delimiter is +/// absent. For example: +/// +/// ```cpp +/// #include +/// #include +/// +/// const auto parts{sourcemeta::core::split_once("key=value", '=')}; +/// assert(parts.has_value()); +/// assert(parts->first == "key"); +/// assert(parts->second == "value"); +/// assert(!sourcemeta::core::split_once("no separator", '=').has_value()); +/// ``` +SOURCEMETA_CORE_TEXT_EXPORT +auto split_once(const std::string_view input, const char delimiter) noexcept + -> std::optional>; + +/// @ingroup text +/// +/// Split `input` at the first occurrence of `delimiter`, returning the +/// parts before and after it. Return `std::nullopt` when the delimiter is +/// absent or empty. For example: +/// +/// ```cpp +/// #include +/// #include +/// +/// const auto parts{sourcemeta::core::split_once("1..5", "..")}; +/// assert(parts.has_value()); +/// assert(parts->first == "1"); +/// assert(parts->second == "5"); +/// ``` +SOURCEMETA_CORE_TEXT_EXPORT +auto split_once(const std::string_view input, + const std::string_view delimiter) noexcept + -> std::optional>; + +/// @ingroup text +/// +/// Iterate the parts of `input` separated by `delimiter`, invoking +/// `callback` with each part. For example: +/// +/// ```cpp +/// #include +/// #include +/// +/// sourcemeta::core::split("alpha;beta;gamma", ';', +/// [](const std::string_view part) { +/// std::cout << part << '\n'; +/// }); +/// ``` +template +auto split(const std::string_view input, const char delimiter, + Callback callback) -> void { + std::string_view rest{input}; + while (true) { + const auto next{sourcemeta::core::split_once(rest, delimiter)}; + if (!next.has_value()) { + callback(rest); + return; + } + callback(next->first); + rest = next->second; + } +} + +/// @ingroup text +/// +/// Stream each item of `items` to `stream`, separated by `separator`. For +/// example: +/// +/// ```cpp +/// #include +/// #include +/// #include +/// +/// constexpr std::array values{1, 2, 3}; +/// sourcemeta::core::join_to(std::cout, values, ", "); +/// // prints: 1, 2, 3 +/// ``` +template +auto join_to(std::ostream &stream, const Range &items, + const std::string_view separator) -> void { + bool first{true}; + for (const auto &item : items) { + if (!first) { + stream << separator; + } + stream << item; + first = false; + } +} + /// @ingroup text /// /// Return `input` with `suffix` removed from the end under ASCII diff --git a/vendor/core/src/lang/text/text.cc b/vendor/core/src/lang/text/text.cc index 07e641263..368e91528 100644 --- a/vendor/core/src/lang/text/text.cc +++ b/vendor/core/src/lang/text/text.cc @@ -2,7 +2,18 @@ #include // std::isalpha, std::toupper #include // std::size_t +#include // std::optional, std::nullopt #include // std::string_view +#include // std::pair + +namespace { + +auto is_ascii_whitespace(const char character) noexcept -> bool { + return character == ' ' || character == '\t' || character == '\n' || + character == '\v' || character == '\f' || character == '\r'; +} + +} // namespace namespace sourcemeta::core { @@ -56,6 +67,58 @@ auto truncate(std::string &input, const std::size_t maximum_length, input.append(marker); } +auto trim(const std::string_view input) noexcept -> std::string_view { + std::string_view result{input}; + while (!result.empty() && is_ascii_whitespace(result.front())) { + result.remove_prefix(1); + } + while (!result.empty() && is_ascii_whitespace(result.back())) { + result.remove_suffix(1); + } + return result; +} + +auto take_until(const std::string_view input, const char marker) noexcept + -> std::string_view { + const auto position{input.find(marker)}; + if (position == std::string_view::npos) { + return input; + } + std::string_view result{input}; + result.remove_suffix(input.size() - position); + return result; +} + +auto split_once(const std::string_view input, const char delimiter) noexcept + -> std::optional> { + const auto position{input.find(delimiter)}; + if (position == std::string_view::npos) { + return std::nullopt; + } + std::string_view before{input}; + before.remove_suffix(input.size() - position); + std::string_view after{input}; + after.remove_prefix(position + 1); + return std::pair{before, after}; +} + +auto split_once(const std::string_view input, + const std::string_view delimiter) noexcept + -> std::optional> { + if (delimiter.empty()) { + return std::nullopt; + } + const auto position{input.find(delimiter)}; + if (position == std::string_view::npos) { + return std::nullopt; + } + std::string_view before{input}; + before.remove_suffix(input.size() - position); + std::string_view after{input}; + after.remove_prefix(position + delimiter.size()); + return std::pair{before, after}; +} + auto remove_suffix_ignore_case(const std::string_view input, const std::string_view suffix) noexcept -> std::string_view {