From 32949765f1fa3a4f84f86e7640521922787036a8 Mon Sep 17 00:00:00 2001
From: Juan Cruz Viotti <jv@jviotti.com>
Date: Thu, 28 May 2026 14:32:04 -0400
Subject: [PATCH] Upgrade Core to `e586f557af367c74d08d2818bc4bd0d07c8b20bc`

Signed-off-by: Juan Cruz Viotti <jv@jviotti.com>
---
 DEPENDENCIES                                  |   2 +-
 .../core/cmake/common/compiler/options.cmake  |  16 +-
 .../cmake/common/targets/executable.cmake     |   8 +-
 .../common/targets/googlebenchmark.cmake      |   3 +
 .../core/cmake/common/targets/library.cmake   |   3 +
 vendor/core/src/core/idna/CMakeLists.txt      |  20 +-
 vendor/core/src/core/idna/codegen.cc          | 203 +++++
 vendor/core/src/core/idna/codegen.py          | 117 ---
 .../idna/include/sourcemeta/core/idna_ucd.h   |  17 +-
 vendor/core/src/core/regex/preprocess.h       |   4 +
 vendor/core/src/core/unicode/CMakeLists.txt   |  20 +-
 vendor/core/src/core/unicode/codegen.cc       | 730 ++++++++++++++++++
 vendor/core/src/core/unicode/codegen.py       | 516 -------------
 .../include/sourcemeta/core/unicode_ucd.h     | 448 ++++++-----
 .../src/lang/io/include/sourcemeta/core/io.h  |  24 +
 .../include/sourcemeta/core/numeric_parse.h   |  26 +-
 vendor/core/src/lang/numeric/parse.cc         |  36 +-
 .../lang/text/include/sourcemeta/core/text.h  | 132 ++++
 vendor/core/src/lang/text/text.cc             |  63 ++
 19 files changed, 1500 insertions(+), 888 deletions(-)
 create mode 100644 vendor/core/src/core/idna/codegen.cc
 delete mode 100644 vendor/core/src/core/idna/codegen.py
 create mode 100644 vendor/core/src/core/unicode/codegen.cc
 delete mode 100644 vendor/core/src/core/unicode/codegen.py

diff --git a/DEPENDENCIES b/DEPENDENCIES
index fd038337c..756d9aab4 100644
--- a/DEPENDENCIES
+++ b/DEPENDENCIES
@@ -1,4 +1,4 @@
 vendorpull https://github.com/sourcemeta/vendorpull 1dcbac42809cf87cb5b045106b863e17ad84ba02
-core https://github.com/sourcemeta/core cd56ace324a42f067b4b8f651f73b9aa0313ca2a
+core https://github.com/sourcemeta/core e586f557af367c74d08d2818bc4bd0d07c8b20bc
 blaze https://github.com/sourcemeta/blaze bc1f434acafd38803f58a941a756a6f788e556e2
 bootstrap https://github.com/twbs/bootstrap 1a6fdfae6be09b09eaced8f0e442ca6f7680a61e
diff --git a/vendor/core/cmake/common/compiler/options.cmake b/vendor/core/cmake/common/compiler/options.cmake
index d43bb2d91..54cca1ad3 100644
--- a/vendor/core/cmake/common/compiler/options.cmake
+++ b/vendor/core/cmake/common/compiler/options.cmake
@@ -17,7 +17,9 @@ function(sourcemeta_add_default_options visibility target)
       $<$<OR:$<COMPILE_LANGUAGE:C>,$<COMPILE_LANGUAGE:CXX>>:/W4>
       $<$<OR:$<COMPILE_LANGUAGE:C>,$<COMPILE_LANGUAGE:CXX>>:/WL>
       $<$<OR:$<COMPILE_LANGUAGE:C>,$<COMPILE_LANGUAGE:CXX>>:/MP>
-      $<$<OR:$<COMPILE_LANGUAGE:C>,$<COMPILE_LANGUAGE:CXX>>:/sdl>)
+      $<$<OR:$<COMPILE_LANGUAGE:C>,$<COMPILE_LANGUAGE:CXX>>:/sdl>
+      # See https://learn.microsoft.com/en-us/cpp/build/reference/guard-enable-control-flow-guard
+      $<$<OR:$<COMPILE_LANGUAGE:C>,$<COMPILE_LANGUAGE:CXX>>:/guard:cf>)
   elseif(SOURCEMETA_COMPILER_LLVM OR SOURCEMETA_COMPILER_GCC)
     target_compile_options("${target}" ${visibility}
       -Wall
@@ -107,7 +109,17 @@ function(sourcemeta_add_default_options visibility target)
       # GCC seems to print a lot of false-positives here
       -Wno-free-nonheap-object
       # Disables runtime type information
-      $<$<OR:$<COMPILE_LANGUAGE:CXX>,$<COMPILE_LANGUAGE:OBJCXX>>:-fno-rtti>)
+      $<$<OR:$<COMPILE_LANGUAGE:CXX>,$<COMPILE_LANGUAGE:OBJCXX>>:-fno-rtti>
+      # See https://best.openssf.org/Compiler-Hardening-Guides/Compiler-Options-Hardening-Guide-for-C-and-C++.html
+      -fstack-clash-protection)
+
+    # _GLIBCXX_ASSERTIONS is libstdc++ (GNU) specific, not honored by libc++
+    # (which the LLVM toolchain on Apple ships). Restrict to non-Apple GCC
+    # to avoid emitting a Debug-only definition that does nothing on macOS
+    if(NOT APPLE)
+      target_compile_definitions("${target}" ${visibility}
+        $<$<CONFIG:Debug>:_GLIBCXX_ASSERTIONS>)
+    endif()
   endif()
 endfunction()
 
diff --git a/vendor/core/cmake/common/targets/executable.cmake b/vendor/core/cmake/common/targets/executable.cmake
index 63f61cc2e..84814a5b6 100644
--- a/vendor/core/cmake/common/targets/executable.cmake
+++ b/vendor/core/cmake/common/targets/executable.cmake
@@ -32,12 +32,9 @@ function(sourcemeta_executable)
   sourcemeta_add_default_options(PRIVATE ${TARGET_NAME})
 
   # See https://best.openssf.org/Compiler-Hardening-Guides/Compiler-Options-Hardening-Guide-for-C-and-C++.html
-  # Position Independent Executable (PIE) for ASLR support
+  # PIE linker flags for ASLR support. The compile-time -fPIE is already
+  # enabled globally via CMAKE_POSITION_INDEPENDENT_CODE in defaults.cmake.
   if(SOURCEMETA_COMPILER_LLVM OR SOURCEMETA_COMPILER_GCC)
-    target_compile_options(${TARGET_NAME} PRIVATE
-      $<$<CONFIG:Release>:-fPIE>
-      $<$<CONFIG:RelWithDebInfo>:-fPIE>
-      $<$<CONFIG:MinSizeRel>:-fPIE>)
     target_link_options(${TARGET_NAME} PRIVATE
       $<$<CONFIG:Release>:-pie>
       $<$<CONFIG:RelWithDebInfo>:-pie>
@@ -47,7 +44,6 @@ function(sourcemeta_executable)
   # See https://learn.microsoft.com/en-us/cpp/build/reference/guard-enable-control-flow-guard
   # See https://learn.microsoft.com/en-us/cpp/build/reference/cetcompat
   if(SOURCEMETA_COMPILER_MSVC)
-    target_compile_options(${TARGET_NAME} PRIVATE /guard:cf)
     target_link_options(${TARGET_NAME} PRIVATE /guard:cf /CETCOMPAT)
   endif()
 
diff --git a/vendor/core/cmake/common/targets/googlebenchmark.cmake b/vendor/core/cmake/common/targets/googlebenchmark.cmake
index d911a93b8..2b14b815c 100644
--- a/vendor/core/cmake/common/targets/googlebenchmark.cmake
+++ b/vendor/core/cmake/common/targets/googlebenchmark.cmake
@@ -19,6 +19,9 @@ function(sourcemeta_googlebenchmark)
 
   add_executable("${TARGET_NAME}" ${SOURCEMETA_GOOGLEBENCHMARK_SOURCES})
   sourcemeta_add_default_options(PRIVATE ${TARGET_NAME})
+  if(SOURCEMETA_COMPILER_MSVC)
+    target_link_options("${TARGET_NAME}" PRIVATE /guard:cf /CETCOMPAT)
+  endif()
   set_target_properties("${TARGET_NAME}" PROPERTIES FOLDER "${FOLDER_NAME}")
   target_link_libraries("${TARGET_NAME}" PRIVATE benchmark::benchmark)
   target_link_libraries("${TARGET_NAME}" PRIVATE benchmark::benchmark_main)
diff --git a/vendor/core/cmake/common/targets/library.cmake b/vendor/core/cmake/common/targets/library.cmake
index 216f8b025..35dd39a77 100644
--- a/vendor/core/cmake/common/targets/library.cmake
+++ b/vendor/core/cmake/common/targets/library.cmake
@@ -55,6 +55,9 @@ function(sourcemeta_library)
     add_library(${TARGET_NAME}
       ${PUBLIC_HEADER} ${ABSOLUTE_PRIVATE_HEADERS} ${SOURCEMETA_LIBRARY_SOURCES})
     sourcemeta_add_default_options(PRIVATE ${TARGET_NAME})
+    if(SOURCEMETA_COMPILER_MSVC)
+      target_link_options(${TARGET_NAME} PRIVATE /guard:cf /CETCOMPAT)
+    endif()
   else()
     add_library(${TARGET_NAME} INTERFACE
       ${PUBLIC_HEADER} ${ABSOLUTE_PRIVATE_HEADERS})
diff --git a/vendor/core/src/core/idna/CMakeLists.txt b/vendor/core/src/core/idna/CMakeLists.txt
index b37440dd1..21006e31e 100644
--- a/vendor/core/src/core/idna/CMakeLists.txt
+++ b/vendor/core/src/core/idna/CMakeLists.txt
@@ -1,19 +1,25 @@
-find_package(Python3 REQUIRED COMPONENTS Interpreter)
-
+# Codegen
 set(SOURCEMETA_CORE_IDNA_UCD_DIR
   "${core_SOURCE_DIR}/vendor/unicodetools/unicodetools/data/idna/dev")
-
 set(SOURCEMETA_CORE_IDNA_DATA_HEADER
   "${CMAKE_CURRENT_BINARY_DIR}/idna_data.h")
-
+sourcemeta_executable(NAMESPACE sourcemeta PROJECT core NAME idna_codegen
+  OUTPUT SOURCEMETA_CORE_IDNA_CODEGEN_TARGET
+  SOURCES codegen.cc include/sourcemeta/core/idna_ucd.h)
+target_link_libraries("${SOURCEMETA_CORE_IDNA_CODEGEN_TARGET}" PRIVATE
+  sourcemeta::core::io
+  sourcemeta::core::options
+  sourcemeta::core::numeric
+  sourcemeta::core::text)
+target_include_directories("${SOURCEMETA_CORE_IDNA_CODEGEN_TARGET}" PRIVATE
+  "${CMAKE_CURRENT_SOURCE_DIR}/include")
 add_custom_command(
   OUTPUT "${SOURCEMETA_CORE_IDNA_DATA_HEADER}"
-  COMMAND "${Python3_EXECUTABLE}"
-    "${CMAKE_CURRENT_SOURCE_DIR}/codegen.py"
+  COMMAND "${SOURCEMETA_CORE_IDNA_CODEGEN_TARGET}"
     "${SOURCEMETA_CORE_IDNA_DATA_HEADER}"
     "${SOURCEMETA_CORE_IDNA_UCD_DIR}/Idna2008.txt"
   DEPENDS
-    "${CMAKE_CURRENT_SOURCE_DIR}/codegen.py"
+    "${SOURCEMETA_CORE_IDNA_CODEGEN_TARGET}"
     "${SOURCEMETA_CORE_IDNA_UCD_DIR}/Idna2008.txt"
   COMMENT "Generating IDNA property tables"
   VERBATIM)
diff --git a/vendor/core/src/core/idna/codegen.cc b/vendor/core/src/core/idna/codegen.cc
new file mode 100644
index 000000000..9c57203df
--- /dev/null
+++ b/vendor/core/src/core/idna/codegen.cc
@@ -0,0 +1,203 @@
+#include <sourcemeta/core/idna_ucd.h>
+
+#include <sourcemeta/core/io.h>
+#include <sourcemeta/core/numeric.h>
+#include <sourcemeta/core/options.h>
+#include <sourcemeta/core/text.h>
+
+#include <cstddef> // std::size_t, std::ptrdiff_t
+#include <cstdint> // std::uint8_t, std::uint16_t, std::uint32_t, std::uint64_t
+#include <cstdlib> // EXIT_FAILURE, EXIT_SUCCESS
+#include <exception>     // std::exception
+#include <filesystem>    // std::filesystem::path
+#include <iostream>      // std::cerr
+#include <ostream>       // std::ostream
+#include <ranges>        // std::views::transform
+#include <span>          // std::span
+#include <stdexcept>     // std::runtime_error
+#include <string>        // std::string
+#include <string_view>   // std::string_view
+#include <unordered_map> // std::unordered_map
+#include <vector>        // std::vector
+
+namespace {
+
+constexpr std::size_t TOTAL_CODEPOINTS{0x110000};
+constexpr std::size_t PAGE_SHIFT{10};
+constexpr std::size_t PAGE_SIZE{1 << PAGE_SHIFT};
+constexpr std::size_t NUM_PAGES{TOTAL_CODEPOINTS / PAGE_SIZE};
+
+struct PropertyEntry {
+  std::uint32_t first;
+  std::uint32_t last;
+  sourcemeta::core::IDNAProperty value;
+};
+
+struct TwoStageTable {
+  std::vector<std::uint16_t> stage1;
+  std::vector<std::uint8_t> stage2;
+};
+
+auto property_from_token(const std::string_view token)
+    -> sourcemeta::core::IDNAProperty {
+#define SOURCEMETA_CORE_IDNA_PROPERTY_CASE(name, alias)                        \
+  if (token == alias) {                                                        \
+    return sourcemeta::core::IDNAProperty::name;                               \
+  }
+  SOURCEMETA_CORE_IDNA_PROPERTY_LIST(SOURCEMETA_CORE_IDNA_PROPERTY_CASE)
+#undef SOURCEMETA_CORE_IDNA_PROPERTY_CASE
+  throw std::runtime_error{
+      std::string{"Unknown IDNA property value: "}.append(token)};
+}
+
+auto parse_hex_codepoint(const std::string_view token) -> std::uint32_t {
+  const auto parsed{sourcemeta::core::to_uint32_t(token, 16)};
+  if (!parsed.has_value() || parsed.value() > 0x10FFFF) {
+    throw std::runtime_error{std::string{"Invalid codepoint: "}.append(token)};
+  }
+  return parsed.value();
+}
+
+auto parse_entry(const std::string_view payload) -> PropertyEntry {
+  const auto trimmed{
+      sourcemeta::core::trim(sourcemeta::core::take_until(payload, '#'))};
+  const auto parts{sourcemeta::core::split_once(trimmed, ';')};
+  if (!parts.has_value()) {
+    throw std::runtime_error{
+        std::string{"Missing ';' in line: "}.append(payload)};
+  }
+  const auto range_part{sourcemeta::core::trim(parts->first)};
+  const auto value_part{sourcemeta::core::trim(parts->second)};
+  const auto range_split{
+      sourcemeta::core::split_once(range_part, std::string_view{".."})};
+  const auto first{parse_hex_codepoint(
+      range_split.has_value() ? range_split->first : range_part)};
+  const auto last{range_split.has_value()
+                      ? parse_hex_codepoint(range_split->second)
+                      : first};
+  return {first, last, property_from_token(value_part)};
+}
+
+auto parse_idna_file(const std::filesystem::path &input_path)
+    -> std::vector<PropertyEntry> {
+  auto stream{sourcemeta::core::read_file(input_path)};
+  std::vector<PropertyEntry> missing;
+  std::vector<PropertyEntry> data;
+  constexpr std::string_view missing_prefix{"@missing:"};
+  sourcemeta::core::for_each_line(stream, [&](const std::string_view raw_line) {
+    const auto line{sourcemeta::core::trim(raw_line)};
+    if (line.empty()) {
+      return;
+    }
+    if (line.front() == '#') {
+      const auto comment_body{sourcemeta::core::trim(line.substr(1))};
+      if (comment_body.size() < missing_prefix.size() ||
+          comment_body.substr(0, missing_prefix.size()) != missing_prefix) {
+        return;
+      }
+      missing.push_back(
+          parse_entry(comment_body.substr(missing_prefix.size())));
+      return;
+    }
+    data.push_back(parse_entry(line));
+  });
+  std::vector<PropertyEntry> result;
+  result.reserve(missing.size() + data.size());
+  result.insert(result.end(), missing.begin(), missing.end());
+  result.insert(result.end(), data.begin(), data.end());
+  return result;
+}
+
+auto build_pages(const std::vector<PropertyEntry> &entries) -> TwoStageTable {
+  std::vector<std::uint8_t> values(
+      TOTAL_CODEPOINTS,
+      static_cast<std::uint8_t>(sourcemeta::core::IDNAProperty::PValid));
+  for (const auto &entry : entries) {
+    for (std::uint32_t codepoint{entry.first}; codepoint <= entry.last;
+         codepoint += 1) {
+      values[codepoint] = static_cast<std::uint8_t>(entry.value);
+    }
+  }
+
+  std::unordered_map<std::string, std::uint16_t> page_to_id;
+  TwoStageTable table;
+  table.stage1.reserve(NUM_PAGES);
+  for (std::size_t page_index{0}; page_index < NUM_PAGES; page_index += 1) {
+    const auto page_start{page_index * PAGE_SIZE};
+    const std::string page_key{
+        reinterpret_cast<const char *>(values.data() + page_start), PAGE_SIZE};
+    const auto existing{page_to_id.find(page_key)};
+    if (existing != page_to_id.end()) {
+      table.stage1.push_back(existing->second);
+      continue;
+    }
+    const auto new_id{
+        static_cast<std::uint16_t>(table.stage2.size() / PAGE_SIZE)};
+    page_to_id.emplace(page_key, new_id);
+    table.stage2.insert(
+        table.stage2.end(),
+        values.begin() + static_cast<std::ptrdiff_t>(page_start),
+        values.begin() + static_cast<std::ptrdiff_t>(page_start + PAGE_SIZE));
+    table.stage1.push_back(new_id);
+  }
+  return table;
+}
+
+template <typename T>
+auto emit_row(std::ostream &stream, const std::span<const T> items) -> void {
+  constexpr std::size_t row_width{16};
+  for (std::size_t offset{0}; offset < items.size(); offset += row_width) {
+    stream << "    ";
+    const auto upper{offset + row_width < items.size() ? offset + row_width
+                                                       : items.size()};
+    const auto row{items.subspan(offset, upper - offset)};
+    const auto widened{row | std::views::transform([](const T value) {
+                         return static_cast<std::uint64_t>(value);
+                       })};
+    sourcemeta::core::join_to(stream, widened, ", ");
+    stream << ",\n";
+  }
+}
+
+auto emit_property(std::ostream &stream, const std::string_view prefix,
+                   const TwoStageTable &table) -> void {
+  stream << "constexpr std::uint16_t " << prefix << "_STAGE1["
+         << table.stage1.size() << "] = {\n";
+  emit_row<std::uint16_t>(stream, table.stage1);
+  stream << "};\n\n";
+  stream << "constexpr std::uint8_t " << prefix << "_STAGE2["
+         << table.stage2.size() << "] = {\n";
+  emit_row<std::uint8_t>(stream, table.stage2);
+  stream << "};\n\n";
+}
+
+} // namespace
+
+auto main(const int argc, const char *const argv[]) -> int {
+  try {
+    sourcemeta::core::Options app;
+    app.parse(argc, argv);
+    const auto &positional{app.positional()};
+    if (positional.size() != 2) {
+      std::cerr << "Usage: " << (argc > 0 ? argv[0] : "codegen")
+                << " <output.h> <Idna2008.txt>\n";
+      return EXIT_FAILURE;
+    }
+
+    const std::filesystem::path output_path{positional.at(0)};
+    const std::filesystem::path input_path{positional.at(1)};
+
+    const auto entries{parse_idna_file(input_path)};
+    const auto table{build_pages(entries)};
+    sourcemeta::core::write_file(output_path, [&](std::ostream &stream) {
+      stream << "#include <cstdint>\n\n";
+      stream << "namespace {\n\n";
+      emit_property(stream, "IDNA_PROPERTY", table);
+      stream << "} // namespace\n";
+    });
+  } catch (const std::exception &error) {
+    std::cerr << "codegen: " << error.what() << "\n";
+    return EXIT_FAILURE;
+  }
+  return EXIT_SUCCESS;
+}
diff --git a/vendor/core/src/core/idna/codegen.py b/vendor/core/src/core/idna/codegen.py
deleted file mode 100644
index 65c64fff3..000000000
--- a/vendor/core/src/core/idna/codegen.py
+++ /dev/null
@@ -1,117 +0,0 @@
-#!/usr/bin/env python3
-
-import re
-import sys
-
-LINE = re.compile(r"^([0-9A-Fa-f]+)(?:\.\.([0-9A-Fa-f]+))?\s*;\s*(\S+)")
-MISSING_PREFIX = re.compile(r"^#\s*@missing:\s*")
-
-TOTAL_CODEPOINTS = 0x110000
-PAGE_SHIFT = 10
-PAGE_SIZE = 1 << PAGE_SHIFT
-NUM_PAGES = TOTAL_CODEPOINTS // PAGE_SIZE
-
-# Integer values must match the IDNAProperty enum in idna_ucd.h.
-IDNA_PROPERTY_VALUES = {
-    "PVALID": 0,
-    "CONTEXTJ": 1,
-    "CONTEXTO": 2,
-    "DISALLOWED": 3,
-    "UNASSIGNED": 4,
-}
-
-
-def parse_file(path, value_map):
-    missing = []
-    data = []
-    with open(path) as source:
-        for line_number, line in enumerate(source, start=1):
-            stripped = line.strip()
-            if not stripped:
-                continue
-            target = data
-            if stripped.startswith("#"):
-                prefix = MISSING_PREFIX.match(stripped)
-                if not prefix:
-                    continue
-                stripped = stripped[prefix.end():]
-                target = missing
-            match = LINE.match(stripped)
-            if not match:
-                raise ValueError(
-                    f"{path}:{line_number}: unparseable line: {stripped!r}"
-                )
-            first = int(match.group(1), 16)
-            last = int(match.group(2), 16) if match.group(2) else first
-            raw_value = match.group(3)
-            try:
-                value = value_map[raw_value]
-            except KeyError as error:
-                raise ValueError(
-                    f"{path}:{line_number}: invalid value {raw_value!r}: {error}"
-                ) from error
-            target.append((first, last, value))
-    return missing + data
-
-
-def build_pages(entries):
-    values = [0] * TOTAL_CODEPOINTS
-    for first, last, value in entries:
-        values[first : last + 1] = [value] * (last - first + 1)
-    page_to_id = {}
-    unique_pages = []
-    stage1 = []
-    for page_index in range(NUM_PAGES):
-        start = page_index * PAGE_SIZE
-        page = tuple(values[start : start + PAGE_SIZE])
-        if page not in page_to_id:
-            page_to_id[page] = len(unique_pages)
-            unique_pages.append(page)
-        stage1.append(page_to_id[page])
-    return stage1, unique_pages
-
-
-def emit_row(output, items):
-    for offset in range(0, len(items), 16):
-        chunk = items[offset : offset + 16]
-        output.write("    " + ", ".join(str(value) for value in chunk) + ",\n")
-
-
-def emit_property(output, prefix, stage1, unique_pages):
-    output.write(
-        f"constexpr std::uint16_t {prefix}_STAGE1[{len(stage1)}] = {{\n"
-    )
-    emit_row(output, stage1)
-    output.write("};\n\n")
-    stage2_size = len(unique_pages) * PAGE_SIZE
-    output.write(
-        f"constexpr std::uint8_t {prefix}_STAGE2[{stage2_size}] = {{\n"
-    )
-    for page in unique_pages:
-        emit_row(output, list(page))
-    output.write("};\n\n")
-
-
-def main():
-    if len(sys.argv) != 3:
-        print(
-            f"Usage: {sys.argv[0]} <output.h> <Idna2008.txt>",
-            file=sys.stderr,
-        )
-        sys.exit(1)
-
-    output_path = sys.argv[1]
-    idna_property_input = sys.argv[2]
-
-    with open(output_path, "w") as output:
-        output.write("#include <cstdint>\n\n")
-        output.write("namespace {\n\n")
-        stage1, pages = build_pages(
-            parse_file(idna_property_input, IDNA_PROPERTY_VALUES)
-        )
-        emit_property(output, "IDNA_PROPERTY", stage1, pages)
-        output.write("} // namespace\n")
-
-
-if __name__ == "__main__":
-    main()
diff --git a/vendor/core/src/core/idna/include/sourcemeta/core/idna_ucd.h b/vendor/core/src/core/idna/include/sourcemeta/core/idna_ucd.h
index 89b694cb0..737537cb0 100644
--- a/vendor/core/src/core/idna/include/sourcemeta/core/idna_ucd.h
+++ b/vendor/core/src/core/idna/include/sourcemeta/core/idna_ucd.h
@@ -5,15 +5,22 @@
 
 namespace sourcemeta::core {
 
+/// @ingroup idna
+/// Each entry maps an `IDNAProperty` enum name to its RFC 5892 token.
+#define SOURCEMETA_CORE_IDNA_PROPERTY_LIST(X)                                  \
+  X(PValid, "PVALID")                                                          \
+  X(ContextJ, "CONTEXTJ")                                                      \
+  X(ContextO, "CONTEXTO")                                                      \
+  X(Disallowed, "DISALLOWED")                                                  \
+  X(Unassigned, "UNASSIGNED")
+
 /// @ingroup idna
 /// The RFC 5892 derived property of a Unicode codepoint. See
 /// https://www.rfc-editor.org/rfc/rfc5892 for the property's definition.
 enum class IDNAProperty : std::uint8_t {
-  PValid = 0,
-  ContextJ = 1,
-  ContextO = 2,
-  Disallowed = 3,
-  Unassigned = 4,
+#define SOURCEMETA_CORE_IDNA_ENUM_ENTRY(name, alias) name,
+  SOURCEMETA_CORE_IDNA_PROPERTY_LIST(SOURCEMETA_CORE_IDNA_ENUM_ENTRY)
+#undef SOURCEMETA_CORE_IDNA_ENUM_ENTRY
 };
 
 } // namespace sourcemeta::core
diff --git a/vendor/core/src/core/regex/preprocess.h b/vendor/core/src/core/regex/preprocess.h
index c6ced47ab..8164c82b4 100644
--- a/vendor/core/src/core/regex/preprocess.h
+++ b/vendor/core/src/core/regex/preprocess.h
@@ -561,6 +561,10 @@ inline auto expand_set_ops(const std::string &content, std::bitset<128> &result)
 
     result =
         op_char == '-' ? (result & ~operand_chars) : (result & operand_chars);
+    if (next == std::string::npos) {
+      break;
+    }
+
     position = next;
   }
 
diff --git a/vendor/core/src/core/unicode/CMakeLists.txt b/vendor/core/src/core/unicode/CMakeLists.txt
index fc6b64dec..0b781358a 100644
--- a/vendor/core/src/core/unicode/CMakeLists.txt
+++ b/vendor/core/src/core/unicode/CMakeLists.txt
@@ -1,15 +1,21 @@
-find_package(Python3 REQUIRED COMPONENTS Interpreter)
-
+# Codegen
 set(SOURCEMETA_CORE_UNICODE_UCD_DIR
   "${core_SOURCE_DIR}/vendor/unicodetools/unicodetools/data/ucd/dev")
-
 set(SOURCEMETA_CORE_UNICODE_DATA_HEADER
   "${CMAKE_CURRENT_BINARY_DIR}/unicode_data.h")
-
+sourcemeta_executable(NAMESPACE sourcemeta PROJECT core NAME unicode_codegen
+  OUTPUT SOURCEMETA_CORE_UNICODE_CODEGEN_TARGET
+  SOURCES codegen.cc include/sourcemeta/core/unicode_ucd.h)
+target_link_libraries("${SOURCEMETA_CORE_UNICODE_CODEGEN_TARGET}" PRIVATE
+  sourcemeta::core::io
+  sourcemeta::core::options
+  sourcemeta::core::numeric
+  sourcemeta::core::text)
+target_include_directories("${SOURCEMETA_CORE_UNICODE_CODEGEN_TARGET}" PRIVATE
+  "${CMAKE_CURRENT_SOURCE_DIR}/include")
 add_custom_command(
   OUTPUT "${SOURCEMETA_CORE_UNICODE_DATA_HEADER}"
-  COMMAND "${Python3_EXECUTABLE}"
-    "${CMAKE_CURRENT_SOURCE_DIR}/codegen.py"
+  COMMAND "${SOURCEMETA_CORE_UNICODE_CODEGEN_TARGET}"
     "${SOURCEMETA_CORE_UNICODE_DATA_HEADER}"
     "${SOURCEMETA_CORE_UNICODE_UCD_DIR}/PropertyValueAliases.txt"
     "${SOURCEMETA_CORE_UNICODE_UCD_DIR}/extracted/DerivedCombiningClass.txt"
@@ -21,7 +27,7 @@ add_custom_command(
     "${SOURCEMETA_CORE_UNICODE_UCD_DIR}/UnicodeData.txt"
     "${SOURCEMETA_CORE_UNICODE_UCD_DIR}/CompositionExclusions.txt"
   DEPENDS
-    "${CMAKE_CURRENT_SOURCE_DIR}/codegen.py"
+    "${SOURCEMETA_CORE_UNICODE_CODEGEN_TARGET}"
     "${SOURCEMETA_CORE_UNICODE_UCD_DIR}/PropertyValueAliases.txt"
     "${SOURCEMETA_CORE_UNICODE_UCD_DIR}/extracted/DerivedCombiningClass.txt"
     "${SOURCEMETA_CORE_UNICODE_UCD_DIR}/extracted/DerivedJoiningType.txt"
diff --git a/vendor/core/src/core/unicode/codegen.cc b/vendor/core/src/core/unicode/codegen.cc
new file mode 100644
index 000000000..7e478003b
--- /dev/null
+++ b/vendor/core/src/core/unicode/codegen.cc
@@ -0,0 +1,730 @@
+#include <sourcemeta/core/unicode_ucd.h>
+
+#include <sourcemeta/core/io.h>
+#include <sourcemeta/core/numeric.h>
+#include <sourcemeta/core/options.h>
+#include <sourcemeta/core/text.h>
+
+#include <algorithm> // std::sort
+#include <array>     // std::array
+#include <cstddef>   // std::size_t, std::ptrdiff_t
+#include <cstdint> // std::uint8_t, std::uint16_t, std::uint32_t, std::uint64_t
+#include <cstdlib> // EXIT_FAILURE, EXIT_SUCCESS
+#include <exception>     // std::exception
+#include <filesystem>    // std::filesystem::path
+#include <iomanip>       // std::hex, std::uppercase, std::dec
+#include <ios>           // std::ios
+#include <iostream>      // std::cerr
+#include <map>           // std::map
+#include <optional>      // std::optional
+#include <ostream>       // std::ostream
+#include <ranges>        // std::views::transform
+#include <span>          // std::span
+#include <stdexcept>     // std::runtime_error
+#include <string>        // std::string
+#include <string_view>   // std::string_view
+#include <unordered_map> // std::unordered_map
+#include <unordered_set> // std::unordered_set
+#include <vector>        // std::vector
+
+namespace {
+
+constexpr std::size_t TOTAL_CODEPOINTS{0x110000};
+constexpr std::size_t PAGE_SHIFT{10};
+constexpr std::size_t PAGE_SIZE{1 << PAGE_SHIFT};
+constexpr std::size_t NUM_PAGES{TOTAL_CODEPOINTS / PAGE_SIZE};
+constexpr std::size_t DECOMPOSITION_OFFSET_BITS{14};
+constexpr std::size_t DECOMPOSITION_OFFSET_MASK{
+    (1U << DECOMPOSITION_OFFSET_BITS) - 1U};
+
+constexpr auto JOINING_TYPE_ORDER{std::to_array<std::string_view>({
+#define SOURCEMETA_CORE_UCD_ALIAS_ENTRY(name, alias) alias,
+    SOURCEMETA_CORE_JOINING_TYPE_LIST(SOURCEMETA_CORE_UCD_ALIAS_ENTRY)
+#undef SOURCEMETA_CORE_UCD_ALIAS_ENTRY
+})};
+
+constexpr auto BIDI_CLASS_ORDER{std::to_array<std::string_view>({
+#define SOURCEMETA_CORE_UCD_ALIAS_ENTRY(name, alias) alias,
+    SOURCEMETA_CORE_BIDI_CLASS_LIST(SOURCEMETA_CORE_UCD_ALIAS_ENTRY)
+#undef SOURCEMETA_CORE_UCD_ALIAS_ENTRY
+})};
+
+constexpr auto NFC_QUICK_CHECK_ORDER{std::to_array<std::string_view>({
+#define SOURCEMETA_CORE_UCD_ALIAS_ENTRY(name, alias) alias,
+    SOURCEMETA_CORE_NFC_QUICK_CHECK_LIST(SOURCEMETA_CORE_UCD_ALIAS_ENTRY)
+#undef SOURCEMETA_CORE_UCD_ALIAS_ENTRY
+})};
+
+constexpr auto UNICODE_SCRIPT_ORDER{std::to_array<std::string_view>({
+#define SOURCEMETA_CORE_UCD_ALIAS_ENTRY(name, alias) alias,
+    SOURCEMETA_CORE_UNICODE_SCRIPT_LIST(SOURCEMETA_CORE_UCD_ALIAS_ENTRY)
+#undef SOURCEMETA_CORE_UCD_ALIAS_ENTRY
+})};
+
+using ValueMap = std::map<std::string, std::uint8_t, std::less<>>;
+
+struct PropertyEntry {
+  std::uint32_t first;
+  std::uint32_t last;
+  std::uint8_t value;
+};
+
+struct TwoStageTable {
+  std::vector<std::uint16_t> stage1;
+  std::vector<std::uint8_t> stage2;
+};
+
+struct DecompositionTable {
+  std::vector<char32_t> blob;
+  std::vector<std::uint16_t> stage1;
+  std::vector<std::uint16_t> stage2;
+};
+
+struct CanonicalCompositionTriple {
+  std::uint32_t starter;
+  std::uint32_t combining;
+  std::uint32_t composed;
+};
+
+auto parse_hex_codepoint(const std::string_view token) -> std::uint32_t {
+  const auto parsed{sourcemeta::core::to_uint32_t(token, 16)};
+  if (!parsed.has_value() || parsed.value() > 0x10FFFF) {
+    throw std::runtime_error{std::string{"Invalid codepoint: "}.append(token)};
+  }
+  return parsed.value();
+}
+
+auto parse_property_file(const std::filesystem::path &input_path,
+                         const ValueMap &value_map,
+                         const std::optional<std::string_view> property_filter)
+    -> std::vector<PropertyEntry> {
+  auto stream{sourcemeta::core::read_file(input_path)};
+  std::vector<PropertyEntry> missing;
+  std::vector<PropertyEntry> data;
+  constexpr std::string_view missing_prefix{"@missing:"};
+
+  const auto parse_payload{
+      [&](const std::string_view payload) -> std::optional<PropertyEntry> {
+        const auto trimmed{
+            sourcemeta::core::trim(sourcemeta::core::take_until(payload, '#'))};
+        const auto first_split{sourcemeta::core::split_once(trimmed, ';')};
+        if (!first_split.has_value()) {
+          throw std::runtime_error{
+              std::string{"Unparseable line: "}.append(payload)};
+        }
+        const auto range_part{sourcemeta::core::trim(first_split->first)};
+        const auto after_first{sourcemeta::core::trim(first_split->second)};
+
+        std::string_view value_token;
+        if (property_filter.has_value()) {
+          const auto second_split{
+              sourcemeta::core::split_once(after_first, ';')};
+          std::string_view property_token;
+          std::string_view tail;
+          if (second_split.has_value()) {
+            property_token = sourcemeta::core::trim(second_split->first);
+            tail = sourcemeta::core::trim(second_split->second);
+          } else {
+            property_token = after_first;
+            tail = {};
+          }
+          if (property_token != property_filter.value()) {
+            return std::nullopt;
+          }
+          value_token = tail;
+        } else {
+          value_token = after_first;
+        }
+
+        const auto range_split{
+            sourcemeta::core::split_once(range_part, std::string_view{".."})};
+        const auto first{parse_hex_codepoint(
+            range_split.has_value() ? range_split->first : range_part)};
+        const auto last{range_split.has_value()
+                            ? parse_hex_codepoint(range_split->second)
+                            : first};
+
+        const auto value_it{value_map.find(value_token)};
+        if (value_it == value_map.end()) {
+          throw std::runtime_error{
+              std::string{"Unknown property value: "}.append(value_token)};
+        }
+        return PropertyEntry{first, last, value_it->second};
+      }};
+
+  sourcemeta::core::for_each_line(stream, [&](const std::string_view raw_line) {
+    const auto line{sourcemeta::core::trim(raw_line)};
+    if (line.empty()) {
+      return;
+    }
+    if (line.front() == '#') {
+      const auto comment_body{sourcemeta::core::trim(line.substr(1))};
+      if (comment_body.size() < missing_prefix.size() ||
+          comment_body.substr(0, missing_prefix.size()) != missing_prefix) {
+        return;
+      }
+      const auto entry{
+          parse_payload(comment_body.substr(missing_prefix.size()))};
+      if (entry.has_value()) {
+        missing.push_back(entry.value());
+      }
+      return;
+    }
+    const auto entry{parse_payload(line)};
+    if (entry.has_value()) {
+      data.push_back(entry.value());
+    }
+  });
+
+  std::vector<PropertyEntry> result;
+  result.reserve(missing.size() + data.size());
+  result.insert(result.end(), missing.begin(), missing.end());
+  result.insert(result.end(), data.begin(), data.end());
+  return result;
+}
+
+auto parse_alias_rows(const std::filesystem::path &aliases_path,
+                      const std::string_view property_short)
+    -> std::vector<std::vector<std::string>> {
+  auto stream{sourcemeta::core::read_file(aliases_path)};
+  std::vector<std::vector<std::string>> rows;
+  sourcemeta::core::for_each_line(stream, [&](const std::string_view raw_line) {
+    const auto line{
+        sourcemeta::core::trim(sourcemeta::core::take_until(raw_line, '#'))};
+    if (line.empty()) {
+      return;
+    }
+    std::vector<std::string> row;
+    bool matched{false};
+    std::size_t field_index{0};
+    sourcemeta::core::split(line, ';', [&](const std::string_view field) {
+      const auto trimmed{sourcemeta::core::trim(field)};
+      if (field_index == 0) {
+        matched = (trimmed == property_short);
+      } else if (matched && !trimmed.empty()) {
+        row.emplace_back(trimmed);
+      }
+      field_index += 1;
+    });
+    if (matched) {
+      rows.push_back(std::move(row));
+    }
+  });
+  return rows;
+}
+
+auto build_combining_mark_value_map(const std::filesystem::path &aliases_path)
+    -> ValueMap {
+  static constexpr std::array<std::string_view, 4> combining{
+      {"M", "Mn", "Mc", "Me"}};
+  ValueMap result;
+  for (const auto &row : parse_alias_rows(aliases_path, "gc")) {
+    std::uint8_t value{0};
+    for (const auto &field : row) {
+      for (const auto &candidate : combining) {
+        if (field == candidate) {
+          value = 1;
+          break;
+        }
+      }
+      if (value == 1) {
+        break;
+      }
+    }
+    for (const auto &field : row) {
+      result[field] = value;
+    }
+  }
+  return result;
+}
+
+auto build_value_map(const std::filesystem::path &aliases_path,
+                     const std::string_view property_short,
+                     const std::span<const std::string_view> canonical_order)
+    -> ValueMap {
+  std::unordered_map<std::string_view, std::uint8_t> canonical_to_int;
+  canonical_to_int.reserve(canonical_order.size());
+  for (std::size_t index{0}; index < canonical_order.size(); index += 1) {
+    canonical_to_int.emplace(canonical_order[index],
+                             static_cast<std::uint8_t>(index));
+  }
+  ValueMap result;
+  std::vector<std::vector<std::string>> unmatched;
+  for (const auto &row : parse_alias_rows(aliases_path, property_short)) {
+    std::optional<std::uint8_t> value;
+    for (const auto &field : row) {
+      const auto found{canonical_to_int.find(field)};
+      if (found != canonical_to_int.end()) {
+        value = found->second;
+        break;
+      }
+    }
+    if (!value.has_value()) {
+      unmatched.push_back(row);
+      continue;
+    }
+    for (const auto &field : row) {
+      result[field] = value.value();
+    }
+  }
+  if (!unmatched.empty()) {
+    throw std::runtime_error{
+        std::string{"Property has values not in canonical order: "}.append(
+            property_short)};
+  }
+  return result;
+}
+
+auto build_integer_value_map(const std::filesystem::path &aliases_path,
+                             const std::string_view property_short)
+    -> ValueMap {
+  ValueMap result;
+  for (const auto &row : parse_alias_rows(aliases_path, property_short)) {
+    if (row.empty()) {
+      continue;
+    }
+    const auto parsed{sourcemeta::core::to_uint32_t(row.front())};
+    if (!parsed.has_value() || parsed.value() > 0xFFU) {
+      throw std::runtime_error{
+          std::string{"Invalid integer property value: "}.append(row.front())};
+    }
+    const auto value{static_cast<std::uint8_t>(parsed.value())};
+    for (const auto &field : row) {
+      result[field] = value;
+    }
+  }
+  return result;
+}
+
+struct UnicodeData {
+  std::map<std::uint32_t, std::vector<std::uint32_t>> decompositions;
+  std::unordered_map<std::uint32_t, std::uint8_t> ccc;
+};
+
+auto parse_unicode_data(const std::filesystem::path &input_path)
+    -> UnicodeData {
+  auto stream{sourcemeta::core::read_file(input_path)};
+  UnicodeData result;
+  sourcemeta::core::for_each_line(stream, [&](const std::string_view raw_line) {
+    const auto line{sourcemeta::core::trim(raw_line)};
+    if (line.empty() || line.front() == '#') {
+      return;
+    }
+    std::array<std::string_view, 6> fields{};
+    std::size_t field_count{0};
+    sourcemeta::core::split(line, ';', [&](const std::string_view field) {
+      if (field_count < fields.size()) {
+        fields[field_count] = field;
+      }
+      field_count += 1;
+    });
+    if (field_count < fields.size()) {
+      throw std::runtime_error{
+          std::string{"UnicodeData.txt: too few fields in line: "}.append(
+              line)};
+    }
+    const auto codepoint{
+        parse_hex_codepoint(sourcemeta::core::trim(fields[0]))};
+    const auto ccc_token{sourcemeta::core::trim(fields[3])};
+    const auto ccc_value{sourcemeta::core::to_uint32_t(ccc_token)};
+    if (!ccc_value.has_value() || ccc_value.value() > 0xFFU) {
+      throw std::runtime_error{
+          std::string{"UnicodeData.txt: invalid CCC: "}.append(ccc_token)};
+    }
+    if (ccc_value.value() != 0) {
+      result.ccc[codepoint] = static_cast<std::uint8_t>(ccc_value.value());
+    }
+    const auto decomp_field{sourcemeta::core::trim(fields[5])};
+    if (decomp_field.empty() || decomp_field.front() == '<') {
+      return;
+    }
+    std::vector<std::uint32_t> decomposition;
+    std::string_view rest{decomp_field};
+    while (!rest.empty()) {
+      const auto token_end{rest.find(' ')};
+      const auto token{token_end == std::string_view::npos
+                           ? rest
+                           : rest.substr(0, token_end)};
+      decomposition.push_back(parse_hex_codepoint(token));
+      if (token_end == std::string_view::npos) {
+        break;
+      }
+      rest.remove_prefix(token_end + 1);
+      while (!rest.empty() && rest.front() == ' ') {
+        rest.remove_prefix(1);
+      }
+    }
+    if (decomposition.size() > 2) {
+      throw std::runtime_error{
+          std::string{"UnicodeData.txt: canonical decomposition has more "
+                      "than 2 codepoints"}};
+    }
+    result.decompositions[codepoint] = std::move(decomposition);
+  });
+  return result;
+}
+
+auto parse_full_composition_exclusions(const std::filesystem::path &input_path)
+    -> std::unordered_set<std::uint32_t> {
+  auto stream{sourcemeta::core::read_file(input_path)};
+  std::unordered_set<std::uint32_t> result;
+  constexpr std::string_view target{"Full_Composition_Exclusion"};
+  sourcemeta::core::for_each_line(stream, [&](const std::string_view raw_line) {
+    const auto line{sourcemeta::core::trim(raw_line)};
+    if (line.empty() || line.front() == '#') {
+      return;
+    }
+    const auto trimmed{
+        sourcemeta::core::trim(sourcemeta::core::take_until(line, '#'))};
+    const auto first_split{sourcemeta::core::split_once(trimmed, ';')};
+    if (!first_split.has_value()) {
+      throw std::runtime_error{
+          std::string{"DerivedNormalizationProps.txt: unparseable line: "}
+              .append(line)};
+    }
+    const auto range_part{sourcemeta::core::trim(first_split->first)};
+    const auto after_first{sourcemeta::core::trim(first_split->second)};
+    const auto second_split{sourcemeta::core::split_once(after_first, ';')};
+    const auto property_token{second_split.has_value()
+                                  ? sourcemeta::core::trim(second_split->first)
+                                  : after_first};
+    if (property_token != target) {
+      return;
+    }
+    const auto range_split{
+        sourcemeta::core::split_once(range_part, std::string_view{".."})};
+    const auto first{parse_hex_codepoint(
+        range_split.has_value() ? range_split->first : range_part)};
+    const auto last{range_split.has_value()
+                        ? parse_hex_codepoint(range_split->second)
+                        : first};
+    for (std::uint32_t codepoint{first}; codepoint <= last; codepoint += 1) {
+      result.insert(codepoint);
+    }
+  });
+  return result;
+}
+
+auto parse_explicit_composition_exclusions(
+    const std::filesystem::path &input_path)
+    -> std::unordered_set<std::uint32_t> {
+  auto stream{sourcemeta::core::read_file(input_path)};
+  std::unordered_set<std::uint32_t> result;
+  sourcemeta::core::for_each_line(stream, [&](const std::string_view raw_line) {
+    const auto line{sourcemeta::core::trim(raw_line)};
+    if (line.empty() || line.front() == '#') {
+      return;
+    }
+    const auto trimmed{
+        sourcemeta::core::trim(sourcemeta::core::take_until(line, '#'))};
+    if (trimmed.empty()) {
+      return;
+    }
+    result.insert(parse_hex_codepoint(trimmed));
+  });
+  return result;
+}
+
+auto build_canonical_compositions(
+    const std::map<std::uint32_t, std::vector<std::uint32_t>> &decompositions,
+    const std::unordered_map<std::uint32_t, std::uint8_t> &ccc,
+    const std::unordered_set<std::uint32_t> &full_exclusions,
+    const std::unordered_set<std::uint32_t> &explicit_exclusions)
+    -> std::vector<CanonicalCompositionTriple> {
+  for (const auto codepoint : explicit_exclusions) {
+    if (!full_exclusions.contains(codepoint)) {
+      throw std::runtime_error{
+          std::string{"CompositionExclusions.txt has entries missing from "
+                      "Full_Composition_Exclusion"}};
+    }
+  }
+
+  const auto ccc_of{[&](const std::uint32_t codepoint) -> std::uint8_t {
+    const auto found{ccc.find(codepoint)};
+    return found == ccc.end() ? std::uint8_t{0} : found->second;
+  }};
+
+  std::vector<CanonicalCompositionTriple> triples;
+  for (const auto &[composed, decomposition] : decompositions) {
+    if (decomposition.size() != 2) {
+      continue;
+    }
+    if (ccc_of(composed) != 0) {
+      continue;
+    }
+    if (ccc_of(decomposition[0]) != 0) {
+      continue;
+    }
+    if (full_exclusions.contains(composed)) {
+      continue;
+    }
+    triples.push_back({decomposition[0], decomposition[1], composed});
+  }
+  std::sort(triples.begin(), triples.end(),
+            [](const CanonicalCompositionTriple &left,
+               const CanonicalCompositionTriple &right) {
+              if (left.starter != right.starter) {
+                return left.starter < right.starter;
+              }
+              if (left.combining != right.combining) {
+                return left.combining < right.combining;
+              }
+              return left.composed < right.composed;
+            });
+  return triples;
+}
+
+auto build_canonical_decomposition_pages(
+    const std::map<std::uint32_t, std::vector<std::uint32_t>> &decompositions)
+    -> DecompositionTable {
+  std::vector<char32_t> blob;
+  std::vector<std::uint16_t> packed(TOTAL_CODEPOINTS, 0);
+  for (const auto &[codepoint, decomposition] : decompositions) {
+    const auto offset{blob.size()};
+    if (offset > DECOMPOSITION_OFFSET_MASK) {
+      throw std::runtime_error{
+          std::string{"canonical decomposition blob exceeds offset cap"}};
+    }
+    for (const auto value : decomposition) {
+      blob.push_back(static_cast<char32_t>(value));
+    }
+    packed[codepoint] = static_cast<std::uint16_t>(
+        (decomposition.size() << DECOMPOSITION_OFFSET_BITS) | offset);
+  }
+
+  std::unordered_map<std::string, std::uint16_t> page_to_id;
+  DecompositionTable table;
+  table.blob = std::move(blob);
+  table.stage1.reserve(NUM_PAGES);
+  for (std::size_t page_index{0}; page_index < NUM_PAGES; page_index += 1) {
+    const auto page_start{page_index * PAGE_SIZE};
+    const std::string page_key{
+        reinterpret_cast<const char *>(packed.data() + page_start),
+        PAGE_SIZE * sizeof(std::uint16_t)};
+    const auto existing{page_to_id.find(page_key)};
+    if (existing != page_to_id.end()) {
+      table.stage1.push_back(existing->second);
+      continue;
+    }
+    const auto new_id{
+        static_cast<std::uint16_t>(table.stage2.size() / PAGE_SIZE)};
+    page_to_id.emplace(page_key, new_id);
+    table.stage2.insert(
+        table.stage2.end(),
+        packed.begin() + static_cast<std::ptrdiff_t>(page_start),
+        packed.begin() + static_cast<std::ptrdiff_t>(page_start + PAGE_SIZE));
+    table.stage1.push_back(new_id);
+  }
+  return table;
+}
+
+auto build_pages(const std::vector<PropertyEntry> &entries) -> TwoStageTable {
+  std::vector<std::uint8_t> values(TOTAL_CODEPOINTS, 0);
+  for (const auto &entry : entries) {
+    for (std::uint32_t codepoint{entry.first}; codepoint <= entry.last;
+         codepoint += 1) {
+      values[codepoint] = entry.value;
+    }
+  }
+
+  std::unordered_map<std::string, std::uint16_t> page_to_id;
+  TwoStageTable table;
+  table.stage1.reserve(NUM_PAGES);
+  for (std::size_t page_index{0}; page_index < NUM_PAGES; page_index += 1) {
+    const auto page_start{page_index * PAGE_SIZE};
+    const std::string page_key{
+        reinterpret_cast<const char *>(values.data() + page_start), PAGE_SIZE};
+    const auto existing{page_to_id.find(page_key)};
+    if (existing != page_to_id.end()) {
+      table.stage1.push_back(existing->second);
+      continue;
+    }
+    const auto new_id{
+        static_cast<std::uint16_t>(table.stage2.size() / PAGE_SIZE)};
+    page_to_id.emplace(page_key, new_id);
+    table.stage2.insert(
+        table.stage2.end(),
+        values.begin() + static_cast<std::ptrdiff_t>(page_start),
+        values.begin() + static_cast<std::ptrdiff_t>(page_start + PAGE_SIZE));
+    table.stage1.push_back(new_id);
+  }
+  return table;
+}
+
+template <typename T>
+auto emit_row_decimal(std::ostream &stream, const std::span<const T> items)
+    -> void {
+  constexpr std::size_t row_width{16};
+  for (std::size_t offset{0}; offset < items.size(); offset += row_width) {
+    stream << "    ";
+    const auto upper{offset + row_width < items.size() ? offset + row_width
+                                                       : items.size()};
+    const auto row{items.subspan(offset, upper - offset)};
+    const auto widened{row | std::views::transform([](const T value) {
+                         return static_cast<std::uint64_t>(value);
+                       })};
+    sourcemeta::core::join_to(stream, widened, ", ");
+    stream << ",\n";
+  }
+}
+
+template <typename T>
+auto emit_row_hex(std::ostream &stream, const std::span<const T> items,
+                  const std::size_t row_width) -> void {
+  for (std::size_t offset{0}; offset < items.size(); offset += row_width) {
+    stream << "    ";
+    const auto upper{offset + row_width < items.size() ? offset + row_width
+                                                       : items.size()};
+    for (std::size_t column{offset}; column < upper; column += 1) {
+      if (column > offset) {
+        stream << ", ";
+      }
+      stream << "0x" << std::hex << std::uppercase
+             << static_cast<std::uint64_t>(items[column]) << std::dec;
+    }
+    stream << ",\n";
+  }
+}
+
+auto emit_property(std::ostream &stream, const std::string_view prefix,
+                   const TwoStageTable &table) -> void {
+  stream << "constexpr std::uint16_t " << prefix << "_STAGE1["
+         << table.stage1.size() << "] = {\n";
+  emit_row_decimal<std::uint16_t>(stream, table.stage1);
+  stream << "};\n\n";
+  stream << "constexpr std::uint8_t " << prefix << "_STAGE2["
+         << table.stage2.size() << "] = {\n";
+  emit_row_decimal<std::uint8_t>(stream, table.stage2);
+  stream << "};\n\n";
+}
+
+auto emit_canonical_decomposition(std::ostream &stream,
+                                  const DecompositionTable &table) -> void {
+  stream << "constexpr char32_t CANONICAL_DECOMPOSITION_BLOB["
+         << table.blob.size() << "] = {\n";
+  emit_row_hex<char32_t>(stream, table.blob, 8);
+  stream << "};\n\n";
+  stream << "constexpr std::uint16_t CANONICAL_DECOMPOSITION_STAGE1["
+         << table.stage1.size() << "] = {\n";
+  emit_row_decimal<std::uint16_t>(stream, table.stage1);
+  stream << "};\n\n";
+  stream << "constexpr std::uint16_t CANONICAL_DECOMPOSITION_STAGE2["
+         << table.stage2.size() << "] = {\n";
+  emit_row_decimal<std::uint16_t>(stream, table.stage2);
+  stream << "};\n\n";
+}
+
+auto emit_canonical_composition(
+    std::ostream &stream,
+    const std::vector<CanonicalCompositionTriple> &triples) -> void {
+  stream << "struct CanonicalCompositionEntry {\n";
+  stream << "  char32_t starter;\n";
+  stream << "  char32_t combining;\n";
+  stream << "  char32_t composed;\n";
+  stream << "};\n\n";
+  stream << "constexpr CanonicalCompositionEntry CANONICAL_COMPOSITIONS["
+         << triples.size() << "] = {\n";
+  for (const auto &triple : triples) {
+    stream << "    {0x" << std::hex << std::uppercase
+           << static_cast<std::uint64_t>(triple.starter) << ", 0x"
+           << static_cast<std::uint64_t>(triple.combining) << ", 0x"
+           << static_cast<std::uint64_t>(triple.composed) << std::dec << "},\n";
+  }
+  stream << "};\n\n";
+}
+
+} // namespace
+
+auto main(const int argc, const char *const argv[]) -> int {
+  try {
+    sourcemeta::core::Options app;
+    app.parse(argc, argv);
+    const auto &positional{app.positional()};
+    if (positional.size() != 10) {
+      std::cerr
+          << "Usage: " << (argc > 0 ? argv[0] : "codegen")
+          << " <output.h> <PropertyValueAliases.txt>"
+             " <DerivedCombiningClass.txt> <DerivedJoiningType.txt>"
+             " <DerivedBidiClass.txt> <Scripts.txt>"
+             " <DerivedGeneralCategory.txt> <DerivedNormalizationProps.txt>"
+             " <UnicodeData.txt> <CompositionExclusions.txt>\n";
+      return EXIT_FAILURE;
+    }
+
+    const std::filesystem::path output_path{positional.at(0)};
+    const std::filesystem::path aliases_path{positional.at(1)};
+    const std::filesystem::path combining_class_path{positional.at(2)};
+    const std::filesystem::path joining_type_path{positional.at(3)};
+    const std::filesystem::path bidi_class_path{positional.at(4)};
+    const std::filesystem::path scripts_path{positional.at(5)};
+    const std::filesystem::path general_category_path{positional.at(6)};
+    const std::filesystem::path normalization_props_path{positional.at(7)};
+    const std::filesystem::path unicode_data_path{positional.at(8)};
+    const std::filesystem::path composition_exclusions_path{positional.at(9)};
+
+    const auto unicode_data{parse_unicode_data(unicode_data_path)};
+    const auto full_exclusions{
+        parse_full_composition_exclusions(normalization_props_path)};
+    const auto explicit_exclusions{
+        parse_explicit_composition_exclusions(composition_exclusions_path)};
+
+    sourcemeta::core::write_file(output_path, [&](std::ostream &stream) {
+      stream << "#include <cstddef>\n";
+      stream << "#include <cstdint>\n\n";
+      stream << "namespace {\n\n";
+
+      struct PropertySpec {
+        std::string_view prefix;
+        const std::filesystem::path &input_path;
+        std::optional<std::string_view> property_filter;
+        ValueMap value_map;
+      };
+
+      const auto combining_class_map{
+          build_integer_value_map(aliases_path, "ccc")};
+      const auto joining_type_map{
+          build_value_map(aliases_path, "jt", JOINING_TYPE_ORDER)};
+      const auto bidi_class_map{
+          build_value_map(aliases_path, "bc", BIDI_CLASS_ORDER)};
+      const auto script_map{
+          build_value_map(aliases_path, "sc", UNICODE_SCRIPT_ORDER)};
+      const auto combining_mark_map{
+          build_combining_mark_value_map(aliases_path)};
+      const auto nfc_quick_check_map{
+          build_value_map(aliases_path, "NFC_QC", NFC_QUICK_CHECK_ORDER)};
+
+      const std::array<PropertySpec, 6> properties{
+          {{"COMBINING_CLASS", combining_class_path, std::nullopt,
+            combining_class_map},
+           {"JOINING_TYPE", joining_type_path, std::nullopt, joining_type_map},
+           {"BIDI_CLASS", bidi_class_path, std::nullopt, bidi_class_map},
+           {"UNICODE_SCRIPT", scripts_path, std::nullopt, script_map},
+           {"IS_COMBINING_MARK", general_category_path, std::nullopt,
+            combining_mark_map},
+           {"NFC_QUICK_CHECK", normalization_props_path,
+            std::optional<std::string_view>{"NFC_QC"}, nfc_quick_check_map}}};
+
+      for (const auto &spec : properties) {
+        const auto entries{parse_property_file(spec.input_path, spec.value_map,
+                                               spec.property_filter)};
+        const auto table{build_pages(entries)};
+        emit_property(stream, spec.prefix, table);
+      }
+
+      const auto decomposition_table{
+          build_canonical_decomposition_pages(unicode_data.decompositions)};
+      emit_canonical_decomposition(stream, decomposition_table);
+
+      const auto triples{build_canonical_compositions(
+          unicode_data.decompositions, unicode_data.ccc, full_exclusions,
+          explicit_exclusions)};
+      emit_canonical_composition(stream, triples);
+
+      stream << "} // namespace\n";
+    });
+  } catch (const std::exception &error) {
+    std::cerr << "codegen: " << error.what() << "\n";
+    return EXIT_FAILURE;
+  }
+  return EXIT_SUCCESS;
+}
diff --git a/vendor/core/src/core/unicode/codegen.py b/vendor/core/src/core/unicode/codegen.py
deleted file mode 100644
index d76411c3f..000000000
--- a/vendor/core/src/core/unicode/codegen.py
+++ /dev/null
@@ -1,516 +0,0 @@
-#!/usr/bin/env python3
-
-import re
-import sys
-
-LINE = re.compile(r"^([0-9A-Fa-f]+)(?:\.\.([0-9A-Fa-f]+))?\s*;\s*(\S+)")
-MULTI_PROPERTY_LINE = re.compile(
-    r"^([0-9A-Fa-f]+)(?:\.\.([0-9A-Fa-f]+))?\s*;\s*(\S+)\s*;\s*(\S+)"
-)
-# Boolean-property rows in multi-property files use a two-field shape,
-# with no value column. Used to recognise the row instead of silently
-# skipping it.
-BOOLEAN_PROPERTY_LINE = re.compile(
-    r"^([0-9A-Fa-f]+)(?:\.\.([0-9A-Fa-f]+))?\s*;\s*(\S+)\s*$"
-)
-MISSING_PREFIX = re.compile(r"^#\s*@missing:\s*")
-
-TOTAL_CODEPOINTS = 0x110000
-PAGE_SHIFT = 10
-PAGE_SIZE = 1 << PAGE_SHIFT
-NUM_PAGES = TOTAL_CODEPOINTS // PAGE_SIZE
-
-# Per-property canonical order. Position in this list defines the integer
-# value of the matching C++ enum entry. PropertyValueAliases.txt supplies
-# the short/long alias mappings at codegen time, so we only need to
-# declare one form per value here.
-
-JOINING_TYPE_ORDER = ["U", "T", "L", "R", "D", "C"]
-
-BIDI_CLASS_ORDER = [
-    "L", "R", "AL", "EN", "ES", "ET", "AN", "CS", "NSM", "BN",
-    "B", "S", "WS", "ON", "LRE", "LRO", "RLE", "RLO", "PDF",
-    "LRI", "RLI", "FSI", "PDI",
-]
-
-NFC_QUICK_CHECK_ORDER = ["Y", "N", "M"]
-
-UNICODE_SCRIPT_ORDER = [
-    "Adlam", "Ahom", "Anatolian_Hieroglyphs", "Arabic", "Armenian",
-    "Avestan", "Balinese", "Bamum", "Bassa_Vah", "Batak", "Bengali",
-    "Beria_Erfe", "Bhaiksuki", "Bopomofo", "Brahmi", "Braille",
-    "Buginese", "Buhid", "Canadian_Aboriginal", "Carian",
-    "Caucasian_Albanian", "Chakma", "Cham", "Cherokee", "Chorasmian",
-    "Common", "Coptic", "Cuneiform", "Cypriot", "Cypro_Minoan",
-    "Cyrillic", "Deseret", "Devanagari", "Dives_Akuru", "Dogra",
-    "Duployan", "Egyptian_Hieroglyphs", "Elbasan", "Elymaic",
-    "Ethiopic", "Garay", "Georgian", "Glagolitic", "Gothic", "Grantha",
-    "Greek", "Gujarati", "Gunjala_Gondi", "Gurmukhi", "Gurung_Khema",
-    "Han", "Hangul", "Hanifi_Rohingya", "Hanunoo", "Hatran", "Hebrew",
-    "Hiragana", "Imperial_Aramaic", "Inherited", "Inscriptional_Pahlavi",
-    "Inscriptional_Parthian", "Javanese", "Kaithi", "Kannada", "Katakana",
-    "Kawi", "Kayah_Li", "Kharoshthi", "Khitan_Small_Script", "Khmer",
-    "Khojki", "Khudawadi", "Kirat_Rai", "Lao", "Latin", "Lepcha", "Limbu",
-    "Linear_A", "Linear_B", "Lisu", "Lycian", "Lydian", "Mahajani",
-    "Makasar", "Malayalam", "Mandaic", "Manichaean", "Marchen",
-    "Masaram_Gondi", "Medefaidrin", "Meetei_Mayek", "Mende_Kikakui",
-    "Meroitic_Cursive", "Meroitic_Hieroglyphs", "Miao", "Modi",
-    "Mongolian", "Mro", "Multani", "Myanmar", "Nabataean", "Nag_Mundari",
-    "Nandinagari", "New_Tai_Lue", "Newa", "Nko", "Nushu",
-    "Nyiakeng_Puachue_Hmong", "Ogham", "Ol_Chiki", "Ol_Onal",
-    "Old_Hungarian", "Old_Italic", "Old_North_Arabian", "Old_Permic",
-    "Old_Persian", "Old_Sogdian", "Old_South_Arabian", "Old_Turkic",
-    "Old_Uyghur", "Oriya", "Osage", "Osmanya", "Pahawh_Hmong",
-    "Palmyrene", "Pau_Cin_Hau", "Phags_Pa", "Phoenician",
-    "Psalter_Pahlavi", "Rejang", "Runic", "Samaritan", "Saurashtra",
-    "Sharada", "Shavian", "Siddham", "Sidetic", "SignWriting", "Sinhala",
-    "Sogdian", "Sora_Sompeng", "Soyombo", "Sundanese", "Sunuwar",
-    "Syloti_Nagri", "Syriac", "Tagalog", "Tagbanwa", "Tai_Le", "Tai_Tham",
-    "Tai_Viet", "Tai_Yo", "Takri", "Tamil", "Tangsa", "Tangut", "Telugu",
-    "Thaana", "Thai", "Tibetan", "Tifinagh", "Tirhuta", "Todhri",
-    "Tolong_Siki", "Toto", "Tulu_Tigalari", "Ugaritic", "Unknown", "Vai",
-    "Vithkuqi", "Wancho", "Warang_Citi", "Yezidi", "Yi", "Zanabazar_Square",
-    "Katakana_Or_Hiragana",
-]
-
-
-def parse_alias_lines(aliases_path, property_short):
-    rows = []
-    with open(aliases_path) as source:
-        for line in source:
-            stripped = line.split("#", 1)[0].strip()
-            if not stripped:
-                continue
-            parts = [part.strip() for part in stripped.split(";")]
-            if parts[0] == property_short:
-                rows.append([field for field in parts[1:] if field])
-    return rows
-
-
-def build_combining_mark_value_map(aliases_path):
-    """Build {form: int} from PropertyValueAliases.txt mapping each
-    General_Category alias to 1 if it is a combining mark (Mn, Mc, Me,
-    or the supergroup M / Mark / Combining_Mark) and to 0 otherwise."""
-    combining = {"M", "Mn", "Mc", "Me"}
-    result = {}
-    for row in parse_alias_lines(aliases_path, "gc"):
-        value = 1 if any(field in combining for field in row) else 0
-        for field in row:
-            result[field] = value
-    return result
-
-
-def build_value_map(aliases_path, property_short, canonical_order=None):
-    """Build {form: int} for a property. With canonical_order, each row's
-    integer is its canonical's position in that list; without, the row's
-    first field is read as the integer directly (used for ccc)."""
-    canonical_to_int = (
-        {name: index for index, name in enumerate(canonical_order)}
-        if canonical_order is not None
-        else None
-    )
-    result = {}
-    unmatched = []
-    for row in parse_alias_lines(aliases_path, property_short):
-        if canonical_to_int is None:
-            value = int(row[0])
-        else:
-            value = next(
-                (canonical_to_int[field] for field in row if field in canonical_to_int),
-                None,
-            )
-            if value is None:
-                unmatched.append(row)
-                continue
-        for field in row:
-            result[field] = value
-    if unmatched:
-        raise ValueError(
-            f"{aliases_path}: property {property_short!r} has values not "
-            f"declared in canonical order: {unmatched}"
-        )
-    return result
-
-
-def parse_file(path, value_map, property_filter=None):
-    """Read a UCD file and return a list of (first, last, value) entries
-    with @missing defaults first and data ranges second, so callers can
-    apply them in order regardless of where @missing appears in the file.
-
-    With property_filter set, lines have shape `codepoint; property; value`
-    (as in DerivedNormalizationProps.txt) and only rows whose property
-    name matches are returned. Without it, lines have shape
-    `codepoint; value` and every row contributes."""
-    line_re = MULTI_PROPERTY_LINE if property_filter is not None else LINE
-    missing = []
-    data = []
-    with open(path) as source:
-        for line_number, line in enumerate(source, start=1):
-            stripped = line.strip()
-            if not stripped:
-                continue
-            target = data
-            if stripped.startswith("#"):
-                prefix = MISSING_PREFIX.match(stripped)
-                if not prefix:
-                    continue
-                stripped = stripped[prefix.end():]
-                target = missing
-            match = line_re.match(stripped)
-            if not match:
-                # Recognise the boolean-property shape used in multi-property
-                # files, but only for properties other than the one we are
-                # filtering for. A boolean-shape row that names our target
-                # property would be malformed data and must raise.
-                data_only = stripped.split("#", 1)[0].strip()
-                if property_filter is not None:
-                    boolean = BOOLEAN_PROPERTY_LINE.fullmatch(data_only)
-                    if boolean and boolean.group(3) != property_filter:
-                        continue
-                raise ValueError(
-                    f"{path}:{line_number}: unparseable line: {stripped!r}"
-                )
-            if property_filter is not None and match.group(3) != property_filter:
-                continue
-            first = int(match.group(1), 16)
-            last = int(match.group(2), 16) if match.group(2) else first
-            raw_value = match.group(4 if property_filter is not None else 3)
-            try:
-                value = value_map[raw_value]
-            except KeyError as error:
-                raise ValueError(
-                    f"{path}:{line_number}: invalid value {raw_value!r}: {error}"
-                ) from error
-            target.append((first, last, value))
-    return missing + data
-
-
-def parse_unicode_data(path):
-    """Read UnicodeData.txt once and return (decompositions, ccc) where
-    decompositions is {codepoint: [decomposition codepoints]} for canonical
-    decompositions only (compatibility decompositions, those whose field 5
-    starts with a `<tag>` prefix per UAX #44, are excluded), and ccc is
-    {codepoint: canonical_combining_class} for codepoints with non-zero CCC.
-
-    Raises if any canonical decomposition has more than two codepoints, which
-    would indicate a format change in UnicodeData.txt."""
-    decompositions = {}
-    ccc = {}
-    with open(path) as source:
-        for line_number, line in enumerate(source, start=1):
-            stripped = line.strip()
-            if not stripped or stripped.startswith("#"):
-                continue
-            fields = stripped.split(";")
-            if len(fields) < 6:
-                raise ValueError(
-                    f"{path}:{line_number}: too few fields: {stripped!r}"
-                )
-            try:
-                codepoint = int(fields[0], 16)
-            except ValueError as error:
-                raise ValueError(
-                    f"{path}:{line_number}: invalid codepoint: {fields[0]!r}"
-                ) from error
-            try:
-                ccc_value = int(fields[3])
-            except ValueError as error:
-                raise ValueError(
-                    f"{path}:{line_number}: invalid CCC: {fields[3]!r}"
-                ) from error
-            if ccc_value != 0:
-                ccc[codepoint] = ccc_value
-            decomp_field = fields[5].strip()
-            if not decomp_field or decomp_field.startswith("<"):
-                continue
-            decomposition = [int(token, 16) for token in decomp_field.split()]
-            if len(decomposition) > 2:
-                raise ValueError(
-                    f"{path}:{line_number}: canonical decomposition of "
-                    f"U+{codepoint:04X} has {len(decomposition)} codepoints, "
-                    f"expected 1 or 2"
-                )
-            decompositions[codepoint] = decomposition
-    return decompositions, ccc
-
-
-def parse_full_composition_exclusions(path):
-    """Read DerivedNormalizationProps.txt and return the set of codepoints
-    for which Full_Composition_Exclusion=Yes. Lines that match neither the
-    three-field nor the two-field property shape raise, so a file format
-    change cannot silently drop exclusion data."""
-    result = set()
-    with open(path) as source:
-        for line_number, line in enumerate(source, start=1):
-            stripped = line.strip()
-            if not stripped or stripped.startswith("#"):
-                continue
-            match = MULTI_PROPERTY_LINE.match(stripped)
-            if match:
-                if match.group(3) != "Full_Composition_Exclusion":
-                    continue
-                first = int(match.group(1), 16)
-                last = int(match.group(2), 16) if match.group(2) else first
-                for codepoint in range(first, last + 1):
-                    result.add(codepoint)
-                continue
-            data_only = stripped.split("#", 1)[0].strip()
-            boolean_match = BOOLEAN_PROPERTY_LINE.fullmatch(data_only)
-            if not boolean_match:
-                raise ValueError(
-                    f"{path}:{line_number}: unparseable line: {stripped!r}"
-                )
-            if boolean_match.group(3) != "Full_Composition_Exclusion":
-                continue
-            first = int(boolean_match.group(1), 16)
-            last = (int(boolean_match.group(2), 16)
-                    if boolean_match.group(2) else first)
-            for codepoint in range(first, last + 1):
-                result.add(codepoint)
-    return result
-
-
-EXPLICIT_COMPOSITION_EXCLUSION_LINE = re.compile(
-    r"^([0-9A-Fa-f]+)(?:\s+#.*)?$"
-)
-
-
-def parse_explicit_composition_exclusions(path):
-    """Read the script-specific list from CompositionExclusions.txt. The
-    file has a flat `codepoint  # NAME` shape with no semicolons. The full
-    line is anchored so prefix-only matches and trailing junk fail loud."""
-    result = set()
-    with open(path) as source:
-        for line_number, line in enumerate(source, start=1):
-            stripped = line.strip()
-            if not stripped or stripped.startswith("#"):
-                continue
-            match = EXPLICIT_COMPOSITION_EXCLUSION_LINE.fullmatch(stripped)
-            if not match:
-                raise ValueError(
-                    f"{path}:{line_number}: unparseable line: {stripped!r}"
-                )
-            result.add(int(match.group(1), 16))
-    return result
-
-
-def build_canonical_compositions(decompositions, ccc, full_exclusions,
-                                  explicit_exclusions):
-    """Apply the UAX #15 §1.3 primary-composite filters and return a sorted
-    list of (starter, combining, composed) triples.
-
-    Raises if the explicit CompositionExclusions.txt list is not a subset of
-    the derived Full_Composition_Exclusion set in
-    DerivedNormalizationProps.txt, which would indicate the two data files
-    have drifted out of sync."""
-    missing = explicit_exclusions - full_exclusions
-    if missing:
-        raise ValueError(
-            "CompositionExclusions.txt entries missing from "
-            "Full_Composition_Exclusion: "
-            + ", ".join(f"U+{codepoint:04X}" for codepoint in sorted(missing))
-        )
-
-    triples = []
-    for composed, decomposition in decompositions.items():
-        if len(decomposition) != 2:
-            continue
-        if ccc.get(composed, 0) != 0:
-            continue
-        if ccc.get(decomposition[0], 0) != 0:
-            continue
-        if composed in full_exclusions:
-            continue
-        triples.append((decomposition[0], decomposition[1], composed))
-    triples.sort()
-    return triples
-
-
-def emit_canonical_composition(output, triples):
-    output.write("struct CanonicalCompositionEntry {\n")
-    output.write("  char32_t starter;\n")
-    output.write("  char32_t combining;\n")
-    output.write("  char32_t composed;\n")
-    output.write("};\n\n")
-    output.write(
-        f"constexpr CanonicalCompositionEntry "
-        f"CANONICAL_COMPOSITIONS[{len(triples)}] = {{\n"
-    )
-    for starter, combining, composed in triples:
-        output.write(
-            f"    {{0x{starter:X}, 0x{combining:X}, 0x{composed:X}}},\n"
-        )
-    output.write("};\n\n")
-
-
-# Packed per-codepoint entry: (length << OFFSET_BITS) | offset. A zero entry
-# means no decomposition. Length 1 / 2 covers the entire canonical space.
-DECOMPOSITION_OFFSET_BITS = 14
-DECOMPOSITION_OFFSET_MASK = (1 << DECOMPOSITION_OFFSET_BITS) - 1
-
-
-def build_canonical_decomposition_pages(decompositions):
-    """Build the flat blob plus per-codepoint packed entries, then run the
-    standard two-stage page-table dedup on top of the packed array."""
-    blob = []
-    packed = [0] * TOTAL_CODEPOINTS
-    for codepoint in sorted(decompositions):
-        decomposition = decompositions[codepoint]
-        offset = len(blob)
-        if offset > DECOMPOSITION_OFFSET_MASK:
-            raise ValueError(
-                f"canonical decomposition blob exceeds "
-                f"{DECOMPOSITION_OFFSET_BITS}-bit offset cap at "
-                f"U+{codepoint:04X}"
-            )
-        blob.extend(decomposition)
-        packed[codepoint] = (len(decomposition) << DECOMPOSITION_OFFSET_BITS) | offset
-
-    page_to_id = {}
-    unique_pages = []
-    stage1 = []
-    for page_index in range(NUM_PAGES):
-        start = page_index * PAGE_SIZE
-        page = tuple(packed[start : start + PAGE_SIZE])
-        if page not in page_to_id:
-            page_to_id[page] = len(unique_pages)
-            unique_pages.append(page)
-        stage1.append(page_to_id[page])
-    return blob, stage1, unique_pages
-
-
-def emit_canonical_decomposition(output, blob, stage1, unique_pages):
-    output.write(
-        f"constexpr char32_t CANONICAL_DECOMPOSITION_BLOB[{len(blob)}] = {{\n"
-    )
-    for offset in range(0, len(blob), 8):
-        chunk = blob[offset : offset + 8]
-        output.write(
-            "    " + ", ".join(f"0x{value:X}" for value in chunk) + ",\n"
-        )
-    output.write("};\n\n")
-
-    output.write(
-        f"constexpr std::uint16_t CANONICAL_DECOMPOSITION_STAGE1"
-        f"[{len(stage1)}] = {{\n"
-    )
-    emit_row(output, stage1)
-    output.write("};\n\n")
-    stage2_size = len(unique_pages) * PAGE_SIZE
-    output.write(
-        f"constexpr std::uint16_t CANONICAL_DECOMPOSITION_STAGE2"
-        f"[{stage2_size}] = {{\n"
-    )
-    for page in unique_pages:
-        emit_row(output, list(page))
-    output.write("};\n\n")
-
-
-def build_pages(entries):
-    values = [0] * TOTAL_CODEPOINTS
-    for first, last, value in entries:
-        values[first : last + 1] = [value] * (last - first + 1)
-    page_to_id = {}
-    unique_pages = []
-    stage1 = []
-    for page_index in range(NUM_PAGES):
-        start = page_index * PAGE_SIZE
-        page = tuple(values[start : start + PAGE_SIZE])
-        if page not in page_to_id:
-            page_to_id[page] = len(unique_pages)
-            unique_pages.append(page)
-        stage1.append(page_to_id[page])
-    return stage1, unique_pages
-
-
-def emit_row(output, items):
-    for offset in range(0, len(items), 16):
-        chunk = items[offset : offset + 16]
-        output.write("    " + ", ".join(str(value) for value in chunk) + ",\n")
-
-
-def emit_property(output, prefix, stage1, unique_pages):
-    output.write(
-        f"constexpr std::uint16_t {prefix}_STAGE1[{len(stage1)}] = {{\n"
-    )
-    emit_row(output, stage1)
-    output.write("};\n\n")
-    stage2_size = len(unique_pages) * PAGE_SIZE
-    output.write(
-        f"constexpr std::uint8_t {prefix}_STAGE2[{stage2_size}] = {{\n"
-    )
-    for page in unique_pages:
-        emit_row(output, list(page))
-    output.write("};\n\n")
-
-
-def main():
-    if len(sys.argv) != 11:
-        print(
-            f"Usage: {sys.argv[0]} "
-            "<output.h> "
-            "<PropertyValueAliases.txt> "
-            "<DerivedCombiningClass.txt> "
-            "<DerivedJoiningType.txt> "
-            "<DerivedBidiClass.txt> "
-            "<Scripts.txt> "
-            "<DerivedGeneralCategory.txt> "
-            "<DerivedNormalizationProps.txt> "
-            "<UnicodeData.txt> "
-            "<CompositionExclusions.txt>",
-            file=sys.stderr,
-        )
-        sys.exit(1)
-
-    output_path = sys.argv[1]
-    aliases_path = sys.argv[2]
-    derived_normalization_props_path = sys.argv[8]
-
-    properties = [
-        ("COMBINING_CLASS", sys.argv[3], None,
-         build_value_map(aliases_path, "ccc")),
-        ("JOINING_TYPE", sys.argv[4], None,
-         build_value_map(aliases_path, "jt", JOINING_TYPE_ORDER)),
-        ("BIDI_CLASS", sys.argv[5], None,
-         build_value_map(aliases_path, "bc", BIDI_CLASS_ORDER)),
-        ("UNICODE_SCRIPT", sys.argv[6], None,
-         build_value_map(aliases_path, "sc", UNICODE_SCRIPT_ORDER)),
-        ("IS_COMBINING_MARK", sys.argv[7], None,
-         build_combining_mark_value_map(aliases_path)),
-        ("NFC_QUICK_CHECK", derived_normalization_props_path, "NFC_QC",
-         build_value_map(aliases_path, "NFC_QC", NFC_QUICK_CHECK_ORDER)),
-    ]
-
-    unicode_data_path = sys.argv[9]
-    composition_exclusions_path = sys.argv[10]
-
-    decompositions, ccc = parse_unicode_data(unicode_data_path)
-    full_exclusions = parse_full_composition_exclusions(
-        derived_normalization_props_path
-    )
-    explicit_exclusions = parse_explicit_composition_exclusions(
-        composition_exclusions_path
-    )
-
-    with open(output_path, "w") as output:
-        output.write("#include <cstddef>\n")
-        output.write("#include <cstdint>\n\n")
-        output.write("namespace {\n\n")
-        for prefix, input_path, property_filter, value_map in properties:
-            stage1, pages = build_pages(
-                parse_file(input_path, value_map, property_filter)
-            )
-            emit_property(output, prefix, stage1, pages)
-        blob, stage1, pages = build_canonical_decomposition_pages(
-            decompositions
-        )
-        emit_canonical_decomposition(output, blob, stage1, pages)
-        triples = build_canonical_compositions(
-            decompositions, ccc, full_exclusions, explicit_exclusions
-        )
-        emit_canonical_composition(output, triples)
-        output.write("} // namespace\n")
-
-
-if __name__ == "__main__":
-    main()
diff --git a/vendor/core/src/core/unicode/include/sourcemeta/core/unicode_ucd.h b/vendor/core/src/core/unicode/include/sourcemeta/core/unicode_ucd.h
index 536054c68..2093e51d4 100644
--- a/vendor/core/src/core/unicode/include/sourcemeta/core/unicode_ucd.h
+++ b/vendor/core/src/core/unicode/include/sourcemeta/core/unicode_ucd.h
@@ -5,238 +5,266 @@
 
 namespace sourcemeta::core {
 
+/// @ingroup unicode
+/// Each entry maps a `JoiningType` enum name to its UCD short alias.
+#define SOURCEMETA_CORE_JOINING_TYPE_LIST(X)                                   \
+  X(NonJoining, "U")                                                           \
+  X(Transparent, "T")                                                          \
+  X(LeftJoining, "L")                                                          \
+  X(RightJoining, "R")                                                         \
+  X(DualJoining, "D")                                                          \
+  X(JoinCausing, "C")
+
 /// @ingroup unicode
 /// The joining type of a Unicode codepoint per UAX #44. See
 /// https://www.unicode.org/reports/tr44/ for the property's definition.
 enum class JoiningType : std::uint8_t {
-  NonJoining = 0,
-  Transparent = 1,
-  LeftJoining = 2,
-  RightJoining = 3,
-  DualJoining = 4,
-  JoinCausing = 5,
+#define SOURCEMETA_CORE_UCD_ENUM_ENTRY(name, alias) name,
+  SOURCEMETA_CORE_JOINING_TYPE_LIST(SOURCEMETA_CORE_UCD_ENUM_ENTRY)
+#undef SOURCEMETA_CORE_UCD_ENUM_ENTRY
 };
 
+/// @ingroup unicode
+/// Each entry maps a `BidiClass` enum name to its UCD short alias.
+#define SOURCEMETA_CORE_BIDI_CLASS_LIST(X)                                     \
+  X(LeftToRight, "L")                                                          \
+  X(RightToLeft, "R")                                                          \
+  X(ArabicLetter, "AL")                                                        \
+  X(EuropeanNumber, "EN")                                                      \
+  X(EuropeanSeparator, "ES")                                                   \
+  X(EuropeanTerminator, "ET")                                                  \
+  X(ArabicNumber, "AN")                                                        \
+  X(CommonSeparator, "CS")                                                     \
+  X(NonspacingMark, "NSM")                                                     \
+  X(BoundaryNeutral, "BN")                                                     \
+  X(ParagraphSeparator, "B")                                                   \
+  X(SegmentSeparator, "S")                                                     \
+  X(WhiteSpace, "WS")                                                          \
+  X(OtherNeutral, "ON")                                                        \
+  X(LeftToRightEmbedding, "LRE")                                               \
+  X(LeftToRightOverride, "LRO")                                                \
+  X(RightToLeftEmbedding, "RLE")                                               \
+  X(RightToLeftOverride, "RLO")                                                \
+  X(PopDirectionalFormat, "PDF")                                               \
+  X(LeftToRightIsolate, "LRI")                                                 \
+  X(RightToLeftIsolate, "RLI")                                                 \
+  X(FirstStrongIsolate, "FSI")                                                 \
+  X(PopDirectionalIsolate, "PDI")
+
 /// @ingroup unicode
 /// The bidirectional class of a Unicode codepoint per UAX #44. See
 /// https://www.unicode.org/reports/tr44/ for the property's definition.
 enum class BidiClass : std::uint8_t {
-  LeftToRight = 0,
-  RightToLeft = 1,
-  ArabicLetter = 2,
-  EuropeanNumber = 3,
-  EuropeanSeparator = 4,
-  EuropeanTerminator = 5,
-  ArabicNumber = 6,
-  CommonSeparator = 7,
-  NonspacingMark = 8,
-  BoundaryNeutral = 9,
-  ParagraphSeparator = 10,
-  SegmentSeparator = 11,
-  WhiteSpace = 12,
-  OtherNeutral = 13,
-  LeftToRightEmbedding = 14,
-  LeftToRightOverride = 15,
-  RightToLeftEmbedding = 16,
-  RightToLeftOverride = 17,
-  PopDirectionalFormat = 18,
-  LeftToRightIsolate = 19,
-  RightToLeftIsolate = 20,
-  FirstStrongIsolate = 21,
-  PopDirectionalIsolate = 22,
+#define SOURCEMETA_CORE_UCD_ENUM_ENTRY(name, alias) name,
+  SOURCEMETA_CORE_BIDI_CLASS_LIST(SOURCEMETA_CORE_UCD_ENUM_ENTRY)
+#undef SOURCEMETA_CORE_UCD_ENUM_ENTRY
 };
 
+/// @ingroup unicode
+/// Each entry maps a `UnicodeScript` enum name to its UCD long alias.
+/// Per UAX #24 §1.4, `Katakana_Or_Hiragana` only appears in the
+/// `Script_Extensions` property and never in the `Script` property itself.
+#define SOURCEMETA_CORE_UNICODE_SCRIPT_LIST(X)                                 \
+  X(Adlam, "Adlam")                                                            \
+  X(Ahom, "Ahom")                                                              \
+  X(AnatolianHieroglyphs, "Anatolian_Hieroglyphs")                             \
+  X(Arabic, "Arabic")                                                          \
+  X(Armenian, "Armenian")                                                      \
+  X(Avestan, "Avestan")                                                        \
+  X(Balinese, "Balinese")                                                      \
+  X(Bamum, "Bamum")                                                            \
+  X(BassaVah, "Bassa_Vah")                                                     \
+  X(Batak, "Batak")                                                            \
+  X(Bengali, "Bengali")                                                        \
+  X(BeriaErfe, "Beria_Erfe")                                                   \
+  X(Bhaiksuki, "Bhaiksuki")                                                    \
+  X(Bopomofo, "Bopomofo")                                                      \
+  X(Brahmi, "Brahmi")                                                          \
+  X(Braille, "Braille")                                                        \
+  X(Buginese, "Buginese")                                                      \
+  X(Buhid, "Buhid")                                                            \
+  X(CanadianAboriginal, "Canadian_Aboriginal")                                 \
+  X(Carian, "Carian")                                                          \
+  X(CaucasianAlbanian, "Caucasian_Albanian")                                   \
+  X(Chakma, "Chakma")                                                          \
+  X(Cham, "Cham")                                                              \
+  X(Cherokee, "Cherokee")                                                      \
+  X(Chorasmian, "Chorasmian")                                                  \
+  X(Common, "Common")                                                          \
+  X(Coptic, "Coptic")                                                          \
+  X(Cuneiform, "Cuneiform")                                                    \
+  X(Cypriot, "Cypriot")                                                        \
+  X(CyproMinoan, "Cypro_Minoan")                                               \
+  X(Cyrillic, "Cyrillic")                                                      \
+  X(Deseret, "Deseret")                                                        \
+  X(Devanagari, "Devanagari")                                                  \
+  X(DivesAkuru, "Dives_Akuru")                                                 \
+  X(Dogra, "Dogra")                                                            \
+  X(Duployan, "Duployan")                                                      \
+  X(EgyptianHieroglyphs, "Egyptian_Hieroglyphs")                               \
+  X(Elbasan, "Elbasan")                                                        \
+  X(Elymaic, "Elymaic")                                                        \
+  X(Ethiopic, "Ethiopic")                                                      \
+  X(Garay, "Garay")                                                            \
+  X(Georgian, "Georgian")                                                      \
+  X(Glagolitic, "Glagolitic")                                                  \
+  X(Gothic, "Gothic")                                                          \
+  X(Grantha, "Grantha")                                                        \
+  X(Greek, "Greek")                                                            \
+  X(Gujarati, "Gujarati")                                                      \
+  X(GunjalaGondi, "Gunjala_Gondi")                                             \
+  X(Gurmukhi, "Gurmukhi")                                                      \
+  X(GurungKhema, "Gurung_Khema")                                               \
+  X(Han, "Han")                                                                \
+  X(Hangul, "Hangul")                                                          \
+  X(HanifiRohingya, "Hanifi_Rohingya")                                         \
+  X(Hanunoo, "Hanunoo")                                                        \
+  X(Hatran, "Hatran")                                                          \
+  X(Hebrew, "Hebrew")                                                          \
+  X(Hiragana, "Hiragana")                                                      \
+  X(ImperialAramaic, "Imperial_Aramaic")                                       \
+  X(Inherited, "Inherited")                                                    \
+  X(InscriptionalPahlavi, "Inscriptional_Pahlavi")                             \
+  X(InscriptionalParthian, "Inscriptional_Parthian")                           \
+  X(Javanese, "Javanese")                                                      \
+  X(Kaithi, "Kaithi")                                                          \
+  X(Kannada, "Kannada")                                                        \
+  X(Katakana, "Katakana")                                                      \
+  X(Kawi, "Kawi")                                                              \
+  X(KayahLi, "Kayah_Li")                                                       \
+  X(Kharoshthi, "Kharoshthi")                                                  \
+  X(KhitanSmallScript, "Khitan_Small_Script")                                  \
+  X(Khmer, "Khmer")                                                            \
+  X(Khojki, "Khojki")                                                          \
+  X(Khudawadi, "Khudawadi")                                                    \
+  X(KiratRai, "Kirat_Rai")                                                     \
+  X(Lao, "Lao")                                                                \
+  X(Latin, "Latin")                                                            \
+  X(Lepcha, "Lepcha")                                                          \
+  X(Limbu, "Limbu")                                                            \
+  X(LinearA, "Linear_A")                                                       \
+  X(LinearB, "Linear_B")                                                       \
+  X(Lisu, "Lisu")                                                              \
+  X(Lycian, "Lycian")                                                          \
+  X(Lydian, "Lydian")                                                          \
+  X(Mahajani, "Mahajani")                                                      \
+  X(Makasar, "Makasar")                                                        \
+  X(Malayalam, "Malayalam")                                                    \
+  X(Mandaic, "Mandaic")                                                        \
+  X(Manichaean, "Manichaean")                                                  \
+  X(Marchen, "Marchen")                                                        \
+  X(MasaramGondi, "Masaram_Gondi")                                             \
+  X(Medefaidrin, "Medefaidrin")                                                \
+  X(MeeteiMayek, "Meetei_Mayek")                                               \
+  X(MendeKikakui, "Mende_Kikakui")                                             \
+  X(MeroiticCursive, "Meroitic_Cursive")                                       \
+  X(MeroiticHieroglyphs, "Meroitic_Hieroglyphs")                               \
+  X(Miao, "Miao")                                                              \
+  X(Modi, "Modi")                                                              \
+  X(Mongolian, "Mongolian")                                                    \
+  X(Mro, "Mro")                                                                \
+  X(Multani, "Multani")                                                        \
+  X(Myanmar, "Myanmar")                                                        \
+  X(Nabataean, "Nabataean")                                                    \
+  X(NagMundari, "Nag_Mundari")                                                 \
+  X(Nandinagari, "Nandinagari")                                                \
+  X(NewTaiLue, "New_Tai_Lue")                                                  \
+  X(Newa, "Newa")                                                              \
+  X(Nko, "Nko")                                                                \
+  X(Nushu, "Nushu")                                                            \
+  X(NyiakengPuachueHmong, "Nyiakeng_Puachue_Hmong")                            \
+  X(Ogham, "Ogham")                                                            \
+  X(OlChiki, "Ol_Chiki")                                                       \
+  X(OlOnal, "Ol_Onal")                                                         \
+  X(OldHungarian, "Old_Hungarian")                                             \
+  X(OldItalic, "Old_Italic")                                                   \
+  X(OldNorthArabian, "Old_North_Arabian")                                      \
+  X(OldPermic, "Old_Permic")                                                   \
+  X(OldPersian, "Old_Persian")                                                 \
+  X(OldSogdian, "Old_Sogdian")                                                 \
+  X(OldSouthArabian, "Old_South_Arabian")                                      \
+  X(OldTurkic, "Old_Turkic")                                                   \
+  X(OldUyghur, "Old_Uyghur")                                                   \
+  X(Oriya, "Oriya")                                                            \
+  X(Osage, "Osage")                                                            \
+  X(Osmanya, "Osmanya")                                                        \
+  X(PahawhHmong, "Pahawh_Hmong")                                               \
+  X(Palmyrene, "Palmyrene")                                                    \
+  X(PauCinHau, "Pau_Cin_Hau")                                                  \
+  X(PhagsPa, "Phags_Pa")                                                       \
+  X(Phoenician, "Phoenician")                                                  \
+  X(PsalterPahlavi, "Psalter_Pahlavi")                                         \
+  X(Rejang, "Rejang")                                                          \
+  X(Runic, "Runic")                                                            \
+  X(Samaritan, "Samaritan")                                                    \
+  X(Saurashtra, "Saurashtra")                                                  \
+  X(Sharada, "Sharada")                                                        \
+  X(Shavian, "Shavian")                                                        \
+  X(Siddham, "Siddham")                                                        \
+  X(Sidetic, "Sidetic")                                                        \
+  X(SignWriting, "SignWriting")                                                \
+  X(Sinhala, "Sinhala")                                                        \
+  X(Sogdian, "Sogdian")                                                        \
+  X(SoraSompeng, "Sora_Sompeng")                                               \
+  X(Soyombo, "Soyombo")                                                        \
+  X(Sundanese, "Sundanese")                                                    \
+  X(Sunuwar, "Sunuwar")                                                        \
+  X(SylotiNagri, "Syloti_Nagri")                                               \
+  X(Syriac, "Syriac")                                                          \
+  X(Tagalog, "Tagalog")                                                        \
+  X(Tagbanwa, "Tagbanwa")                                                      \
+  X(TaiLe, "Tai_Le")                                                           \
+  X(TaiTham, "Tai_Tham")                                                       \
+  X(TaiViet, "Tai_Viet")                                                       \
+  X(TaiYo, "Tai_Yo")                                                           \
+  X(Takri, "Takri")                                                            \
+  X(Tamil, "Tamil")                                                            \
+  X(Tangsa, "Tangsa")                                                          \
+  X(Tangut, "Tangut")                                                          \
+  X(Telugu, "Telugu")                                                          \
+  X(Thaana, "Thaana")                                                          \
+  X(Thai, "Thai")                                                              \
+  X(Tibetan, "Tibetan")                                                        \
+  X(Tifinagh, "Tifinagh")                                                      \
+  X(Tirhuta, "Tirhuta")                                                        \
+  X(Todhri, "Todhri")                                                          \
+  X(TolongSiki, "Tolong_Siki")                                                 \
+  X(Toto, "Toto")                                                              \
+  X(TuluTigalari, "Tulu_Tigalari")                                             \
+  X(Ugaritic, "Ugaritic")                                                      \
+  X(Unknown, "Unknown")                                                        \
+  X(Vai, "Vai")                                                                \
+  X(Vithkuqi, "Vithkuqi")                                                      \
+  X(Wancho, "Wancho")                                                          \
+  X(WarangCiti, "Warang_Citi")                                                 \
+  X(Yezidi, "Yezidi")                                                          \
+  X(Yi, "Yi")                                                                  \
+  X(ZanabazarSquare, "Zanabazar_Square")                                       \
+  X(KatakanaOrHiragana, "Katakana_Or_Hiragana")
+
 /// @ingroup unicode
 /// The script of a Unicode codepoint per UAX #24. See
 /// https://www.unicode.org/reports/tr24/ for the property's definition.
 enum class UnicodeScript : std::uint8_t {
-  Adlam = 0,
-  Ahom = 1,
-  AnatolianHieroglyphs = 2,
-  Arabic = 3,
-  Armenian = 4,
-  Avestan = 5,
-  Balinese = 6,
-  Bamum = 7,
-  BassaVah = 8,
-  Batak = 9,
-  Bengali = 10,
-  BeriaErfe = 11,
-  Bhaiksuki = 12,
-  Bopomofo = 13,
-  Brahmi = 14,
-  Braille = 15,
-  Buginese = 16,
-  Buhid = 17,
-  CanadianAboriginal = 18,
-  Carian = 19,
-  CaucasianAlbanian = 20,
-  Chakma = 21,
-  Cham = 22,
-  Cherokee = 23,
-  Chorasmian = 24,
-  Common = 25,
-  Coptic = 26,
-  Cuneiform = 27,
-  Cypriot = 28,
-  CyproMinoan = 29,
-  Cyrillic = 30,
-  Deseret = 31,
-  Devanagari = 32,
-  DivesAkuru = 33,
-  Dogra = 34,
-  Duployan = 35,
-  EgyptianHieroglyphs = 36,
-  Elbasan = 37,
-  Elymaic = 38,
-  Ethiopic = 39,
-  Garay = 40,
-  Georgian = 41,
-  Glagolitic = 42,
-  Gothic = 43,
-  Grantha = 44,
-  Greek = 45,
-  Gujarati = 46,
-  GunjalaGondi = 47,
-  Gurmukhi = 48,
-  GurungKhema = 49,
-  Han = 50,
-  Hangul = 51,
-  HanifiRohingya = 52,
-  Hanunoo = 53,
-  Hatran = 54,
-  Hebrew = 55,
-  Hiragana = 56,
-  ImperialAramaic = 57,
-  Inherited = 58,
-  InscriptionalPahlavi = 59,
-  InscriptionalParthian = 60,
-  Javanese = 61,
-  Kaithi = 62,
-  Kannada = 63,
-  Katakana = 64,
-  Kawi = 65,
-  KayahLi = 66,
-  Kharoshthi = 67,
-  KhitanSmallScript = 68,
-  Khmer = 69,
-  Khojki = 70,
-  Khudawadi = 71,
-  KiratRai = 72,
-  Lao = 73,
-  Latin = 74,
-  Lepcha = 75,
-  Limbu = 76,
-  LinearA = 77,
-  LinearB = 78,
-  Lisu = 79,
-  Lycian = 80,
-  Lydian = 81,
-  Mahajani = 82,
-  Makasar = 83,
-  Malayalam = 84,
-  Mandaic = 85,
-  Manichaean = 86,
-  Marchen = 87,
-  MasaramGondi = 88,
-  Medefaidrin = 89,
-  MeeteiMayek = 90,
-  MendeKikakui = 91,
-  MeroiticCursive = 92,
-  MeroiticHieroglyphs = 93,
-  Miao = 94,
-  Modi = 95,
-  Mongolian = 96,
-  Mro = 97,
-  Multani = 98,
-  Myanmar = 99,
-  Nabataean = 100,
-  NagMundari = 101,
-  Nandinagari = 102,
-  NewTaiLue = 103,
-  Newa = 104,
-  Nko = 105,
-  Nushu = 106,
-  NyiakengPuachueHmong = 107,
-  Ogham = 108,
-  OlChiki = 109,
-  OlOnal = 110,
-  OldHungarian = 111,
-  OldItalic = 112,
-  OldNorthArabian = 113,
-  OldPermic = 114,
-  OldPersian = 115,
-  OldSogdian = 116,
-  OldSouthArabian = 117,
-  OldTurkic = 118,
-  OldUyghur = 119,
-  Oriya = 120,
-  Osage = 121,
-  Osmanya = 122,
-  PahawhHmong = 123,
-  Palmyrene = 124,
-  PauCinHau = 125,
-  PhagsPa = 126,
-  Phoenician = 127,
-  PsalterPahlavi = 128,
-  Rejang = 129,
-  Runic = 130,
-  Samaritan = 131,
-  Saurashtra = 132,
-  Sharada = 133,
-  Shavian = 134,
-  Siddham = 135,
-  Sidetic = 136,
-  SignWriting = 137,
-  Sinhala = 138,
-  Sogdian = 139,
-  SoraSompeng = 140,
-  Soyombo = 141,
-  Sundanese = 142,
-  Sunuwar = 143,
-  SylotiNagri = 144,
-  Syriac = 145,
-  Tagalog = 146,
-  Tagbanwa = 147,
-  TaiLe = 148,
-  TaiTham = 149,
-  TaiViet = 150,
-  TaiYo = 151,
-  Takri = 152,
-  Tamil = 153,
-  Tangsa = 154,
-  Tangut = 155,
-  Telugu = 156,
-  Thaana = 157,
-  Thai = 158,
-  Tibetan = 159,
-  Tifinagh = 160,
-  Tirhuta = 161,
-  Todhri = 162,
-  TolongSiki = 163,
-  Toto = 164,
-  TuluTigalari = 165,
-  Ugaritic = 166,
-  Unknown = 167,
-  Vai = 168,
-  Vithkuqi = 169,
-  Wancho = 170,
-  WarangCiti = 171,
-  Yezidi = 172,
-  Yi = 173,
-  ZanabazarSquare = 174,
-  // Per UAX #24 §1.4, the value Katakana_Or_Hiragana only appears in the
-  // Script_Extensions property and never in the Script property itself.
-  KatakanaOrHiragana = 175,
+#define SOURCEMETA_CORE_UCD_ENUM_ENTRY(name, alias) name,
+  SOURCEMETA_CORE_UNICODE_SCRIPT_LIST(SOURCEMETA_CORE_UCD_ENUM_ENTRY)
+#undef SOURCEMETA_CORE_UCD_ENUM_ENTRY
 };
 
+/// @ingroup unicode
+/// Each entry maps an `NFCQuickCheck` enum name to its UCD short alias.
+#define SOURCEMETA_CORE_NFC_QUICK_CHECK_LIST(X)                                \
+  X(Yes, "Y")                                                                  \
+  X(No, "N")                                                                   \
+  X(Maybe, "M")
+
 /// @ingroup unicode
 /// The NFC quick-check result for a Unicode codepoint per UAX #15.
 /// See https://www.unicode.org/reports/tr15/ for the property's definition.
 enum class NFCQuickCheck : std::uint8_t {
-  Yes = 0,
-  No = 1,
-  Maybe = 2,
+#define SOURCEMETA_CORE_UCD_ENUM_ENTRY(name, alias) name,
+  SOURCEMETA_CORE_NFC_QUICK_CHECK_LIST(SOURCEMETA_CORE_UCD_ENUM_ENTRY)
+#undef SOURCEMETA_CORE_UCD_ENUM_ENTRY
 };
 
 } // namespace sourcemeta::core
diff --git a/vendor/core/src/lang/io/include/sourcemeta/core/io.h b/vendor/core/src/lang/io/include/sourcemeta/core/io.h
index 46c43b27c..82632183b 100644
--- a/vendor/core/src/lang/io/include/sourcemeta/core/io.h
+++ b/vendor/core/src/lang/io/include/sourcemeta/core/io.h
@@ -220,6 +220,30 @@ auto read_file_to_string(const std::filesystem::path &path)
 /// ```
 inline auto read_stdin() -> std::string { return read_to_string(std::cin); }
 
+/// @ingroup io
+///
+/// Iterate the lines of `stream`, invoking `callback` with each line. The
+/// line view is only valid for the duration of the callback. For example:
+///
+/// ```cpp
+/// #include <sourcemeta/core/io.h>
+/// #include <iostream>
+/// #include <sstream>
+///
+/// std::istringstream stream{"alpha\nbeta\ngamma\n"};
+/// sourcemeta::core::for_each_line(stream,
+///     [](const std::string_view line) {
+///       std::cout << line << '\n';
+///     });
+/// ```
+template <typename Callback>
+auto for_each_line(std::istream &stream, Callback callback) -> void {
+  std::string line;
+  while (std::getline(stream, line)) {
+    callback(std::string_view{line});
+  }
+}
+
 /// @ingroup io
 ///
 /// Recursively mirror a directory tree using hard links for regular files.
diff --git a/vendor/core/src/lang/numeric/include/sourcemeta/core/numeric_parse.h b/vendor/core/src/lang/numeric/include/sourcemeta/core/numeric_parse.h
index ff745ecf3..c30def5f9 100644
--- a/vendor/core/src/lang/numeric/include/sourcemeta/core/numeric_parse.h
+++ b/vendor/core/src/lang/numeric/include/sourcemeta/core/numeric_parse.h
@@ -5,35 +5,47 @@
 #include <sourcemeta/core/numeric_export.h>
 #endif
 
-#include <cstdint>  // std::int64_t, std::uint64_t
-#include <optional> // std::optional
-#include <string>   // std::string
+#include <cstdint>     // std::int64_t, std::uint32_t, std::uint64_t
+#include <optional>    // std::optional
+#include <string_view> // std::string_view
 
 namespace sourcemeta::core {
 
 /// @ingroup numeric
 /// Attempt to parse a string as a double
 SOURCEMETA_CORE_NUMERIC_EXPORT
-auto to_double(const std::string &input) noexcept -> std::optional<double>;
+auto to_double(const std::string_view input) noexcept -> std::optional<double>;
 
 /// @ingroup numeric
 /// Attempt to parse a string as a signed 64-bit integer
 SOURCEMETA_CORE_NUMERIC_EXPORT
-auto to_int64_t(const std::string &input) noexcept
+auto to_int64_t(const std::string_view input) noexcept
     -> std::optional<std::int64_t>;
 
 /// @ingroup numeric
 /// Attempt to parse a string as a signed 64-bit integer in a given base
 SOURCEMETA_CORE_NUMERIC_EXPORT
-auto to_int64_t(const std::string &input, const int base) noexcept
+auto to_int64_t(const std::string_view input, const int base) noexcept
     -> std::optional<std::int64_t>;
 
 /// @ingroup numeric
 /// Attempt to parse a string as an unsigned 64-bit decimal integer.
 SOURCEMETA_CORE_NUMERIC_EXPORT
-auto to_uint64_t(const std::string &input) noexcept
+auto to_uint64_t(const std::string_view input) noexcept
     -> std::optional<std::uint64_t>;
 
+/// @ingroup numeric
+/// Attempt to parse a string as an unsigned 32-bit decimal integer
+SOURCEMETA_CORE_NUMERIC_EXPORT
+auto to_uint32_t(const std::string_view input) noexcept
+    -> std::optional<std::uint32_t>;
+
+/// @ingroup numeric
+/// Attempt to parse a string as an unsigned 32-bit integer in a given base
+SOURCEMETA_CORE_NUMERIC_EXPORT
+auto to_uint32_t(const std::string_view input, const int base) noexcept
+    -> std::optional<std::uint32_t>;
+
 } // namespace sourcemeta::core
 
 #endif
diff --git a/vendor/core/src/lang/numeric/parse.cc b/vendor/core/src/lang/numeric/parse.cc
index 645d7f869..0edd740b2 100644
--- a/vendor/core/src/lang/numeric/parse.cc
+++ b/vendor/core/src/lang/numeric/parse.cc
@@ -2,32 +2,31 @@
 
 #include <charconv>     // std::from_chars
 #include <cstddef>      // std::size_t
-#include <stdexcept>    // std::invalid_argument, std::out_of_range
+#include <string>       // std::string
 #include <system_error> // std::errc
 
 namespace sourcemeta::core {
 
-auto to_double(const std::string &input) noexcept -> std::optional<double> {
+auto to_double(const std::string_view input) noexcept -> std::optional<double> {
   try {
+    const std::string owned{input};
     std::size_t position{0};
-    const auto value{std::stod(input, &position)};
-    if (position != input.size()) {
+    const auto value{std::stod(owned, &position)};
+    if (position != owned.size()) {
       return std::nullopt;
     }
     return value;
-  } catch (const std::invalid_argument &) {
-    return std::nullopt;
-  } catch (const std::out_of_range &) {
+  } catch (...) {
     return std::nullopt;
   }
 }
 
-auto to_int64_t(const std::string &input) noexcept
+auto to_int64_t(const std::string_view input) noexcept
     -> std::optional<std::int64_t> {
   return to_int64_t(input, 10);
 }
 
-auto to_int64_t(const std::string &input, const int base) noexcept
+auto to_int64_t(const std::string_view input, const int base) noexcept
     -> std::optional<std::int64_t> {
   std::int64_t value{};
   const auto result =
@@ -39,7 +38,7 @@ auto to_int64_t(const std::string &input, const int base) noexcept
   return value;
 }
 
-auto to_uint64_t(const std::string &input) noexcept
+auto to_uint64_t(const std::string_view input) noexcept
     -> std::optional<std::uint64_t> {
   std::uint64_t value{};
   const auto result =
@@ -51,4 +50,21 @@ auto to_uint64_t(const std::string &input) noexcept
   return value;
 }
 
+auto to_uint32_t(const std::string_view input) noexcept
+    -> std::optional<std::uint32_t> {
+  return to_uint32_t(input, 10);
+}
+
+auto to_uint32_t(const std::string_view input, const int base) noexcept
+    -> std::optional<std::uint32_t> {
+  std::uint32_t value{};
+  const auto result =
+      std::from_chars(input.data(), input.data() + input.size(), value, base);
+  if (result.ec != std::errc{} || result.ptr != input.data() + input.size()) {
+    return std::nullopt;
+  }
+
+  return value;
+}
+
 } // namespace sourcemeta::core
diff --git a/vendor/core/src/lang/text/include/sourcemeta/core/text.h b/vendor/core/src/lang/text/include/sourcemeta/core/text.h
index 599892230..26ffc0616 100644
--- a/vendor/core/src/lang/text/include/sourcemeta/core/text.h
+++ b/vendor/core/src/lang/text/include/sourcemeta/core/text.h
@@ -6,8 +6,11 @@
 #endif
 
 #include <cstddef>     // std::size_t
+#include <optional>    // std::optional
+#include <ostream>     // std::ostream
 #include <string>      // std::string
 #include <string_view> // std::string_view
+#include <utility>     // std::pair
 
 /// @defgroup text Text
 /// @brief A collection of general-purpose text manipulation utilities
@@ -71,6 +74,135 @@ SOURCEMETA_CORE_TEXT_EXPORT
 auto truncate(std::string &input, const std::size_t maximum_length,
               const std::string_view marker) -> void;
 
+/// @ingroup text
+///
+/// Return `input` with leading and trailing ASCII whitespace removed. For
+/// example:
+///
+/// ```cpp
+/// #include <sourcemeta/core/text.h>
+/// #include <cassert>
+///
+/// assert(sourcemeta::core::trim("  hello  ") == "hello");
+/// assert(sourcemeta::core::trim("\t\nfoo\r\n") == "foo");
+/// assert(sourcemeta::core::trim("   ").empty());
+/// ```
+SOURCEMETA_CORE_TEXT_EXPORT
+auto trim(const std::string_view input) noexcept -> std::string_view;
+
+/// @ingroup text
+///
+/// Return the prefix of `input` up to (but excluding) the first occurrence
+/// of `marker`, or the full input when `marker` is absent. For example:
+///
+/// ```cpp
+/// #include <sourcemeta/core/text.h>
+/// #include <cassert>
+///
+/// assert(sourcemeta::core::take_until("foo # bar", '#') == "foo ");
+/// assert(sourcemeta::core::take_until("no marker", '#') == "no marker");
+/// assert(sourcemeta::core::take_until("#leading", '#').empty());
+/// ```
+SOURCEMETA_CORE_TEXT_EXPORT
+auto take_until(const std::string_view input, const char marker) noexcept
+    -> std::string_view;
+
+/// @ingroup text
+///
+/// Split `input` at the first occurrence of `delimiter`, returning the
+/// parts before and after it. Return `std::nullopt` when the delimiter is
+/// absent. For example:
+///
+/// ```cpp
+/// #include <sourcemeta/core/text.h>
+/// #include <cassert>
+///
+/// const auto parts{sourcemeta::core::split_once("key=value", '=')};
+/// assert(parts.has_value());
+/// assert(parts->first == "key");
+/// assert(parts->second == "value");
+/// assert(!sourcemeta::core::split_once("no separator", '=').has_value());
+/// ```
+SOURCEMETA_CORE_TEXT_EXPORT
+auto split_once(const std::string_view input, const char delimiter) noexcept
+    -> std::optional<std::pair<std::string_view, std::string_view>>;
+
+/// @ingroup text
+///
+/// Split `input` at the first occurrence of `delimiter`, returning the
+/// parts before and after it. Return `std::nullopt` when the delimiter is
+/// absent or empty. For example:
+///
+/// ```cpp
+/// #include <sourcemeta/core/text.h>
+/// #include <cassert>
+///
+/// const auto parts{sourcemeta::core::split_once("1..5", "..")};
+/// assert(parts.has_value());
+/// assert(parts->first == "1");
+/// assert(parts->second == "5");
+/// ```
+SOURCEMETA_CORE_TEXT_EXPORT
+auto split_once(const std::string_view input,
+                const std::string_view delimiter) noexcept
+    -> std::optional<std::pair<std::string_view, std::string_view>>;
+
+/// @ingroup text
+///
+/// Iterate the parts of `input` separated by `delimiter`, invoking
+/// `callback` with each part. For example:
+///
+/// ```cpp
+/// #include <sourcemeta/core/text.h>
+/// #include <iostream>
+///
+/// sourcemeta::core::split("alpha;beta;gamma", ';',
+///     [](const std::string_view part) {
+///       std::cout << part << '\n';
+///     });
+/// ```
+template <typename Callback>
+auto split(const std::string_view input, const char delimiter,
+           Callback callback) -> void {
+  std::string_view rest{input};
+  while (true) {
+    const auto next{sourcemeta::core::split_once(rest, delimiter)};
+    if (!next.has_value()) {
+      callback(rest);
+      return;
+    }
+    callback(next->first);
+    rest = next->second;
+  }
+}
+
+/// @ingroup text
+///
+/// Stream each item of `items` to `stream`, separated by `separator`. For
+/// example:
+///
+/// ```cpp
+/// #include <sourcemeta/core/text.h>
+/// #include <array>
+/// #include <iostream>
+///
+/// constexpr std::array<int, 3> values{1, 2, 3};
+/// sourcemeta::core::join_to(std::cout, values, ", ");
+/// // prints: 1, 2, 3
+/// ```
+template <typename Range>
+auto join_to(std::ostream &stream, const Range &items,
+             const std::string_view separator) -> void {
+  bool first{true};
+  for (const auto &item : items) {
+    if (!first) {
+      stream << separator;
+    }
+    stream << item;
+    first = false;
+  }
+}
+
 /// @ingroup text
 ///
 /// Return `input` with `suffix` removed from the end under ASCII
diff --git a/vendor/core/src/lang/text/text.cc b/vendor/core/src/lang/text/text.cc
index 07e641263..368e91528 100644
--- a/vendor/core/src/lang/text/text.cc
+++ b/vendor/core/src/lang/text/text.cc
@@ -2,7 +2,18 @@
 
 #include <cctype>      // std::isalpha, std::toupper
 #include <cstddef>     // std::size_t
+#include <optional>    // std::optional, std::nullopt
 #include <string_view> // std::string_view
+#include <utility>     // std::pair
+
+namespace {
+
+auto is_ascii_whitespace(const char character) noexcept -> bool {
+  return character == ' ' || character == '\t' || character == '\n' ||
+         character == '\v' || character == '\f' || character == '\r';
+}
+
+} // namespace
 
 namespace sourcemeta::core {
 
@@ -56,6 +67,58 @@ auto truncate(std::string &input, const std::size_t maximum_length,
   input.append(marker);
 }
 
+auto trim(const std::string_view input) noexcept -> std::string_view {
+  std::string_view result{input};
+  while (!result.empty() && is_ascii_whitespace(result.front())) {
+    result.remove_prefix(1);
+  }
+  while (!result.empty() && is_ascii_whitespace(result.back())) {
+    result.remove_suffix(1);
+  }
+  return result;
+}
+
+auto take_until(const std::string_view input, const char marker) noexcept
+    -> std::string_view {
+  const auto position{input.find(marker)};
+  if (position == std::string_view::npos) {
+    return input;
+  }
+  std::string_view result{input};
+  result.remove_suffix(input.size() - position);
+  return result;
+}
+
+auto split_once(const std::string_view input, const char delimiter) noexcept
+    -> std::optional<std::pair<std::string_view, std::string_view>> {
+  const auto position{input.find(delimiter)};
+  if (position == std::string_view::npos) {
+    return std::nullopt;
+  }
+  std::string_view before{input};
+  before.remove_suffix(input.size() - position);
+  std::string_view after{input};
+  after.remove_prefix(position + 1);
+  return std::pair{before, after};
+}
+
+auto split_once(const std::string_view input,
+                const std::string_view delimiter) noexcept
+    -> std::optional<std::pair<std::string_view, std::string_view>> {
+  if (delimiter.empty()) {
+    return std::nullopt;
+  }
+  const auto position{input.find(delimiter)};
+  if (position == std::string_view::npos) {
+    return std::nullopt;
+  }
+  std::string_view before{input};
+  before.remove_suffix(input.size() - position);
+  std::string_view after{input};
+  after.remove_prefix(position + delimiter.size());
+  return std::pair{before, after};
+}
+
 auto remove_suffix_ignore_case(const std::string_view input,
                                const std::string_view suffix) noexcept
     -> std::string_view {