From 0c3cf72757fcb6ff72a1020589446bc55a08ad2d Mon Sep 17 00:00:00 2001 From: Juan Cruz Viotti Date: Tue, 26 May 2026 16:43:00 -0400 Subject: [PATCH 1/2] Implement `idna_is_valid_a_label` Signed-off-by: Juan Cruz Viotti --- config.cmake.in | 1 + src/core/idna/CMakeLists.txt | 1 + src/core/idna/idna.cc | 50 ++++++++++++++++- src/core/idna/include/sourcemeta/core/idna.h | 22 ++++++++ test/idna/CMakeLists.txt | 3 +- test/idna/idna_is_valid_a_label_test.cc | 56 ++++++++++++++++++++ 6 files changed, 131 insertions(+), 2 deletions(-) create mode 100644 test/idna/idna_is_valid_a_label_test.cc diff --git a/config.cmake.in b/config.cmake.in index b448a0cb3..bbcd7477b 100644 --- a/config.cmake.in +++ b/config.cmake.in @@ -71,6 +71,7 @@ foreach(component ${SOURCEMETA_CORE_COMPONENTS}) include("${CMAKE_CURRENT_LIST_DIR}/sourcemeta_core_ip.cmake") elseif(component STREQUAL "idna") include("${CMAKE_CURRENT_LIST_DIR}/sourcemeta_core_unicode.cmake") + include("${CMAKE_CURRENT_LIST_DIR}/sourcemeta_core_punycode.cmake") include("${CMAKE_CURRENT_LIST_DIR}/sourcemeta_core_idna.cmake") elseif(component STREQUAL "dns") include("${CMAKE_CURRENT_LIST_DIR}/sourcemeta_core_unicode.cmake") diff --git a/src/core/idna/CMakeLists.txt b/src/core/idna/CMakeLists.txt index 38483f467..b37440dd1 100644 --- a/src/core/idna/CMakeLists.txt +++ b/src/core/idna/CMakeLists.txt @@ -30,3 +30,4 @@ if(SOURCEMETA_CORE_INSTALL) endif() target_link_libraries(sourcemeta_core_idna PRIVATE sourcemeta::core::unicode) +target_link_libraries(sourcemeta_core_idna PRIVATE sourcemeta::core::punycode) diff --git a/src/core/idna/idna.cc b/src/core/idna/idna.cc index 685c91f33..0af38bd44 100644 --- a/src/core/idna/idna.cc +++ b/src/core/idna/idna.cc @@ -1,9 +1,11 @@ #include +#include #include #include // std::size_t -#include // std::u32string_view +#include // std::string, std::u32string +#include // std::string_view, std::u32string_view #include "idna_data.h" @@ -280,4 +282,50 @@ auto idna_passes_bidi_rule(const std::u32string_view label) noexcept -> bool { return false; } +auto idna_is_valid_a_label(const std::string_view label) noexcept -> bool { + try { + constexpr std::string_view prefix{"xn--"}; + if (!label.starts_with(prefix)) { + return false; + } + + // RFC 5890 §2.3.2.1: A-labels are pure ASCII + for (const auto byte : label) { + if (static_cast(byte) > 0x7F) { + return false; + } + } + + const auto encoded{label.substr(prefix.size())}; + if (encoded.empty()) { + return false; + } + + const auto decoded{punycode_to_utf32(encoded)}; + + // RFC 5890 §2.3.2.1: a U-label contains at least one non-ASCII codepoint. + // A Punycode body that decodes to pure ASCII is not a real A-label. + bool has_non_ascii{false}; + for (const auto codepoint : decoded) { + if (codepoint > 0x7F) { + has_non_ascii = true; + break; + } + } + if (!has_non_ascii) { + return false; + } + + if (!idna_is_valid_u_label(decoded)) { + return false; + } + + // RFC 5891 §4.2: A-labels must be in canonical Punycode form, so + // re-encoding the decoded U-label must yield the original bytes. + return utf32_to_punycode(decoded) == encoded; + } catch (...) { + return false; + } +} + } // namespace sourcemeta::core diff --git a/src/core/idna/include/sourcemeta/core/idna.h b/src/core/idna/include/sourcemeta/core/idna.h index 198312bf3..d6f2c7406 100644 --- a/src/core/idna/include/sourcemeta/core/idna.h +++ b/src/core/idna/include/sourcemeta/core/idna.h @@ -115,6 +115,28 @@ auto idna_passes_bidi_rule(const std::u32string_view label) noexcept -> bool; SOURCEMETA_CORE_IDNA_EXPORT auto idna_is_valid_u_label(const std::u32string_view label) noexcept -> bool; +/// @ingroup idna +/// Return whether the given label is a valid A-label per RFC 5891 §4. See +/// https://www.rfc-editor.org/rfc/rfc5891#section-4 for the criteria. +/// A valid A-label starts with the ACE prefix "xn--", is pure ASCII, has a +/// non-empty Punycode body that decodes to a U-label containing at least +/// one non-ASCII codepoint, and round-trips through Punycode in its +/// canonical form. For example: +/// +/// ```cpp +/// #include +/// #include +/// +/// // xn--mnchen-3ya decodes to "München" +/// assert(sourcemeta::core::idna_is_valid_a_label("xn--mnchen-3ya")); +/// // Missing "xn--" prefix +/// assert(!sourcemeta::core::idna_is_valid_a_label("abc")); +/// // Decodes to "abc" (no non-ASCII codepoint) +/// assert(!sourcemeta::core::idna_is_valid_a_label("xn--abc-")); +/// ``` +SOURCEMETA_CORE_IDNA_EXPORT +auto idna_is_valid_a_label(const std::string_view label) noexcept -> bool; + } // namespace sourcemeta::core #endif diff --git a/test/idna/CMakeLists.txt b/test/idna/CMakeLists.txt index e22d99397..906b4041d 100644 --- a/test/idna/CMakeLists.txt +++ b/test/idna/CMakeLists.txt @@ -4,7 +4,8 @@ sourcemeta_googletest(NAMESPACE sourcemeta PROJECT core NAME idna idna_passes_contexto_test.cc idna_passes_contextj_test.cc idna_passes_bidi_rule_test.cc - idna_is_valid_u_label_test.cc) + idna_is_valid_u_label_test.cc + idna_is_valid_a_label_test.cc) target_link_libraries(sourcemeta_core_idna_unit PRIVATE sourcemeta::core::idna) diff --git a/test/idna/idna_is_valid_a_label_test.cc b/test/idna/idna_is_valid_a_label_test.cc new file mode 100644 index 000000000..b6b1299a2 --- /dev/null +++ b/test/idna/idna_is_valid_a_label_test.cc @@ -0,0 +1,56 @@ +#include + +#include + +TEST(IDNA_is_valid_a_label, munich_german) { + // xn--mnchen-3ya decodes to "M\u00FCnchen" + EXPECT_TRUE(sourcemeta::core::idna_is_valid_a_label("xn--mnchen-3ya")); +} + +TEST(IDNA_is_valid_a_label, volos_greek) { + // xn--nxasmq6b decodes to "\u03B2\u03CC\u03BB\u03BF\u03C2" + EXPECT_TRUE(sourcemeta::core::idna_is_valid_a_label("xn--nxasmq6b")); +} + +TEST(IDNA_is_valid_a_label, deja_french) { + // xn--dj-kia8a decodes to "d\u00E9j\u00E0" + EXPECT_TRUE(sourcemeta::core::idna_is_valid_a_label("xn--dj-kia8a")); +} + +TEST(IDNA_is_valid_a_label, missing_prefix) { + EXPECT_FALSE(sourcemeta::core::idna_is_valid_a_label("abc")); +} + +TEST(IDNA_is_valid_a_label, partial_prefix) { + EXPECT_FALSE(sourcemeta::core::idna_is_valid_a_label("xn-abc")); +} + +TEST(IDNA_is_valid_a_label, empty_input) { + EXPECT_FALSE(sourcemeta::core::idna_is_valid_a_label("")); +} + +TEST(IDNA_is_valid_a_label, only_prefix) { + EXPECT_FALSE(sourcemeta::core::idna_is_valid_a_label("xn--")); +} + +TEST(IDNA_is_valid_a_label, decodes_to_pure_ascii) { + // The body "abc-" is a Punycode encoding whose result has no non-ASCII + EXPECT_FALSE(sourcemeta::core::idna_is_valid_a_label("xn--abc-")); +} + +TEST(IDNA_is_valid_a_label, malformed_punycode_body) { + // A trailing hyphen with garbage extension chars is invalid Punycode + EXPECT_FALSE(sourcemeta::core::idna_is_valid_a_label("xn--abc-zzzzzzzz!")); +} + +TEST(IDNA_is_valid_a_label, non_ascii_byte_in_input) { + // U+00E4 ä is non-ASCII, so the whole input fails the ASCII requirement + EXPECT_FALSE(sourcemeta::core::idna_is_valid_a_label("xn--\u00E4")); +} + +TEST(IDNA_is_valid_a_label, uppercase_in_body) { + // Punycode is case-insensitive but A-labels are conventionally lowercase. + // An uppercase letter in the Punycode body is not the canonical + // representation, so the round-trip check rejects it. + EXPECT_FALSE(sourcemeta::core::idna_is_valid_a_label("xn--MNCHEN-3ya")); +} From 306747e1d8406a7b66fe0b0da3a074adc4b86225 Mon Sep 17 00:00:00 2001 From: Juan Cruz Viotti Date: Tue, 26 May 2026 16:50:51 -0400 Subject: [PATCH 2/2] Fix Signed-off-by: Juan Cruz Viotti --- src/core/idna/idna.cc | 70 ++++++++++++++++++++++++------------------- 1 file changed, 39 insertions(+), 31 deletions(-) diff --git a/src/core/idna/idna.cc b/src/core/idna/idna.cc index 0af38bd44..d9f932966 100644 --- a/src/core/idna/idna.cc +++ b/src/core/idna/idna.cc @@ -283,45 +283,53 @@ auto idna_passes_bidi_rule(const std::u32string_view label) noexcept -> bool { } auto idna_is_valid_a_label(const std::string_view label) noexcept -> bool { - try { - constexpr std::string_view prefix{"xn--"}; - if (!label.starts_with(prefix)) { - return false; - } - - // RFC 5890 §2.3.2.1: A-labels are pure ASCII - for (const auto byte : label) { - if (static_cast(byte) > 0x7F) { - return false; - } - } + constexpr std::string_view prefix{"xn--"}; + if (!label.starts_with(prefix)) { + return false; + } - const auto encoded{label.substr(prefix.size())}; - if (encoded.empty()) { + // RFC 5890 §2.3.2.1: A-labels are pure ASCII + for (const auto byte : label) { + if (static_cast(byte) > 0x7F) { return false; } + } - const auto decoded{punycode_to_utf32(encoded)}; + // The substring after the prefix. Constructing the view via (data, size) + // avoids `std::string_view::substr`, which is not noexcept. + const std::string_view encoded{label.data() + prefix.size(), + label.size() - prefix.size()}; + if (encoded.empty()) { + return false; + } - // RFC 5890 §2.3.2.1: a U-label contains at least one non-ASCII codepoint. - // A Punycode body that decodes to pure ASCII is not a real A-label. - bool has_non_ascii{false}; - for (const auto codepoint : decoded) { - if (codepoint > 0x7F) { - has_non_ascii = true; - break; - } - } - if (!has_non_ascii) { - return false; - } + std::u32string decoded; + try { + decoded = punycode_to_utf32(encoded); + } catch (...) { + return false; + } - if (!idna_is_valid_u_label(decoded)) { - return false; + // RFC 5890 §2.3.2.1: a U-label contains at least one non-ASCII codepoint. + // A Punycode body that decodes to pure ASCII is not a real A-label. + bool has_non_ascii{false}; + for (const auto codepoint : decoded) { + if (codepoint > 0x7F) { + has_non_ascii = true; + break; } + } + if (!has_non_ascii) { + return false; + } - // RFC 5891 §4.2: A-labels must be in canonical Punycode form, so - // re-encoding the decoded U-label must yield the original bytes. + if (!idna_is_valid_u_label(decoded)) { + return false; + } + + // RFC 5891 §4.2: A-labels must be in canonical Punycode form, so + // re-encoding the decoded U-label must yield the original bytes. + try { return utf32_to_punycode(decoded) == encoded; } catch (...) { return false;