diff --git a/config.cmake.in b/config.cmake.in index b448a0cb3..bbcd7477b 100644 --- a/config.cmake.in +++ b/config.cmake.in @@ -71,6 +71,7 @@ foreach(component ${SOURCEMETA_CORE_COMPONENTS}) include("${CMAKE_CURRENT_LIST_DIR}/sourcemeta_core_ip.cmake") elseif(component STREQUAL "idna") include("${CMAKE_CURRENT_LIST_DIR}/sourcemeta_core_unicode.cmake") + include("${CMAKE_CURRENT_LIST_DIR}/sourcemeta_core_punycode.cmake") include("${CMAKE_CURRENT_LIST_DIR}/sourcemeta_core_idna.cmake") elseif(component STREQUAL "dns") include("${CMAKE_CURRENT_LIST_DIR}/sourcemeta_core_unicode.cmake") diff --git a/src/core/idna/CMakeLists.txt b/src/core/idna/CMakeLists.txt index 38483f467..b37440dd1 100644 --- a/src/core/idna/CMakeLists.txt +++ b/src/core/idna/CMakeLists.txt @@ -30,3 +30,4 @@ if(SOURCEMETA_CORE_INSTALL) endif() target_link_libraries(sourcemeta_core_idna PRIVATE sourcemeta::core::unicode) +target_link_libraries(sourcemeta_core_idna PRIVATE sourcemeta::core::punycode) diff --git a/src/core/idna/idna.cc b/src/core/idna/idna.cc index 685c91f33..d9f932966 100644 --- a/src/core/idna/idna.cc +++ b/src/core/idna/idna.cc @@ -1,9 +1,11 @@ #include +#include #include #include // std::size_t -#include // std::u32string_view +#include // std::string, std::u32string +#include // std::string_view, std::u32string_view #include "idna_data.h" @@ -280,4 +282,58 @@ auto idna_passes_bidi_rule(const std::u32string_view label) noexcept -> bool { return false; } +auto idna_is_valid_a_label(const std::string_view label) noexcept -> bool { + constexpr std::string_view prefix{"xn--"}; + if (!label.starts_with(prefix)) { + return false; + } + + // RFC 5890 §2.3.2.1: A-labels are pure ASCII + for (const auto byte : label) { + if (static_cast(byte) > 0x7F) { + return false; + } + } + + // The substring after the prefix. Constructing the view via (data, size) + // avoids `std::string_view::substr`, which is not noexcept. + const std::string_view encoded{label.data() + prefix.size(), + label.size() - prefix.size()}; + if (encoded.empty()) { + return false; + } + + std::u32string decoded; + try { + decoded = punycode_to_utf32(encoded); + } catch (...) { + return false; + } + + // RFC 5890 §2.3.2.1: a U-label contains at least one non-ASCII codepoint. + // A Punycode body that decodes to pure ASCII is not a real A-label. + bool has_non_ascii{false}; + for (const auto codepoint : decoded) { + if (codepoint > 0x7F) { + has_non_ascii = true; + break; + } + } + if (!has_non_ascii) { + return false; + } + + if (!idna_is_valid_u_label(decoded)) { + return false; + } + + // RFC 5891 §4.2: A-labels must be in canonical Punycode form, so + // re-encoding the decoded U-label must yield the original bytes. + try { + return utf32_to_punycode(decoded) == encoded; + } catch (...) { + return false; + } +} + } // namespace sourcemeta::core diff --git a/src/core/idna/include/sourcemeta/core/idna.h b/src/core/idna/include/sourcemeta/core/idna.h index 198312bf3..d6f2c7406 100644 --- a/src/core/idna/include/sourcemeta/core/idna.h +++ b/src/core/idna/include/sourcemeta/core/idna.h @@ -115,6 +115,28 @@ auto idna_passes_bidi_rule(const std::u32string_view label) noexcept -> bool; SOURCEMETA_CORE_IDNA_EXPORT auto idna_is_valid_u_label(const std::u32string_view label) noexcept -> bool; +/// @ingroup idna +/// Return whether the given label is a valid A-label per RFC 5891 §4. See +/// https://www.rfc-editor.org/rfc/rfc5891#section-4 for the criteria. +/// A valid A-label starts with the ACE prefix "xn--", is pure ASCII, has a +/// non-empty Punycode body that decodes to a U-label containing at least +/// one non-ASCII codepoint, and round-trips through Punycode in its +/// canonical form. For example: +/// +/// ```cpp +/// #include +/// #include +/// +/// // xn--mnchen-3ya decodes to "München" +/// assert(sourcemeta::core::idna_is_valid_a_label("xn--mnchen-3ya")); +/// // Missing "xn--" prefix +/// assert(!sourcemeta::core::idna_is_valid_a_label("abc")); +/// // Decodes to "abc" (no non-ASCII codepoint) +/// assert(!sourcemeta::core::idna_is_valid_a_label("xn--abc-")); +/// ``` +SOURCEMETA_CORE_IDNA_EXPORT +auto idna_is_valid_a_label(const std::string_view label) noexcept -> bool; + } // namespace sourcemeta::core #endif diff --git a/test/idna/CMakeLists.txt b/test/idna/CMakeLists.txt index e22d99397..906b4041d 100644 --- a/test/idna/CMakeLists.txt +++ b/test/idna/CMakeLists.txt @@ -4,7 +4,8 @@ sourcemeta_googletest(NAMESPACE sourcemeta PROJECT core NAME idna idna_passes_contexto_test.cc idna_passes_contextj_test.cc idna_passes_bidi_rule_test.cc - idna_is_valid_u_label_test.cc) + idna_is_valid_u_label_test.cc + idna_is_valid_a_label_test.cc) target_link_libraries(sourcemeta_core_idna_unit PRIVATE sourcemeta::core::idna) diff --git a/test/idna/idna_is_valid_a_label_test.cc b/test/idna/idna_is_valid_a_label_test.cc new file mode 100644 index 000000000..b6b1299a2 --- /dev/null +++ b/test/idna/idna_is_valid_a_label_test.cc @@ -0,0 +1,56 @@ +#include + +#include + +TEST(IDNA_is_valid_a_label, munich_german) { + // xn--mnchen-3ya decodes to "M\u00FCnchen" + EXPECT_TRUE(sourcemeta::core::idna_is_valid_a_label("xn--mnchen-3ya")); +} + +TEST(IDNA_is_valid_a_label, volos_greek) { + // xn--nxasmq6b decodes to "\u03B2\u03CC\u03BB\u03BF\u03C2" + EXPECT_TRUE(sourcemeta::core::idna_is_valid_a_label("xn--nxasmq6b")); +} + +TEST(IDNA_is_valid_a_label, deja_french) { + // xn--dj-kia8a decodes to "d\u00E9j\u00E0" + EXPECT_TRUE(sourcemeta::core::idna_is_valid_a_label("xn--dj-kia8a")); +} + +TEST(IDNA_is_valid_a_label, missing_prefix) { + EXPECT_FALSE(sourcemeta::core::idna_is_valid_a_label("abc")); +} + +TEST(IDNA_is_valid_a_label, partial_prefix) { + EXPECT_FALSE(sourcemeta::core::idna_is_valid_a_label("xn-abc")); +} + +TEST(IDNA_is_valid_a_label, empty_input) { + EXPECT_FALSE(sourcemeta::core::idna_is_valid_a_label("")); +} + +TEST(IDNA_is_valid_a_label, only_prefix) { + EXPECT_FALSE(sourcemeta::core::idna_is_valid_a_label("xn--")); +} + +TEST(IDNA_is_valid_a_label, decodes_to_pure_ascii) { + // The body "abc-" is a Punycode encoding whose result has no non-ASCII + EXPECT_FALSE(sourcemeta::core::idna_is_valid_a_label("xn--abc-")); +} + +TEST(IDNA_is_valid_a_label, malformed_punycode_body) { + // A trailing hyphen with garbage extension chars is invalid Punycode + EXPECT_FALSE(sourcemeta::core::idna_is_valid_a_label("xn--abc-zzzzzzzz!")); +} + +TEST(IDNA_is_valid_a_label, non_ascii_byte_in_input) { + // U+00E4 ä is non-ASCII, so the whole input fails the ASCII requirement + EXPECT_FALSE(sourcemeta::core::idna_is_valid_a_label("xn--\u00E4")); +} + +TEST(IDNA_is_valid_a_label, uppercase_in_body) { + // Punycode is case-insensitive but A-labels are conventionally lowercase. + // An uppercase letter in the Punycode body is not the canonical + // representation, so the round-trip check rejects it. + EXPECT_FALSE(sourcemeta::core::idna_is_valid_a_label("xn--MNCHEN-3ya")); +}