Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions config.cmake.in
Original file line number Diff line number Diff line change
Expand Up @@ -71,6 +71,7 @@ foreach(component ${SOURCEMETA_CORE_COMPONENTS})
include("${CMAKE_CURRENT_LIST_DIR}/sourcemeta_core_ip.cmake")
elseif(component STREQUAL "idna")
include("${CMAKE_CURRENT_LIST_DIR}/sourcemeta_core_unicode.cmake")
include("${CMAKE_CURRENT_LIST_DIR}/sourcemeta_core_punycode.cmake")
include("${CMAKE_CURRENT_LIST_DIR}/sourcemeta_core_idna.cmake")
elseif(component STREQUAL "dns")
include("${CMAKE_CURRENT_LIST_DIR}/sourcemeta_core_unicode.cmake")
Expand Down
1 change: 1 addition & 0 deletions src/core/idna/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -30,3 +30,4 @@ if(SOURCEMETA_CORE_INSTALL)
endif()

target_link_libraries(sourcemeta_core_idna PRIVATE sourcemeta::core::unicode)
target_link_libraries(sourcemeta_core_idna PRIVATE sourcemeta::core::punycode)
Comment thread
jviotti marked this conversation as resolved.
58 changes: 57 additions & 1 deletion src/core/idna/idna.cc
Original file line number Diff line number Diff line change
@@ -1,9 +1,11 @@
#include <sourcemeta/core/idna.h>

#include <sourcemeta/core/punycode.h>
#include <sourcemeta/core/unicode.h>

#include <cstddef> // std::size_t
#include <string_view> // std::u32string_view
#include <string> // std::string, std::u32string
#include <string_view> // std::string_view, std::u32string_view

#include "idna_data.h"

Expand Down Expand Up @@ -280,4 +282,58 @@ auto idna_passes_bidi_rule(const std::u32string_view label) noexcept -> bool {
return false;
}

auto idna_is_valid_a_label(const std::string_view label) noexcept -> bool {
constexpr std::string_view prefix{"xn--"};
if (!label.starts_with(prefix)) {
return false;
}

// RFC 5890 §2.3.2.1: A-labels are pure ASCII
for (const auto byte : label) {
if (static_cast<unsigned char>(byte) > 0x7F) {
return false;
}
}

// The substring after the prefix. Constructing the view via (data, size)
// avoids `std::string_view::substr`, which is not noexcept.
const std::string_view encoded{label.data() + prefix.size(),
label.size() - prefix.size()};
if (encoded.empty()) {
return false;
}

std::u32string decoded;
try {
decoded = punycode_to_utf32(encoded);
} catch (...) {
return false;
}

// RFC 5890 §2.3.2.1: a U-label contains at least one non-ASCII codepoint.
// A Punycode body that decodes to pure ASCII is not a real A-label.
bool has_non_ascii{false};
for (const auto codepoint : decoded) {
if (codepoint > 0x7F) {
has_non_ascii = true;
break;
}
}
if (!has_non_ascii) {
return false;
}

if (!idna_is_valid_u_label(decoded)) {
return false;
}

// RFC 5891 §4.2: A-labels must be in canonical Punycode form, so
// re-encoding the decoded U-label must yield the original bytes.
try {
return utf32_to_punycode(decoded) == encoded;
} catch (...) {
return false;
}
}

} // namespace sourcemeta::core
22 changes: 22 additions & 0 deletions src/core/idna/include/sourcemeta/core/idna.h
Original file line number Diff line number Diff line change
Expand Up @@ -115,6 +115,28 @@ auto idna_passes_bidi_rule(const std::u32string_view label) noexcept -> bool;
SOURCEMETA_CORE_IDNA_EXPORT
auto idna_is_valid_u_label(const std::u32string_view label) noexcept -> bool;

/// @ingroup idna
/// Return whether the given label is a valid A-label per RFC 5891 §4. See
/// https://www.rfc-editor.org/rfc/rfc5891#section-4 for the criteria.
/// A valid A-label starts with the ACE prefix "xn--", is pure ASCII, has a
/// non-empty Punycode body that decodes to a U-label containing at least
/// one non-ASCII codepoint, and round-trips through Punycode in its
/// canonical form. For example:
///
/// ```cpp
/// #include <sourcemeta/core/idna.h>
/// #include <cassert>
///
/// // xn--mnchen-3ya decodes to "München"
/// assert(sourcemeta::core::idna_is_valid_a_label("xn--mnchen-3ya"));
/// // Missing "xn--" prefix
/// assert(!sourcemeta::core::idna_is_valid_a_label("abc"));
/// // Decodes to "abc" (no non-ASCII codepoint)
/// assert(!sourcemeta::core::idna_is_valid_a_label("xn--abc-"));
/// ```
SOURCEMETA_CORE_IDNA_EXPORT
auto idna_is_valid_a_label(const std::string_view label) noexcept -> bool;

} // namespace sourcemeta::core

#endif
3 changes: 2 additions & 1 deletion test/idna/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,8 @@ sourcemeta_googletest(NAMESPACE sourcemeta PROJECT core NAME idna
idna_passes_contexto_test.cc
idna_passes_contextj_test.cc
idna_passes_bidi_rule_test.cc
idna_is_valid_u_label_test.cc)
idna_is_valid_u_label_test.cc
idna_is_valid_a_label_test.cc)

target_link_libraries(sourcemeta_core_idna_unit
PRIVATE sourcemeta::core::idna)
56 changes: 56 additions & 0 deletions test/idna/idna_is_valid_a_label_test.cc
Original file line number Diff line number Diff line change
@@ -0,0 +1,56 @@
#include <gtest/gtest.h>

#include <sourcemeta/core/idna.h>

TEST(IDNA_is_valid_a_label, munich_german) {
// xn--mnchen-3ya decodes to "M\u00FCnchen"
EXPECT_TRUE(sourcemeta::core::idna_is_valid_a_label("xn--mnchen-3ya"));
}

TEST(IDNA_is_valid_a_label, volos_greek) {
// xn--nxasmq6b decodes to "\u03B2\u03CC\u03BB\u03BF\u03C2"
EXPECT_TRUE(sourcemeta::core::idna_is_valid_a_label("xn--nxasmq6b"));
}

TEST(IDNA_is_valid_a_label, deja_french) {
// xn--dj-kia8a decodes to "d\u00E9j\u00E0"
EXPECT_TRUE(sourcemeta::core::idna_is_valid_a_label("xn--dj-kia8a"));
}

TEST(IDNA_is_valid_a_label, missing_prefix) {
EXPECT_FALSE(sourcemeta::core::idna_is_valid_a_label("abc"));
}

TEST(IDNA_is_valid_a_label, partial_prefix) {
EXPECT_FALSE(sourcemeta::core::idna_is_valid_a_label("xn-abc"));
}

TEST(IDNA_is_valid_a_label, empty_input) {
EXPECT_FALSE(sourcemeta::core::idna_is_valid_a_label(""));
}

TEST(IDNA_is_valid_a_label, only_prefix) {
EXPECT_FALSE(sourcemeta::core::idna_is_valid_a_label("xn--"));
}

TEST(IDNA_is_valid_a_label, decodes_to_pure_ascii) {
// The body "abc-" is a Punycode encoding whose result has no non-ASCII
EXPECT_FALSE(sourcemeta::core::idna_is_valid_a_label("xn--abc-"));
}

TEST(IDNA_is_valid_a_label, malformed_punycode_body) {
// A trailing hyphen with garbage extension chars is invalid Punycode
EXPECT_FALSE(sourcemeta::core::idna_is_valid_a_label("xn--abc-zzzzzzzz!"));
}

TEST(IDNA_is_valid_a_label, non_ascii_byte_in_input) {
// U+00E4 ä is non-ASCII, so the whole input fails the ASCII requirement
EXPECT_FALSE(sourcemeta::core::idna_is_valid_a_label("xn--\u00E4"));
}

TEST(IDNA_is_valid_a_label, uppercase_in_body) {
// Punycode is case-insensitive but A-labels are conventionally lowercase.
// An uppercase letter in the Punycode body is not the canonical
// representation, so the round-trip check rejects it.
EXPECT_FALSE(sourcemeta::core::idna_is_valid_a_label("xn--MNCHEN-3ya"));
}
Loading