Skip to content

Commit

Permalink
Encode ' ', '"', '<', '>', and '`' in URL fragments.
Browse files Browse the repository at this point in the history
Implements the changes to fragment processing described in
whatwg/url#347, which adds a new "fragment
percent-encode set" which contains the C0 control percent-encode set,
along with:

* 0x20 SPACE
* 0x22 (")
* 0x3C (<)
* 0x3E (>)
* 0x60 (`)

This brings our implementation into line with Firefox.

Bug: 758523
Change-Id: I25de642017ccb69473626a327ad194b3431a11ed
Reviewed-on: https://chromium-review.googlesource.com/719004
Commit-Queue: Mike West <mkwst@chromium.org>
Reviewed-by: Jochen Eisinger <jochen@chromium.org>
Cr-Original-Commit-Position: refs/heads/master@{#523383}
Cr-Mirrored-From: https://chromium.googlesource.com/chromium/src
Cr-Mirrored-Commit: 01c25d47d2d22456368363e576083d766eedf8f6
  • Loading branch information
mikewest authored and Commit Bot committed Dec 12, 2017
1 parent 66687cf commit bcbfce4
Show file tree
Hide file tree
Showing 3 changed files with 57 additions and 11 deletions.
53 changes: 45 additions & 8 deletions url_canon_etc.cc
Expand Up @@ -244,6 +244,43 @@ bool DoPort(const CHAR* spec,
return true;
}

// clang-format off
// Percent-escape all "C0 controls" (0x00-0x1F)
// https://infra.spec.whatwg.org/#c0-control along with the characters ' '
// (0x20), '"' (0x22), '<' (0x3C), '>' (0x3E), and '`' (0x60):
const bool kShouldEscapeCharInRef[0x80] = {
// Control characters (0x00-0x1F)
true, true, true, true, true, true, true, true,
true, true, true, true, true, true, true, true,
true, true, true, true, true, true, true, true,
true, true, true, true, true, true, true, true,
// ' ' ! " # $ % & '
true, false, true, false, false, false, false, false,
// ( ) * + , - . /
false, false, false, false, false, false, false, false,
// 0 1 2 3 4 5 6 7
false, false, false, false, false, false, false, false,
// 8 9 : ; < = > ?
false, false, false, false, true, false, true, false,
// @ A B C D E F G
false, false, false, false, false, false, false, false,
// H I J K L M N O
false, false, false, false, false, false, false, false,
// P Q R S T U V W
false, false, false, false, false, false, false, false,
// X Y Z [ \ ] ^ _
false, false, false, false, false, false, false, false,
// ` a b c d e f g
true, false, false, false, false, false, false, false,
// h i j k l m n o
false, false, false, false, false, false, false, false,
// p q r s t u v w
false, false, false, false, false, false, false, false,
// x y z { | } ~
false, false, false, false, false, false, false
};
// clang-format on

template<typename CHAR, typename UCHAR>
void DoCanonicalizeRef(const CHAR* spec,
const Component& ref,
Expand All @@ -266,14 +303,14 @@ void DoCanonicalizeRef(const CHAR* spec,
if (spec[i] == 0) {
// IE just strips NULLs, so we do too.
continue;
} else if (static_cast<UCHAR>(spec[i]) < 0x20) {
// Unline IE seems to, we escape control characters. This will probably
// make the reference fragment unusable on a web page, but people
// shouldn't be using control characters in their anchor names.
AppendEscapedChar(static_cast<unsigned char>(spec[i]), output);
} else if (static_cast<UCHAR>(spec[i]) < 0x80) {
// Normal ASCII characters are just appended.
output->push_back(static_cast<char>(spec[i]));
}

UCHAR current_char = static_cast<UCHAR>(spec[i]);
if (current_char < 0x80) {
if (kShouldEscapeCharInRef[current_char])
AppendEscapedChar(static_cast<unsigned char>(spec[i]), output);
else
output->push_back(static_cast<char>(spec[i]));
} else {
AppendUTF8EscapedChar(spec, &i, end, output);
}
Expand Down
13 changes: 11 additions & 2 deletions url_canon_unittest.cc
Expand Up @@ -1277,8 +1277,17 @@ TEST(URLCanonTest, Query) {
TEST(URLCanonTest, Ref) {
// Refs are trivial, it just checks the encoding.
DualComponentCase ref_cases[] = {
// Regular one, we shouldn't escape spaces, et al.
{"hello, world", L"hello, world", "#hello, world", Component(1, 12),
{"hello!", L"hello!", "#hello!", Component(1, 6), true},
// We should escape spaces, double-quotes, angled braces, and backtics.
{"hello, world", L"hello, world", "#hello,%20world", Component(1, 14),
true},
{"hello,\"world", L"hello,\"world", "#hello,%22world", Component(1, 14),
true},
{"hello,<world", L"hello,<world", "#hello,%3Cworld", Component(1, 14),
true},
{"hello,>world", L"hello,>world", "#hello,%3Eworld", Component(1, 14),
true},
{"hello,`world", L"hello,`world", "#hello,%60world", Component(1, 14),
true},
// UTF-8/wide input should be preserved
{"\xc2\xa9", L"\xa9", "#%C2%A9", Component(1, 6), true},
Expand Down
2 changes: 1 addition & 1 deletion url_util_unittest.cc
Expand Up @@ -179,7 +179,7 @@ TEST(URLUtilTest, ReplaceScheme) {
EXPECT_EQ("about://google.com/",
CheckReplaceScheme("http://google.com/", "about"));

EXPECT_EQ("http://example.com/%20hello%20# world",
EXPECT_EQ("http://example.com/%20hello%20#%20world",
CheckReplaceScheme("myscheme:example.com/ hello # world ", "http"));
}

Expand Down

0 comments on commit bcbfce4

Please sign in to comment.