Update dangling markup mitigations.

Still behind a flag, just updating the checks to look for both `\n` and `<` rather than just the former. This is in line with the patches up at whatwg/url#284 and whatwg/fetch#519. Intent to Remove: https://groups.google.com/a/chromium.org/d/msg/blink-dev/KaA_YNOlTPk/VmmoV88xBgAJ. Bug: 680970 Change-Id: Ifda61a0afe1f0e97620acef7dc54b005c6f74840 Reviewed-on: https://chromium-review.googlesource.com/514024 Commit-Queue: Mike West <mkwst@chromium.org> Reviewed-by: Jochen Eisinger <jochen@chromium.org> Cr-Original-Commit-Position: refs/heads/master@{#474341} Cr-Mirrored-From: https://chromium.googlesource.com/chromium/src Cr-Mirrored-Commit: 9e5ae901660de47ef1b844c6113eae91b5ae8e9e
scrapy · May 24, 2017 · 6b19814 · 6b19814
1 parent c0cb55b
commit 6b19814
Show file tree

Hide file tree

Showing 8 changed files with 88 additions and 48 deletions.
diff --git a/gurl_unittest.cc b/gurl_unittest.cc
@@ -645,12 +645,26 @@ TEST(GURLTest, Newlines) {
   // Constructor.
   GURL url_1(" \t ht\ntp://\twww.goo\rgle.com/as\ndf \n ");
   EXPECT_EQ("http://www.google.com/asdf", url_1.spec());
-  EXPECT_TRUE(url_1.parsed_for_possibly_invalid_spec().whitespace_removed);
+  EXPECT_FALSE(
+      url_1.parsed_for_possibly_invalid_spec().potentially_dangling_markup);
 
   // Relative path resolver.
   GURL url_2 = url_1.Resolve(" \n /fo\to\r ");
   EXPECT_EQ("http://www.google.com/foo", url_2.spec());
-  EXPECT_TRUE(url_2.parsed_for_possibly_invalid_spec().whitespace_removed);
+  EXPECT_FALSE(
+      url_2.parsed_for_possibly_invalid_spec().potentially_dangling_markup);
+
+  // Constructor.
+  GURL url_3(" \t ht\ntp://\twww.goo\rgle.com/as\ndf< \n ");
+  EXPECT_EQ("http://www.google.com/asdf%3C", url_3.spec());
+  EXPECT_TRUE(
+      url_3.parsed_for_possibly_invalid_spec().potentially_dangling_markup);
+
+  // Relative path resolver.
+  GURL url_4 = url_1.Resolve(" \n /fo\to<\r ");
+  EXPECT_EQ("http://www.google.com/foo%3C", url_4.spec());
+  EXPECT_TRUE(
+      url_4.parsed_for_possibly_invalid_spec().potentially_dangling_markup);
 
   // Note that newlines are NOT stripped from ReplaceComponents.
 }

diff --git a/third_party/mozilla/url_parse.cc b/third_party/mozilla/url_parse.cc
@@ -690,7 +690,7 @@ bool DoExtractQueryKeyValue(const CHAR* spec,
 
 }  // namespace
 
-Parsed::Parsed() : whitespace_removed(false), inner_parsed_(NULL) {}
+Parsed::Parsed() : potentially_dangling_markup(false), inner_parsed_(NULL) {}
 
 Parsed::Parsed(const Parsed& other)
     : scheme(other.scheme),
@@ -701,7 +701,7 @@ Parsed::Parsed(const Parsed& other)
       path(other.path),
       query(other.query),
       ref(other.ref),
-      whitespace_removed(other.whitespace_removed),
+      potentially_dangling_markup(other.potentially_dangling_markup),
       inner_parsed_(NULL) {
   if (other.inner_parsed_)
     set_inner_parsed(*other.inner_parsed_);
@@ -717,7 +717,7 @@ Parsed& Parsed::operator=(const Parsed& other) {
     path = other.path;
     query = other.query;
     ref = other.ref;
-    whitespace_removed = other.whitespace_removed;
+    potentially_dangling_markup = other.potentially_dangling_markup;
     if (other.inner_parsed_)
       set_inner_parsed(*other.inner_parsed_);
     else

diff --git a/third_party/mozilla/url_parse.h b/third_party/mozilla/url_parse.h
@@ -177,8 +177,12 @@ struct URL_EXPORT Parsed {
   // the string with the scheme stripped off.
   Component GetContent() const;
 
-  // True if whitespace was removed from the URL during parsing.
-  bool whitespace_removed;
+  // True if the URL's source contained a raw `<` character, and whitespace was
+  // removed from the URL during parsing
+  //
+  // TODO(mkwst): Link this to something in a spec if
+  // https://github.com/whatwg/url/pull/284 lands.
+  bool potentially_dangling_markup;
 
   // This is used for nested URL types, currently only filesystem.  If you
   // parse a filesystem URL, the resulting Parsed will have a nested

diff --git a/url_canon.h b/url_canon.h
@@ -231,14 +231,21 @@ class URL_EXPORT CharsetConverter {
 //
 // Therefore, callers should not use the buffer, since it may actually be empty,
 // use the computed pointer and |*output_len| instead.
-URL_EXPORT const char* RemoveURLWhitespace(const char* input, int input_len,
+//
+// If |input| contained both removable whitespace and a raw `<` character,
+// |potentially_dangling_markup| will be set to `true`. Otherwise, it will be
+// left untouched.
+URL_EXPORT const char* RemoveURLWhitespace(const char* input,
+                                           int input_len,
                                            CanonOutputT<char>* buffer,
-                                           int* output_len);
+                                           int* output_len,
+                                           bool* potentially_dangling_markup);
 URL_EXPORT const base::char16* RemoveURLWhitespace(
     const base::char16* input,
     int input_len,
     CanonOutputT<base::char16>* buffer,
-    int* output_len);
+    int* output_len,
+    bool* potentially_dangling_markup);
 
 // IDN ------------------------------------------------------------------------
 

diff --git a/url_canon_etc.cc b/url_canon_etc.cc
@@ -22,10 +22,12 @@ inline bool IsRemovableURLWhitespace(int ch) {
 // Backend for RemoveURLWhitespace (see declaration in url_canon.h).
 // It sucks that we have to do this, since this takes about 13% of the total URL
 // canonicalization time.
-template<typename CHAR>
-const CHAR* DoRemoveURLWhitespace(const CHAR* input, int input_len,
+template <typename CHAR>
+const CHAR* DoRemoveURLWhitespace(const CHAR* input,
+                                  int input_len,
                                   CanonOutputT<CHAR>* buffer,
-                                  int* output_len) {
+                                  int* output_len,
+                                  bool* potentially_dangling_markup) {
   // Fast verification that there's nothing that needs removal. This is the 99%
   // case, so we want it to be fast and don't care about impacting the speed
   // when we do find whitespace.
@@ -46,8 +48,11 @@ const CHAR* DoRemoveURLWhitespace(const CHAR* input, int input_len,
 
   // Remove the whitespace into the new buffer and return it.
   for (int i = 0; i < input_len; i++) {
-    if (!IsRemovableURLWhitespace(input[i]))
+    if (!IsRemovableURLWhitespace(input[i])) {
+      if (potentially_dangling_markup && input[i] == 0x3C)
+        *potentially_dangling_markup = true;
       buffer->push_back(input[i]);
+    }
   }
   *output_len = buffer->length();
   return buffer->data();
@@ -274,17 +279,22 @@ void DoCanonicalizeRef(const CHAR* spec,
 
 }  // namespace
 
-const char* RemoveURLWhitespace(const char* input, int input_len,
+const char* RemoveURLWhitespace(const char* input,
+                                int input_len,
                                 CanonOutputT<char>* buffer,
-                                int* output_len) {
-  return DoRemoveURLWhitespace(input, input_len, buffer, output_len);
+                                int* output_len,
+                                bool* potentially_dangling_markup) {
+  return DoRemoveURLWhitespace(input, input_len, buffer, output_len,
+                               potentially_dangling_markup);
 }
 
 const base::char16* RemoveURLWhitespace(const base::char16* input,
                                         int input_len,
                                         CanonOutputT<base::char16>* buffer,
-                                        int* output_len) {
-  return DoRemoveURLWhitespace(input, input_len, buffer, output_len);
+                                        int* output_len,
+                                        bool* potentially_dangling_markup) {
+  return DoRemoveURLWhitespace(input, input_len, buffer, output_len,
+                               potentially_dangling_markup);
 }
 
 char CanonicalSchemeChar(base::char16 ch) {

diff --git a/url_canon_relative.cc b/url_canon_relative.cc
@@ -443,11 +443,11 @@ bool DoResolveRelativeURL(const char* base_url,
                           Parsed* out_parsed) {
   // |base_parsed| is the starting point for our output. Since we may have
   // removed whitespace from |relative_url| before entering this method, we'll
-  // carry over the |whitespace_removed| flag.
-  bool whitespace_removed = out_parsed->whitespace_removed;
+  // carry over the |potentially_dangling_markup| flag.
+  bool potentially_dangling_markup = out_parsed->potentially_dangling_markup;
   *out_parsed = base_parsed;
-  if (whitespace_removed)
-    out_parsed->whitespace_removed = true;
+  if (potentially_dangling_markup)
+    out_parsed->potentially_dangling_markup = true;
 
   // Sanity check: the input should have a host or we'll break badly below.
   // We can only resolve relative URLs with base URLs that have hosts and

diff --git a/url_util.cc b/url_util.cc
@@ -185,8 +185,8 @@ bool DoFindAndCompareScheme(const CHAR* str,
   // This matches the canonicalization done in DoCanonicalize function.
   RawCanonOutputT<CHAR> whitespace_buffer;
   int spec_len;
-  const CHAR* spec = RemoveURLWhitespace(str, str_len,
-                                         &whitespace_buffer, &spec_len);
+  const CHAR* spec =
+      RemoveURLWhitespace(str, str_len, &whitespace_buffer, &spec_len, nullptr);
 
   Component our_scheme;
   if (!ExtractScheme(spec, spec_len, &our_scheme)) {
@@ -214,11 +214,8 @@ bool DoCanonicalize(const CHAR* spec,
   // Possibly this will result in copying to the new buffer.
   RawCanonOutputT<CHAR> whitespace_buffer;
   if (whitespace_policy == REMOVE_WHITESPACE) {
-    int original_len = spec_len;
-    spec =
-        RemoveURLWhitespace(spec, original_len, &whitespace_buffer, &spec_len);
-    if (spec_len != original_len)
-      output_parsed->whitespace_removed = true;
+    spec = RemoveURLWhitespace(spec, spec_len, &whitespace_buffer, &spec_len,
+                               &output_parsed->potentially_dangling_markup);
   }
 
   Parsed parsed_input;
@@ -296,11 +293,9 @@ bool DoResolveRelative(const char* base_spec,
   // copying to the new buffer.
   RawCanonOutputT<CHAR> whitespace_buffer;
   int relative_length;
-  const CHAR* relative = RemoveURLWhitespace(in_relative, in_relative_length,
-                                             &whitespace_buffer,
-                                             &relative_length);
-  if (in_relative_length != relative_length)
-    output_parsed->whitespace_removed = true;
+  const CHAR* relative = RemoveURLWhitespace(
+      in_relative, in_relative_length, &whitespace_buffer, &relative_length,
+      &output_parsed->potentially_dangling_markup);
 
   bool base_is_authority_based = false;
   bool base_is_hierarchical = false;

diff --git a/url_util_unittest.cc b/url_util_unittest.cc
@@ -374,23 +374,32 @@ TEST(URLUtilTest, TestNoRefComponent) {
   EXPECT_FALSE(resolved_parsed.ref.is_valid());
 }
 
-TEST(URLUtilTest, RelativeWhitespaceRemoved) {
+TEST(URLUtilTest, PotentiallyDanglingMarkup) {
   struct ResolveRelativeCase {
     const char* base;
     const char* rel;
-    bool whitespace_removed;
+    bool potentially_dangling_markup;
     const char* out;
   } cases[] = {
-      {"https://example.com/", "/path", false, "https://example.com/path"},
-      {"https://example.com/", "\n/path", true, "https://example.com/path"},
-      {"https://example.com/", "\r/path", true, "https://example.com/path"},
-      {"https://example.com/", "\t/path", true, "https://example.com/path"},
-      {"https://example.com/", "/pa\nth", true, "https://example.com/path"},
-      {"https://example.com/", "/pa\rth", true, "https://example.com/path"},
-      {"https://example.com/", "/pa\tth", true, "https://example.com/path"},
-      {"https://example.com/", "/path\n", true, "https://example.com/path"},
-      {"https://example.com/", "/path\r", true, "https://example.com/path"},
-      {"https://example.com/", "/path\r", true, "https://example.com/path"},
+      {"https://example.com/", "/path<", false, "https://example.com/path%3C"},
+      {"https://example.com/", "\n/path<", true, "https://example.com/path%3C"},
+      {"https://example.com/", "\r/path<", true, "https://example.com/path%3C"},
+      {"https://example.com/", "\t/path<", true, "https://example.com/path%3C"},
+      {"https://example.com/", "/pa\nth<", true, "https://example.com/path%3C"},
+      {"https://example.com/", "/pa\rth<", true, "https://example.com/path%3C"},
+      {"https://example.com/", "/pa\tth<", true, "https://example.com/path%3C"},
+      {"https://example.com/", "/path\n<", true, "https://example.com/path%3C"},
+      {"https://example.com/", "/path\r<", true, "https://example.com/path%3C"},
+      {"https://example.com/", "/path\r<", true, "https://example.com/path%3C"},
+      {"https://example.com/", "\n/<path", true, "https://example.com/%3Cpath"},
+      {"https://example.com/", "\r/<path", true, "https://example.com/%3Cpath"},
+      {"https://example.com/", "\t/<path", true, "https://example.com/%3Cpath"},
+      {"https://example.com/", "/<pa\nth", true, "https://example.com/%3Cpath"},
+      {"https://example.com/", "/<pa\rth", true, "https://example.com/%3Cpath"},
+      {"https://example.com/", "/<pa\tth", true, "https://example.com/%3Cpath"},
+      {"https://example.com/", "/<path\n", true, "https://example.com/%3Cpath"},
+      {"https://example.com/", "/<path\r", true, "https://example.com/%3Cpath"},
+      {"https://example.com/", "/<path\r", true, "https://example.com/%3Cpath"},
   };
 
   for (const auto& test : cases) {
@@ -407,7 +416,8 @@ TEST(URLUtilTest, RelativeWhitespaceRemoved) {
     ASSERT_TRUE(valid);
     output.Complete();
 
-    EXPECT_EQ(test.whitespace_removed, resolved_parsed.whitespace_removed);
+    EXPECT_EQ(test.potentially_dangling_markup,
+              resolved_parsed.potentially_dangling_markup);
     EXPECT_EQ(test.out, resolved);
   }
 }