Skip to content

Commit

Permalink
Merge branch 'advisory-fix' into 2.11
Browse files Browse the repository at this point in the history
  • Loading branch information
Gallaecio committed May 13, 2024
2 parents bb948af + 7a1ab7e commit 1d0502f
Show file tree
Hide file tree
Showing 2 changed files with 144 additions and 10 deletions.
40 changes: 32 additions & 8 deletions scrapy/downloadermiddlewares/redirect.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,14 +20,38 @@ def _build_redirect_request(source_request, *, url, **kwargs):
has_cookie_header = "Cookie" in redirect_request.headers
has_authorization_header = "Authorization" in redirect_request.headers
if has_cookie_header or has_authorization_header:
source_request_netloc = urlparse_cached(source_request).netloc
redirect_request_netloc = urlparse_cached(redirect_request).netloc
if source_request_netloc != redirect_request_netloc:
if has_cookie_header:
del redirect_request.headers["Cookie"]
# https://fetch.spec.whatwg.org/#ref-for-cors-non-wildcard-request-header-name
if has_authorization_header:
del redirect_request.headers["Authorization"]
default_ports = {"http": 80, "https": 443}

parsed_source_request = urlparse_cached(source_request)
source_scheme, source_host, source_port = (
parsed_source_request.scheme,
parsed_source_request.hostname,
parsed_source_request.port
or default_ports.get(parsed_source_request.scheme),
)

parsed_redirect_request = urlparse_cached(redirect_request)
redirect_scheme, redirect_host, redirect_port = (
parsed_redirect_request.scheme,
parsed_redirect_request.hostname,
parsed_redirect_request.port
or default_ports.get(parsed_redirect_request.scheme),
)

if has_cookie_header and (
(source_scheme != redirect_scheme and redirect_scheme != "https")
or source_host != redirect_host
):
del redirect_request.headers["Cookie"]

# https://fetch.spec.whatwg.org/#ref-for-cors-non-wildcard-request-header-name
if has_authorization_header and (
source_scheme != redirect_scheme
or source_host != redirect_host
or source_port != redirect_port
):
del redirect_request.headers["Authorization"]

return redirect_request


Expand Down
114 changes: 112 additions & 2 deletions tests/test_downloadermiddleware_redirect.py
Original file line number Diff line number Diff line change
Expand Up @@ -247,13 +247,18 @@ def test_utf8_location(self):
perc_encoded_utf8_url = "http://scrapytest.org/a%C3%A7%C3%A3o"
self.assertEqual(perc_encoded_utf8_url, req_result.url)

def test_cross_domain_header_dropping(self):
def test_cross_origin_header_dropping(self):
safe_headers = {"A": "B"}
cookie_header = {"Cookie": "a=b"}
authorization_header = {"Authorization": "Bearer 123456"}

original_request = Request(
"https://example.com",
headers={"Cookie": "a=b", "Authorization": "a", **safe_headers},
headers={**safe_headers, **cookie_header, **authorization_header},
)

# Redirects to the same origin (same scheme, same domain, same port)
# keep all headers.
internal_response = Response(
"https://example.com",
headers={"Location": "https://example.com/a"},
Expand All @@ -265,6 +270,71 @@ def test_cross_domain_header_dropping(self):
self.assertIsInstance(internal_redirect_request, Request)
self.assertEqual(original_request.headers, internal_redirect_request.headers)

# Redirects to the same origin (same scheme, same domain, same port)
# keep all headers also when the scheme is http.
http_request = Request(
"http://example.com",
headers={**safe_headers, **cookie_header, **authorization_header},
)
http_response = Response(
"http://example.com",
headers={"Location": "http://example.com/a"},
status=301,
)
http_redirect_request = self.mw.process_response(
http_request, http_response, self.spider
)
self.assertIsInstance(http_redirect_request, Request)
self.assertEqual(http_request.headers, http_redirect_request.headers)

# For default ports, whether the port is explicit or implicit does not
# affect the outcome, it is still the same origin.
to_explicit_port_response = Response(
"https://example.com",
headers={"Location": "https://example.com:443/a"},
status=301,
)
to_explicit_port_redirect_request = self.mw.process_response(
original_request, to_explicit_port_response, self.spider
)
self.assertIsInstance(to_explicit_port_redirect_request, Request)
self.assertEqual(
original_request.headers, to_explicit_port_redirect_request.headers
)

# For default ports, whether the port is explicit or implicit does not
# affect the outcome, it is still the same origin.
to_implicit_port_response = Response(
"https://example.com:433",
headers={"Location": "https://example.com/a"},
status=301,
)
to_implicit_port_redirect_request = self.mw.process_response(
original_request, to_implicit_port_response, self.spider
)
self.assertIsInstance(to_implicit_port_redirect_request, Request)
self.assertEqual(
original_request.headers, to_implicit_port_redirect_request.headers
)

# A port change drops the Authorization header because the origin
# changes, but keeps the Cookie header because the domain remains the
# same.
different_port_response = Response(
"https://example.com",
headers={"Location": "https://example.com:8080/a"},
status=301,
)
different_port_redirect_request = self.mw.process_response(
original_request, different_port_response, self.spider
)
self.assertIsInstance(different_port_redirect_request, Request)
self.assertEqual(
{**safe_headers, **cookie_header},
different_port_redirect_request.headers.to_unicode_dict(),
)

# A domain change drops both the Authorization and the Cookie header.
external_response = Response(
"https://example.com",
headers={"Location": "https://example.org/a"},
Expand All @@ -278,6 +348,46 @@ def test_cross_domain_header_dropping(self):
safe_headers, external_redirect_request.headers.to_unicode_dict()
)

# A scheme upgrade (http → https) drops the Authorization header
# because the origin changes, but keeps the Cookie header because the
# domain remains the same.
upgrade_response = Response(
"http://example.com",
headers={"Location": "https://example.com/a"},
status=301,
)
upgrade_redirect_request = self.mw.process_response(
http_request, upgrade_response, self.spider
)
self.assertIsInstance(upgrade_redirect_request, Request)
self.assertEqual(
{**safe_headers, **cookie_header},
upgrade_redirect_request.headers.to_unicode_dict(),
)

# A scheme downgrade (https → http) drops the Authorization header
# because the origin changes, and the Cookie header because its value
# cannot indicate whether the cookies were secure (HTTPS-only) or not.
#
# Note: If the Cookie header is set by the cookie management
# middleware, as recommended in the docs, the dropping of Cookie on
# scheme downgrade is not an issue, because the cookie management
# middleware will add again the Cookie header to the new request if
# appropriate.
downgrade_response = Response(
"https://example.com",
headers={"Location": "http://example.com/a"},
status=301,
)
downgrade_redirect_request = self.mw.process_response(
original_request, downgrade_response, self.spider
)
self.assertIsInstance(downgrade_redirect_request, Request)
self.assertEqual(
safe_headers,
downgrade_redirect_request.headers.to_unicode_dict(),
)


class MetaRefreshMiddlewareTest(unittest.TestCase):
def setUp(self):
Expand Down

0 comments on commit 1d0502f

Please sign in to comment.