Skip to content

Commit

Permalink
Merge branch 'redirect-protocols' into 2.11
Browse files Browse the repository at this point in the history
  • Loading branch information
Gallaecio committed May 13, 2024
2 parents f138d5d + 3cbb6fe commit 36287cb
Show file tree
Hide file tree
Showing 2 changed files with 110 additions and 4 deletions.
10 changes: 8 additions & 2 deletions scrapy/downloadermiddlewares/redirect.py
Original file line number Diff line number Diff line change
Expand Up @@ -142,6 +142,8 @@ def process_response(self, request, response, spider):
location = request_scheme + "://" + location.lstrip("/")

redirected_url = urljoin(request.url, location)
if urlparse(redirected_url).scheme not in {"http", "https"}:
return response

if response.status in (301, 307, 308) or request.method == "HEAD":
redirected = _build_redirect_request(request, url=redirected_url)
Expand All @@ -164,12 +166,16 @@ def process_response(self, request, response, spider):
request.meta.get("dont_redirect", False)
or request.method == "HEAD"
or not isinstance(response, HtmlResponse)
or urlparse_cached(request).scheme not in {"http", "https"}
):
return response

interval, url = get_meta_refresh(response, ignore_tags=self._ignore_tags)
if url and interval < self._maxdelay:
if not url:
return response
if urlparse(url).scheme not in {"http", "https"}:
return response
if interval < self._maxdelay:
redirected = self._redirect_request_using_get(request, url)
return self._redirect(redirected, request, spider, "meta refresh")

return response
104 changes: 102 additions & 2 deletions tests/test_downloadermiddleware_redirect.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,7 @@
import unittest
from itertools import chain, product

import pytest

from scrapy.downloadermiddlewares.httpproxy import HttpProxyMiddleware
from scrapy.downloadermiddlewares.redirect import (
Expand Down Expand Up @@ -1138,6 +1141,64 @@ def test_no_location(self):
assert self.mw.process_response(request, response, self.spider) is response


SCHEME_PARAMS = ("url", "location", "target")
HTTP_SCHEMES = ("http", "https")
NON_HTTP_SCHEMES = ("data", "file", "ftp", "s3", "foo")
REDIRECT_SCHEME_CASES = (
# http/https → http/https redirects
*(
(
f"{input_scheme}://example.com/a",
f"{output_scheme}://example.com/b",
f"{output_scheme}://example.com/b",
)
for input_scheme, output_scheme in product(HTTP_SCHEMES, repeat=2)
),
# http/https → data/file/ftp/s3/foo does not redirect
*(
(
f"{input_scheme}://example.com/a",
f"{output_scheme}://example.com/b",
None,
)
for input_scheme in HTTP_SCHEMES
for output_scheme in NON_HTTP_SCHEMES
),
# http/https → relative redirects
*(
(
f"{scheme}://example.com/a",
location,
f"{scheme}://example.com/b",
)
for scheme in HTTP_SCHEMES
for location in ("//example.com/b", "/b")
),
# Note: We do not test data/file/ftp/s3 schemes for the initial URL
# because their download handlers cannot return a status code of 3xx.
)


@pytest.mark.parametrize(SCHEME_PARAMS, REDIRECT_SCHEME_CASES)
def test_redirect_schemes(url, location, target):
crawler = get_crawler(Spider)
spider = crawler._create_spider("foo")
mw = RedirectMiddleware.from_crawler(crawler)
request = Request(url)
response = Response(url, headers={"Location": location}, status=301)
redirect = mw.process_response(request, response, spider)
if target is None:
assert redirect == response
else:
assert isinstance(redirect, Request)
assert redirect.url == target


def meta_refresh_body(url, interval=5):
html = f"""<html><head><meta http-equiv="refresh" content="{interval};url={url}"/></head></html>"""
return html.encode("utf-8")


class MetaRefreshMiddlewareTest(Base.Test):
mwcls = MetaRefreshMiddleware
reason = "meta refresh"
Expand All @@ -1148,8 +1209,7 @@ def setUp(self):
self.mw = self.mwcls.from_crawler(crawler)

def _body(self, interval=5, url="http://example.org/newpage"):
html = f"""<html><head><meta http-equiv="refresh" content="{interval};url={url}"/></head></html>"""
return html.encode("utf-8")
return meta_refresh_body(url, interval)

def get_response(self, request, location):
return HtmlResponse(request.url, body=self._body(url=location))
Expand Down Expand Up @@ -1216,5 +1276,45 @@ def test_ignore_tags_1_x_list(self):
assert isinstance(response, Response)


@pytest.mark.parametrize(
SCHEME_PARAMS,
(
*REDIRECT_SCHEME_CASES,
# data/file/ftp/s3/foo → * does not redirect
*(
(
f"{input_scheme}://example.com/a",
f"{output_scheme}://example.com/b",
None,
)
for input_scheme in NON_HTTP_SCHEMES
for output_scheme in chain(HTTP_SCHEMES, NON_HTTP_SCHEMES)
),
# data/file/ftp/s3/foo → relative does not redirect
*(
(
f"{scheme}://example.com/a",
location,
None,
)
for scheme in NON_HTTP_SCHEMES
for location in ("//example.com/b", "/b")
),
),
)
def test_meta_refresh_schemes(url, location, target):
crawler = get_crawler(Spider)
spider = crawler._create_spider("foo")
mw = MetaRefreshMiddleware.from_crawler(crawler)
request = Request(url)
response = HtmlResponse(url, body=meta_refresh_body(location))
redirect = mw.process_response(request, response, spider)
if target is None:
assert redirect == response
else:
assert isinstance(redirect, Request)
assert redirect.url == target


if __name__ == "__main__":
unittest.main()

0 comments on commit 36287cb

Please sign in to comment.