Skip to content

Commit

Permalink
Replace urlparse with urlparse_cached where possible (#6229)
Browse files Browse the repository at this point in the history
  • Loading branch information
Laerte committed Feb 20, 2024
1 parent c4e4b9b commit ee11895
Show file tree
Hide file tree
Showing 7 changed files with 25 additions and 22 deletions.
8 changes: 4 additions & 4 deletions docs/topics/media-pipeline.rst
Original file line number Diff line number Diff line change
Expand Up @@ -532,14 +532,14 @@ See here the methods that you can override in your custom Files Pipeline:
.. code-block:: python
from pathlib import PurePosixPath
from urllib.parse import urlparse
from scrapy.utils.httpobj import urlparse_cached
from scrapy.pipelines.files import FilesPipeline
class MyFilesPipeline(FilesPipeline):
def file_path(self, request, response=None, info=None, *, item=None):
return "files/" + PurePosixPath(urlparse(request.url).path).name
return "files/" + PurePosixPath(urlparse_cached(request).path).name
Similarly, you can use the ``item`` to determine the file path based on some item
property.
Expand Down Expand Up @@ -690,14 +690,14 @@ See here the methods that you can override in your custom Images Pipeline:
.. code-block:: python
from pathlib import PurePosixPath
from urllib.parse import urlparse
from scrapy.utils.httpobj import urlparse_cached
from scrapy.pipelines.images import ImagesPipeline
class MyImagesPipeline(ImagesPipeline):
def file_path(self, request, response=None, info=None, *, item=None):
return "files/" + PurePosixPath(urlparse(request.url).path).name
return "files/" + PurePosixPath(urlparse_cached(request).path).name
Similarly, you can use the ``item`` to determine the file path based on some item
property.
Expand Down
6 changes: 3 additions & 3 deletions scrapy/core/http2/stream.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,6 @@
from enum import Enum
from io import BytesIO
from typing import TYPE_CHECKING, Dict, List, Optional, Tuple
from urllib.parse import urlparse

from h2.errors import ErrorCodes
from h2.exceptions import H2Error, ProtocolError, StreamClosedError
Expand All @@ -15,6 +14,7 @@
from scrapy.http import Request
from scrapy.http.headers import Headers
from scrapy.responsetypes import responsetypes
from scrapy.utils.httpobj import urlparse_cached

if TYPE_CHECKING:
from scrapy.core.http2.protocol import H2ClientProtocol
Expand Down Expand Up @@ -185,7 +185,7 @@ def get_response(self) -> Deferred:

def check_request_url(self) -> bool:
# Make sure that we are sending the request to the correct URL
url = urlparse(self._request.url)
url = urlparse_cached(self._request)
return (
url.netloc == str(self._protocol.metadata["uri"].host, "utf-8")
or url.netloc == str(self._protocol.metadata["uri"].netloc, "utf-8")
Expand All @@ -194,7 +194,7 @@ def check_request_url(self) -> bool:
)

def _get_request_headers(self) -> List[Tuple[str, str]]:
url = urlparse(self._request.url)
url = urlparse_cached(self._request)

path = url.path
if url.query:
Expand Down
4 changes: 2 additions & 2 deletions scrapy/downloadermiddlewares/redirect.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@

import logging
from typing import TYPE_CHECKING, Any, List, Union, cast
from urllib.parse import urljoin, urlparse
from urllib.parse import urljoin

from w3lib.url import safe_url_string

Expand Down Expand Up @@ -125,7 +125,7 @@ def process_response(
assert response.headers["Location"] is not None
location = safe_url_string(response.headers["Location"])
if response.headers["Location"].startswith(b"//"):
request_scheme = urlparse(request.url).scheme
request_scheme = urlparse_cached(request).scheme
location = request_scheme + "://" + location.lstrip("/")

redirected_url = urljoin(request.url, location)
Expand Down
3 changes: 2 additions & 1 deletion tests/CrawlerRunner/ip_address.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@

from scrapy import Request, Spider
from scrapy.crawler import CrawlerRunner
from scrapy.utils.httpobj import urlparse_cached
from scrapy.utils.log import configure_logging
from tests.mockserver import MockDNSServer, MockServer

Expand All @@ -30,7 +31,7 @@ def start_requests(self):
yield Request(self.url)

def parse(self, response):
netloc = urlparse(response.url).netloc
netloc = urlparse_cached(response).netloc
host = netloc.split(":")[0]
self.logger.info(f"Host: {host}")
self.logger.info(f"Type: {type(response.ip_address)}")
Expand Down
10 changes: 5 additions & 5 deletions tests/test_http_cookies.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,8 @@
from unittest import TestCase
from urllib.parse import urlparse

from scrapy.http import Request, Response
from scrapy.http.cookies import WrappedRequest, WrappedResponse
from scrapy.utils.httpobj import urlparse_cached


class WrappedRequestTest(TestCase):
Expand All @@ -17,12 +17,12 @@ def test_get_full_url(self):
self.assertEqual(self.wrapped.full_url, self.request.url)

def test_get_host(self):
self.assertEqual(self.wrapped.get_host(), urlparse(self.request.url).netloc)
self.assertEqual(self.wrapped.host, urlparse(self.request.url).netloc)
self.assertEqual(self.wrapped.get_host(), urlparse_cached(self.request).netloc)
self.assertEqual(self.wrapped.host, urlparse_cached(self.request).netloc)

def test_get_type(self):
self.assertEqual(self.wrapped.get_type(), urlparse(self.request.url).scheme)
self.assertEqual(self.wrapped.type, urlparse(self.request.url).scheme)
self.assertEqual(self.wrapped.get_type(), urlparse_cached(self.request).scheme)
self.assertEqual(self.wrapped.type, urlparse_cached(self.request).scheme)

def test_is_unverifiable(self):
self.assertFalse(self.wrapped.is_unverifiable())
Expand Down
11 changes: 6 additions & 5 deletions tests/test_http_request.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@
import xmlrpc.client
from typing import Any, Dict, List
from unittest import mock
from urllib.parse import parse_qs, unquote_to_bytes, urlparse
from urllib.parse import parse_qs, unquote_to_bytes

from scrapy.http import (
FormRequest,
Expand All @@ -16,6 +16,7 @@
XmlRpcRequest,
)
from scrapy.http.request import NO_CALLBACK
from scrapy.utils.httpobj import urlparse_cached
from scrapy.utils.python import to_bytes, to_unicode


Expand Down Expand Up @@ -617,8 +618,8 @@ def test_from_response_duplicate_form_key(self):
method="GET",
formdata=(("foo", "bar"), ("foo", "baz")),
)
self.assertEqual(urlparse(req.url).hostname, "www.example.com")
self.assertEqual(urlparse(req.url).query, "foo=bar&foo=baz")
self.assertEqual(urlparse_cached(req).hostname, "www.example.com")
self.assertEqual(urlparse_cached(req).query, "foo=bar&foo=baz")

def test_from_response_override_duplicate_form_key(self):
response = _buildresponse(
Expand Down Expand Up @@ -666,8 +667,8 @@ def test_from_response_get(self):
response, formdata={"one": ["two", "three"], "six": "seven"}
)
self.assertEqual(r1.method, "GET")
self.assertEqual(urlparse(r1.url).hostname, "www.example.com")
self.assertEqual(urlparse(r1.url).path, "/this/get.php")
self.assertEqual(urlparse_cached(r1).hostname, "www.example.com")
self.assertEqual(urlparse_cached(r1).path, "/this/get.php")
fs = _qs(r1)
self.assertEqual(set(fs[b"test"]), {b"val1", b"val2"})
self.assertEqual(set(fs[b"one"]), {b"two", b"three"})
Expand Down
5 changes: 3 additions & 2 deletions tests/test_scheduler_base.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
from typing import Dict, Optional
from unittest import TestCase
from urllib.parse import urljoin, urlparse
from urllib.parse import urljoin

from testfixtures import LogCapture
from twisted.internet import defer
Expand All @@ -9,6 +9,7 @@
from scrapy.core.scheduler import BaseScheduler
from scrapy.http import Request
from scrapy.spiders import Spider
from scrapy.utils.httpobj import urlparse_cached
from scrapy.utils.request import fingerprint
from scrapy.utils.test import get_crawler
from tests.mockserver import MockServer
Expand Down Expand Up @@ -57,7 +58,7 @@ def __init__(self, mockserver, *args, **kwargs):
self.start_urls = map(mockserver.url, PATHS)

def parse(self, response):
return {"path": urlparse(response.url).path}
return {"path": urlparse_cached(response).path}


class InterfaceCheckMixin:
Expand Down

0 comments on commit ee11895

Please sign in to comment.