Replace urlparse with urlparse_cached where possible (#6229)

scrapy · Feb 20, 2024 · ee11895 · ee11895
1 parent c4e4b9b
commit ee11895
Show file tree

Hide file tree

Showing 7 changed files with 25 additions and 22 deletions.
diff --git a/docs/topics/media-pipeline.rst b/docs/topics/media-pipeline.rst
@@ -532,14 +532,14 @@ See here the methods that you can override in your custom Files Pipeline:
       .. code-block:: python
 
         from pathlib import PurePosixPath
-        from urllib.parse import urlparse
+        from scrapy.utils.httpobj import urlparse_cached
 
         from scrapy.pipelines.files import FilesPipeline
 
 
         class MyFilesPipeline(FilesPipeline):
             def file_path(self, request, response=None, info=None, *, item=None):
-                return "files/" + PurePosixPath(urlparse(request.url).path).name
+                return "files/" + PurePosixPath(urlparse_cached(request).path).name
 
       Similarly, you can use the ``item`` to determine the file path based on some item 
       property.
@@ -690,14 +690,14 @@ See here the methods that you can override in your custom Images Pipeline:
       .. code-block:: python
 
         from pathlib import PurePosixPath
-        from urllib.parse import urlparse
+        from scrapy.utils.httpobj import urlparse_cached
 
         from scrapy.pipelines.images import ImagesPipeline
 
 
         class MyImagesPipeline(ImagesPipeline):
             def file_path(self, request, response=None, info=None, *, item=None):
-                return "files/" + PurePosixPath(urlparse(request.url).path).name
+                return "files/" + PurePosixPath(urlparse_cached(request).path).name
 
       Similarly, you can use the ``item`` to determine the file path based on some item 
       property.

diff --git a/scrapy/core/http2/stream.py b/scrapy/core/http2/stream.py
@@ -2,7 +2,6 @@
 from enum import Enum
 from io import BytesIO
 from typing import TYPE_CHECKING, Dict, List, Optional, Tuple
-from urllib.parse import urlparse
 
 from h2.errors import ErrorCodes
 from h2.exceptions import H2Error, ProtocolError, StreamClosedError
@@ -15,6 +14,7 @@
 from scrapy.http import Request
 from scrapy.http.headers import Headers
 from scrapy.responsetypes import responsetypes
+from scrapy.utils.httpobj import urlparse_cached
 
 if TYPE_CHECKING:
     from scrapy.core.http2.protocol import H2ClientProtocol
@@ -185,7 +185,7 @@ def get_response(self) -> Deferred:
 
     def check_request_url(self) -> bool:
         # Make sure that we are sending the request to the correct URL
-        url = urlparse(self._request.url)
+        url = urlparse_cached(self._request)
         return (
             url.netloc == str(self._protocol.metadata["uri"].host, "utf-8")
             or url.netloc == str(self._protocol.metadata["uri"].netloc, "utf-8")
@@ -194,7 +194,7 @@ def check_request_url(self) -> bool:
         )
 
     def _get_request_headers(self) -> List[Tuple[str, str]]:
-        url = urlparse(self._request.url)
+        url = urlparse_cached(self._request)
 
         path = url.path
         if url.query:

diff --git a/scrapy/downloadermiddlewares/redirect.py b/scrapy/downloadermiddlewares/redirect.py
@@ -2,7 +2,7 @@
 
 import logging
 from typing import TYPE_CHECKING, Any, List, Union, cast
-from urllib.parse import urljoin, urlparse
+from urllib.parse import urljoin
 
 from w3lib.url import safe_url_string
 
@@ -125,7 +125,7 @@ def process_response(
         assert response.headers["Location"] is not None
         location = safe_url_string(response.headers["Location"])
         if response.headers["Location"].startswith(b"//"):
-            request_scheme = urlparse(request.url).scheme
+            request_scheme = urlparse_cached(request).scheme
             location = request_scheme + "://" + location.lstrip("/")
 
         redirected_url = urljoin(request.url, location)

diff --git a/tests/CrawlerRunner/ip_address.py b/tests/CrawlerRunner/ip_address.py
@@ -9,6 +9,7 @@
 
 from scrapy import Request, Spider
 from scrapy.crawler import CrawlerRunner
+from scrapy.utils.httpobj import urlparse_cached
 from scrapy.utils.log import configure_logging
 from tests.mockserver import MockDNSServer, MockServer
 
@@ -30,7 +31,7 @@ def start_requests(self):
         yield Request(self.url)
 
     def parse(self, response):
-        netloc = urlparse(response.url).netloc
+        netloc = urlparse_cached(response).netloc
         host = netloc.split(":")[0]
         self.logger.info(f"Host: {host}")
         self.logger.info(f"Type: {type(response.ip_address)}")

diff --git a/tests/test_http_cookies.py b/tests/test_http_cookies.py
@@ -1,8 +1,8 @@
 from unittest import TestCase
-from urllib.parse import urlparse
 
 from scrapy.http import Request, Response
 from scrapy.http.cookies import WrappedRequest, WrappedResponse
+from scrapy.utils.httpobj import urlparse_cached
 
 
 class WrappedRequestTest(TestCase):
@@ -17,12 +17,12 @@ def test_get_full_url(self):
         self.assertEqual(self.wrapped.full_url, self.request.url)
 
     def test_get_host(self):
-        self.assertEqual(self.wrapped.get_host(), urlparse(self.request.url).netloc)
-        self.assertEqual(self.wrapped.host, urlparse(self.request.url).netloc)
+        self.assertEqual(self.wrapped.get_host(), urlparse_cached(self.request).netloc)
+        self.assertEqual(self.wrapped.host, urlparse_cached(self.request).netloc)
 
     def test_get_type(self):
-        self.assertEqual(self.wrapped.get_type(), urlparse(self.request.url).scheme)
-        self.assertEqual(self.wrapped.type, urlparse(self.request.url).scheme)
+        self.assertEqual(self.wrapped.get_type(), urlparse_cached(self.request).scheme)
+        self.assertEqual(self.wrapped.type, urlparse_cached(self.request).scheme)
 
     def test_is_unverifiable(self):
         self.assertFalse(self.wrapped.is_unverifiable())

diff --git a/tests/test_http_request.py b/tests/test_http_request.py
@@ -5,7 +5,7 @@
 import xmlrpc.client
 from typing import Any, Dict, List
 from unittest import mock
-from urllib.parse import parse_qs, unquote_to_bytes, urlparse
+from urllib.parse import parse_qs, unquote_to_bytes
 
 from scrapy.http import (
     FormRequest,
@@ -16,6 +16,7 @@
     XmlRpcRequest,
 )
 from scrapy.http.request import NO_CALLBACK
+from scrapy.utils.httpobj import urlparse_cached
 from scrapy.utils.python import to_bytes, to_unicode
 
 
@@ -617,8 +618,8 @@ def test_from_response_duplicate_form_key(self):
             method="GET",
             formdata=(("foo", "bar"), ("foo", "baz")),
         )
-        self.assertEqual(urlparse(req.url).hostname, "www.example.com")
-        self.assertEqual(urlparse(req.url).query, "foo=bar&foo=baz")
+        self.assertEqual(urlparse_cached(req).hostname, "www.example.com")
+        self.assertEqual(urlparse_cached(req).query, "foo=bar&foo=baz")
 
     def test_from_response_override_duplicate_form_key(self):
         response = _buildresponse(
@@ -666,8 +667,8 @@ def test_from_response_get(self):
             response, formdata={"one": ["two", "three"], "six": "seven"}
         )
         self.assertEqual(r1.method, "GET")
-        self.assertEqual(urlparse(r1.url).hostname, "www.example.com")
-        self.assertEqual(urlparse(r1.url).path, "/this/get.php")
+        self.assertEqual(urlparse_cached(r1).hostname, "www.example.com")
+        self.assertEqual(urlparse_cached(r1).path, "/this/get.php")
         fs = _qs(r1)
         self.assertEqual(set(fs[b"test"]), {b"val1", b"val2"})
         self.assertEqual(set(fs[b"one"]), {b"two", b"three"})

diff --git a/tests/test_scheduler_base.py b/tests/test_scheduler_base.py
@@ -1,6 +1,6 @@
 from typing import Dict, Optional
 from unittest import TestCase
-from urllib.parse import urljoin, urlparse
+from urllib.parse import urljoin
 
 from testfixtures import LogCapture
 from twisted.internet import defer
@@ -9,6 +9,7 @@
 from scrapy.core.scheduler import BaseScheduler
 from scrapy.http import Request
 from scrapy.spiders import Spider
+from scrapy.utils.httpobj import urlparse_cached
 from scrapy.utils.request import fingerprint
 from scrapy.utils.test import get_crawler
 from tests.mockserver import MockServer
@@ -57,7 +58,7 @@ def __init__(self, mockserver, *args, **kwargs):
         self.start_urls = map(mockserver.url, PATHS)
 
     def parse(self, response):
-        return {"path": urlparse(response.url).path}
+        return {"path": urlparse_cached(response).path}
 
 
 class InterfaceCheckMixin: