diff --git a/scrapy/downloadermiddlewares/httpcompression.py b/scrapy/downloadermiddlewares/httpcompression.py index 56a58a7508a..5bd1b94ac42 100644 --- a/scrapy/downloadermiddlewares/httpcompression.py +++ b/scrapy/downloadermiddlewares/httpcompression.py @@ -1,12 +1,13 @@ from __future__ import annotations import io +import logging import zlib -from typing import TYPE_CHECKING, List, Optional, Union +from typing import TYPE_CHECKING, List, Optional, Tuple, Union from scrapy import Request, Spider from scrapy.crawler import Crawler -from scrapy.exceptions import NotConfigured +from scrapy.exceptions import NotConfigured, NotSupported from scrapy.http import Response, TextResponse from scrapy.responsetypes import responsetypes from scrapy.statscollectors import StatsCollector @@ -17,6 +18,10 @@ from typing_extensions import Self ACCEPTED_ENCODINGS: List[bytes] = [b"gzip", b"deflate"] +ENCODINGS_DELIMETER: bytes = b", " + +logger = logging.getLogger(__name__) + try: import brotli @@ -35,7 +40,7 @@ class HttpCompressionMiddleware: """This middleware allows compressed (gzip, deflate) traffic to be - sent/received from web sites""" + sent/received from websites""" def __init__(self, stats: Optional[StatsCollector] = None): self.stats = stats @@ -49,7 +54,10 @@ def from_crawler(cls, crawler: Crawler) -> Self: def process_request( self, request: Request, spider: Spider ) -> Union[Request, Response, None]: - request.headers.setdefault("Accept-Encoding", b", ".join(ACCEPTED_ENCODINGS)) + self._raise_unsupported_compressors(request) + request.headers.setdefault( + "Accept-Encoding", ENCODINGS_DELIMETER.join(ACCEPTED_ENCODINGS) + ) return None def process_response( @@ -85,6 +93,22 @@ def process_response( return response + @property + def _raise_unsupported(self) -> Tuple[bytes]: + return (b"br",) + + def _raise_unsupported_compressors(self, request: Request): + encodings = request.headers.getlist("Accept-Encoding") + if encodings and len(encodings): + encodings = encodings.pop().split(ENCODINGS_DELIMETER) + unsupported = [key for key in encodings if key not in ACCEPTED_ENCODINGS] + for unsupp in unsupported: + if unsupp in self._raise_unsupported: + raise NotSupported( + f"Request is configured with Accept-Encoding header with unsupported encoding(s): " + f"{unsupp.decode()}" + ) + def _decode(self, body: bytes, encoding: bytes) -> bytes: if encoding == b"gzip" or encoding == b"x-gzip": body = gunzip(body) @@ -99,8 +123,13 @@ def _decode(self, body: bytes, encoding: bytes) -> bytes: # http://www.port80software.com/200ok/archive/2005/10/31/868.aspx # http://www.gzip.org/zlib/zlib_faq.html#faq38 body = zlib.decompress(body, -15) - if encoding == b"br" and b"br" in ACCEPTED_ENCODINGS: - body = brotli.decompress(body) + if encoding == b"br": + if b"br" in ACCEPTED_ENCODINGS: + body = brotli.decompress(body) + else: + logger.warning( + "Brotli encoding received. Cannot decompress the body as Brotli is not installed." + ) if encoding == b"zstd" and b"zstd" in ACCEPTED_ENCODINGS: # Using its streaming API since its simple API could handle only cases # where there is content size data embedded in the frame diff --git a/tests/test_downloadermiddleware_httpcompression.py b/tests/test_downloadermiddleware_httpcompression.py index a96b710f307..5505a64d405 100644 --- a/tests/test_downloadermiddleware_httpcompression.py +++ b/tests/test_downloadermiddleware_httpcompression.py @@ -1,6 +1,7 @@ from gzip import GzipFile from io import BytesIO from pathlib import Path +from typing import Tuple from unittest import SkipTest, TestCase from w3lib.encoding import resolve_encoding @@ -9,7 +10,7 @@ ACCEPTED_ENCODINGS, HttpCompressionMiddleware, ) -from scrapy.exceptions import NotConfigured +from scrapy.exceptions import NotConfigured, NotSupported from scrapy.http import HtmlResponse, Request, Response from scrapy.responsetypes import responsetypes from scrapy.spiders import Spider @@ -37,6 +38,12 @@ } +class BroHttpCompressionMiddleware(HttpCompressionMiddleware): + @property + def _raise_unsupported(self) -> Tuple[bytes]: + return (b"bro",) + + class HttpCompressionTest(TestCase): def setUp(self): self.crawler = get_crawler(Spider) @@ -102,6 +109,31 @@ def test_process_request(self): request.headers.get("Accept-Encoding"), b", ".join(ACCEPTED_ENCODINGS) ) + def test_process_request_checks_encodings(self): + initial_encodings = ACCEPTED_ENCODINGS.copy() + + mw = BroHttpCompressionMiddleware.from_crawler(self.crawler) + + ACCEPTED_ENCODINGS.append(b"bro") + + request = Request( + "http://scrapytest.org", + headers={"Accept-Encoding": b", ".join((b"bro", b"gzip"))}, + ) + mw.process_request(request, self.spider) + """Expecting no Exception raised here as `bro` encoding is forced to be allowed.""" + + request = Request( + "http://scrapytest.org", + headers={"Accept-Encoding": b", ".join((b"bro", b"gzip"))}, + ) + ACCEPTED_ENCODINGS.pop() + mw = BroHttpCompressionMiddleware.from_crawler(self.crawler) + self.assertRaises(NotSupported, mw.process_request, request, self.spider) + + """Checking that valid encondings are back""" + self.assertListEqual(ACCEPTED_ENCODINGS, initial_encodings) + def test_process_response_gzip(self): response = self._getresponse("gzip") request = response.request