From 16e457abcf20e1981fc6aa43ff62c4df3b65cadc Mon Sep 17 00:00:00 2001 From: mazen-r Date: Thu, 23 Jan 2025 11:57:56 +0200 Subject: [PATCH 1/4] support zstd and deflate auto compression --- .../extract/passing_compressed_document.py | 4 +- scrapfly/extraction_config.py | 38 +++++++++++++------ 2 files changed, 29 insertions(+), 13 deletions(-) diff --git a/examples/extract/passing_compressed_document.py b/examples/extract/passing_compressed_document.py index 0f7bbe1..88bf1f6 100644 --- a/examples/extract/passing_compressed_document.py +++ b/examples/extract/passing_compressed_document.py @@ -22,8 +22,8 @@ is_document_compressed=False, # specify that the sent document is not compressed to compress it document_compression_format=CompressionFormat.GZIP # specify that compression format # If both is_document_compressed and document_compression_format are ignored, the raw HTML sould be sent - # If is_document_compressed is set to false and CompressionFormat set to GZIP, the SDK will automatically compress the document to gzip - # is_document_compressed is set to false and CompressionFormat set to ZSTD or DEFLATE, the document passed to ExtractionConfig must be manually compressed + # If is_document_compressed is set to false and CompressionFormat is set, the SDK will automatically compress the document to the specified format + # is_document_compressed is set to false and CompressionFormat set to ZSTD or DEFLATE, the document passed to ExtractionConfig must be manually compressed ) ) diff --git a/scrapfly/extraction_config.py b/scrapfly/extraction_config.py index 847e456..a63c49c 100644 --- a/scrapfly/extraction_config.py +++ b/scrapfly/extraction_config.py @@ -14,6 +14,7 @@ class CompressionFormat(Enum): Attributes: GZIP: gzip format. ZSTD: zstd format. + DEFLATE: deflate format. """ GZIP = "gzip" @@ -99,12 +100,22 @@ def __init__( if compression_foramt == CompressionFormat.GZIP.value: import gzip - self.body = gzip.compress(bytes(self.body, 'utf-8')) - else: - raise ExtractionConfigError( - f'Auto compression for {compression_foramt} format is not available. ' - f'You can manually compress to {compression_foramt} or choose the gzip format for auto compression.' - ) + self.body = gzip.compress(self.body.encode('utf-8')) + + elif compression_foramt == CompressionFormat.ZSTD.value: + try: + import zstandard as zstd + except ImportError: + raise ExtractionConfigError( + f'zstandard is not installed. You must run pip install zstandard' + f' to auto compress into zstd or use compression formats.' + ) + self.body = zstd.compress(self.body.encode('utf-8')) + + elif compression_foramt == CompressionFormat.DEFLATE.value: + import zlib + compressor = zlib.compressobj(wbits=-zlib.MAX_WBITS) # raw deflate compression + self.body = compressor.compress(self.body.encode('utf-8')) + compressor.flush() def to_api_params(self, key: str) -> Dict: params = { @@ -149,11 +160,16 @@ def to_dict(self) -> Dict: if compression_foramt == CompressionFormat.GZIP.value: import gzip self.body = gzip.decompress(self.body).decode('utf-8') - else: - raise ExtractionConfigError( - f'Auto decompression for {compression_foramt} format is not available. ' - f'You can manually decompress to {compression_foramt} or choose the gzip format for auto decompression.' - ) + + elif compression_foramt == CompressionFormat.ZSTD.value: + import zstandard as zstd + self.body = zstd.decompress(self.body).decode('utf-8') + + elif compression_foramt == CompressionFormat.DEFLATE.value: + import zlib + decompressor = zlib.decompressobj(wbits=-zlib.MAX_WBITS) + self.body = decompressor.decompress(self.body) + decompressor.flush() + self.body = self.body.decode('utf-8') return { 'body': self.body, From e258633ac1365fc865ad82937050e50e3c307b6c Mon Sep 17 00:00:00 2001 From: mazen-r Date: Thu, 23 Jan 2025 17:17:58 +0200 Subject: [PATCH 2/4] add auto compression detection support --- .../extract/passing_compressed_document.py | 5 +- scrapfly/extraction_config.py | 85 ++++++++++++++++--- 2 files changed, 72 insertions(+), 18 deletions(-) diff --git a/examples/extract/passing_compressed_document.py b/examples/extract/passing_compressed_document.py index 88bf1f6..80b83c3 100644 --- a/examples/extract/passing_compressed_document.py +++ b/examples/extract/passing_compressed_document.py @@ -19,11 +19,8 @@ content_type='text/html', charset='utf-8', extraction_model='review_list', - is_document_compressed=False, # specify that the sent document is not compressed to compress it document_compression_format=CompressionFormat.GZIP # specify that compression format - # If both is_document_compressed and document_compression_format are ignored, the raw HTML sould be sent - # If is_document_compressed is set to false and CompressionFormat is set, the SDK will automatically compress the document to the specified format - # is_document_compressed is set to false and CompressionFormat set to ZSTD or DEFLATE, the document passed to ExtractionConfig must be manually compressed + # If the body is not compressed, Scrapfly will automatically compress it based on the document_compression_format value ) ) diff --git a/scrapfly/extraction_config.py b/scrapfly/extraction_config.py index a63c49c..1fd4078 100644 --- a/scrapfly/extraction_config.py +++ b/scrapfly/extraction_config.py @@ -1,7 +1,7 @@ import json import warnings from enum import Enum -from typing import Optional, Dict +from typing import Optional, Dict, Union from urllib.parse import quote_plus from base64 import urlsafe_b64encode from .api_config import BaseApiConfig @@ -27,7 +27,7 @@ class ExtractionConfigError(Exception): class ExtractionConfig(BaseApiConfig): - body: str + body: Union[str, bytes] content_type: str url: Optional[str] = None charset: Optional[str] = None @@ -46,7 +46,7 @@ class ExtractionConfig(BaseApiConfig): def __init__( self, - body: str, + body: Union[str, bytes], content_type: str, url: Optional[str] = None, charset: Optional[str] = None, @@ -91,16 +91,30 @@ def __init__( self.raise_on_upstream_error = raise_on_upstream_error if self.document_compression_format is not None: + if self.is_document_compressed is None: - raise ExtractionConfigError( - 'When declaring compression format, your must declare the is_document_compressed parameter to compress the document or skip it.' - ) + compression_foramt = detect_compression_format(self.body) + + if compression_foramt == 'unknown': + self.is_document_compressed = False + + else: + if compression_foramt != self.document_compression_format.value: + raise ExtractionConfigError( + f'The detected compression format `{compression_foramt}` does not match declared format `{self.document_compression_format.value}`. ' + f'You must pass the compression format or disable compression.' + ) + self.is_document_compressed = True + if self.is_document_compressed is False: compression_foramt = CompressionFormat(self.document_compression_format).value if self.document_compression_format else None + + if isinstance(self.body, str) and compression_foramt: + self.body = self.body.encode('utf-8') if compression_foramt == CompressionFormat.GZIP.value: import gzip - self.body = gzip.compress(self.body.encode('utf-8')) + self.body = gzip.compress(self.body) elif compression_foramt == CompressionFormat.ZSTD.value: try: @@ -110,12 +124,12 @@ def __init__( f'zstandard is not installed. You must run pip install zstandard' f' to auto compress into zstd or use compression formats.' ) - self.body = zstd.compress(self.body.encode('utf-8')) + self.body = zstd.compress(self.body) elif compression_foramt == CompressionFormat.DEFLATE.value: import zlib compressor = zlib.compressobj(wbits=-zlib.MAX_WBITS) # raw deflate compression - self.body = compressor.compress(self.body.encode('utf-8')) + compressor.flush() + self.body = compressor.compress(self.body) + compressor.flush() def to_api_params(self, key: str) -> Dict: params = { @@ -159,20 +173,22 @@ def to_dict(self) -> Dict: if compression_foramt == CompressionFormat.GZIP.value: import gzip - self.body = gzip.decompress(self.body).decode('utf-8') + body = gzip.decompress(self.body) elif compression_foramt == CompressionFormat.ZSTD.value: import zstandard as zstd - self.body = zstd.decompress(self.body).decode('utf-8') + body = zstd.decompress(self.body) elif compression_foramt == CompressionFormat.DEFLATE.value: import zlib decompressor = zlib.decompressobj(wbits=-zlib.MAX_WBITS) - self.body = decompressor.decompress(self.body) + decompressor.flush() - self.body = self.body.decode('utf-8') + body = decompressor.decompress(self.body) + decompressor.flush() + + if isinstance(self.body, str) and compression_foramt: + body = self.body.decode('utf-8') return { - 'body': self.body, + 'body': body, # don't alter the already compressed body 'content_type': self.content_type, 'url': self.url, 'charset': self.charset, @@ -219,3 +235,44 @@ def from_dict(extraction_config_dict: Dict) -> 'ExtractionConfig': webhook=webhook, raise_on_upstream_error=raise_on_upstream_error ) + + +def detect_compression_format(data): + """ + Detects the compression type of the given data. + + Args: + data: The compressed data as bytes. + + Returns: + The name of the compression type ("gzip", "zstd", "deflate", "unknown"). + """ + + if len(data) < 2: + return 'unknown' + + # gzip + if data[0] == 0x1f and data[1] == 0x8b: + return 'gzip' + + # zstd + zstd_magic_numbers = [ + b'\x1e\xb5\x2f\xfd', # v0.1 + b'\x22\xb5\x2f\xfd', # v0.2 + b'\x23\xb5\x2f\xfd', # v0.3 + b'\x24\xb5\x2f\xfd', # v0.4 + b'\x25\xb5\x2f\xfd', # v0.5 + b'\x26\xb5\x2f\xfd', # v0.6 + b'\x27\xb5\x2f\xfd', # v0.7 + b'\x28\xb5\x2f\xfd', # v0.8 + ] + for magic in zstd_magic_numbers: + if data[:len(magic)] == magic: + return 'zstd' + + # deflate + if data[0] == 0x78: + if data[1] in (0x01, 0x5E, 0x9C, 0xDA): + return 'deflate' + + return 'unknown' \ No newline at end of file From d098ae373bf3fc620791a6ae5e79c53633663b0c Mon Sep 17 00:00:00 2001 From: mazen-r Date: Thu, 23 Jan 2025 19:56:31 +0200 Subject: [PATCH 3/4] update auto compression detection --- scrapfly/extraction_config.py | 44 +++++++++++++++++------------------ 1 file changed, 22 insertions(+), 22 deletions(-) diff --git a/scrapfly/extraction_config.py b/scrapfly/extraction_config.py index 1fd4078..58a2305 100644 --- a/scrapfly/extraction_config.py +++ b/scrapfly/extraction_config.py @@ -90,21 +90,19 @@ def __init__( self.webhook = webhook self.raise_on_upstream_error = raise_on_upstream_error - if self.document_compression_format is not None: - - if self.is_document_compressed is None: - compression_foramt = detect_compression_format(self.body) - - if compression_foramt == 'unknown': - self.is_document_compressed = False + if isinstance(body, bytes) or document_compression_format: + compression_format = detect_compression_format(body) - else: - if compression_foramt != self.document_compression_format.value: - raise ExtractionConfigError( - f'The detected compression format `{compression_foramt}` does not match declared format `{self.document_compression_format.value}`. ' - f'You must pass the compression format or disable compression.' - ) - self.is_document_compressed = True + if compression_format is not None: + self.is_document_compressed = True + + if self.document_compression_format and compression_format != self.document_compression_format.value: + raise ExtractionConfigError( + f'The detected compression format `{compression_format}` does not match declared format `{self.document_compression_format.value}`. ' + f'You must pass the compression format or disable compression.' + ) + + self.document_compression_format = CompressionFormat(compression_format) if self.is_document_compressed is False: compression_foramt = CompressionFormat(self.document_compression_format).value if self.document_compression_format else None @@ -168,27 +166,29 @@ def to_dict(self) -> Dict: """ Export the ExtractionConfig instance to a plain dictionary. """ - if self.is_document_compressed is False and self.document_compression_format: + + if self.is_document_compressed is True: compression_foramt = CompressionFormat(self.document_compression_format).value if self.document_compression_format else None if compression_foramt == CompressionFormat.GZIP.value: import gzip - body = gzip.decompress(self.body) + self.body = gzip.decompress(self.body) elif compression_foramt == CompressionFormat.ZSTD.value: import zstandard as zstd - body = zstd.decompress(self.body) + self.body = zstd.decompress(self.body) elif compression_foramt == CompressionFormat.DEFLATE.value: import zlib decompressor = zlib.decompressobj(wbits=-zlib.MAX_WBITS) - body = decompressor.decompress(self.body) + decompressor.flush() + self.body = decompressor.decompress(self.body) + decompressor.flush() - if isinstance(self.body, str) and compression_foramt: - body = self.body.decode('utf-8') + if isinstance(self.body, bytes): + self.body = self.body.decode('utf-8') + self.is_document_compressed = False return { - 'body': body, # don't alter the already compressed body + 'body': self.body, 'content_type': self.content_type, 'url': self.url, 'charset': self.charset, @@ -275,4 +275,4 @@ def detect_compression_format(data): if data[1] in (0x01, 0x5E, 0x9C, 0xDA): return 'deflate' - return 'unknown' \ No newline at end of file + return None \ No newline at end of file From d9b8688252de5d9aff7e97d0ac13e236e4664a94 Mon Sep 17 00:00:00 2001 From: mazen-r Date: Fri, 24 Jan 2025 13:24:08 +0200 Subject: [PATCH 4/4] use enums directly --- scrapfly/extraction_config.py | 37 +++++++++++++++++++---------------- 1 file changed, 20 insertions(+), 17 deletions(-) diff --git a/scrapfly/extraction_config.py b/scrapfly/extraction_config.py index 58a2305..5aeb922 100644 --- a/scrapfly/extraction_config.py +++ b/scrapfly/extraction_config.py @@ -86,7 +86,7 @@ def __init__( self.extraction_prompt = extraction_prompt self.extraction_model = extraction_model self.is_document_compressed = is_document_compressed - self.document_compression_format = document_compression_format + self.document_compression_format = CompressionFormat(document_compression_format) if document_compression_format else None self.webhook = webhook self.raise_on_upstream_error = raise_on_upstream_error @@ -96,25 +96,28 @@ def __init__( if compression_format is not None: self.is_document_compressed = True - if self.document_compression_format and compression_format != self.document_compression_format.value: + if self.document_compression_format and compression_format != self.document_compression_format: raise ExtractionConfigError( - f'The detected compression format `{compression_format}` does not match declared format `{self.document_compression_format.value}`. ' + f'The detected compression format `{compression_format}` does not match declared format `{self.document_compression_format}`. ' f'You must pass the compression format or disable compression.' ) - self.document_compression_format = CompressionFormat(compression_format) + self.document_compression_format = compression_format + + else: + self.is_document_compressed = False if self.is_document_compressed is False: - compression_foramt = CompressionFormat(self.document_compression_format).value if self.document_compression_format else None + compression_foramt = CompressionFormat(self.document_compression_format) if self.document_compression_format else None if isinstance(self.body, str) and compression_foramt: self.body = self.body.encode('utf-8') - if compression_foramt == CompressionFormat.GZIP.value: + if compression_foramt == CompressionFormat.GZIP: import gzip self.body = gzip.compress(self.body) - elif compression_foramt == CompressionFormat.ZSTD.value: + elif compression_foramt == CompressionFormat.ZSTD: try: import zstandard as zstd except ImportError: @@ -124,7 +127,7 @@ def __init__( ) self.body = zstd.compress(self.body) - elif compression_foramt == CompressionFormat.DEFLATE.value: + elif compression_foramt == CompressionFormat.DEFLATE: import zlib compressor = zlib.compressobj(wbits=-zlib.MAX_WBITS) # raw deflate compression self.body = compressor.compress(self.body) + compressor.flush() @@ -168,17 +171,17 @@ def to_dict(self) -> Dict: """ if self.is_document_compressed is True: - compression_foramt = CompressionFormat(self.document_compression_format).value if self.document_compression_format else None + compression_foramt = CompressionFormat(self.document_compression_format) if self.document_compression_format else None - if compression_foramt == CompressionFormat.GZIP.value: + if compression_foramt == CompressionFormat.GZIP: import gzip self.body = gzip.decompress(self.body) - elif compression_foramt == CompressionFormat.ZSTD.value: + elif compression_foramt == CompressionFormat.ZSTD: import zstandard as zstd self.body = zstd.decompress(self.body) - elif compression_foramt == CompressionFormat.DEFLATE.value: + elif compression_foramt == CompressionFormat.DEFLATE: import zlib decompressor = zlib.decompressobj(wbits=-zlib.MAX_WBITS) self.body = decompressor.decompress(self.body) + decompressor.flush() @@ -237,7 +240,7 @@ def from_dict(extraction_config_dict: Dict) -> 'ExtractionConfig': ) -def detect_compression_format(data): +def detect_compression_format(data) -> Optional[CompressionFormat]: """ Detects the compression type of the given data. @@ -249,11 +252,11 @@ def detect_compression_format(data): """ if len(data) < 2: - return 'unknown' + return None # gzip if data[0] == 0x1f and data[1] == 0x8b: - return 'gzip' + return CompressionFormat.GZIP # zstd zstd_magic_numbers = [ @@ -268,11 +271,11 @@ def detect_compression_format(data): ] for magic in zstd_magic_numbers: if data[:len(magic)] == magic: - return 'zstd' + return CompressionFormat.ZSTD # deflate if data[0] == 0x78: if data[1] in (0x01, 0x5E, 0x9C, 0xDA): - return 'deflate' + return CompressionFormat.DEFLATE return None \ No newline at end of file