Rework and update example - Better handling for encoding issue

scrapfly · Oct 27, 2023 · 712b37a · 712b37a
1 parent 7fbb8b1
commit 712b37a
Show file tree

Hide file tree

Showing 19 changed files with 138 additions and 100 deletions.
diff --git a/.gitignore b/.gitignore
@@ -4,3 +4,5 @@ scrapfly_sdk.egg-info
 .pypirc
 *.pyc
 venv
+examples/scrapy/demo/images
+!examples/scrapy/demo/images/.gitkeep
diff --git a/examples/concurrency.py b/examples/concurrency.py
@@ -10,14 +10,14 @@
 logger.StreamHandler(stdout)
 
 async def main():
-    results = await scrapfly.concurrent_scrape(scrape_configs=[
-        ScrapeConfig(url='http://httpbin.org/anything', render_js=True),
-        ScrapeConfig(url='http://httpbin.org/anything', render_js=True),
-        ScrapeConfig(url='http://httpbin.org/anything', render_js=True),
-        ScrapeConfig(url='http://httpbin.org/anything', render_js=True)
-    ])
+    scrape_configs = [
+        ScrapeConfig(url='https://httpbin.dev/anything'),
+        ScrapeConfig(url='https://httpbin.dev/anything'),
+        ScrapeConfig(url='https://httpbin.dev/anything'),
+        ScrapeConfig(url='https://httpbin.dev/anything')
+    ]
 
-
-    print(results)
+    async for result in scrapfly.concurrent_scrape(scrape_configs):
+        print(result)
 
 asyncio.run(main())
diff --git a/examples/scrapy/bea/bea/settings.py b/examples/scrapy/bea/bea/settings.py
diff --git a/examples/scrapy/bea/bea/spiders/bea.py b/examples/scrapy/bea/bea/spiders/bea.py
diff --git a/examples/scrapy/bea/scrapy.cfg b/examples/scrapy/bea/scrapy.cfg
diff --git a/examples/scrapy/covid/covid/__init__.py b/examples/scrapy/covid/covid/__init__.py
diff --git a/examples/scrapy/covid/covid/settings.py b/examples/scrapy/covid/covid/settings.py
diff --git a/examples/scrapy/covid/covid/spiders/__init__.py b/examples/scrapy/covid/covid/spiders/__init__.py
diff --git a/examples/scrapy/covid/covid/spiders/covid.py b/examples/scrapy/covid/covid/spiders/covid.py
diff --git a/examples/scrapy/covid/scrapy.cfg b/examples/scrapy/covid/scrapy.cfg
diff --git a/examples/scrapy/bea/bea/__init__.py → examples/scrapy/demo/demo/__init__.py b/examples/scrapy/bea/bea/__init__.py → examples/scrapy/demo/demo/__init__.py
diff --git a/examples/scrapy/demo/demo/settings.py b/examples/scrapy/demo/demo/settings.py
@@ -0,0 +1,5 @@
+SPIDER_MODULES = ['demo.spiders']
+NEWSPIDER_MODULE = 'demo.spiders'
+SCRAPFLY_API_KEY = '__API_KEY__'
+CONCURRENT_REQUESTS = 2
+IMAGES_STORE = "./images"
diff --git a/examples/scrapy/bea/bea/spiders/__init__.py → ...ples/scrapy/demo/demo/spiders/__init__.py b/examples/scrapy/bea/bea/spiders/__init__.py → ...ples/scrapy/demo/demo/spiders/__init__.py
diff --git a/examples/scrapy/demo/demo/spiders/demo.py b/examples/scrapy/demo/demo/spiders/demo.py
@@ -0,0 +1,71 @@
+from scrapy import Item, Field
+from scrapy.exceptions import CloseSpider
+from scrapy.spidermiddlewares.httperror import HttpError
+from twisted.python.failure import Failure
+
+from scrapfly import ScrapeConfig
+from scrapfly.errors import ScraperAPIError, ApiHttpServerError
+from scrapfly.scrapy import ScrapflyScrapyRequest, ScrapflySpider, ScrapflyScrapyResponse
+
+
+class Product(Item):
+
+    name = Field()
+    price = Field()
+    description = Field()
+
+    # scrapy.pipelines.images.ImagesPipeline
+    image_urls = Field()
+    images = Field()
+
+
+class Demo(ScrapflySpider):
+    name = "demo"
+
+    allowed_domains = ["web-scraping.dev", "httpbin.dev"]
+    start_urls = [
+        ScrapeConfig("https://web-scraping.dev/product/1", render_js=True),
+        ScrapeConfig("https://web-scraping.dev/product/2"),
+        ScrapeConfig("https://web-scraping.dev/product/3"),
+        ScrapeConfig("https://web-scraping.dev/product/4"),
+        ScrapeConfig("https://web-scraping.dev/product/5", render_js=True),
+        ScrapeConfig("https://httpbin.dev/status/403", asp=True, retry=False), # it will fail on purpose
+        ScrapeConfig("https://httpbin.dev/status/400"), # it will fail on purpose - will fall on scrapy.spidermiddlewares.httperror.HttpError
+        ScrapeConfig("https://httpbin.dev/status/404"), # it will fail on purpose - will fall on scrapy.spidermiddlewares.httperror.HttpError
+    ]
+
+    def start_requests(self):
+        for scrape_config in self.start_urls:
+            yield ScrapflyScrapyRequest(scrape_config, callback=self.parse, errback=self.error_handler, dont_filter=True)
+
+    def error_handler(self, failure:Failure):
+        if failure.check(ScraperAPIError): # The scrape errored
+            error_code = failure.value.code # https://scrapfly.io/docs/scrape-api/errors#web_scraping_api_error
+
+            if error_code == "ERR::ASP::SHIELD_PROTECTION_FAILED":
+                self.logger.warning("The url %s must be retried" % failure.request.url)
+        elif failure.check(HttpError): # The scrape succeed but the target server returned a non success http code >=400
+            response:ScrapflyScrapyResponse = failure.value.response
+
+            if response.status == 404:
+                self.logger.warning("The url %s returned a 404 http code - Page not found" % response.url)
+            elif response.status == 500:
+                raise CloseSpider(reason="The target server returned a 500 http code - Website down")
+
+        elif failure.check(ApiHttpServerError): # Generic API error, config error, quota reached, etc
+            self.logger.error(failure)
+        else:
+            self.logger.error(failure)
+
+    def parse(self, response:ScrapflyScrapyResponse, **kwargs):
+        item = Product()
+
+        if response.status == 200:
+            # make sure the url is absolute
+            item['image_urls'] = [response.urljoin(response.css('img.product-img::attr(src)').get())]
+
+        item['name'] = response.css('h3.product-title').get()
+        item['price'] = response.css('span.product-price::text').get()
+        item['description'] = response.css('p.product-description').get()
+
+        yield item
diff --git a/examples/scrapy/bea/pdf/.gitkeep → examples/scrapy/demo/images/.gitkeep b/examples/scrapy/bea/pdf/.gitkeep → examples/scrapy/demo/images/.gitkeep
diff --git a/examples/scrapy/demo/scrapy.cfg b/examples/scrapy/demo/scrapy.cfg
@@ -0,0 +1,5 @@
+[settings]
+default = demo.settings
+
+[deploy]
+project = demo
diff --git a/scrapfly/api_response.py b/scrapfly/api_response.py
@@ -1,3 +1,4 @@
+import base64
 import re
 import logging as logger
 import shutil
@@ -104,7 +105,10 @@ def __call__(self, content: bytes) -> Union[str, Dict]:
         try:
             return self.content_loader(content)
         except Exception as e:
-            raise EncoderError(content=content.decode('utf-8')) from e
+            try:
+                raise EncoderError(content=content.decode('utf-8')) from e
+            except UnicodeError:
+                raise EncoderError(content=base64.b64encode(content).decode('utf-8')) from e
 
 
 class ScrapeApiResponse:
@@ -364,7 +368,7 @@ def upstream_result_into_response(self, _class=Response) -> Optional[Response]:
                 response._content = self.scrape_result['content'].encode('utf-8')
         else:
             response._content = None
-        
+
         response.headers.update(self.scrape_result['response_headers'])
         response.url = self.scrape_result['url']
 

diff --git a/scrapfly/errors.py b/scrapfly/errors.py
@@ -1,3 +1,4 @@
+import base64
 from typing import Optional, Tuple
 from requests import Request, Response
 
@@ -56,6 +57,9 @@ def __init__(self, content:str):
     def __str__(self) -> str:
         return self.content
 
+    def __repr__(self):
+        return "Invalid payload: %s" % self.content
+
 
 class ExtraUsageForbidden(ScrapflyError):
     pass
@@ -70,15 +74,18 @@ def __init__(self, request:Request, response:Optional[Response]=None, **kwargs):
 
     def __str__(self) -> str:
         if isinstance(self, UpstreamHttpError):
-            text = "%s -- %s " % (self.api_response.scrape_result['status_code'], self.api_response.scrape_result['reason'])
+            text = f"Target website responded with {self.api_response.scrape_result['status_code']} - {self.api_response.scrape_result['reason']}"
         else:
-            text = "%s -- %s " % (self.response.status_code, self.response.reason)
+            text = f"{self.response.status_code} - {self.response.reason}"
 
             if isinstance(self, (ApiHttpClientError, ApiHttpServerError)):
                 try:
                     text += self.response.content.decode('utf-8')
                 except UnicodeError:
-                    text += str(self.response.content)
+                    raise EncoderError(content=base64.b64encode(self.response.content).decode('utf-8'))
+            elif isinstance(self, ScraperAPIError):
+                print(self.api_response.error)
+                text += f" | {self.api_response.error['code']} - {self.api_response.error['message']} - {self.api_response.error['links']}"
 
         return text
 
@@ -103,6 +110,10 @@ class BadApiKeyError(ApiHttpClientError):
     pass
 
 
+class PaymentRequired(ApiHttpClientError):
+    pass
+
+
 class TooManyRequest(ApiHttpClientError):
     pass
 
@@ -111,31 +122,35 @@ class ApiHttpServerError(ApiHttpClientError):
     pass
 
 
-class ScrapflyScrapeError(HttpError):
+class ScraperAPIError(HttpError):
     pass
 
 
-class ScrapflyProxyError(HttpError):
+class ScrapflyScrapeError(ScraperAPIError):
     pass
 
 
-class ScrapflyThrottleError(HttpError):
+class ScrapflyProxyError(ScraperAPIError):
     pass
 
 
-class ScrapflyAspError(HttpError):
+class ScrapflyThrottleError(ScraperAPIError):
     pass
 
 
-class ScrapflyScheduleError(HttpError):
+class ScrapflyAspError(ScraperAPIError):
     pass
 
 
-class ScrapflyWebhookError(HttpError):
+class ScrapflyScheduleError(ScraperAPIError):
     pass
 
 
-class ScrapflySessionError(HttpError):
+class ScrapflyWebhookError(ScraperAPIError):
+    pass
+
+
+class ScrapflySessionError(ScraperAPIError):
     pass
 
 
@@ -158,8 +173,10 @@ class ErrorFactory:
     }
 
     # Notable http error has own class for more convenience
+    # Only applicable for generic API error
     HTTP_STATUS_TO_ERROR = {
         401: BadApiKeyError,
+        402: PaymentRequired,
         429: TooManyRequest
     }
 
@@ -226,10 +243,12 @@ def create(api_response: 'ScrapeApiResponse'):
             if http_code >= 500:
                 return ApiHttpServerError(**args)
 
-            if http_code in ErrorFactory.HTTP_STATUS_TO_ERROR:
+            is_scraper_api_error = resource in ErrorFactory.RESOURCE_TO_ERROR
+
+            if http_code in ErrorFactory.HTTP_STATUS_TO_ERROR and not is_scraper_api_error:
                 return ErrorFactory.HTTP_STATUS_TO_ERROR[http_code](**args)
 
-            if resource in ErrorFactory.RESOURCE_TO_ERROR:
+            if is_scraper_api_error:
                 return ErrorFactory.RESOURCE_TO_ERROR[resource](**args)
 
             return ApiHttpClientError(**args)

diff --git a/scrapfly/scrape_config.py b/scrapfly/scrape_config.py
@@ -2,7 +2,7 @@
 import json
 import logging
 from typing import Optional, List, Dict, Iterable, Union, Set
-from urllib.parse import urlencode, quote
+from urllib.parse import urlencode
 from requests.structures import CaseInsensitiveDict
 
 
@@ -48,6 +48,7 @@ class ScrapeConfig:
     lang:Optional[List[str]] = None
     os:Optional[str] = None
     auto_scroll:Optional[bool] = None
+    cost_budget:Optional[int] = None
 
     def __init__(
         self,
@@ -83,7 +84,8 @@ def __init__(
         extract:Optional[Dict] = None,
         os:Optional[str] = None,
         lang:Optional[List[str]] = None,
-        auto_scroll:Optional[bool] = None
+        auto_scroll:Optional[bool] = None,
+        cost_budget:Optional[int] = None
     ):
         assert(type(url) is str)
 
@@ -127,6 +129,7 @@ def __init__(
         self.lang = lang
         self.os = os
         self.auto_scroll = auto_scroll
+        self.cost_budget = cost_budget
 
         if cookies:
             _cookies = []
@@ -184,6 +187,9 @@ def to_api_params(self, key:str) -> Dict:
         if self.extract is not None:
             params['extract'] = base64.urlsafe_b64encode(json.dumps(self.extract).encode('utf-8')).decode('utf-8')
 
+        if self.cost_budget is not None:
+            params['cost_budget'] = self.cost_budget
+
         if self.render_js is True:
             params['render_js'] = self._bool_to_http(self.render_js)
 
@@ -318,5 +324,6 @@ def from_exported_config(config:str) -> 'ScrapeConfig':
             rendering_wait=data['rendering_wait'],
             screenshots=data['screenshots'] or {},
             proxy_pool=data['proxy_pool'],
-            auto_scroll=data['auto_scroll']
+            auto_scroll=data['auto_scroll'],
+            cost_budget=data['cost_budget']
         )