Skip to content

Commit

Permalink
Rework and update example - Better handling for encoding issue
Browse files Browse the repository at this point in the history
  • Loading branch information
jjsaunier committed Oct 27, 2023
1 parent 7fbb8b1 commit 712b37a
Show file tree
Hide file tree
Showing 19 changed files with 138 additions and 100 deletions.
2 changes: 2 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -4,3 +4,5 @@ scrapfly_sdk.egg-info
.pypirc
*.pyc
venv
examples/scrapy/demo/images
!examples/scrapy/demo/images/.gitkeep
16 changes: 8 additions & 8 deletions examples/concurrency.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,14 +10,14 @@
logger.StreamHandler(stdout)

async def main():
results = await scrapfly.concurrent_scrape(scrape_configs=[
ScrapeConfig(url='http://httpbin.org/anything', render_js=True),
ScrapeConfig(url='http://httpbin.org/anything', render_js=True),
ScrapeConfig(url='http://httpbin.org/anything', render_js=True),
ScrapeConfig(url='http://httpbin.org/anything', render_js=True)
])
scrape_configs = [
ScrapeConfig(url='https://httpbin.dev/anything'),
ScrapeConfig(url='https://httpbin.dev/anything'),
ScrapeConfig(url='https://httpbin.dev/anything'),
ScrapeConfig(url='https://httpbin.dev/anything')
]


print(results)
async for result in scrapfly.concurrent_scrape(scrape_configs):
print(result)

asyncio.run(main())
4 changes: 0 additions & 4 deletions examples/scrapy/bea/bea/settings.py

This file was deleted.

26 changes: 0 additions & 26 deletions examples/scrapy/bea/bea/spiders/bea.py

This file was deleted.

5 changes: 0 additions & 5 deletions examples/scrapy/bea/scrapy.cfg

This file was deleted.

Empty file.
5 changes: 0 additions & 5 deletions examples/scrapy/covid/covid/settings.py

This file was deleted.

Empty file.
30 changes: 0 additions & 30 deletions examples/scrapy/covid/covid/spiders/covid.py

This file was deleted.

5 changes: 0 additions & 5 deletions examples/scrapy/covid/scrapy.cfg

This file was deleted.

File renamed without changes.
5 changes: 5 additions & 0 deletions examples/scrapy/demo/demo/settings.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
SPIDER_MODULES = ['demo.spiders']
NEWSPIDER_MODULE = 'demo.spiders'
SCRAPFLY_API_KEY = '__API_KEY__'
CONCURRENT_REQUESTS = 2
IMAGES_STORE = "./images"
File renamed without changes.
71 changes: 71 additions & 0 deletions examples/scrapy/demo/demo/spiders/demo.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,71 @@
from scrapy import Item, Field
from scrapy.exceptions import CloseSpider
from scrapy.spidermiddlewares.httperror import HttpError
from twisted.python.failure import Failure

from scrapfly import ScrapeConfig
from scrapfly.errors import ScraperAPIError, ApiHttpServerError
from scrapfly.scrapy import ScrapflyScrapyRequest, ScrapflySpider, ScrapflyScrapyResponse


class Product(Item):

name = Field()
price = Field()
description = Field()

# scrapy.pipelines.images.ImagesPipeline
image_urls = Field()
images = Field()


class Demo(ScrapflySpider):
name = "demo"

allowed_domains = ["web-scraping.dev", "httpbin.dev"]
start_urls = [
ScrapeConfig("https://web-scraping.dev/product/1", render_js=True),
ScrapeConfig("https://web-scraping.dev/product/2"),
ScrapeConfig("https://web-scraping.dev/product/3"),
ScrapeConfig("https://web-scraping.dev/product/4"),
ScrapeConfig("https://web-scraping.dev/product/5", render_js=True),
ScrapeConfig("https://httpbin.dev/status/403", asp=True, retry=False), # it will fail on purpose
ScrapeConfig("https://httpbin.dev/status/400"), # it will fail on purpose - will fall on scrapy.spidermiddlewares.httperror.HttpError
ScrapeConfig("https://httpbin.dev/status/404"), # it will fail on purpose - will fall on scrapy.spidermiddlewares.httperror.HttpError
]

def start_requests(self):
for scrape_config in self.start_urls:
yield ScrapflyScrapyRequest(scrape_config, callback=self.parse, errback=self.error_handler, dont_filter=True)

def error_handler(self, failure:Failure):
if failure.check(ScraperAPIError): # The scrape errored
error_code = failure.value.code # https://scrapfly.io/docs/scrape-api/errors#web_scraping_api_error

if error_code == "ERR::ASP::SHIELD_PROTECTION_FAILED":
self.logger.warning("The url %s must be retried" % failure.request.url)
elif failure.check(HttpError): # The scrape succeed but the target server returned a non success http code >=400
response:ScrapflyScrapyResponse = failure.value.response

if response.status == 404:
self.logger.warning("The url %s returned a 404 http code - Page not found" % response.url)
elif response.status == 500:
raise CloseSpider(reason="The target server returned a 500 http code - Website down")

elif failure.check(ApiHttpServerError): # Generic API error, config error, quota reached, etc
self.logger.error(failure)
else:
self.logger.error(failure)

def parse(self, response:ScrapflyScrapyResponse, **kwargs):
item = Product()

if response.status == 200:
# make sure the url is absolute
item['image_urls'] = [response.urljoin(response.css('img.product-img::attr(src)').get())]

item['name'] = response.css('h3.product-title').get()
item['price'] = response.css('span.product-price::text').get()
item['description'] = response.css('p.product-description').get()

yield item
File renamed without changes.
5 changes: 5 additions & 0 deletions examples/scrapy/demo/scrapy.cfg
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
[settings]
default = demo.settings

[deploy]
project = demo
8 changes: 6 additions & 2 deletions scrapfly/api_response.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
import base64
import re
import logging as logger
import shutil
Expand Down Expand Up @@ -104,7 +105,10 @@ def __call__(self, content: bytes) -> Union[str, Dict]:
try:
return self.content_loader(content)
except Exception as e:
raise EncoderError(content=content.decode('utf-8')) from e
try:
raise EncoderError(content=content.decode('utf-8')) from e
except UnicodeError:
raise EncoderError(content=base64.b64encode(content).decode('utf-8')) from e


class ScrapeApiResponse:
Expand Down Expand Up @@ -364,7 +368,7 @@ def upstream_result_into_response(self, _class=Response) -> Optional[Response]:
response._content = self.scrape_result['content'].encode('utf-8')
else:
response._content = None

response.headers.update(self.scrape_result['response_headers'])
response.url = self.scrape_result['url']

Expand Down
43 changes: 31 additions & 12 deletions scrapfly/errors.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
import base64
from typing import Optional, Tuple
from requests import Request, Response

Expand Down Expand Up @@ -56,6 +57,9 @@ def __init__(self, content:str):
def __str__(self) -> str:
return self.content

def __repr__(self):
return "Invalid payload: %s" % self.content


class ExtraUsageForbidden(ScrapflyError):
pass
Expand All @@ -70,15 +74,18 @@ def __init__(self, request:Request, response:Optional[Response]=None, **kwargs):

def __str__(self) -> str:
if isinstance(self, UpstreamHttpError):
text = "%s -- %s " % (self.api_response.scrape_result['status_code'], self.api_response.scrape_result['reason'])
text = f"Target website responded with {self.api_response.scrape_result['status_code']} - {self.api_response.scrape_result['reason']}"
else:
text = "%s -- %s " % (self.response.status_code, self.response.reason)
text = f"{self.response.status_code} - {self.response.reason}"

if isinstance(self, (ApiHttpClientError, ApiHttpServerError)):
try:
text += self.response.content.decode('utf-8')
except UnicodeError:
text += str(self.response.content)
raise EncoderError(content=base64.b64encode(self.response.content).decode('utf-8'))
elif isinstance(self, ScraperAPIError):
print(self.api_response.error)
text += f" | {self.api_response.error['code']} - {self.api_response.error['message']} - {self.api_response.error['links']}"

return text

Expand All @@ -103,6 +110,10 @@ class BadApiKeyError(ApiHttpClientError):
pass


class PaymentRequired(ApiHttpClientError):
pass


class TooManyRequest(ApiHttpClientError):
pass

Expand All @@ -111,31 +122,35 @@ class ApiHttpServerError(ApiHttpClientError):
pass


class ScrapflyScrapeError(HttpError):
class ScraperAPIError(HttpError):
pass


class ScrapflyProxyError(HttpError):
class ScrapflyScrapeError(ScraperAPIError):
pass


class ScrapflyThrottleError(HttpError):
class ScrapflyProxyError(ScraperAPIError):
pass


class ScrapflyAspError(HttpError):
class ScrapflyThrottleError(ScraperAPIError):
pass


class ScrapflyScheduleError(HttpError):
class ScrapflyAspError(ScraperAPIError):
pass


class ScrapflyWebhookError(HttpError):
class ScrapflyScheduleError(ScraperAPIError):
pass


class ScrapflySessionError(HttpError):
class ScrapflyWebhookError(ScraperAPIError):
pass


class ScrapflySessionError(ScraperAPIError):
pass


Expand All @@ -158,8 +173,10 @@ class ErrorFactory:
}

# Notable http error has own class for more convenience
# Only applicable for generic API error
HTTP_STATUS_TO_ERROR = {
401: BadApiKeyError,
402: PaymentRequired,
429: TooManyRequest
}

Expand Down Expand Up @@ -226,10 +243,12 @@ def create(api_response: 'ScrapeApiResponse'):
if http_code >= 500:
return ApiHttpServerError(**args)

if http_code in ErrorFactory.HTTP_STATUS_TO_ERROR:
is_scraper_api_error = resource in ErrorFactory.RESOURCE_TO_ERROR

if http_code in ErrorFactory.HTTP_STATUS_TO_ERROR and not is_scraper_api_error:
return ErrorFactory.HTTP_STATUS_TO_ERROR[http_code](**args)

if resource in ErrorFactory.RESOURCE_TO_ERROR:
if is_scraper_api_error:
return ErrorFactory.RESOURCE_TO_ERROR[resource](**args)

return ApiHttpClientError(**args)
Expand Down
13 changes: 10 additions & 3 deletions scrapfly/scrape_config.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
import json
import logging
from typing import Optional, List, Dict, Iterable, Union, Set
from urllib.parse import urlencode, quote
from urllib.parse import urlencode
from requests.structures import CaseInsensitiveDict


Expand Down Expand Up @@ -48,6 +48,7 @@ class ScrapeConfig:
lang:Optional[List[str]] = None
os:Optional[str] = None
auto_scroll:Optional[bool] = None
cost_budget:Optional[int] = None

def __init__(
self,
Expand Down Expand Up @@ -83,7 +84,8 @@ def __init__(
extract:Optional[Dict] = None,
os:Optional[str] = None,
lang:Optional[List[str]] = None,
auto_scroll:Optional[bool] = None
auto_scroll:Optional[bool] = None,
cost_budget:Optional[int] = None
):
assert(type(url) is str)

Expand Down Expand Up @@ -127,6 +129,7 @@ def __init__(
self.lang = lang
self.os = os
self.auto_scroll = auto_scroll
self.cost_budget = cost_budget

if cookies:
_cookies = []
Expand Down Expand Up @@ -184,6 +187,9 @@ def to_api_params(self, key:str) -> Dict:
if self.extract is not None:
params['extract'] = base64.urlsafe_b64encode(json.dumps(self.extract).encode('utf-8')).decode('utf-8')

if self.cost_budget is not None:
params['cost_budget'] = self.cost_budget

if self.render_js is True:
params['render_js'] = self._bool_to_http(self.render_js)

Expand Down Expand Up @@ -318,5 +324,6 @@ def from_exported_config(config:str) -> 'ScrapeConfig':
rendering_wait=data['rendering_wait'],
screenshots=data['screenshots'] or {},
proxy_pool=data['proxy_pool'],
auto_scroll=data['auto_scroll']
auto_scroll=data['auto_scroll'],
cost_budget=data['cost_budget']
)

0 comments on commit 712b37a

Please sign in to comment.