From f896a7d87d506ad877bcd53c7b3dfcb15a43908c Mon Sep 17 00:00:00 2001 From: ankitjavalkar Date: Fri, 2 Jun 2023 15:49:47 +0530 Subject: [PATCH] Replace example.com domains with Scrapinghub demo toscrape.com domains in the documentation --- docs/intro/advanced-tutorial.rst | 12 ++++++------ docs/rules-from-web-poet.rst | 12 ++++++------ 2 files changed, 12 insertions(+), 12 deletions(-) diff --git a/docs/intro/advanced-tutorial.rst b/docs/intro/advanced-tutorial.rst index a882fd81..c8685cf2 100644 --- a/docs/intro/advanced-tutorial.rst +++ b/docs/intro/advanced-tutorial.rst @@ -62,7 +62,7 @@ Suppose we have the following Page Object: # Simulates clicking on a button that says "View All Images" response: web_poet.HttpResponse = await self.http.get( - f"https://api.example.com/v2/images?id={item['product_id']}" + f"https://api.toscrape.com/v2/images?id={item['product_id']}" ) item["images"] = response.css(".product-images img::attr(src)").getall() return item @@ -85,8 +85,8 @@ It can be directly used inside the spider as: def start_requests(self): for url in [ - "https://example.com/category/product/item?id=123", - "https://example.com/category/product/item?id=989", + "https://toscrape.com/category/product/item?id=123", + "https://toscrape.com/category/product/item?id=989", ]: yield scrapy.Request(url, callback=self.parse) @@ -128,7 +128,7 @@ This basically acts as a switch to update the behavior of the Page Object: # Simulates clicking on a button that says "View All Images" if self.page_params.get("enable_extracting_all_images") response: web_poet.HttpResponse = await self.http.get( - f"https://api.example.com/v2/images?id={item['product_id']}" + f"https://api.toscrape.com/v2/images?id={item['product_id']}" ) item["images"] = response.css(".product-images img::attr(src)").getall() @@ -157,8 +157,8 @@ Let's see it in action: } start_urls = [ - "https://example.com/category/product/item?id=123", - "https://example.com/category/product/item?id=989", + "https://toscrape.com/category/product/item?id=123", + "https://toscrape.com/category/product/item?id=989", ] def start_requests(self): diff --git a/docs/rules-from-web-poet.rst b/docs/rules-from-web-poet.rst index 7b4a3bbc..f2603877 100644 --- a/docs/rules-from-web-poet.rst +++ b/docs/rules-from-web-poet.rst @@ -80,7 +80,7 @@ And then override it for a particular domain using ``settings.py``: .. code-block:: python SCRAPY_POET_RULES = [ - ApplyRule("example.com", use=ISBNBookPage, instead_of=BookPage) + ApplyRule("toscrape.com", use=ISBNBookPage, instead_of=BookPage) ] This new Page Object gets the original ``BookPage`` as dependency and enrich @@ -211,7 +211,7 @@ Let's check out an example: name: str - @handle_urls("example.com") + @handle_urls("toscrape.com") @attrs.define class ProductPage(WebPage[Product]): @@ -225,7 +225,7 @@ Let's check out an example: def start_requests(self): yield scrapy.Request( - "https://example.com/products/some-product", self.parse + "https://toscrape.com/products/some-product", self.parse ) # We can directly ask for the item here instead of the page object. @@ -236,7 +236,7 @@ From this example, we can see that: * Spider callbacks can directly ask for items as dependencies. * The ``Product`` item instance directly comes from ``ProductPage``. - * This is made possible by the ``ApplyRule("example.com", use=ProductPage, + * This is made possible by the ``ApplyRule("toscrape.com", use=ProductPage, to_return=Product)`` instance created from the ``@handle_urls`` decorator on ``ProductPage``. @@ -248,7 +248,7 @@ From this example, we can see that: .. code-block:: python - @handle_urls("example.com") + @handle_urls("toscrape.com") @attrs.define class ProductPage(WebPage[Product]): product_image_page: ProductImagePage @@ -267,7 +267,7 @@ From this example, we can see that: def start_requests(self): yield scrapy.Request( - "https://example.com/products/some-product", self.parse + "https://toscrape.com/products/some-product", self.parse ) async def parse(self, response: DummyResponse, product_page: ProductPage):