In [None]:
## Set notebook to auto reload updated modules. Put this at the top of your Jupyter
#  notebooks, and if you import code from local .py files, you won't have to restart
#  the kernel when you make changes in the code.
%load_ext autoreload
%autoreload 2

---

# Book Scraper

This notebook scrapes the [books.toscrape.com site](https://books.toscrape.com).


---

## Imports

Import packages to use in the notebook.


In [None]:
from pathlib import Path

In [None]:
import httpx
from hishel import SyncSqliteStorage
from hishel.httpx import SyncCacheTransport

## Constants

Define constants for the notebook (values that are set at the beginning and don't change).

In [None]:
## Path to HTTP cache database
CACHE_PATH = ".cache/http.cache"

## Base URL for target site
BASE_URL: str = "https://books.toscrape.com"

## Functions

Re-usable code for the notebook to call.

In [None]:
def get_cache_transport(
    cache_file: str | Path = CACHE_PATH, ttl: int = 900
) -> httpx.HTTPTransport:
    """Return a Hishel SyncCacheTransport with SQLite cache.

    Params:
        cache_file (str | Path): Path to a SQLite file where responses will be cached.
        ttl (int): Time an object should live in the cache before being cleared/refreshed (default=900, which is 15 minutes).
    """
    ## Ensure cache parent directory exists. If it already exists, nothing will happen
    Path(cache_file).parent.mkdir(parents=True, exist_ok=True)

    ## Prepare a Hishel SQLite database to use as a cache
    storage = SyncSqliteStorage(
        database_path=str(cache_file),
        default_ttl=ttl,
        refresh_ttl_on_access=True,
    )

    ## Create the cache transport
    return SyncCacheTransport(
        next_transport=httpx.HTTPTransport(),
        storage=storage,
    )

In [None]:
def get_client(
    use_cache: bool = False, cache_file: str | Path = CACHE_PATH, ttl: int = 900
) -> httpx.Client:
    """Return a reusable HTTPX client optionally with cache transport.

    Params:
        use_cache (bool): Whether to enable Hishel SQLite caching (default=False).
        ttl (int): Time an object should live in the cache before being cleared/refreshed (default=900, which is 15 minutes).
        cache_file (str | Path): Path to a SQLite file where responses will be cached.
    """
    ## Creates a cache transport for HTTPX if use_cache=True, otherwise value is None
    transport: SyncCacheTransport = (
        get_cache_transport(cache_file, ttl) if use_cache else None
    )

    ## Create an HTTPX client with optional cache transport
    return httpx.Client(transport=transport, timeout=10.0)

## Scrape

In [None]:
## Create HTTP client to use for requests. Add a cache so we're not repeatedly sending live requests
http_client: httpx.Client = get_client(use_cache=True)

In [None]:
## Do a HEAD request to check if the site is online (should get a 200 response)
#  Raise an exception for any non-successful response, i.e. 400, 404, etc
ping: httpx.Response = http_client.head(BASE_URL)
ping.raise_for_status()

display(ping.status_code)

In [None]:
## Instead of using httpx.get() directly, you can also pre-create a Request object and use the client to send it
homepage_request: httpx.Request = httpx.Request(method="GET", url=BASE_URL)
display(f"Homepage request URL: {homepage_request.url}")

In [None]:
## Instead of httpx.get(), use httpx.send() and give it the homepage_request object

homepage_res: httpx.Response = http_client.send(homepage_request)
homepage_res.raise_for_status()

In [None]:
## Decode the response content into a string with the page's URL
homepage_html: str = homepage_res.content.decode("utf-8")

## Cleanup

In [None]:
## Close HTTP client
# http_client.close()