## Encapsulated logic

In [19]:
from bs4 import BeautifulSoup

soup = BeautifulSoup(
    """
        <div>
            <span class="event">Event</span>
            <span>party</span>
        </div>
    """,
    features="lxml",
)

for div in soup.find_all("div"):
    for event in div.find_all(class_="event", recursive=False):
        party = event.find_next_sibling("span", string="party")
        if party is not None:
            break
party

<span>party</span>

In [20]:
from bs4 import BeautifulSoup

from soupsavvy import ClassSelector, PatternSelector, TypeSelector

soup = BeautifulSoup(
    """
        <div>
            <span class="event">Event</span>
            <span>party</span>
        </div>
    """,
    features="lxml",
)

selector = (TypeSelector("div") > ClassSelector("event") + (
    TypeSelector("span") & PatternSelector("party")
))
selector.find(soup)

<span>party</span>

## Error handling

In [54]:
from bs4 import BeautifulSoup

soup = BeautifulSoup(
    """
        <div>
            <span>No event here</span>
            <span>No party</span>
        </div>
    """,
    features="lxml",
)

event = soup.find(class_="event")

if event is not None:
    party = event.find_next_sibling(string="party")
else:
    print("This needs to be handled explicitly every time.")

This needs to be handled explicitly every time.


In [58]:
from bs4 import BeautifulSoup

from soupsavvy import ClassSelector, PatternSelector
from soupsavvy.exceptions import TagNotFoundException

soup = BeautifulSoup(
    """
        <div>
            <span>No event here</span>
            <span>No party</span>
        </div>
    """,
    features="lxml",
)

selector = ClassSelector("event") + PatternSelector("party")
assert selector.find(soup) is None

try:
    selector.find(soup, strict=True)
except TagNotFoundException as e:
    print(e)

Tag was not found in markup.


## Handling sets

In [17]:
from bs4 import BeautifulSoup

soup = BeautifulSoup(
    """
        <p class="special">Festival</p>
        <div>
            <span>Event</span>
            <span>Menu</span>
        </div>
        <div>
            <span>Menu</span>
        </div>
        <div>
            <span>Event</span>
        </div>
        <span>Event</span>
    """,
    features="lxml",
)

result1 = soup.find_all("span", string="Event")
result2 = soup.select(":last-child")
result3 = soup.find_all(class_="special")

(set(result1) & set(result2)) | set(result3)

{<p class="special">Festival</p>, <span>Event</span>}

In [19]:
from bs4 import BeautifulSoup

from soupsavvy import ClassSelector, PatternSelector, TypeSelector
from soupsavvy.selectors.css import LastChild

soup = BeautifulSoup(
    """
        <p class="special">Festival</p>
        <div>
            <span>Event</span>
            <span>Menu</span>
        </div>
        <div>
            <span>Menu</span>
        </div>
        <div>
            <span>Event</span>
        </div>
        <span>Event</span>
    """,
    features="lxml",
)

selector = (
    PatternSelector("Event") & TypeSelector("span") & LastChild()
) | ClassSelector("special")
selector.find_all(soup)


[<p class="special">Festival</p>, <span>Event</span>, <span>Event</span>]

## Applying operations

In [63]:
from datetime import datetime

from bs4 import BeautifulSoup

soup = BeautifulSoup(
    """
        <p>Event</p>
        <span class="date">2023-10-30</span>
        <span class="date">2023-08-31</span>
    """,
    features="lxml",
)

date_elements = soup.find_all(class_="date")
dates = [
    datetime.strptime(element.get_text(strip=True), "%Y-%m-%d")
    for element in date_elements
]
dates

[datetime.datetime(2023, 10, 30, 0, 0), datetime.datetime(2023, 8, 31, 0, 0)]

In [4]:
from datetime import datetime

from bs4 import BeautifulSoup

from soupsavvy import ClassSelector
from soupsavvy.operations import Operation, Text

soup = BeautifulSoup(
    """
        <p>Event</p>
        <span class="date">2023-10-30</span>
        <span class="date">2023-08-31</span>
    """,
    features="lxml",
)

selector = (
    ClassSelector("date") | Text(strip=True) | Operation(datetime.strptime, "%Y-%m-%d")
)
selector.find_all(soup)

[datetime.datetime(2023, 10, 30, 0, 0), datetime.datetime(2023, 8, 31, 0, 0)]

## Structured information

In [26]:
from dataclasses import dataclass

from bs4 import BeautifulSoup


@dataclass
class Book:
    title: str
    price: float


text = """
    <div class="book">
        <p class="title">Animal Farm</p>
        <p class="price">100$</p>
    </div>
    <div class="book">
        <p class="title">Brave New World  </p>
        <p class="price">80$</p>
    </div>
"""
soup = BeautifulSoup(text, features="lxml")

books = []
book_elements = soup.find_all("div", class_="book")

for book_element in book_elements:
    title = book_element.find(class_="title")

    if title is None:
        raise ValueError("Title not found")

    title = title.get_text(strip=True)

    price = book_element.find(class_="price")

    if price is None:
        raise ValueError("Price not found")

    price = int(price.get_text(strip=True).replace("$", ""))
    book = Book(title, price)
    books.append(book)

books

[Book(title='Animal Farm', price=100), Book(title='Brave New World', price=80)]

In [27]:
from bs4 import BeautifulSoup

from soupsavvy import ClassSelector, TypeSelector
from soupsavvy.models import BaseModel
from soupsavvy.operations import Operation, Text


class Book(BaseModel):

    __scope__ = TypeSelector("div") & ClassSelector("book")

    title = ClassSelector("title") | Text(strip=True)
    price = (
        ClassSelector("price")
        | Text(strip=True)
        | Operation(lambda x: x.strip("$"))
        | Operation(int)
    )


text = """
    <div class="book">
        <p class="title">Animal Farm</p>
        <p class="price">100$</p>
    </div>
    <div class="book">
        <p class="title">Brave New World  </p>
        <p class="price">80$</p>
    </div>
"""
soup = BeautifulSoup(text, features="lxml")
Book.find_all(soup)

[Book(title='Animal Farm', price=100), Book(title='Brave New World', price=80)]