In [3]:
import nest_asyncio
import asyncio
from crawl4ai import AsyncWebCrawler, CrawlerRunConfig, CacheMode
from crawl4ai.extraction_strategy import JsonCssExtractionStrategy
import json

nest_asyncio.apply()

schema = {
    "name": "Example Items",
    "baseSelector": "div.item",
    "fields":[
        {
            "name": "title",
            "selector": "h2",
            "type": "text"
        },
        {
            "name": "link",
            "selector": "a",
            "type": "attribute",
            "attribute": "href"
        }
    ]
}

raw_html = """<div class='item'>
<h2>Item 1</h2>
<a href='https://example.com/item1'>Link 1</a>
</div>"""

async def main():
    async with AsyncWebCrawler() as crawler:
        result = await crawler.arun(
            url = "raw://" + raw_html,
            config = CrawlerRunConfig(
                cache_mode = CacheMode.DISABLED,
                extraction_strategy = JsonCssExtractionStrategy(schema)
            )
        )

    data = json.dumps(result.extracted_content)
    print(result.extracted_content)
    print("====================")
    print(data)

if __name__ == "__main__":
    asyncio.run(main())


[
    {
        "title": "Item 1",
        "link": "https://example.com/item1"
    }
]
"[\n    {\n        \"title\": \"Item 1\",\n        \"link\": \"https://example.com/item1\"\n    }\n]"


In [5]:
import asyncio
import nest_asyncio
from crawl4ai import AsyncWebCrawler, CrawlerRunConfig, CacheMode
from crawl4ai.extraction_strategy import JsonCssExtractionStrategy

async def main():
    dummy_html="""
    <html>
      <body>
        <div class='crypto-row'>
          <h2 class='coin-name'>Bitcoin</h2>
          <span class='coin-price'>$28,000</span>
        </div>
        <div class='crypto-row'>
          <h2 class='coin-name'>Ethereum</h2>
          <span class='coin-price'>$1,800</span>
        </div>
      </body>
    </html>
    """
    schema = {
        "name": "crypo Price",
        "baseSelector": "div.crypto-row",
        "fields":[
            {
                "name": "coin_name",
                "selector": "h2.coin-name",
                "type": "text"
            },
            {
                "name": "price",
                "selector": "span.coin-price",
                "type": "text"
            }
        ]
    }
    config = CrawlerRunConfig(
        cache_mode = CacheMode.BYPASS,
        extraction_strategy = JsonCssExtractionStrategy(schema)
    )
    async with AsyncWebCrawler() as crawler:
        raw_url = f"raw://{dummy_html}"
        result = await crawler.arun(
            url = raw_url,
            config=config
        )

    if not result.success:
        print("Crawl failed:", result.error_message)
        return
        
    print(result.extracted_content)

if __name__ == "__main__":
    asyncio.run(main())


[
    {
        "coin_name": "Bitcoin",
        "price": "$28,000"
    },
    {
        "coin_name": "Ethereum",
        "price": "$1,800"
    }
]
