In [8]:
from crawl4ai import AsyncWebCrawler,CacheMode,CrawlerRunConfig,LLMConfig
import asyncio
import nest_asyncio
from pprint import pprint
from crawl4ai.extraction_strategy import JsonCssExtractionStrategy
import json

nest_asyncio.apply()

async def main():
    html = """<div class='item'>
        <h2>Item 1</h2>
        <a href='https://example.com/item1'>Link 1</a>
    </div>"""

    schema = JsonCssExtractionStrategy.generate_schema(
        html,
        llm_config=LLMConfig(
            provider="ollama/llama3.2",
            api_token=None
        )
    )
    print("schema")
    pprint(schema)

    strategy = JsonCssExtractionStrategy(schema)

    config = CrawlerRunConfig(
        cache_mode=CacheMode.BYPASS,
        extraction_strategy=strategy,
    )

    async with AsyncWebCrawler() as crawler:
        result = await crawler.arun(
            url = f"raw://{html}",
            config = config
        )
    
    data= json.loads(result.extracted_content)
    pprint(data)
        

if __name__ == "__main__":
    asyncio.run(main())




schema
{'baseFields': [{'attribute': 'href',
                 'name': 'data_href',
                 'type': 'attribute'}],
 'baseSelector': '.item',
 'fields': [{'name': 'title', 'selector': 'h2', 'type': 'text'},
            {'attribute': 'href',
             'name': 'link',
             'selector': 'a',
             'type': 'attribute'}],
 'name': 'Item List'}


[{'link': 'https://example.com/item1', 'title': 'Item 1'}]


In [None]:
import asyncio
import nest_asyncio,json
from crawl4ai.extraction_strategy import LLMExtractionStrategy
from crawl4ai import AsyncWebCrawler, CrawlerRunConfig, CacheMode

async def main():
   # 測試用 HTML（模擬電商網頁）
    html = """
    <html>
        <body>
            <div class='product-card'>
                <h2>電競筆電 - 高效能遊戲機</h2>
                <p>這款筆電配備最新的 RTX 4070 顯示卡，
                   搭配 Intel i9 處理器，適合專業遊戲玩家。</p>
                <div class='price-section'>
                    <span class='old-price'>原價 $1499.99</span>
                    <span class='new-price'>特價 $1299.99</span>
                </div>
                <a href='https://example.com/gaming-laptop'>查看詳情</a>
            </div>
            <div class='product-card'>
                <h2>無線滑鼠 - 人體工學設計</h2>
                <p>符合人體工學的無線滑鼠，電池續航力長達 3 個月。</p>
                <div class='price-section'>
                    <span class='new-price'>$29.99</span>
                </div>
                <a href='https://example.com/wireless-mouse'>查看詳情</a>
            </div>
        </body>
    </html>
    """

    custom_instruction = """
    請從網頁中擷取所有產品資訊，並以 JSON 格式返回。

    要求：
    1. 提取每個產品的以下資訊：
       - 產品名稱（從標題中提取）
       - 產品描述（簡短描述）
       - 原價（如果有的話，沒有則為 null）
       - 特價（當前售價）
       - 連結

    2. 返回的格式:
    {
        "products": [
            {
                "name": "產品名稱",
                "description": "產品描述",
                "original_price": "原價或 null",
                "current_price": "當前售價",
                "url": "產品連結"
            }
        ],
        "total_count": 產品數量
    }

    3. 價格請保留貨幣符號和金額
    4. 如果沒有原價，original_price 設為 null
    """

    # 建立LLM提取策略
    strategy = LLMExtractionStrategy(
        llm_config=LLMConfig(
            provider="gemini/gemini-2.5-flash",
            api_token="xxxxxxxxx"
            ),
        api_token=None,
        instruction=custom_instruction
    )

    config = CrawlerRunConfig(
        cache_mode=CacheMode.BYPASS,
        extraction_strategy=strategy,
    )

    async with AsyncWebCrawler() as crawler:
        result = await crawler.arun(url=f"raw://{html}",config=config)
        #print(result.extracted_content)
        data = json.loads(result.extracted_content)
        pprint(data)

if __name__ == "__main__":
    asyncio.run(main())

[{'content': ['{"name": "電競筆電 - 高效能遊戲機", "description": "這款筆電配備最新的 RTX 4070 '
              '顯示卡， 搭配 Intel i9 處理器，適合專業遊戲玩家。", "original_price": "$1499.99", '
              '"current_price": "$1299.99", "url": '
              '"https://example.com/gaming-laptop"}'],
  'error': False,
  'index': 0,
  'tags': ['product']},
 {'content': ['{"name": "無線滑鼠 - 人體工學設計", "description": "符合人體工學的無線滑鼠，電池續航力長達 3 '
              '個月。", "original_price": null, "current_price": "$29.99", "url": '
              '"https://example.com/wireless-mouse"}'],
  'error': False,
  'index': 1,
  'tags': ['product']}]
