# Chapter 0 - 安装与设置

## Install

In [1]:
%%capture
!pip install -U crawl4ai
!pip install nest_asyncio

In [2]:
# Check crawl4ai version
import crawl4ai
print(crawl4ai.__version__.__version__)

0.6.3


## Setup

In [3]:
%%capture
!crawl4ai-setup

## Test

In [4]:
!crawl4ai-doctor

[1;36m[[0m[36mINIT[0m[1;36m][0m[36m...[0m[36m. → Running Crawl4AI health check[0m[36m...[0m[36m [0m
[1;36m[[0m[36mINIT[0m[1;36m][0m[36m...[0m[36m. → Crawl4AI [0m[1;36m0.6[0m[36m.[0m[1;36m3[0m[36m [0m
[1;36m[[0m[36mTEST[0m[1;36m][0m[36m...[0m[36m. ℹ Testing crawling capabilities[0m[36m...[0m[36m [0m
[1;36m[[0m[36mEXPORT[0m[1;36m][0m[36m.. ℹ Exporting media [0m[1;36m([0m[36mPDF/MHTML/screenshot[0m[1;36m)[0m[36m took [0m[1;36m0.[0m[36m89s [0m
[1;32m[[0m[32mFETCH[0m[1;32m][0m[32m...[0m[32m ↓ [0m[4;32mhttps://crawl4ai.com[0m[32m                                               [0m
[32m| [0m[32m✓[0m[32m | ⏱: [0m[1;32m4.[0m[32m88s [0m
[1;32m[[0m[32mSCRAPE[0m[1;32m][0m[32m.. ◆ [0m[4;32mhttps://crawl4ai.com[0m[32m                                               [0m
[32m| [0m[32m✓[0m[32m | ⏱: [0m[1;33m0.[0m[33m02[0m[32ms [0m
[1;32m[[0m[32mCOMPLETE[0m[1;32m][0m[32m ● [0m[4;32mhttp

In [5]:
import asyncio # 导入Python的异步编程标准库
import nest_asyncio # 导入嵌套异步事件循环支持库
nest_asyncio.apply() # 允许在Jupyter中使用异步操作

In [7]:
from playwright.async_api import async_playwright

async def test_browser():
    async with async_playwright() as p:
        browser = await p.chromium.launch(headless = True)
        page = await browser.new_page()
        await page.goto('https://example.com')
        print(f'Title: {await page.title()}')
        await browser.close()

asyncio.run(test_browser())

Title: Example Domain


## *Markdown Output Function

In [None]:
import os

OUTPUT_PATH = 'outputs/markdown/'

def output_md(base_filename, md_str):
    # 创建输出目录
    os.makedirs(OUTPUT_PATH, exist_ok=True)

    # 生成带长度的文件名
    length = len(md_str)
    name, ext = os.path.splitext(base_filename)
    filename = f"{name}({length}){ext}"

    # 完整路径
    full_path = os.path.join(OUTPUT_PATH, filename)

    with open(full_path, 'w', encoding='utf-8') as f:
        f.write(md_str)

    print(f"已保存到: {full_path}")



# Chapter 1 - 基础形态

## 1.1 - Basic Type

In [10]:
import asyncio  # 异步编程库
from crawl4ai import AsyncWebCrawler  # 网页抓取工具

# 异步抓取网页内容
async def main(output_filename):
    # 创建爬虫对象，自动管理资源(确保爬虫使用完后会自动关闭，释放资源)
    async with AsyncWebCrawler() as crawler:
        # 访问指定网址并等待响应(await 关键字表示等待这个操作完成后再继续执行下面的代码)
        result = await crawler.arun("https://www.anthropic.com/news/agent-capabilities-api")

        # 打印抓取结果
        print("Markdown length:", len(result.markdown))
        print(result.markdown[:300])

        # 保存到.md文件
        output_md(output_filename, result.markdown)

# 启动异步程序
asyncio.run(main('1_1_Basic.md'))

Markdown length: 11049
[Skip to main content](https://www.anthropic.com/news/agent-capabilities-api#main-content)[Skip to footer](https://www.anthropic.com/news/agent-capabilities-api#footer)
[](https://www.anthropic.com/)
  * Claude
  * API
  * Solutions
  * Research
  * Commitments
  * Learn
[News](https://www.anthropic
已保存到: outputs/markdown/1_1_Basic(11049).md


# Chapter 2 - 进阶形态

## 2.1 - Setting with BrowerConfig（浏览器配置）

BrowserConfig - 控制浏览器本身的行为和启动方式
- headless: 是否以无头模式运行, 还是显示完整界面
- user_agent: 设置用户代理来模拟不同浏览器
- proxy_config: 配置代理服务器等浏览器级别的设置
- text_mode: 禁用图片加载，只抓取文本内容

In [None]:
import asyncio  # 异步编程库
from crawl4ai import AsyncWebCrawler, BrowserConfig
# AsyncWebCrawler: 异步网页爬虫
# BrowserConfig: 浏览器配置
# CrawlerRunConfig: 爬虫运行配置
# CacheMode: 缓存模式控制

# 异步主函数，执行网页爬取任务
async def main(output_filename):
   # 配置浏览器参数
   browser_config = BrowserConfig(
       headless = True,  # 无头模式，不显示浏览器窗口
       viewport_width = 1280,   # 窗口宽度
       viewport_height = 720,   # 窗口高度
       user_agent = 'Chrome/114.0.0.0',  # 浏览器标识
       text_mode = True, #禁用图片加载，可能会加速仅文本的爬取
   )

   # 创建异步网页爬虫，自动管理资源
   async with AsyncWebCrawler(config = browser_config) as crawler:
       # 执行网页爬取
        result = await crawler.arun(
            url = "https://www.anthropic.com/news/agent-capabilities-api",  # 目标网址
        )

        # 显示爬取结果
        print("Markdown length:", len(result.markdown))  # 内容长度
        print(result.markdown[:300])  # 前300字符预览

        output_md(output_filename, result.markdown)

# 启动异步程序
asyncio.run(main('2_1_BrowserConfig.md'))

## 2.2.0 - Setting with CrawlerRunConfig (爬虫运行配置)

CrawlerRunConfig - 控制每次具体爬取任务的执行方式
- word_count_threshold: 过滤掉过短的内容，比如导航菜单、按钮文字、简短标签
- extraction_strategy: 自定义抓取内容，需要定义json的schema
- cache_mode: 缓存策略, 是否使用缓存
- js_code: 模拟用户点击[Load More]等按钮
- screenshot: 在页面完全加载后自动截取网页截图
- pdf: 将整个网页转换为PDF文档
- [重要] markdown_generator: 默认DefaultMarkdownGenerator()

In [None]:
import asyncio  # 异步编程库
from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig, CacheMode
# AsyncWebCrawler: 异步网页爬虫
# BrowserConfig: 浏览器配置
# CrawlerRunConfig: 爬虫运行配置
# CacheMode: 缓存模式控制
from crawl4ai.markdown_generation_strategy import DefaultMarkdownGenerator

# 异步主函数，执行网页爬取任务
async def main(output_filename):
   # 配置浏览器参数
   browser_config = BrowserConfig(
       headless = True,  # 无头模式，不显示浏览器窗口
       viewport_width = 1280,   # 窗口宽度
       viewport_height = 720,   # 窗口高度
       user_agent = 'Chrome/114.0.0.0',  # 浏览器标识
       text_mode = True, #禁用图片加载，可能会加速仅文本的爬取
   )

   # 配置爬虫运行参数
   run_config = CrawlerRunConfig(
       cache_mode = CacheMode.DISABLED,  # 禁用缓存，获取最新内容
       markdown_generator = DefaultMarkdownGenerator(),
   )

   # 创建异步网页爬虫，自动管理资源
   async with AsyncWebCrawler(config = browser_config) as crawler:
       # 执行网页爬取
        result = await crawler.arun(
            url = "https://www.anthropic.com/news/agent-capabilities-api",  # 目标网址
            config = run_config,  # 运行配置
        )

        # 显示爬取结果
        print("Markdown length:", len(result.markdown))  # 内容长度
        print(result.markdown[:300])  # 前300字符预览

        output_md(output_filename, result.markdown)

# 启动异步程序
asyncio.run(main('2_2_0_RunConfig.md'))

### 2.2.1 + Content Filter: PruningContentFilter例

- **markdown_generator**: 核心功能，从网页生成干净、结构化的Markdown
    - DefaultMarkdownGenerator(默认且唯一)
        - 参数1: Content Filters
            - BM25ContentFilter  关键词过滤器
            - PruningContentFilter 内容精简过滤器
            - LLMContentFilter AI过滤器

In [None]:
from crawl4ai import AsyncWebCrawler, CrawlerRunConfig, CacheMode
from crawl4ai.content_filter_strategy import PruningContentFilter
from crawl4ai.markdown_generation_strategy import DefaultMarkdownGenerator

async def main(output_filename):
    # 浏览器配置
    browser_config = BrowserConfig(headless = True, # 无头模式
                                viewport_width = 1280,  # 窗口宽度
                                viewport_height = 720,  # 窗口高度
                                user_agent = 'Chrome/114.0.0.0', # 浏览器标识
                                text_mode = True,
                                 )

    # 爬虫运行配置
    run_config = CrawlerRunConfig(
    cache_mode = CacheMode.DISABLED,  # 禁用缓存
    markdown_generator = DefaultMarkdownGenerator(
        content_filter = PruningContentFilter(
            # min_word_threshold = 10, # 丢弃少于N个单词的块，因为它们可能太短或无用(不建议)
            threshold = 0.76,  # fixded: 固定阈值 / dynamic: 初始阈值
            threshold_type = "fixed", # 固定
            # threshold_type = "dynamic", # 变动
        )),
    )

    # 创建爬虫并执行
    async with AsyncWebCrawler(config = browser_config) as crawler:
        result = await crawler.arun(
            url = "https://www.anthropic.com/news/agent-capabilities-api",  # 目标网址
            config = run_config,  # 运行配置
        )

        # 保存原始内容
        print("Raw Markdown length:", len(result.markdown.raw_markdown))
        output_md(output_filename.replace('.md', '_raw.md'), result.markdown.raw_markdown)

        # 保存过滤后内容
        print("Fit Markdown length:", len(result.markdown.fit_markdown))
        output_md(output_filename.replace('.md', '_fit.md'), result.markdown.fit_markdown)

asyncio.run(main('2_2_1_RunConfig_ContentFilterPruning.md'))

### 2.2.2 + Options

- **markdown_generator**: 核心功能，从网页生成干净、结构化的Markdown
    - DefaultMarkdownGenerator(默认且唯一)
        - 参数1: Content Filters
            - BM25ContentFilter  关键词过滤器
            - PruningContentFilter 内容精简过滤器
            - LLMContentFilter AI过滤器
        - 参数2: Options
            - ignore_links (bool): 是否在最终markdown中移除所有超链接
            - ignore_images (bool): 移除所有 [[image]]() 图片引用
            - escape_html (bool): 将HTML实体转换为文本（默认通常为 True）
            - body_width (int): 在N个字符处换行。0 或 None 表示不换行
            - skip_internal_links (bool): 如果为 True，忽略 #localAnchors 或引用同一页面的内部链接
            - include_sup_sub (bool): 尝试以更易读的方式处理 <sup> / <sub> 标签

In [None]:
from crawl4ai import AsyncWebCrawler, CrawlerRunConfig, CacheMode
from crawl4ai.content_filter_strategy import PruningContentFilter
from crawl4ai.markdown_generation_strategy import DefaultMarkdownGenerator

async def main(output_filename):
    # 浏览器配置
    browser_config = BrowserConfig(headless = True, # 无头模式
                                viewport_width = 1280,  # 窗口宽度
                                viewport_height = 720,  # 窗口高度
                                user_agent = 'Chrome/114.0.0.0', # 浏览器标识
                                text_mode = True,
                                 )

    # 爬虫运行配置
    run_config = CrawlerRunConfig(
    cache_mode = CacheMode.DISABLED,  # 禁用缓存
    markdown_generator = DefaultMarkdownGenerator(
        content_filter = PruningContentFilter(
            # min_word_threshold = 10, # 丢弃少于N个单词的块，因为它们可能太短或无用(不建议)
            threshold = 0.76,  # fixded: 固定阈值 / dynamic: 初始阈值
            # threshold_type = "fixed", # 固定
            threshold_type = "dynamic", # 变动
        ),
        options = {
            "ignore_links": True,
            "ignore_images": True,
            })
    )

    # 创建爬虫并执行
    async with AsyncWebCrawler(config=browser_config) as crawler:
        result = await crawler.arun(
            url = "https://www.anthropic.com/news/agent-capabilities-api",  # 目标网址
            config = run_config,  # 运行配置
        )

        # 保存原始内容
        print("Raw Markdown length:", len(result.markdown.raw_markdown))
        output_md(output_filename.replace('.md', '_raw.md'), result.markdown.raw_markdown)

        # 保存过滤后内容
        print("Fit Markdown length:", len(result.markdown.fit_markdown))
        output_md(output_filename.replace('.md', '_fit.md'), result.markdown.fit_markdown)

asyncio.run(main('2_2_2_RunConfig_ContentFilterPruning_Options.md'))