In [None]:
pip install -U crawl4ai

In [None]:
# Install the package
!pip install -U crawl4ai

# (Optional) Install the pre-release version
!pip install crawl4ai --pre

# Run post-installation setup
!crawl4ai-setup

# Verify your installation
!crawl4ai-doctor


In [None]:
!crawl4ai-setup

In [None]:
!pip install nest_asyncio


In [64]:
import asyncio
import json
from pathlib import Path
from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig, CacheMode
from crawl4ai.extraction_strategy import JsonCssExtractionStrategy
from crawl4ai.markdown_generation_strategy import DefaultMarkdownGenerator

async def main():
    browser_conf = BrowserConfig(headless=True)

    # 1) Extract article URLs from category page
    cat_schema = {
        "name": "articles",
        "baseSelector": "li.wp-block-post div.loop-card__content",
        "fields": [
            {
                "name": "category",
                "selector": ".loop-card__cat-group .loop-card__cat",
                "type": "text"
            },
            {
                "name": "url",
                "selector": "h3.loop-card__title a.loop-card__title-link",
                "type": "attribute",
                "attribute": "href"
            },
            {
                "name": "headline",
                "selector": "h3.loop-card__title a.loop-card__title-link",
                "type": "text"
            },
            {
                "name": "author",
                "selector": ".loop-card__meta .loop-card__author",
                "type": "text"
            },
            {
                "name": "date",
                "selector": ".loop-card__meta time",
                "type": "attribute",
                "attribute": "datetime"
            },
            {
                "name": "image_src",
                "selector": "figure.loop-card__figure img",
                "type": "attribute",
                "attribute": "src"
            }
        ]
    }
    cat_run_conf = CrawlerRunConfig(
        cache_mode = CacheMode.BYPASS,
        extraction_strategy = JsonCssExtractionStrategy(cat_schema)
    )

    async with AsyncWebCrawler(config=browser_conf) as crawler:
        result_cat = await crawler.arun(
            url = "https://techcrunch.com/category/startups/",
            config = cat_run_conf
        )

    if not result_cat.success:
        print("Category crawl failed:", result_cat.error_message)
        return

    article_list = json.loads(result_cat.extracted_content)
    print(f"Found {len(article_list)} articles")
    for article in article_list:
     url = article.get('url')
     if url:
         print(url)
     else:
         print("Skipping an article with a missing URL.")

    # Setup Markdown Generator for the article pages
    md_generator = DefaultMarkdownGenerator()

    # Directory to save markdown files
    out_dir = Path("articles_md")
    out_dir.mkdir(exist_ok=True)

    # 2) For each article URL → crawl and save markdown
    async with AsyncWebCrawler(config=browser_conf) as crawler:
        for ai in article_list:
            article_url = ai.get("url")
            if not article_url or not article_url.startswith(("http://", "https://")):
                print("Skipping invalid URL:", article_url)
                continue

            run_conf = CrawlerRunConfig(
                cache_mode = CacheMode.BYPASS,
                markdown_generator = md_generator
            )

            result_art = await crawler.arun(
                url = article_url,
                config = run_conf
            )

            if not result_art.success:
                print("Failed to crawl article:", article_url, "error:", result_art.error_message)
                continue

            md = result_art.markdown.raw_markdown
            # File name from URL (last segment)
            filename = article_url.rstrip("/").split("/")[-1] + ".md"
            filepath = out_dir / filename
            with open(filepath, "w", encoding="utf-8") as f:
                f.write(md)
            print("Saved:", filepath)

if __name__ == "__main__":
    await main()
    


Found 37 articles
https://techcrunch.com/2025/10/21/open-source-agentic-startup-langchain-hits-1-25b-valuation/
https://techcrunch.com/2025/10/21/sesame-the-conversational-ai-startup-from-oculus-founders-raises-250m-and-launches-beta/
https://techcrunch.com/2025/10/21/sources-multimodal-ai-startup-fal-ai-already-raised-at-4b-valuation/
https://techcrunch.com/2025/10/21/as-the-browser-wars-heat-up-here-are-the-hottest-alternatives-to-chrome-and-safari-in-2025/
https://techcrunch.com/2025/10/21/a16z-backed-codi-launches-ai-agent-office-manager/
https://techcrunch.com/2025/10/21/the-full-techcrunch-disrupt-stage-revealed-where-the-future-of-tech-breaks-first/
https://techcrunch.com/2025/10/21/aura-introduces-a-499-e-ink-digital-photo-frame-that-lets-you-go-cordless/
https://techcrunch.com/2025/10/21/only-6-days-until-techcrunch-disrupt-2025-kicks-off-in-san-francisco-and-ticket-rates-increase/
https://techcrunch.com/2025/10/20/shin-starrs-robotic-food-truck-kitchen-will-serve-up-korean-bb

Saved: articles_md/open-source-agentic-startup-langchain-hits-1-25b-valuation.md


Saved: articles_md/sesame-the-conversational-ai-startup-from-oculus-founders-raises-250m-and-launches-beta.md


Saved: articles_md/sources-multimodal-ai-startup-fal-ai-already-raised-at-4b-valuation.md


Saved: articles_md/as-the-browser-wars-heat-up-here-are-the-hottest-alternatives-to-chrome-and-safari-in-2025.md


Saved: articles_md/a16z-backed-codi-launches-ai-agent-office-manager.md


Saved: articles_md/the-full-techcrunch-disrupt-stage-revealed-where-the-future-of-tech-breaks-first.md


Saved: articles_md/aura-introduces-a-499-e-ink-digital-photo-frame-that-lets-you-go-cordless.md


Saved: articles_md/only-6-days-until-techcrunch-disrupt-2025-kicks-off-in-san-francisco-and-ticket-rates-increase.md


Saved: articles_md/shin-starrs-robotic-food-truck-kitchen-will-serve-up-korean-bbq-at-techcrunch-disrupt-2025.md


Saved: articles_md/san-francisco-mayor-daniel-lurie-is-coming-to-techcrunch-disrupt-2025.md


Saved: articles_md/last-minute-ticket-deal-for-techcrunch-disrupt-2025-save-60-on-your-plus-one.md


Saved: articles_md/final-countdown-only-7-days-until-techcrunch-disrupt-2025-and-ticket-prices-increase.md


Saved: articles_md/this-top-vc-bet-close-to-20-of-his-fund-on-teenagers-heres-why.md


Saved: articles_md/should-ai-do-everything-openai-thinks-so.md


Saved: articles_md/from-sb-243-to-chatgpt-why-its-not-cool-to-be-cautious-about-ai.md


Saved: articles_md/tech-layoffs-2025-list.md


Saved: articles_md/together-we-make-disrupt-unforgettable-thank-you-to-our-sponsors.md


Saved: articles_md/cryptos-next-chapter-with-solanas-anatoly-yakovenko-at-techcrunch-disrupt-2025.md


Saved: articles_md/less-than-24-hours-to-spotlight-your-startup-at-techcrunch-disrupt-2025-in-front-of-10000-tech-leaders-and-vcs.md


Saved: articles_md/last-flash-sale-before-techcrunch-disrupt-2025-doors-open-save-up-to-624.md


Saved: articles_md/rent-a-cyber-friend-will-pay-you-to-talk-to-strangers-online-and-will-show-off-its-platform-at-techcrunch-disrupt-2025.md


Saved: articles_md/a-new-wave-of-social-media-apps-provide-hope-in-a-doomscrolling-world.md


Saved: articles_md/deel-hits-17-3b-valuation-after-raising-300m-from-big-name-vcs.md


Saved: articles_md/how-a-headphone-site-operator-built-loyalty-startup-lantern-to-solve-his-own-problems.md


Saved: articles_md/final-2-days-to-claim-your-exhibit-table-at-techcrunch-disrupt-2025.md


Saved: articles_md/general-intuition-lands-134m-seed-to-teach-agents-spatial-reasoning-using-video-game-clips.md


Saved: articles_md/only-48-hours-left-to-save-before-the-techcrunch-disrupt-2025-flash-sale-ends.md


Saved: articles_md/how-tiny-mubadala-backed-aaf-is-winning-vc-deals-in-some-of-the-hottest-startups.md


Saved: articles_md/indias-kuku-snags-85m-as-mobile-content-wars-intensify.md


Saved: articles_md/electric-aircraft-startup-beta-technologies-seeks-to-raise-825m-in-ipo.md
Skipping invalid URL: None
Skipping invalid URL: None
Skipping invalid URL: None
Skipping invalid URL: None
Skipping invalid URL: None
Skipping invalid URL: None
Skipping invalid URL: None
