In [37]:
import os
from firecrawl import FirecrawlApp
import time
import json
from urllib.parse import urlparse

# Initialize the FirecrawlApp with your API key
app = FirecrawlApp(api_key=os.getenv("FIREWCRAWL_API_KEY"))

# URLs to scrape, organized by category
base_urls = [
    'https://ibm.github.io/watsonx-ai-python-sdk/base.html',
]

connection_urls = [
    'https://ibm.github.io/watsonx-ai-python-sdk/core_api.html',
]

foundation_models_urls = [
    'https://ibm.github.io/watsonx-ai-python-sdk/fm_embeddings.html',
    'https://ibm.github.io/watsonx-ai-python-sdk/fm_model_inference.html',
    'https://ibm.github.io/watsonx-ai-python-sdk/fm_ts_model_inference.html',
    'https://ibm.github.io/watsonx-ai-python-sdk/fm_model.html',
    'https://ibm.github.io/watsonx-ai-python-sdk/fm_deployments.html',
    'https://ibm.github.io/watsonx-ai-python-sdk/prompt_template_manager.html',
    'https://ibm.github.io/watsonx-ai-python-sdk/fm_helpers.html',
    'https://ibm.github.io/watsonx-ai-python-sdk/fm_text_extraction.html',
    'https://ibm.github.io/watsonx-ai-python-sdk/fm_schema.html',
    'https://ibm.github.io/watsonx-ai-python-sdk/fm_rerank.html',
    'https://ibm.github.io/watsonx-ai-python-sdk/rate_limit.html',    
]

foundation_models_extensions_urls = [
    'https://ibm.github.io/watsonx-ai-python-sdk/fm_extensions_langchain.html',
    'https://ibm.github.io/watsonx-ai-python-sdk/fm_extensions_llamaindex.html',
    'https://ibm.github.io/watsonx-ai-python-sdk/fm_extensions_rag.html'    
]

# Combine all URLs into a single list
all_urls = base_urls + connection_urls + foundation_models_urls + foundation_models_extensions_urls

# Create directory structure
def create_directories():
    """Create directory structure for organizing the scraped docs"""
    os.makedirs("scraped_docs", exist_ok=True)
    os.makedirs("scraped_docs/base", exist_ok=True)
    os.makedirs("scraped_docs/connection", exist_ok=True)
    os.makedirs("scraped_docs/foundation_models", exist_ok=True)
    os.makedirs("scraped_docs/foundation_models_extensions", exist_ok=True)
    
    print("Created directory structure.")

def get_filename_from_url(url):
    """Extract the filename from the URL"""
    parsed_url = urlparse(url)
    path = parsed_url.path
    filename = os.path.basename(path)
    # Remove the .html extension and replace with .md
    if filename.endswith('.html'):
        filename = filename[:-5] + '.md'
    return filename

def determine_category(url):
    """Determine which category a URL belongs to"""
    if url in base_urls:
        return "base"
    elif url in connection_urls:
        return "connection"
    elif url in foundation_models_extensions_urls:
        return "foundation_models_extensions"
    elif url in foundation_models_urls:
        return "foundation_models"
    else:
        return "misc"  # Default category

def scrape_url_with_retry(url, max_retries=3, delay=10):
    """Scrape a URL with retry logic"""
    for attempt in range(max_retries):
        try:
            print(f"Scraping {url} (Attempt {attempt + 1}/{max_retries})")
            scrape_status = app.scrape_url(
                url,
                params={'formats': ['markdown']}
            )
            
            # Check if the scrape was successful
            if scrape_status and 'markdown' in scrape_status:
                return scrape_status
            
            print(f"Scrape incomplete, retrying in {delay} seconds...")
            time.sleep(delay)
        except Exception as e:
            print(f"Error scraping {url}: {str(e)}")
            print(f"Retrying in {delay} seconds...")
            time.sleep(delay)
    
    print(f"Failed to scrape {url} after {max_retries} attempts")
    return None

def save_markdown(markdown_content, category, filename):
    """Save the markdown content to a file"""
    file_path = os.path.join("scraped_docs", category, filename)
    
    with open(file_path, 'w', encoding='utf-8') as f:
        f.write(markdown_content)
    
    print(f"Saved {file_path}")
    return file_path

def save_metadata(url_results):
    """Save metadata about the scraping process"""
    metadata = {
        "scrape_time": time.strftime("%Y-%m-%d %H:%M:%S"),
        "total_urls": len(all_urls),
        "successful_scrapes": len([r for r in url_results if r["success"]]),
        "failed_scrapes": len([r for r in url_results if not r["success"]]),
        "results": url_results
    }
    
    with open("scraped_docs/metadata.json", 'w') as f:
        json.dump(metadata, f, indent=2)
    
    print(f"Saved metadata to scraped_docs/metadata.json")

def main():
    """Main function to scrape all URLs"""
    create_directories()
    
    # Display warning about responsible scraping
    print("="*80)
    print("RESPONSIBLE SCRAPING NOTICE:")
    print("This script includes delays between requests to be respectful to the server.")
    print("Please use this tool responsibly and in accordance with the website's terms of service.")
    print("="*80)
    print()
    
    url_results = []
    
    for url in all_urls:
        category = determine_category(url)
        filename = get_filename_from_url(url)
        
        print(f"\nProcessing {url} (Category: {category}, Filename: {filename})")
        
        scrape_result = scrape_url_with_retry(url)
        
        result = {
            "url": url,
            "category": category,
            "filename": filename,
            "success": False,
            "file_path": None
        }
        
        if scrape_result and 'markdown' in scrape_result:
            markdown_content = scrape_result['markdown']
            file_path = save_markdown(markdown_content, category, filename)
            
            result["success"] = True
            result["file_path"] = file_path
        
        url_results.append(result)
        
        # Add a substantial delay to avoid hitting rate limits or triggering denial of service protections
        delay_seconds = 5  # Increased delay to 5 seconds
        print(f"Waiting {delay_seconds} seconds before the next request...")
        time.sleep(delay_seconds)
    
    save_metadata(url_results)
    
    # Print summary
    successful = len([r for r in url_results if r["success"]])
    print(f"\nScraping completed: {successful}/{len(all_urls)} URLs scraped successfully")
    
    # Print failures if any
    failures = [r for r in url_results if not r["success"]]
    if failures:
        print("\nFailed to scrape the following URLs:")
        for failure in failures:
            print(f"- {failure['url']}")



In [38]:
main()

Created directory structure.
RESPONSIBLE SCRAPING NOTICE:
This script includes delays between requests to be respectful to the server.
Please use this tool responsibly and in accordance with the website's terms of service.


Processing https://ibm.github.io/watsonx-ai-python-sdk/base.html (Category: base, Filename: base.md)
Scraping https://ibm.github.io/watsonx-ai-python-sdk/base.html (Attempt 1/3)
Saved scraped_docs/base/base.md
Waiting 5 seconds before the next request...

Processing https://ibm.github.io/watsonx-ai-python-sdk/core_api.html (Category: connection, Filename: core_api.md)
Scraping https://ibm.github.io/watsonx-ai-python-sdk/core_api.html (Attempt 1/3)
Saved scraped_docs/connection/core_api.md
Waiting 5 seconds before the next request...

Processing https://ibm.github.io/watsonx-ai-python-sdk/fm_embeddings.html (Category: foundation_models, Filename: fm_embeddings.md)
Scraping https://ibm.github.io/watsonx-ai-python-sdk/fm_embeddings.html (Attempt 1/3)
Saved scraped_doc