In [7]:
import os
import requests
from bs4 import BeautifulSoup
import html2text

# Base URL for user guide
BASE_URL = "https://pandas.pydata.org/docs/reference/"
MAIN_PAGE = BASE_URL + "index.html"

# Directory to save the documentation
OUTPUT_FILE = "pandas_user_guide.txt"

def get_links():
    """Fetch all documentation links from the main page."""
    response = requests.get(MAIN_PAGE)
    if response.status_code != 200:
        print("Failed to fetch the main page.")
        return []

    soup = BeautifulSoup(response.text, "html.parser")
    
    # Extract all internal links from the sidebar
    links = []
    for a in soup.select("ul.nav.bd-sidenav a.reference.internal"):
        href = a.get("href")
        if href and not href.startswith("http"):  # Ignore external links
            links.append(BASE_URL + href)
    
    return links

def extract_text(url):
    """Fetch a URL and extract its text content."""
    try:
        response = requests.get(url)
        if response.status_code != 200:
            print(f"Failed to fetch {url}")
            return ""

        soup = BeautifulSoup(response.text, "html.parser")
        
        # Remove navigation, footers, and unnecessary elements
        for tag in soup(["nav", "header", "footer", "script", "style", "aside"]):
            tag.extract()

        # Convert HTML to readable text
        converter = html2text.HTML2Text()
        converter.ignore_links = True  # Remove links
        converter.ignore_images = True  # Remove images

        text = converter.handle(str(soup))
        return text.strip()
    
    except Exception as e:
        print(f"Error processing {url}: {e}")
        return ""

def save_documentation():
    """Scrape and save all documentation into a single file."""
    links = get_links()
    if not links:
        print("No links found.")
        return

    with open(OUTPUT_FILE, "w", encoding="utf-8") as f:
        for i, link in enumerate(links, start=1):
            print(f"[{i}/{len(links)}] Fetching: {link}")
            text = extract_text(link)
            f.write(f"\n\n{'='*80}\n{link}\n{'='*80}\n\n{text}\n\n")

    print(f"Documentation saved to {OUTPUT_FILE}")

# Run the scraper
#save_documentation()


In [8]:
save_documentation()

[1/16] Fetching: https://pandas.pydata.org/docs/reference/io.html
[2/16] Fetching: https://pandas.pydata.org/docs/reference/general_functions.html
[3/16] Fetching: https://pandas.pydata.org/docs/reference/series.html
[4/16] Fetching: https://pandas.pydata.org/docs/reference/frame.html
[5/16] Fetching: https://pandas.pydata.org/docs/reference/arrays.html
[6/16] Fetching: https://pandas.pydata.org/docs/reference/indexing.html
[7/16] Fetching: https://pandas.pydata.org/docs/reference/offset_frequency.html
[8/16] Fetching: https://pandas.pydata.org/docs/reference/window.html
[9/16] Fetching: https://pandas.pydata.org/docs/reference/groupby.html
[10/16] Fetching: https://pandas.pydata.org/docs/reference/resampling.html
[11/16] Fetching: https://pandas.pydata.org/docs/reference/style.html
[12/16] Fetching: https://pandas.pydata.org/docs/reference/plotting.html
[13/16] Fetching: https://pandas.pydata.org/docs/reference/options.html
[14/16] Fetching: https://pandas.pydata.org/docs/reference/ex