In [1]:

"""
multipage_scraper.py

Usage examples:
    # Basic run (scrape up to 200 pages starting from the given URL)
    python multipage_scraper.py "https://www.w3schools.com/html/html_responsive.asp" --max-pages 200 --output json --out-file w3_pages.json

    # Save to all formats and limit to 50 pages with 1 second delay
    python multipage_scraper.py "https://www.w3schools.com/html/html_responsive.asp" --max-pages 50 --output all --delay 1.0 --out-file w3_all

Notes:
 - The scraper respects robots.txt by default.
 - It only follows links within the same domain and (optionally) same path prefix.
 - Be polite: set delay and don't hammer the server.
"""



'\nmultipage_scraper.py\n\nUsage examples:\n    # Basic run (scrape up to 200 pages starting from the given URL)\n    python multipage_scraper.py "https://www.w3schools.com/html/html_responsive.asp" --max-pages 200 --output json --out-file w3_pages.json\n\n    # Save to all formats and limit to 50 pages with 1 second delay\n    python multipage_scraper.py "https://www.w3schools.com/html/html_responsive.asp" --max-pages 50 --output all --delay 1.0 --out-file w3_all\n\nNotes:\n - The scraper respects robots.txt by default.\n - It only follows links within the same domain and (optionally) same path prefix.\n - Be polite: set delay and don\'t hammer the server.\n'

In [2]:
import argparse
import csv
import json
import time
import re
from urllib.parse import urlparse, urljoin
import xml.etree.ElementTree as ET
import os
import sys
from collections import deque

import requests
from bs4 import BeautifulSoup
from urllib.robotparser import RobotFileParser


In [3]:

# ---------- CONFIG ----------
USER_AGENT = "MultipageScraper/1.0 (+https://yourdomain.example)"
DEFAULT_DELAY = 0.5  # seconds between requests
TIMEOUT = 15
# ---------------------------



In [4]:
def allowed_by_robots(start_url, user_agent=USER_AGENT):
    parsed = urlparse(start_url)
    robots_url = f"{parsed.scheme}://{parsed.netloc}/robots.txt"
    rp = RobotFileParser()
    try:
        rp.set_url(robots_url)
        rp.read()
        return rp
    except Exception:
        # if we cannot fetch robots.txt, return a permissive parser (but still be polite)
        rp = RobotFileParser()
        rp.parse("")  # empty rules -> allow?
        return rp

def get_domain_and_prefix(start_url):
    p = urlparse(start_url)
    domain = f"{p.scheme}://{p.netloc}"
    prefix = p.path.rsplit("/", 1)[0]  # parent path (useful if you want to restrict)
    return domain, prefix

def is_internal_link(link, base_netloc):
    if not link:
        return False
    parsed = urlparse(link)
    if parsed.netloc == "" or parsed.netloc == base_netloc:
        return True
    return False

def normalize_url(href, base_url):
    if not href:
        return None
    href = href.strip()
    # ignore javascript:, mailto:, tel:
    if href.startswith("javascript:") or href.startswith("mailto:") or href.startswith("tel:") or href.startswith("#"):
        return None
    return urljoin(base_url, href.split("#")[0])  # remove fragment

def extract_main_text(soup):
    # Heuristics: prefer <article>, then largest <div> or <main>, then body text.
    article = soup.find("article")
    if article and len(article.get_text(strip=True)) > 100:
        return article.get_text(separator="\n", strip=True)
    main = soup.find("main")
    if main and len(main.get_text(strip=True)) > 100:
        return main.get_text(separator="\n", strip=True)

    # find the largest div by text length
    divs = soup.find_all("div")
    best = ""
    for d in divs:
        txt = d.get_text(separator="\n", strip=True)
        if len(txt) > len(best):
            best = txt
    if len(best) > 50:
        return best

    # fallback: full body text
    body = soup.body
    return body.get_text(separator="\n", strip=True) if body else soup.get_text(separator="\n", strip=True)

def fetch_page(session, url, user_agent=USER_AGENT):
    headers = {"User-Agent": user_agent}
    resp = session.get(url, headers=headers, timeout=TIMEOUT)
    resp.raise_for_status()
    return resp

def crawl(start_url, max_pages=200, delay=DEFAULT_DELAY, restrict_to_prefix=True, output_formats=("json",), out_file="output"):
    parsed_start = urlparse(start_url)
    base_domain = parsed_start.netloc
    base_origin = f"{parsed_start.scheme}://{parsed_start.netloc}"
    prefix_path = parsed_start.path
    if restrict_to_prefix:
        prefix_path = prefix_path.split("/", 2)[:2]  # keep safe prefix (approx)
    else:
        prefix_path = None

    rp = allowed_by_robots(base_origin)
    queue = deque([start_url])
    seen = set([start_url])
    results = []

    session = requests.Session()

    pages_crawled = 0
    while queue and pages_crawled < max_pages:
        url = queue.popleft()
        # robots check
        try:
            can_fetch = rp.can_fetch(USER_AGENT, url)
        except Exception:
            can_fetch = True
        if not can_fetch:
            print(f"[robots.txt blocked] {url}", file=sys.stderr)
            continue

        try:
            resp = fetch_page(session, url)
        except Exception as e:
            print(f"[fetch error] {url} -> {e}", file=sys.stderr)
            continue

        soup = BeautifulSoup(resp.text, "html.parser")
        title = soup.title.string.strip() if soup.title and soup.title.string else ""
        meta_desc = ""
        md = soup.find("meta", attrs={"name": re.compile(r"description", re.I)})
        if md and md.get("content"):
            meta_desc = md.get("content").strip()
        else:
            og_desc = soup.find("meta", property="og:description")
            if og_desc and og_desc.get("content"):
                meta_desc = og_desc.get("content").strip()

        main_text = extract_main_text(soup)
        links = []
        for a in soup.find_all("a", href=True):
            href = a.get("href")
            nu = normalize_url(href, url)
            if not nu:
                continue
            parsed = urlparse(nu)
            if parsed.scheme not in ("http", "https"):
                continue
            # only internal links
            if parsed.netloc != base_domain:
                # still collect outbound links but don't queue them
                links.append({"url": nu, "internal": False})
                continue
            links.append({"url": nu, "internal": True})
            # optionally restrict to path prefix to avoid entire domain crawl
            if nu not in seen:
                if restrict_to_prefix:
                    # ensure starts with the same top-level path (optional)
                    if urlparse(nu).path.startswith(urlparse(start_url).path.split("/",2)[1] if "/" in urlparse(start_url).path[1:] else "/"):
                        queue.append(nu)
                        seen.add(nu)
                else:
                    queue.append(nu)
                    seen.add(nu)

        result = {
            "url": url,
            "status_code": resp.status_code,
            "title": title,
            "meta_description": meta_desc,
            "content": main_text,
            "num_words": len(main_text.split()),
            "links": links,
        }
        results.append(result)
        pages_crawled += 1
        print(f"[{pages_crawled}] {url} (words: {result['num_words']})")
        time.sleep(delay)

    # Save outputs
    os.makedirs("scrape_outputs", exist_ok=True)

    if "json" in output_formats or "all" in output_formats:
        json_path = f"scrape_outputs/{out_file}.json"
        with open(json_path, "w", encoding="utf-8") as f:
            json.dump(results, f, ensure_ascii=False, indent=2)
        print(f"Saved JSON -> {json_path}")

    if "csv" in output_formats or "all" in output_formats:
        csv_path = f"scrape_outputs/{out_file}.csv"
        with open(csv_path, "w", newline="", encoding="utf-8") as f:
            writer = csv.writer(f)
            writer.writerow(["url", "status_code", "title", "meta_description", "num_words", "content_snippet"])
            for r in results:
                snippet = (r["content"][:300] + "...") if len(r["content"]) > 300 else r["content"]
                writer.writerow([r["url"], r["status_code"], r["title"], r["meta_description"], r["num_words"], snippet])
        print(f"Saved CSV -> {csv_path}")

    if "xml" in output_formats or "all" in output_formats:
        xml_path = f"scrape_outputs/{out_file}.xml"
        root = ET.Element("pages")
        for r in results:
            p = ET.SubElement(root, "page")
            ET.SubElement(p, "url").text = r["url"]
            ET.SubElement(p, "status_code").text = str(r["status_code"])
            ET.SubElement(p, "title").text = r["title"] or ""
            ET.SubElement(p, "meta_description").text = r["meta_description"] or ""
            ET.SubElement(p, "num_words").text = str(r["num_words"])
            c = ET.SubElement(p, "content")
            c.text = r["content"]
        tree = ET.ElementTree(root)
        tree.write(xml_path, encoding="utf-8", xml_declaration=True)
        print(f"Saved XML -> {xml_path}")

    return results

def parse_args():
    p = argparse.ArgumentParser(description="Multipage scraper (requests + BeautifulSoup)")
    p.add_argument("start_url", help="Starting URL (e.g. https://www.example.com/some/page)")
    p.add_argument("--max-pages", type=int, default=200, help="Maximum pages to crawl")
    p.add_argument("--delay", type=float, default=DEFAULT_DELAY, help="Delay between requests in seconds")
    p.add_argument("--output", choices=["json", "csv", "xml", "all"], default="json", help="Output format(s)")
    p.add_argument("--out-file", default="output", help="Base filename (without extension) saved in scrape_outputs/")
    p.add_argument("--no-prefix-restrict", action="store_true", help="Don't restrict crawl to same path prefix (will crawl entire domain)")
    return p.parse_args()
if __name__ == "__main__":
    import sys
    
    # If running inside Jupyter/IPython, ignore unknown args
    if "ipykernel" in sys.modules:
        results = crawl(
            start_url="https://www.w3schools.com/html/html_responsive.asp",
            max_pages=10,
            delay=1.0,
            restrict_to_prefix=True,
            output_formats=("json", "csv"),
            out_file="w3_output"
        )
    else:
        args = parse_args()
        out_formats = (args.output,) if args.output != "all" else ("json", "csv", "xml")
        crawl(
            start_url=args.start_url,
            max_pages=args.max_pages,
            delay=args.delay,
            restrict_to_prefix=(not args.no_prefix_restrict),
            output_formats=out_formats,
            out_file=args.out_file,
        )


[1] https://www.w3schools.com/html/html_responsive.asp (words: 1209)
Saved JSON -> scrape_outputs/w3_output.json
Saved CSV -> scrape_outputs/w3_output.csv
