# Site Crawler
we are going to use this to build a site crawler that will crawl all pages in a site, and save each file in a markdown format

if we can use langchain for this, and use it to grab things we will use it. ok so agentic web crawler. 

In [2]:
# crawl_to_md.py
# Minimal, robust website -> Markdown crawler
# Usage:
#   python crawl_to_md.py https://example.com --out out_md --max-pages 1000 --delay 0.5
#   pip install -r requirements.txt

import argparse, os, re, time, hashlib, queue, sys, datetime
from urllib.parse import urljoin, urlparse, urldefrag
import requests
from bs4 import BeautifulSoup
from markdownify import markdownify as md
import urllib.robotparser as robotparser
from xml.etree import ElementTree as ET

In [None]:
# ---------- Helpers ----------
def norm_url(seed_netloc, url):
    # drop fragments, normalize, enforce same host
    url, _ = urldefrag(url.strip())
    if not url:
        return None
    try:
        u = urlparse(url)
        if not u.scheme:
            return None
        if u.netloc != seed_netloc:
            return None
        # strip default ports, normalize path
        path = re.sub(r"/+", "/", u.path or "/")
        # drop tracking query params (basic)
        q = "&".join(sorted([p for p in (u.query or "").split("&") if p and not p.lower().startswith(("utm_","gclid","fbclid"))]))
        return f"{u.scheme}://{u.netloc}{path}" + (f"?{q}" if q else "")
    except Exception:
        return None

def is_probably_html(resp):
    ctype = (resp.headers.get("Content-Type") or "").lower()
    return "text/html" in ctype or "application/xhtml" in ctype or (not ctype and resp.text.startswith("<"))

def slugify_path(url):
    u = urlparse(url)
    # turn /a/b/ into a-b, fallback to hash if too short
    path = u.path.strip("/")
    if not path:
        path = "index"
    path = re.sub(r"[^a-zA-Z0-9\-/_]", "-", path)
    path = re.sub(r"/+", "/", path)
    slug = path.replace("/", "-")
    # include stable hash of query to avoid collisions
    h = hashlib.sha1((u.path + "?" + (u.query or "")).encode("utf-8")).hexdigest()[:10]
    return f"{slug}-{h}.md"

def front_matter(title, url):
    ts = datetime.datetime.utcnow().isoformat() + "Z"
    return f"---\ntitle: \"{title}\"\nsource: \"{url}\"\ncrawled_at: \"{ts}\"\n---\n\n"

def extract_title(soup):
    if soup.title and soup.title.string:
        return soup.title.string.strip()
    h1 = soup.find("h1")
    return (h1.get_text(strip=True) if h1 else "Untitled")

def discover_links(base_url, soup, seed_netloc):
    links = set()
    for a in soup.find_all("a", href=True):
        href = a["href"]
        absu = urljoin(base_url, href)
        n = norm_url(seed_netloc, absu)
        if n:
            links.add(n)
    return links

def parse_sitemaps(seed, session):
    # robots -> Sitemap: URLs -> parse XML -> loc entries
    urls = set()
    try:
        rp = session.get(urljoin(seed, "/robots.txt"), timeout=10)
        if rp.status_code == 200:
            for line in rp.text.splitlines():
                if line.lower().startswith("sitemap:"):
                    sm = line.split(":",1)[1].strip()
                    urls |= parse_sitemap_xml(sm, session)
    except Exception:
        pass
    return urls

def parse_sitemap_xml(url, session):
    out = set()
    try:
        r = session.get(url, timeout=15)
        if r.status_code != 200:
            return out
        tree = ET.fromstring(r.content)
        ns = {"sm": "http://www.sitemaps.org/schemas/sitemap/0.9"}
        # urlset
        for loc in tree.findall(".//sm:url/sm:loc", ns):
            if loc.text:
                out.add(loc.text.strip())
        # nested sitemaps
        for sm in tree.findall(".//sm:sitemap/sm:loc", ns):
            if sm.text:
                out |= parse_sitemap_xml(sm.text.strip(), session)
    except Exception:
        pass
    return out

def allowed_by_robots(seed, ua):
    rp = robotparser.RobotFileParser()
    try:
        rp.set_url(urljoin(seed, "/robots.txt"))
        rp.read()
    except Exception:
        pass
    return rp

def backoff_sleep(attempt, base_delay):
    # exponential backoff with cap
    time.sleep(min(base_delay * (2 ** attempt), 10.0))

In [None]:
# ---------- Main crawl ----------
def crawl(seed, out_dir, max_pages, delay, timeout, user_agent, include_sitemap):
    os.makedirs(out_dir, exist_ok=True)
    seed = seed.rstrip("/")
    seed_parsed = urlparse(seed)
    seed_netloc = seed_parsed.netloc
    session = requests.Session()
    session.headers.update({"User-Agent": user_agent})
    robots = allowed_by_robots(seed, user_agent)

    q = queue.Queue()
    visited = set()
    enq = set()

    # seed queue
    q.put(seed); enq.add(seed)

    # optional: pre-seed with sitemap URLs
    if include_sitemap:
        for u in parse_sitemaps(seed, session):
            n = norm_url(seed_netloc, u)
            if n and n not in enq:
                q.put(n); enq.add(n)

    saved = 0
    attempts = {}

    while not q.empty() and saved < max_pages:
        url = q.get()
        if url in visited:
            continue
        visited.add(url)

        if robots and hasattr(robots, "can_fetch") and not robots.can_fetch(user_agent, url):
            continue

        # polite delay
        time.sleep(delay)

        # fetch with retry
        for attempt in range(4):
            try:
                resp = session.get(url, timeout=timeout, allow_redirects=True)
                status = resp.status_code
                if status in (429, 500, 502, 503, 504):
                    backoff_sleep(attempt, 1.0); continue
                if status != 200:
                    break
                if not is_probably_html(resp):
                    break
                html = resp.text
                soup = BeautifulSoup(html, "html.parser")

                # extract & enqueue links
                for link in discover_links(resp.url, soup, seed_netloc):
                    if link not in enq:
                        q.put(link); enq.add(link)

                # convert to markdown
                title = extract_title(soup)
                # remove script/style/nav/footer noise (basic)
                for tag in soup(["script","style","noscript"]):
                    tag.decompose()
                # Optional: drop navs/footers by role/semantic (lightweight heuristic)
                for tag in soup.find_all(attrs={"role":"navigation"}): tag.decompose()
                for tag in soup.find_all(["nav","footer"]): tag.decompose()

                body_html = str(soup.body or soup)
                markdown = md(body_html, heading_style="ATX", strip=["img"])  # skip images by default

                # write file
                fname = slugify_path(resp.url)
                path = os.path.join(out_dir, fname)
                with open(path, "w", encoding="utf-8") as f:
                    f.write(front_matter(title, resp.url))
                    # keep the H1 if present as title inside too (optional)
                    f.write(markdown.strip() + "\n")
                saved += 1
                break
            except requests.RequestException:
                backoff_sleep(attempt, 1.0)
                continue
            except Exception:
                # skip malformed pages
                break

    print(f"Done. Saved {saved} markdown files to {out_dir}. Visited {len(visited)} URLs.")

In [None]:
# if __name__ == "__main__":
#     ap = argparse.ArgumentParser(description="Crawl a website and save pages as Markdown.")
#     ap.add_argument("seed", help="Seed URL (e.g., https://example.com)")
#     ap.add_argument("--out", dest="out_dir", default="site_md", help="Output directory")
#     ap.add_argument("--max-pages", type=int, default=1000, help="Maximum pages to save")
#     ap.add_argument("--delay", type=float, default=0.5, help="Politeness delay between requests (seconds)")
#     ap.add_argument("--timeout", type=int, default=15, help="HTTP request timeout (seconds)")
#     ap.add_argument("--ua", dest="user_agent", default="SiteToMarkdownBot/1.0 (+https://example.com)", help="User-Agent string")
#     ap.add_argument("--no-sitemap", action="store_true", help="Do not pre-seed URLs from sitemap")
#     args = ap.parse_args()

#     crawl(
#         seed=args.seed,
#         out_dir=args.out_dir,
#         max_pages=args.max_pages,
#         delay=args.delay,
#         timeout=args.timeout,
#         user_agent=args.user_agent,
#         include_sitemap=(not args.no_sitemap),
#     )

In [None]:
seed_url = "https://help.itglue.kaseya.com/help/Content/0-HOME/Home.htm"
out_dir = "out_md"

crawl(
    seed=seed_url,
    out_dir=out_dir,
    max_pages=100,
    delay=0.5,
    timeout=15,
    user_agent="MyCrawlerBot/1.0",
    include_sitemap=True
)