# JS-Aware Web Summarizer (Selenium + OpenAI)
Short, fast summaries from static or JS pages. Uses headless Chrome when needed.

**Notes**
- Keep `OPENAI_API_KEY` in env.
- Uses `requests` first; falls back to Selenium if page seems JS-heavy or `--force-js`.
- Caches raw HTML and summaries.
- CLI included.

In [None]:
# Minimal imports (install first run)
# !pip install selenium webdriver-manager beautifulsoup4 lxml python-dotenv readability-lxml tiktoken --quiet
import os, re, time, json, hashlib, pathlib, html
from dataclasses import dataclass
from typing import Optional, Tuple
from urllib.parse import urlparse
import requests
from bs4 import BeautifulSoup
from readability import Document
from webdriver_manager.chrome import ChromeDriverManager
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC

In [None]:
# ---- Config (edit as needed) ----
CACHE_DIR = pathlib.Path("cache"); CACHE_DIR.mkdir(exist_ok=True)
TIMEOUT = 20
MAX_TOKENS = 1200   # tighten for cheaper runs
MODEL = os.getenv("OPENAI_MODEL", "gpt-4o-mini")
USER_AGENT = "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120 Safari/537.36"

In [None]:
def _key(s: str) -> str: return hashlib.sha256(s.encode()).hexdigest()[:16]

def cache_write(name: str, data: bytes):
    p = CACHE_DIR / name; p.write_bytes(data); return p

def cache_read(name: str) -> Optional[bytes]:
    p = CACHE_DIR / name
    return p.read_bytes() if p.exists() else None

def strip_noise(html_text: str) -> str:
    # quick clean; keep it short
    soup = BeautifulSoup(html_text, "lxml")
    for tag in soup(["script","style","noscript"]): tag.extract()
    return soup.get_text(" ", strip=True)

def looks_js_heavy(html_text: str) -> bool:
    # heuristic: lots of script tags or empty body text
    scripts = len(re.findall(r'<script[\s>]', html_text, flags=re.I))
    return scripts > 20 or len(strip_noise(html_text)) < 400

def short(txt: str, n=9000) -> str:
    return txt[:n] + ("…" if len(txt) > n else "")

In [None]:
@dataclass
class FetchResult:
    url: str
    method: str
    html: str
    text: str
    elapsed: float

def fetch_static(url: str) -> FetchResult:
    t0 = time.time()
    r = requests.get(url, headers={"User-Agent": USER_AGENT}, timeout=TIMEOUT)
    r.raise_for_status()
    html_text = r.text
    elapsed = time.time() - t0
    return FetchResult(url, "requests", html_text, strip_noise(html_text), elapsed)

def fetch_selenium(url: str, wait_css: Optional[str]=None) -> FetchResult:
    t0 = time.time()
    opts = Options()
    opts.add_argument("--headless=new")
    opts.add_argument("--no-sandbox")
    opts.add_argument("--disable-gpu")
    opts.add_argument("--disable-dev-shm-usage")
    opts.add_argument(f"--user-agent={USER_AGENT}")
    driver = webdriver.Chrome(ChromeDriverManager().install(), options=opts)
    try:
        driver.set_page_load_timeout(TIMEOUT)
        driver.get(url)
        if wait_css:
            WebDriverWait(driver, min(TIMEOUT, 15)).until(EC.presence_of_element_located((By.CSS_SELECTOR, wait_css)))
        html_text = driver.page_source
    finally:
        driver.quit()
    elapsed = time.time() - t0
    return FetchResult(url, "selenium", html_text, strip_noise(html_text), elapsed)

def smart_fetch(url: str, force_js=False, wait_css: Optional[str]=None) -> FetchResult:
    # cache first
    ck = f"raw_{_key(url)}.html"
    cached = cache_read(ck)
    if cached:
        html_text = cached.decode("utf-8", errors="ignore")
        return FetchResult(url, "cache", html_text, strip_noise(html_text), 0.0)
    try:
        if not force_js:
            s = fetch_static(url)
            # if static works and doesn't look JS-heavy, use it
            if not looks_js_heavy(s.html):
                cache_write(ck, s.html.encode("utf-8", errors="ignore"))
                return s
        # else fallback
        s = fetch_selenium(url, wait_css=wait_css)
        cache_write(ck, s.html.encode("utf-8", errors="ignore"))
        return s
    except Exception as e:
        # last resort: try selenium if static failed, or static if selenium failed
        if force_js:
            alt = fetch_static(url)
        else:
            alt = fetch_selenium(url, wait_css=wait_css)
        cache_write(ck, alt.html.encode("utf-8", errors="ignore"))
        return alt

In [None]:
def extract_main_content(html_text: str, url: str) -> Tuple[str, str]:
    try:
        doc = Document(html_text)
        title = doc.short_title()
        clean_html = doc.summary(html_partial=True)
        text = strip_noise(clean_html)
        return title, text
    except Exception:
        return "", strip_noise(html_text)

In [None]:
import tiktoken

def approx_tokens(s: str, model: str="gpt-4o-mini") -> int:
    try:
        enc = tiktoken.get_encoding("cl100k_base")
        return len(enc.encode(s))
    except Exception:
        return len(s)//4

def summarize(text: str, url: str, model: str = MODEL, max_tokens: int = MAX_TOKENS) -> str:
    import http.client, json, os
    api_key = os.getenv("OPENAI_API_KEY")
    if not api_key: raise RuntimeError("Set OPENAI_API_KEY in env")
    system = "Be concise. 6 bullets max. Include 1 line TL;DR. Use plain text."
    user = f"Source: {url}\n\nContent:\n{text}"
    body = json.dumps({
        "model": model,
        "input": [
            {"role":"system","content":system},
            {"role":"user","content":user}
        ],
        "max_output_tokens": max_tokens
    })
    conn = http.client.HTTPSConnection("api.openai.com")
    conn.request("POST", "/v1/responses", body=body, headers={
        "Authorization": f"Bearer {api_key}",
        "Content-Type": "application/json"
    })
    resp = conn.getresponse()
    data = json.loads(resp.read())
    conn.close()
    # extract text
    try:
        return data["output"][0]["content"][0]["text"]
    except Exception:
        return json.dumps(data, indent=2)

In [None]:
def summarize_url(url: str, force_js=False, wait_css=None, max_chars=18000) -> dict:
    fetched = smart_fetch(url, force_js=force_js, wait_css=wait_css)
    title, main_text = extract_main_content(fetched.html, url)
    text = short(main_text, n=max_chars)
    summ = summarize(text, url)
    out = {
        "url": url,
        "title": title or "(no title)",
        "method": fetched.method,
        "elapsed_sec": round(fetched.elapsed, 2),
        "summary": summ.strip(),
    }
    # cache summary
    cache_write(f"sum_{_key(url)}.json", json.dumps(out, ensure_ascii=False, indent=2).encode())
    return out

In [None]:
def _cli():
    import argparse, sys
    p = argparse.ArgumentParser(description="Short JS-aware web summarizer")
    p.add_argument("url", help="Page to summarize")
    p.add_argument("--force-js", action="store_true", help="Force Selenium")
    p.add_argument("--wait-css", default=None, help="CSS selector to wait for")
    p.add_argument("--print", action="store_true", help="Print summary to stdout")
    args = p.parse_args()
    out = summarize_url(args.url, force_js=args.force_js, wait_css=args.wait_css)
    path = CACHE_DIR / f"sum_{_key(args.url)}.json"
    print(f"Saved: {path}")
    if args.print:
        print("\n== Summary ==\n", out["summary"])

if __name__ == "__main__":
    # Example (uncomment to try):
    # print(summarize_url("https://example.com", force_js=False))
    pass

**Run**
```bash
# 1) Create venv; install deps
pip install -r requirements.txt
# or:
pip install selenium webdriver-manager beautifulsoup4 lxml readability-lxml tiktoken python-dotenv

# 2) Env
export OPENAI_API_KEY=***

# 3) Notebook
# Run cells; or use CLI from a .py export.

# 4) CLI (optional)
python js_summarizer.py "https://example.com" --force-js --print
```

**Make it yours**
- Add logo + short demo GIF.
- Include `samples/` with 2 static and 2 JS sites.
- Keep summaries <= 6 bullets; one TL;DR.