# Cell 1: Imports & configuration

In [1]:
import os
import json
import time
import requests
import pandas as pd
from tqdm.notebook import tqdm
from IPython.display import clear_output

# ─── USER CONFIG ──────────────────────────────────────────────────────────
# CSV of domains (download separately or via script)
CSV_FILE        = "majestic_million.csv"
# Where to persist crawl progress
STATE_FILES     = {"tos": "state_tos.json", "pp": "state_pp.json"}
# API endpoints (set in .env or here)
TOS_ENDPOINT    = "http://localhost:8080/api/v1/crawl-tos"
PP_ENDPOINT     = "http://localhost:8080/api/v1/crawl-pp"
API_KEY         = os.getenv("X_API_KEY")
# how many successful crawls per stage
TARGET_SUCCESSES = 100
# network/time settings
TIMEOUT         = 10     # seconds
DELAY           = 0.1    # seconds between requests

# Cell 2: Download Data Set

In [2]:
if not os.path.exists(CSV_FILE):
    print("Downloading Majestic Million list…")
    resp = requests.get(CSV_URL, stream=True, timeout=TIMEOUT)
    resp.raise_for_status()
    with open(CSV_FILE, "wb") as f:
        for chunk in resp.iter_content(chunk_size=1024):
            f.write(chunk)
else:
    print("CSV already exists; skipping download.")

# Now read the real header row and pull exactly GlobalRank + Domain
df = pd.read_csv(
    CSV_FILE,
    usecols=["GlobalRank", "Domain"],
    dtype={"GlobalRank": int, "Domain": str},
    low_memory=False
)

# Make sure it’s sorted
df = df.sort_values("GlobalRank").reset_index(drop=True)

print(f"Loaded {len(df):,} domains.  First few:\n", df.head())

CSV already exists; skipping download.
Loaded 1,000,000 domains.  First few:
    GlobalRank         Domain
0           1     google.com
1           2   facebook.com
2           3    youtube.com
3           4    twitter.com
4           5  instagram.com


# Cell 3: Health check your service

In [3]:
health_url = "http://localhost:8080/api/test"
print("Checking service at", health_url, "…")

try:
    r = requests.get(health_url, timeout=TIMEOUT)
    r.raise_for_status()
    js = r.json()
    assert js.get("status") == "ok"
    print("✅ Service is up:", js.get("message", "<no message>"))
except Exception as e:
    raise SystemExit(f"❌ Health check failed: {e}")

Checking service at http://localhost:8080/api/test …
✅ Service is up: Server is running


# Cell 4: Define the crawl function

In [4]:
def crawl_stage(df, stage_name, endpoint):
    """Crawl until TARGET_SUCCESSES successes for given stage."""
    state_file = STATE_FILES[stage_name]
    # load or init
    if os.path.exists(state_file):
        state = json.load(open(state_file))
    else:
        state = {"last_rank": 0, "success": 0, "fail": 0}

    last_rank = state["last_rank"]
    succ      = state["success"]
    fail      = state["fail"]

    # display header
    clear_output(wait=True)
    print(f"[{stage_name.upper()}] Resuming at rank {last_rank}: "
          f"{succ}✅, {fail}❌")

    # filter remaining domains
    to_crawl = df[df.GlobalRank > last_rank].sort_values("GlobalRank")
    pbar     = tqdm(to_crawl.itertuples(index=False), total=len(to_crawl),
                    desc=f"crawl-{stage_name}")
    
    for row in pbar:
        if succ >= TARGET_SUCCESSES:
            break

        rank, domain = row.GlobalRank, row.Domain
        ok = False
        try:
            resp = requests.post(
                endpoint,
                json={"url": domain},
                headers={"X-API-KEY": API_KEY},
            )
            resp.raise_for_status()
            ok = resp.json().get("success", False)
        except Exception:
            ok = False

        # update counts and state
        if ok:
            succ += 1
        else:
            fail += 1
        state = {"last_rank": rank, "success": succ, "fail": fail}
        with open(state_file, "w") as f:
            json.dump(state, f)

        # live summary
        clear_output(wait=True)
        pct = succ / (succ + fail) * 100 if (succ + fail) else 0
        print(f"[{stage_name.upper()}] Rank {rank}: {succ}✅, {fail}❌ ({pct:.2f}%)")

        time.sleep(DELAY)

    pbar.close()
    print(f"🎉 [{stage_name.upper()}] Done: {succ} successes, {fail} failures.")

# Cell 5: Run both stages back-to-back, but only start PP after tos reaches TARGET_SUCCESSES

In [None]:
# — run ToS stage
crawl_stage(df, "tos", TOS_ENDPOINT)

# — check ToS state
tos_state = json.load(open(STATE_FILES["tos"]))
if tos_state["success"] >= TARGET_SUCCESSES:
    crawl_stage(df, "pp", PP_ENDPOINT)
else:
    print(
      f"ℹ️  Skipping Privacy Policy crawl: "
      f"only {tos_state['success']} successes on ToS (target is {TARGET_SUCCESSES})"
    )

[PP] Rank 240: 99✅, 141❌ (41.25%)
