# Cell 1: Imports & configuration

In [1]:
import os, json, time, requests, pandas as pd
from tqdm.notebook import tqdm
from IPython.display import clear_output

# — User config —"
OUTPUT_CSV       = "top-1m.csv"
TRNCO_URL        = "https://tranco-list.eu/download_daily/KJNQW"
STATE_FILES      = {"tos": "state_tos.json", "pp": "state_pp.json"}
TOS_ENDPOINT="http://localhost:8080/api/v1/crawl-tos"
PP_ENDPOINT="http://localhost:8080/api/v1/crawl-pp"
HEALTH_URL       = "http://localhost:8080/api/test"
API_KEY          = os.getenv("X_API_KEY")
TARGET_SUCCESSES = 12500
DELAY            = 0.1

# Cell 2: Download, unzip, load & peek top-5, then write top-1m.csv

In [2]:
# Cell 2: Download & prepare Tranco list into OUTPUT_CSV, skip if exists
if os.path.exists(OUTPUT_CSV):
    print(f"{OUTPUT_CSV} already exists; loading it.")
    df = pd.read_csv(
        OUTPUT_CSV,
        dtype={"Rank": int, "Domain": str},
        low_memory=False
    )
else:
    print("Downloading Tranco list…")
    resp = requests.get(TRNCO_URL)
    resp.raise_for_status()
    data = resp.content

    # try to unzip first CSV, else treat as raw CSV
    try:
        import zipfile, io
        with zipfile.ZipFile(io.BytesIO(data)) as z:
            csv_name = next(n for n in z.namelist() if n.lower().endswith(".csv"))
            with z.open(csv_name) as src:
                df = pd.read_csv(
                    src,
                    header=None,
                    names=["Rank", "Domain"],
                    usecols=[0, 1],
                    dtype={"Rank": int, "Domain": str},
                    low_memory=False
                )
    except zipfile.BadZipFile:
        from io import BytesIO
        df = pd.read_csv(
            BytesIO(data),
            header=None,
            names=["Rank", "Domain"],
            usecols=[0, 1],
            dtype={"Rank": int, "Domain": str},
            low_memory=False
        )

    # sort and persist
    df = df.sort_values("Rank").reset_index(drop=True)
    df.to_csv(OUTPUT_CSV, index=False)
    print(f"Saved {len(df):,} rows to {OUTPUT_CSV}")

# peek at top-5
print("\nFirst 5 entries:")
print(df.head(5).to_string(index=False))

top-1m.csv already exists; loading it.

First 5 entries:
 Rank           Domain
    1       google.com
    2    microsoft.com
    3          mail.ru
    4     facebook.com
    5 root-servers.net


# Cell 3: Health check your service

In [3]:
# Cell 3: Health check your backend
print("Checking service at", HEALTH_URL, "…")
try:
    r = requests.get(HEALTH_URL)
    r.raise_for_status()
    status = r.json().get("status", "no status field")
    print("✅ Service is up:", status)
except Exception as e:
    raise RuntimeError("❌ Health check failed:", e)

Checking service at http://localhost:8080/api/test …
✅ Service is up: ok


# Cell 4: Define the crawl function (shows both TOS & PP progress)

In [4]:
from concurrent.futures import ThreadPoolExecutor, as_completed
from IPython.display import clear_output

def crawl_stage(df, stage_name, endpoint):
    """
    Crawl using up to 10 concurrent requests for the given stage.
    """
    state_file     = STATE_FILES[stage_name]
    opposite_stage = "pp" if stage_name == "tos" else "tos"
    opp_file       = STATE_FILES[opposite_stage]

    if os.path.exists(state_file):
        state = json.load(open(state_file))
    else:
        state = {"last_rank": 0, "success": 0, "fail": 0}
    last_rank, succ, fail = state["last_rank"], state["success"], state["fail"]

    if os.path.exists(opp_file):
        opp = json.load(open(opp_file))
        opp_last = opp.get("last_rank", 0)
        opp_succ = opp.get("success", 0)
        opp_fail = opp.get("fail", 0)
    else:
        opp_last = opp_succ = opp_fail = 0

    def post_url(domain, rank):
        try:
            resp = requests.post(
                endpoint,
                json={"url": domain},
                headers={"X-API-Key": API_KEY}
            )
            resp.raise_for_status()
            ok = resp.json().get("success", False)
        except:
            ok = False
        return (rank, domain, ok)

    with ThreadPoolExecutor(max_workers=10) as executor:
        futures = []
        for row in df.itertuples(index=False):
            rank, domain = row.Rank, row.Domain
            if rank <= last_rank or succ >= TARGET_SUCCESSES:
                continue

            futures.append(executor.submit(post_url, domain, rank))

            if len(futures) >= 10:
                for f in as_completed(futures):
                    r, d, ok = f.result()
                    if ok:
                        succ += 1
                    else:
                        fail += 1
                    last_rank = r

                    with open(state_file, "w") as f:
                        json.dump({"last_rank": last_rank, "success": succ, "fail": fail}, f)

                    clear_output(wait=True)
                    if stage_name == "pp":
                        print(f"[ToS] Rank {opp_last}: {opp_succ}✅, {opp_fail}❌ ({opp_succ / max(1, opp_succ + opp_fail) * 100:.2f}%)")
                        print(f"[PP]  Rank {last_rank}: {succ}✅, {fail}❌ ({succ / max(1, succ + fail) * 100:.2f}%) *")
                    else:
                        print(f"[PP]  Rank {opp_last}: {opp_succ}✅, {opp_fail}❌ ({opp_succ / max(1, opp_succ + opp_fail) * 100:.2f}%)")
                        print(f"[ToS] Rank {last_rank}: {succ}✅, {fail}❌ ({succ / max(1, succ + fail) * 100:.2f}%) *")

                    time.sleep(DELAY)
                futures = []

        for f in as_completed(futures):
            r, d, ok = f.result()
            if ok:
                succ += 1
            else:
                fail += 1
            last_rank = r

            with open(state_file, "w") as f:
                json.dump({"last_rank": last_rank, "success": succ, "fail": fail}, f)

            clear_output(wait=True)
            if stage_name == "pp":
                print(f"[ToS] Rank {opp_last}: {opp_succ}✅, {opp_fail}❌ ({opp_succ / max(1, opp_succ + opp_fail) * 100:.2f}%)")
                print(f"[PP]  Rank {last_rank}: {succ}✅, {fail}❌ ({succ / max(1, succ + fail) * 100:.2f}%) *")
            else:
                print(f"[PP]  Rank {opp_last}: {opp_succ}✅, {opp_fail}❌ ({opp_succ / max(1, opp_succ + opp_fail) * 100:.2f}%)")
                print(f"[ToS] Rank {last_rank}: {succ}✅, {fail}❌ ({succ / max(1, succ + fail) * 100:.2f}%) *")
            time.sleep(DELAY)

# Cell 5: Controlled runner (set mode below)

In [None]:
mode = "both"  # set to "tos", "pp", or "both"

# Helper to read state
def read_state(stage):
    if os.path.exists(STATE_FILES[stage]):
        return json.load(open(STATE_FILES[stage]))
    return {"success": 0, "fail": 0, "last_rank": 0}

# Execute based on mode
if mode == "tos":
    crawl_stage(df, "tos", TOS_ENDPOINT)          

elif mode == "pp":
    crawl_stage(df, "pp", PP_ENDPOINT)

elif mode == "both":
    crawl_stage(df, "tos", TOS_ENDPOINT)
    tos_state = read_state("tos")
    if tos_state.get("success", 0) < TARGET_SUCCESSES:
        print(f"\n⏩ Skipping PP crawl: only {tos_state['success']} ToS successes (target {TARGET_SUCCESSES}).")
    else:
        crawl_stage(df, "pp", PP_ENDPOINT)

else:
    raise ValueError("❌ Invalid mode! Use 'tos', 'pp', or 'both'.")

[PP]  Rank 25194: 6250✅, 18944❌ (24.81%)
[ToS] Rank 39934: 10813✅, 31119❌ (25.79%) *
