# Cell 1: Imports & basic config

In [1]:
import os
import json
import time
import requests
import pandas as pd
from tqdm import tqdm
from dotenv import load_dotenv

# Filenames
STATE_FILE = "crawl_state.json"
CSV_FILE   = "majestic_million.csv"
CSV_URL    = "https://downloads.majestic.com/majestic_million.csv"

# Cell 2: Load environment variables

In [2]:
load_dotenv(".env")

API_KEY      = os.getenv("X_API_KEY")
TOS_ENDPOINT = os.getenv("TOS_ENDPOINT")
PP_ENDPOINT  = os.getenv("PP_ENDPOINT")

BATCH_SIZE  = int(os.getenv("BATCH_SIZE", 100))
TIMEOUT     = int(os.getenv("TIMEOUT", 10))
DELAY       = float(os.getenv("DELAY_BETWEEN_REQUESTS", 1))
MAX_RETRIES = int(os.getenv("MAX_RETRIES", 3))

# Cell 3: Download Majestic CSV if missing, then load only Rank+Domain

In [3]:
# Cell 3: Download Majestic CSV if missing, then load GlobalRank & Domain from header
if not os.path.exists(CSV_FILE):
    print("Downloading Majestic Million list…")
    resp = requests.get(CSV_URL, stream=True, timeout=TIMEOUT)
    resp.raise_for_status()
    with open(CSV_FILE, "wb") as f:
        for chunk in resp.iter_content(chunk_size=1024):
            f.write(chunk)
else:
    print("CSV already exists; skipping download.")

# Now read the real header row and pull exactly GlobalRank + Domain
df = pd.read_csv(
    CSV_FILE,
    usecols=["GlobalRank", "Domain"],
    dtype={"GlobalRank": int, "Domain": str},
    low_memory=False
)

# Make sure it’s sorted
df = df.sort_values("GlobalRank").reset_index(drop=True)

print(f"Loaded {len(df):,} domains.  First few:\n", df.head())

CSV already exists; skipping download.
Loaded 1,000,000 domains.  First few:
    GlobalRank         Domain
0           1     google.com
1           2   facebook.com
2           3    youtube.com
3           4    twitter.com
4           5  instagram.com


# Cell 4: Initialize or resume state

In [4]:
# Cell 4: Init or resume state by GlobalRank
if os.path.exists(STATE_FILE):
    state = json.load(open(STATE_FILE))
    print(f"Resuming from GlobalRank > {state.get('last_rank', 0)}")
else:
    state = {"last_rank": 0, "success_count": 0, "fail_count": 0}
    print("Starting fresh run from rank 1.")

Starting fresh run from rank 1.


# Cell 5: Choose which endpoint to crawlm

In [5]:
choice = input("Enter 1 to crawl TOS or 2 to crawl Privacy Policy: ").strip()
if choice == "1":
    endpoint = TOS_ENDPOINT
    label = "TOS"
elif choice == "2":
    endpoint = PP_ENDPOINT
    label = "Privacy Policy"
else:
    raise SystemExit("Invalid choice – restart and enter 1 or 2.")

print(f"Crawling {label} via {endpoint}")

Enter 1 to crawl TOS or 2 to crawl Privacy Policy:  1


Crawling TOS via http://localhost:8080/api/v1/crawl-tos


# Cell 6: Health check – ensure your backend is up before crawling

In [6]:
import os
import requests
from dotenv import load_dotenv

# (Re)load your .env so TIMEOUT is defined here
load_dotenv("sample.env")
TIMEOUT = int(os.getenv("TIMEOUT", 10))

health_url = "http://localhost:8080/api/test"
headers = {"accept": "application/json"}

try:
    resp = requests.get(health_url, headers=headers, timeout=TIMEOUT)
    resp.raise_for_status()
    data = resp.json()
    if data.get("status") == "ok":
        print(f"✅ Service is up: {data.get('message')}")
    else:
        raise ValueError(f"Unexpected response body: {data}")
except Exception as e:
    raise SystemExit(f"❌ Health check failed for {health_url}: {e}")

✅ Service is up: Server is running


# Cell 7: Crawl loop with live counters and progress display

In [7]:
import json
import requests
import time
from collections import deque
from tqdm.notebook import tqdm
from IPython.display import clear_output, display

# Load or init state
if os.path.exists(STATE_FILE):
    state = json.load(open(STATE_FILE))
else:
    state = {"last_rank": 0, "success_count": 0, "fail_count": 0}

last_rank = state["last_rank"]
success   = state["success_count"]
fail      = state["fail_count"]

# Always show the current summary at the very top
print(
    f"\nStopped at GlobalRank {last_rank}: "
    f"{success} ✅, {fail} ❌ "
    f"({success/(success+fail)*100 if (success+fail) else 0:.2f}% success)\n"
)

# ▶ Pick how many successes to stop after
opts   = ["10", "100", "1000", "10000", "100000", "max"]
choice = input(f"Stop after how many successes {opts} [default 'max']: ").strip() or "max"
if choice not in opts:
    choice = "max"
target = float("inf") if choice == "max" else int(choice)

print(f"▶ Resuming from GlobalRank > {last_rank}, stopping after {target} successes.\n")

# Filter remaining domains
rem = df[df["GlobalRank"] > last_rank].sort_values("GlobalRank")
print(f"→ {len(rem):,} domains remaining to crawl.\n")

# Prepare progress bar and log buffer
pbar = tqdm(total=len(rem), desc="crawl-POST", unit="domain")
last_logs = deque(maxlen=10)

for _, row in rem.iterrows():
    if success >= target:
        break

    rank   = int(row["GlobalRank"])
    domain = row["Domain"]
    ok     = False
    status = None
    data   = {}

    # Single POST
    try:
        resp = requests.post(
            endpoint,
            json={"url": domain},
            headers={"X-API-KEY": API_KEY, "Content-Type": "application/json"}
        )
        status = resp.status_code
        resp.raise_for_status()
        data = resp.json()
        if data.get("success") is True:
            ok = True
    except Exception as e:
        status = getattr(e, "response", None) and e.response.status_code or str(e)

    # Record this log line
    log_line = f"[{'✔' if ok else '✘'}] Rank={rank}, Domain={domain}, HTTP={status}, success={data.get('success')}"
    last_logs.append(log_line)

    # Update counters
    if ok:
        success += 1
    else:
        fail += 1

    # Persist state
    last_rank = rank
    state.update({
        "last_rank":     last_rank,
        "success_count": success,
        "fail_count":    fail
    })
    with open(STATE_FILE, "w") as f:
        json.dump(state, f, indent=2)

    # Update progress bar
    pbar.update(1)
    pct = success / (success + fail) * 100 if (success + fail) else 0
    pbar.set_postfix({"✔": success, "✘": fail, "rate": f"{pct:.1f}%"} )

    # Live display: clear and show only the last 10 logs + progress bar
    clear_output(wait=True)
    # Reprint summary at top
    print(
        f"\nStopped at GlobalRank {last_rank}: "
        f"{success} ✅, {fail} ❌ "
        f"({success/(success+fail)*100:.2f}% success)\n"
    )
    for line in last_logs:
        print(line)
    display(pbar)

    time.sleep(DELAY)

pbar.close()

# Final summary (again)
clear_output(wait=False)
print("Final summary:")
print(
    f"Stopped at GlobalRank {last_rank}: "
    f"{success} ✅, {fail} ❌ "
    f"({success/(success+fail)*100:.2f}% success)\n"
)
print("Last 10 log lines:")
for line in last_logs:
    print(line)

Final summary:
Stopped at GlobalRank 16: 10 ✅, 6 ❌ (62.50% success)

Last 10 log lines:
[✔] Rank=7, Domain=microsoft.com, HTTP=200, success=True
[✔] Rank=8, Domain=apple.com, HTTP=200, success=True
[✘] Rank=9, Domain=googletagmanager.com, HTTP=200, success=False
[✔] Rank=10, Domain=wikipedia.org, HTTP=200, success=True
[✘] Rank=11, Domain=github.com, HTTP=200, success=False
[✔] Rank=12, Domain=wordpress.org, HTTP=200, success=True
[✘] Rank=13, Domain=youtu.be, HTTP=200, success=False
[✔] Rank=14, Domain=en.wikipedia.org, HTTP=200, success=True
[✘] Rank=15, Domain=play.google.com, HTTP=200, success=False
[✔] Rank=16, Domain=pinterest.com, HTTP=200, success=True
