In [None]:
import os
import time
from datetime import datetime
import pandas as pd
from selenium import webdriver
from selenium.webdriver.firefox.options import Options
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.common.exceptions import TimeoutException, WebDriverException

# ===== CONFIG =====
INPUT_FILE = "links.xlsx"
SHEET_NAME = "Links"
COLUMN_LETTER = "URL"
OUTPUT_FILE = "website_status.xlsx"

PAGE_LOAD_TIMEOUT = 40        # page load timeout (seconds)
CONTENT_WAIT_TIMEOUT = 25     # wait for body presence or title (seconds)
HEADLESS = False              # set True to run without visible browser
MIN_DOM_ELEMENTS = 6          # if body has >= this many child elements -> consider as content
MIN_TEXT_CHARACTERS = 30      # minimal count of alphanumeric chars in body to be "textful"
# ===================


def setup_driver():
    opts = Options()
    if HEADLESS:
        opts.add_argument("--headless")
    driver = webdriver.Firefox(options=opts)
    driver.set_page_load_timeout(PAGE_LOAD_TIMEOUT)
    return driver


def load_urls():
    df = pd.read_excel(INPUT_FILE, sheet_name=SHEET_NAME)
    if COLUMN_LETTER in df.columns:
        urls = df[COLUMN_LETTER].dropna().astype(str).str.strip().tolist()
    else:
        urls = df.iloc[:, 4].dropna().astype(str).str.strip().tolist()
    return urls


def looks_like_verification(text: str) -> bool:
    """Detect common verification/anti-bot / waiting-room triggers."""
    if not text:
        return False
    lowered = text.lower()
    triggers = [
        "just a moment", "checking your browser", "enable javascript",
        "access denied", "verify you are human", "are you human",
        "please wait", "verifying", "cloudflare", "service temporarily unavailable",
        "403", "404", "503", "error"
    ]
    return any(t in lowered for t in triggers)


def has_enough_text(body_text: str) -> bool:
    """Count alphanumeric characters (a crude but reliable 'meaningful' text check)."""
    if not body_text:
        return False
    # keep only letters+digits, count them
    import re
    chars = re.sub(r'[^A-Za-z0-9]', '', body_text)
    return len(chars) >= MIN_TEXT_CHARACTERS


def count_dom_children(driver) -> int:
    """Return number of descendant elements under <body> (fast indicator of a real page)."""
    try:
        elems = driver.find_elements(By.CSS_SELECTOR, "body *")
        return len(elems)
    except Exception:
        return 0


def count_images(driver) -> int:
    try:
        imgs = driver.find_elements(By.TAG_NAME, "img")
        return len(imgs)
    except Exception:
        return 0


def check_website(driver, url):
    """Return (status, info). Status in {WORKING, VERIFICATION_BLOCKED, NO_CONTENT, DOWN, ERROR}"""
    try:
        driver.get(url)
    except WebDriverException as e:
        return "DOWN", f"WebDriver Error on get(): {str(e)[:200]}"
    except Exception as e:
        return "ERROR", f"Exception on get(): {str(e)[:200]}"

    # Wait until either a non-empty title appears or body exists (whichever first)
    try:
        WebDriverWait(driver, CONTENT_WAIT_TIMEOUT).until(
            lambda d: (d.title and d.title.strip()) or d.find_elements(By.TAG_NAME, "body")
        )
    except TimeoutException:
        # continue — maybe partial load; we'll still inspect what's available
        pass
    except Exception as e:
        return "ERROR", f"Exception while waiting: {str(e)[:200]}"

    # Gather signals
    try:
        title = driver.title.strip() if driver.title else ""
    except Exception:
        title = ""

    try:
        body_el = driver.find_element(By.TAG_NAME, "body")
        body_text = body_el.text.strip() if body_el and body_el.text else ""
    except Exception:
        body_text = ""

    dom_count = count_dom_children(driver)
    img_count = count_images(driver)

    # Check for obvious verification or "please wait" pages
    if looks_like_verification(title) or looks_like_verification(body_text):
        # However, if page also has many DOM elements or many images, treat as WORKING fallback
        if dom_count >= MIN_DOM_ELEMENTS or img_count >= 1:
            return "WORKING", title or f"Content-heavy verification-like page (dom:{dom_count}, imgs:{img_count})"
        return "VERIFICATION_BLOCKED", "Detected anti-bot / verification / waiting-room content"

    # Primary positivity checks (any of these -> WORKING)
    if title:
        # If the title contains common error words, treat specially
        if any(k in title.lower() for k in ("404", "503", "error", "not found", "forbidden")):
            # still allow if page has many DOM elements or decent text
            if dom_count >= MIN_DOM_ELEMENTS or has_enough_text(body_text) or img_count >= 1:
                return "WORKING", title
            return "NO_CONTENT", f"Errorish title: {title}"
        return "WORKING", title

    # If no title but body looks meaningful
    if has_enough_text(body_text):
        return "WORKING", "No title, but body text detected"

    # If few text but many DOM elements or images (JS-heavy pages)
    if dom_count >= MIN_DOM_ELEMENTS or img_count >= 1:
        return "WORKING", f"No title/text but DOM elements: {dom_count}, images: {img_count}"

    # Otherwise treat as no content
    return "NO_CONTENT", "No meaningful title, text, images, or DOM elements detected"


def save_progress(results):
    # append-safe save: overwrite with the current results
    pd.DataFrame(results).to_excel(OUTPUT_FILE, index=False)


def main():
    urls = load_urls()
    if not urls:
        print("No URLs found in the input sheet/column.")
        return

    # remove previous output so it's always 'fresh'
    if os.path.exists(OUTPUT_FILE):
        try:
            os.remove(OUTPUT_FILE)
        except Exception:
            pass

    driver = setup_driver()
    results = []

    try:
        for idx, url in enumerate(urls, start=1):
            print(f"[{idx}/{len(urls)}] Checking → {url}")
            start = time.time()
            status, info = check_website(driver, url)
            duration = round(time.time() - start, 2)
            checked_at = datetime.utcnow().isoformat(timespec="seconds")

            print(f"   → {status} | {info} | {duration}s")

            results.append({
                "URL": url,
                "STATUS": status,
                "INFO_OR_TITLE": info,
                "CHECKED_AT_UTC": checked_at,
                "LOAD_TIME_SEC": duration,
                "DOM_CHILD_COUNT": count_dom_children(driver),
                "IMAGE_COUNT": count_images(driver)
            })

            # Save progress each iteration so you can stop anytime
            try:
                save_progress(results)
            except Exception as e:
                print("Warning: failed to save progress:", e)

    finally:
        try:
            driver.quit()
        except Exception:
            pass

    print(f"\n✔ Completed. Latest results saved to {OUTPUT_FILE}")


if __name__ == "__main__":
    main()


In [2]:
import os
import time
import requests
import pandas as pd
from bs4 import BeautifulSoup
from datetime import datetime

from selenium import webdriver
from selenium.webdriver.firefox.options import Options
from selenium.webdriver.common.by import By
from selenium.common.exceptions import WebDriverException

# ================= CONFIG ==================
INPUT_FILE = "links.xlsx"
SHEET_NAME = "Links"
COLUMN_NAME = "URL"
OUTPUT_FILE = "final_results.xlsx"

HTTP_TIMEOUT = 8
SELENIUM_TIMEOUT = 30
HEADLESS = False  # True → run hidden

HEADERS = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:130.0) Gecko/20100101 Firefox/130.0",
    "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
    "Accept-Language": "en-US,en;q=0.5",
    "Referer": "https://www.google.com/",
    "Connection": "keep-alive"
}

CF_SIGNS = [
    "checking your browser",
    "please stand by",
    "verify you are human",
    "security check",
    "attention required",
    "cf-browser-verification",
    "cloudflare"
]
# ===========================================


def load_urls():
    df = pd.read_excel(INPUT_FILE, sheet_name=SHEET_NAME)
    return df[COLUMN_NAME].dropna().astype(str).tolist()


def http_check(url):
    try:
        r = requests.get(url, headers=HEADERS, timeout=HTTP_TIMEOUT)
        soup = BeautifulSoup(r.text, "html.parser")
        title = soup.title.text.strip() if soup.title else ""
        return r.status_code, title
    except Exception:
        return None, ""


def need_selenium(status, title):
    if status is None or status >= 400:
        return True
    if not title:
        return True
    if any(x in title.lower() for x in CF_SIGNS):
        return True
    return False


def setup_driver():
    opts = Options()
    if HEADLESS:
        opts.add_argument("--headless")

    opts.set_preference("dom.webdriver.enabled", False)
    opts.set_preference("useAutomationExtension", False)

    driver = webdriver.Firefox(options=opts)
    driver.set_page_load_timeout(SELENIUM_TIMEOUT)
    return driver


def selenium_check(driver, url):
    try:
        driver.get(url)
    except WebDriverException as e:
        return "DOWN", f"Selenium error: {str(e)[:80]}"

    time.sleep(3)

    def get_title_and_body():
        title = driver.title.strip() if driver.title else ""
        try:
            body_text = driver.find_element(By.TAG_NAME, "body").text.strip()
        except:
            body_text = ""
        return title, body_text

    title, body_text = get_title_and_body()
    page = driver.page_source.lower()

    # Instant mark: Challenge before content
    if any(x in page for x in CF_SIGNS) and not (len(body_text) > 100 or title):
        print("  - Security challenge immediately → skipping")
        return "BLOCKED_CF", title or "Blocked by Cloudflare"

    # Proper content detected first
    if driver.title and len(body_text) > 100:
        print("  - Content loaded → storing result")
        return "WORKING", driver.title

    # Retry small wait
    timeout = time.time() + 10
    while time.time() < timeout:
        time.sleep(2)
        title, body_text = get_title_and_body()
        page = driver.page_source.lower()

        if driver.title and len(body_text) > 100:
            return "WORKING", driver.title

        if any(x in page for x in CF_SIGNS):
            print("  - No content + challenge → skipping")
            return "BLOCKED_CF", title or "Blocked by Cloudflare"

    return "NO_CONTENT", title or "No useful content"


def save_results(results):
    pd.DataFrame(results).to_excel(OUTPUT_FILE, index=False)


def main():
    urls = load_urls()

    if os.path.exists(OUTPUT_FILE):
        os.remove(OUTPUT_FILE)

    driver = setup_driver()
    results = []
    total = len(urls)

    for i, url in enumerate(urls, start=1):
        print(f"\n[{i}/{total}] Checking: {url}")
        start = time.time()
        timestamp = datetime.utcnow().isoformat(timespec="seconds")

        http_code, title = http_check(url)

        if need_selenium(http_code, title):
            print("  → Using browser due to possible block/protection…")
            status, title = selenium_check(driver, url)
        else:
            status = "WORKING"

        duration = round(time.time() - start, 2)

        print(f"  → FINAL: {status} | HTTP: {http_code} | Title: {title[:60]} | Time: {duration}s")

        results.append({
            "URL": url,
            "STATUS": status,
            "HTTP_CODE": http_code,   # <-- Added
            "TITLE": title,
            "CHECKED_AT_UTC": timestamp,
            "TIME_SEC": duration
        })

        save_results(results)

    driver.quit()
    print(f"\n✓ Done → Results saved: {OUTPUT_FILE}")


if __name__ == "__main__":
    main()


[1/2289] Checking: https://votodigital.onpe.gob.pe/


  timestamp = datetime.utcnow().isoformat(timespec="seconds")


  → FINAL: WORKING | HTTP: 200 | Title: Portal Digital de la STVD | Time: 1.34s

[2/2289] Checking: https://www.valimised.ee/en/previous-elections
  → FINAL: WORKING | HTTP: 200 | Title: Previous elections | Elections in Estonia | Time: 1.37s

[3/2289] Checking: https://www.valimised.ee/ru/arkhiv-proshedshikh-vyborov
  → FINAL: WORKING | HTTP: 200 | Title: Архив прошедших выборов | Выборы в Эстонии | Time: 1.34s

[4/2289] Checking: https://www.valimised.ee/et/toimunud-valimiste-arhiiv
  → FINAL: WORKING | HTTP: 200 | Title: Toimunud valimiste arhiiv | Valimised Eestis | Time: 1.52s

[5/2289] Checking: https://www.voteaza.md/ru/chasto-zadavaemye-voprosy/#1529386641918-0e3927fa-67ff
  → Using browser due to possible block/protection…
  - Content loaded → storing result
  → FINAL: WORKING | HTTP: None | Title: Часто задаваемые вопросы - VOTEAZA.MD | Time: 12.2s

[6/2289] Checking: https://www.vrk.lt/en/balsavimas-rinkimu-diena-2024-sei
  → Using browser due to possible block/protection…
 

KeyboardInterrupt: 