In [None]:
import os
import time
import requests
import pandas as pd
from bs4 import BeautifulSoup
from datetime import datetime

from selenium import webdriver
from selenium.webdriver.firefox.options import Options
from selenium.webdriver.common.by import By
from selenium.common.exceptions import WebDriverException, JavascriptException
from selenium.webdriver.support.ui import WebDriverWait

# ================ CONFIG ====================
INPUT_FILE = "links.xlsx"
SHEET_NAME = "Links"
COLUMN_NAME = "URL"
OUTPUT_FILE = "final_results.xlsx"

HTTP_TIMEOUT = 8
SELENIUM_PAGE_TIMEOUT = 30   # page load timeout for driver.get
SELENIUM_TITLE_WAIT = 8      # total seconds to poll for JS-updated title/body when needed
HEADLESS = False

HEADERS = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:130.0) Gecko/20100101 Firefox/130.0",
    "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
    "Accept-Language": "en-US,en;q=0.5",
    "Referer": "https://www.google.com/",
    "Connection": "keep-alive"
}

CF_SIGNS = [
    "checking your browser",
    "please stand by",
    "verify you are human",
    "security check",
    "attention required",
    "cf-browser-verification",
    "cloudflare"
]
# ============================================

# Reuse session for connection pooling
SESSION = requests.Session()
SESSION.headers.update(HEADERS)

def load_urls():
    df = pd.read_excel(INPUT_FILE, sheet_name=SHEET_NAME)
    return df[COLUMN_NAME].dropna().astype(str).tolist()

def _extract_title_from_html(text):
    soup = BeautifulSoup(text, "html.parser")
    if soup.title and soup.title.string and soup.title.string.strip():
        return soup.title.string.strip()
    # fallback: og:title / twitter:title
    for key in ("og:title", "twitter:title"):
        tag = soup.find("meta", property=key) or soup.find("meta", attrs={"name": key})
        if tag and tag.get("content"):
            return tag.get("content").strip()
    # fallback: first H1
    h1 = soup.find("h1")
    if h1 and h1.get_text(strip=True):
        return h1.get_text(" ", strip=True)
    return ""

def http_check(url):
    try:
        r = SESSION.get(url, timeout=HTTP_TIMEOUT)
        text = r.text or ""
        title = _extract_title_from_html(text)
        return r.status_code, title
    except Exception:
        return None, ""

def need_selenium(status, title):
    if status is None or (isinstance(status, int) and status >= 400):
        return True
    if not title:
        return True
    if any(sig in title.lower() for sig in CF_SIGNS):
        return True
    return False

def setup_driver():
    opts = Options()
    if HEADLESS:
        opts.add_argument("--headless")
    # make webdriver less obvious
    opts.set_preference("dom.webdriver.enabled", False)
    opts.set_preference("useAutomationExtension", False)
    # create driver with normal load strategy but with sensible timeouts
    driver = webdriver.Firefox(options=opts)
    driver.set_page_load_timeout(SELENIUM_PAGE_TIMEOUT)
    return driver

def _js_get_title(driver):
    try:
        return (driver.execute_script("return document.title || ''") or "").strip()
    except JavascriptException:
        return ""

def _js_get_body_snippet(driver, chars=2000):
    try:
        # innerText is usually the fastest visible text
        return (driver.execute_script(
            "return (document.body && (document.body.innerText || document.body.textContent) || '').substring(0, arguments[0]);",
            chars
        ) or "").strip()
    except JavascriptException:
        return ""

def selenium_check(driver, url):
    try:
        driver.get(url)
    except WebDriverException as e:
        return "DOWN", f"Selenium error: {str(e)[:200]}"

    # quick attempt to read immediately (page may be interactive even if resources still loading)
    title = _js_get_title(driver)
    body_snip = _js_get_body_snippet(driver)

    page_src = ""
    try:
        page_src = driver.page_source.lower()
    except Exception:
        page_src = ""

    # If challenge signs appear and no usable content, mark blocked
    if any(sig in page_src for sig in CF_SIGNS) and not (len(body_snip) > 100 or title):
        return "BLOCKED_CF", title or "Blocked by Cloudflare"

    # If we already have a meaningful title and body, accept immediately
    if title and len(body_snip) > 100:
        return "WORKING", title

    # Poll a short window for JS-updated title/body (covers SPA and delayed title set)
    end = time.time() + SELENIUM_TITLE_WAIT
    while time.time() < end:
        time.sleep(0.5)
        title = _js_get_title(driver)
        body_snip = _js_get_body_snippet(driver)
        try:
            page_src = driver.page_source.lower()
        except Exception:
            page_src = ""
        if any(sig in page_src for sig in CF_SIGNS) and not (len(body_snip) > 100 or title):
            return "BLOCKED_CF", title or "Blocked by Cloudflare"
        if title and len(body_snip) > 100:
            return "WORKING", title

    # If title exists but body small, still accept title as best-effort (many pages only set title)
    if title:
        return "WORKING", title

    return "NO_CONTENT", title or "No useful content"

def save_results(results):
    pd.DataFrame(results).to_excel(OUTPUT_FILE, index=False)

def main():
    urls = load_urls()
    if os.path.exists(OUTPUT_FILE):
        os.remove(OUTPUT_FILE)

    driver = None
    results = []
    total = len(urls)

    # Start driver lazily only if needed
    driver_started = False

    for i, url in enumerate(urls, start=1):
        print(f"\n[{i}/{total}] Checking: {url}")
        start = time.time()
        timestamp = datetime.utcnow().isoformat(timespec="seconds")

        http_code, title = http_check(url)

        use_selenium = need_selenium(http_code, title)

        if use_selenium:
            if not driver_started:
                try:
                    driver = setup_driver()
                    driver_started = True
                except Exception as e:
                    print("  ! Failed to start Selenium driver:", e)
                    # fallback: record HTTP result (best-effort) and continue
                    status = "NO_BROWSER"
                    duration = round(time.time() - start, 2)
                    results.append({
                        "URL": url,
                        "STATUS": status,
                        "HTTP_CODE": http_code,
                        "TITLE": title,
                        "CHECKED_AT_UTC": timestamp,
                        "TIME_SEC": duration
                    })
                    save_results(results)
                    continue

            print("  → Using browser due to possible block/protection…")
            status, title = selenium_check(driver, url)
        else:
            status = "WORKING"

        duration = round(time.time() - start, 2)
        print(f"  → FINAL: {status} | HTTP: {http_code} | Title: {str(title)[:120]} | Time: {duration}s")

        results.append({
            "URL": url,
            "STATUS": status,
            "HTTP_CODE": http_code,
            "TITLE": title,
            "CHECKED_AT_UTC": timestamp,
            "TIME_SEC": duration
        })
        save_results(results)

    if driver_started and driver:
        driver.quit()
    print(f"\n✓ Done → Results saved: {OUTPUT_FILE}")

if __name__ == "__main__":
    main()


[1/2273] Checking: https://votodigital.onpe.gob.pe/


  timestamp = datetime.utcnow().isoformat(timespec="seconds")


  → FINAL: WORKING | HTTP: 200 | Title: Portal Digital de la STVD | Time: 1.12s

[2/2273] Checking: https://www.valimised.ee/en/previous-elections
  → FINAL: WORKING | HTTP: 200 | Title: Previous elections | Elections in Estonia | Time: 1.12s

[3/2273] Checking: https://www.valimised.ee/ru/arkhiv-proshedshikh-vyborov
  → FINAL: WORKING | HTTP: 200 | Title: Архив прошедших выборов | Выборы в Эстонии | Time: 0.43s

[4/2273] Checking: https://www.valimised.ee/et/toimunud-valimiste-arhiiv
  → FINAL: WORKING | HTTP: 200 | Title: Toimunud valimiste arhiiv | Valimised Eestis | Time: 0.42s

[5/2273] Checking: https://www.voteaza.md/ru/chasto-zadavaemye-voprosy/#1529386641918-0e3927fa-67ff
  → Using browser due to possible block/protection…
  → FINAL: WORKING | HTTP: None | Title: Часто задаваемые вопросы - VOTEAZA.MD | Time: 14.87s

[6/2273] Checking: https://www.vrk.lt/en/balsavimas-rinkimu-diena-2024-sei
  → Using browser due to possible block/protection…
  → FINAL: WORKING | HTTP: 403 | Tit

In [None]:
###### Finallized
import os
import time
import requests
import pandas as pd
from bs4 import BeautifulSoup
from datetime import datetime

from selenium import webdriver
from selenium.webdriver.firefox.options import Options
from selenium.webdriver.common.by import By
from selenium.common.exceptions import WebDriverException

# ================= CONFIG ==================
INPUT_FILE = "links.xlsx"
SHEET_NAME = "Links"
COLUMN_NAME = "URL"
OUTPUT_FILE = "final_results-remaining.xlsx"

HTTP_TIMEOUT = 8
SELENIUM_TIMEOUT = 30
HEADLESS = False  # True → run hidden

HEADERS = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:130.0) Gecko/20100101 Firefox/130.0",
    "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
    "Accept-Language": "en-US,en;q=0.5",
    "Referer": "https://www.google.com/",
    "Connection": "keep-alive"
}

CF_SIGNS = [
    "checking your browser",
    "please stand by",
    "verify you are human",
    "security check",
    "attention required",
    "cf-browser-verification",
    "cloudflare"
]
# ===========================================


def load_urls():
    df = pd.read_excel(INPUT_FILE, sheet_name=SHEET_NAME)
    return df[COLUMN_NAME].dropna().astype(str).tolist()


def http_check(url):
    try:
        r = requests.get(url, headers=HEADERS, timeout=HTTP_TIMEOUT)
        soup = BeautifulSoup(r.text, "html.parser")
        title = soup.title.text.strip() if soup.title else ""
        return r.status_code, title
    except Exception:
        return None, ""


def need_selenium(status, title):
    if status is None or status >= 400:
        return True
    if not title:
        return True
    if any(x in title.lower() for x in CF_SIGNS):
        return True
    return False


def setup_driver():
    opts = Options()
    if HEADLESS:
        opts.add_argument("--headless")

    opts.set_preference("dom.webdriver.enabled", False)
    opts.set_preference("useAutomationExtension", False)

    driver = webdriver.Firefox(options=opts)
    driver.set_page_load_timeout(SELENIUM_TIMEOUT)
    return driver


def selenium_check(driver, url):
    try:
        driver.get(url)
    except WebDriverException as e:
        return "DOWN", f"Selenium error: {str(e)[:80]}"

    time.sleep(3)

    def get_title_and_body():
        title = driver.title.strip() if driver.title else ""
        try:
            body_text = driver.find_element(By.TAG_NAME, "body").text.strip()
        except:
            body_text = ""
        return title, body_text

    title, body_text = get_title_and_body()
    page = driver.page_source.lower()

    # Instant mark: Challenge before content
    if any(x in page for x in CF_SIGNS) and not (len(body_text) > 100 or title):
        print("  - Security challenge immediately → skipping")
        return "BLOCKED_CF", title or "Blocked by Cloudflare"

    # Proper content detected first
    if driver.title and len(body_text) > 100:
        print("  - Content loaded → storing result")
        return "WORKING", driver.title

    # Retry small wait
    timeout = time.time() + 10
    while time.time() < timeout:
        time.sleep(2)
        title, body_text = get_title_and_body()
        page = driver.page_source.lower()

        if driver.title and len(body_text) > 100:
            return "WORKING", driver.title

        if any(x in page for x in CF_SIGNS):
            print("  - No content + challenge → skipping")
            return "BLOCKED_CF", title or "Blocked by Cloudflare"

    return "NO_CONTENT", title or "No useful content"


def save_results(results):
    pd.DataFrame(results).to_excel(OUTPUT_FILE, index=False)


def main():
    urls = load_urls()

    if os.path.exists(OUTPUT_FILE):
        os.remove(OUTPUT_FILE)

    driver = setup_driver()
    results = []
    total = len(urls)

    for i, url in enumerate(urls, start=1):
        print(f"\n[{i}/{total}] Checking: {url}")
        start = time.time()
        timestamp = datetime.utcnow().isoformat(timespec="seconds")

        http_code, title = http_check(url)

        if need_selenium(http_code, title):
            print("  → Using browser due to possible block/protection…")
            status, title = selenium_check(driver, url)
        else:
            status = "WORKING"

        duration = round(time.time() - start, 2)

        print(f"  → FINAL: {status} | HTTP: {http_code} | Title: {title[:60]} | Time: {duration}s")

        results.append({
            "URL": url,
            "STATUS": status,
            "HTTP_CODE": http_code,   # <-- Added
            "TITLE": title,
            "CHECKED_AT_UTC": timestamp,
            "TIME_SEC": duration
        })

        save_results(results)

    driver.quit()
    print(f"\n✓ Done → Results saved: {OUTPUT_FILE}")


if __name__ == "__main__":
    main()