In [None]:
# Install Chrome + Selenium
!apt-get update -qq
!wget -q https://dl.google.com/linux/direct/google-chrome-stable_current_amd64.deb
!dpkg -i google-chrome-stable_current_amd64.deb || apt-get -y -f install
!pip -q install --upgrade "selenium>=4.20.0" boto3



W: Skipping acquire of configured file 'main/source/Sources' as repository 'https://r2u.stat.illinois.edu/ubuntu jammy InRelease' does not seem to provide it (sources.list entry misspelt?)
(Reading database ... 126756 files and directories currently installed.)
Preparing to unpack google-chrome-stable_current_amd64.deb ...
Unpacking google-chrome-stable (140.0.7339.185-1) over (140.0.7339.185-1) ...
Setting up google-chrome-stable (140.0.7339.185-1) ...
Processing triggers for mailcap (3.70+nmu1ubuntu1) ...
Processing triggers for man-db (2.10.2-1) ...


## INSERT KEY
AWS_ACCESS_KEY_ID     = 
AWS_SECRET_ACCESS_KEY = 
AWS_SESSION_TOKEN     = ""  # only if using temporary creds

In [None]:
# --- One cell to run everything in Colab ---
# --- Python code ---
import os, time, tempfile, shutil, socket, subprocess
from contextlib import suppress

from google.colab import drive
drive.mount('/content/drive')

# Change this to where you want PDFs in Drive
DOWNLOAD_DIR = "/content/drive/MyDrive/municode_downloads"
os.makedirs(DOWNLOAD_DIR, exist_ok=True)

from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import (
    TimeoutException, StaleElementReferenceException,
    InvalidSessionIdException, WebDriverException
)

# ---------- Helpers ----------
TMP_PROFILE = tempfile.mkdtemp(prefix="selenium-profile-")

def _free_port():
    s = socket.socket()
    s.bind(("", 0))
    port = s.getsockname()[1]
    s.close()
    return port

def make_driver(headless=True):
    chrome_opts = webdriver.ChromeOptions()
    chrome_opts.binary_location = "/usr/bin/google-chrome"
    chrome_opts.add_argument(f"--user-data-dir={TMP_PROFILE}")
    chrome_opts.add_argument("--profile-directory=Default")
    chrome_opts.add_argument("--no-first-run")
    chrome_opts.add_argument("--no-default-browser-check")
    if headless:
        chrome_opts.add_argument("--headless=new")
    chrome_opts.add_argument("--no-sandbox")
    chrome_opts.add_argument("--disable-dev-shm-usage")
    chrome_opts.add_argument("--disable-gpu")
    chrome_opts.add_argument("--window-size=1920,1080")
    chrome_opts.add_argument(f"--remote-debugging-port={_free_port()}")

    # set download dir to Drive
    chrome_opts.add_experimental_option("prefs", {
        "download.default_directory": os.path.abspath(DOWNLOAD_DIR),
        "download.prompt_for_download": False,
        "safebrowsing.enabled": True,
    })

    return webdriver.Chrome(options=chrome_opts)

def cleanup_driver(driver):
    with suppress(Exception):
        driver.quit()
    tmpdir = getattr(driver, "_tmp_profile_dir", TMP_PROFILE)
    with suppress(Exception):
        subprocess.run(["bash","-lc",f"pkill -f {tmpdir} || true"], check=False)
        subprocess.run(["bash","-lc","pkill -f chrome || true"], check=False)
        subprocess.run(["bash","-lc","pkill -f chromedriver || true"], check=False)
    if os.path.isdir(tmpdir):
        for _ in range(20):
            try:
                shutil.rmtree(tmpdir)
                break
            except Exception:
                time.sleep(0.3)

def is_county_level(name: str, url: str) -> bool:
    n = (name or "").lower()
    u = (url or "").lower()
    county_terms = [" county", " parish", " borough"]
    return any(t in n for t in county_terms) or any(t in u for t in ["/county", "/parish", "/borough"])

# ---------- Scraper ----------
driver = None
try:
    driver = make_driver(headless=True)
    driver._tmp_profile_dir = TMP_PROFILE
    wait = WebDriverWait(driver, 20)

    print("Navigating to the main Georgia page...")
    driver.get("https://library.municode.com/ga")

    elements = wait.until(EC.presence_of_all_elements_located((By.CSS_SELECTOR, "li a.index-link")))
    links = [(el.text.strip(), el.get_attribute("href")) for el in elements if el.get_attribute("href")]
    county_links = [(n,u) for (n,u) in links if is_county_level(n,u)]
    print(f"Found {len(links)} total; {len(county_links)} county-like entries.")

    failed_urls = []
    for name, url in county_links:
        print(f"\n--- Processing: {name} | {url} ---")
        try:
            driver.get(url)
            try:
                all_buttons = wait.until(
                    EC.presence_of_all_elements_located((By.CSS_SELECTOR, "button.btn-pdf-download"))
                )
                visible_buttons = [b for b in all_buttons if b.is_displayed()]
            except TimeoutException:
                visible_buttons = []
            print(f"  Found {len(visible_buttons)} visible download button(s).")
            if not visible_buttons:
                continue

            for i in range(len(visible_buttons)):
                current_visible = [b for b in driver.find_elements(By.CSS_SELECTOR, "button.btn-pdf-download") if b.is_displayed()]
                if i >= len(current_visible):
                    break
                print(f"  -> Clicking visible button #{i+1}...")
                current_visible[i].click()
                modal_btn = wait.until(EC.element_to_be_clickable((By.CSS_SELECTOR, "button.get-pdf-download-btn")))
                modal_btn.click()
                print("     Download triggered.")
                time.sleep(5)

        except (TimeoutException, StaleElementReferenceException) as e:
            print(f"  -> Skipping ({e})"); failed_urls.append(url)
        except InvalidSessionIdException:
            print("  -> SESSION DIED."); failed_urls.append(url); break
        except WebDriverException as e:
            print(f"  -> WebDriver error: {e}"); failed_urls.append(url)
        except Exception as e:
            print(f"  -> Unknown error: {e}"); failed_urls.append(url)

    print("\n--- Waiting for remaining downloads ---")
    start = time.time()
    while any(f.endswith(".crdownload") for f in os.listdir(DOWNLOAD_DIR)):
        if time.time() - start > 900:
            print("!!! Timeout waiting for downloads")
            break
        time.sleep(5)

    if failed_urls:
        failed_path = os.path.join(DOWNLOAD_DIR, "failed_urls.txt")
        with open(failed_path,"w") as f: f.write("\n".join(failed_urls))
        print(f"Saved {len(failed_urls)} failed URLs to {failed_path}")

finally:
    if driver:
        cleanup_driver(driver)

print(f"All files saved in Google Drive: {DOWNLOAD_DIR}")


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
Navigating to the main Georgia page...
Found 395 total; 127 county-like entries.

--- Processing: Athens-Clarke County | https://library.municode.com/ga/athens-clarke_county ---
  Found 0 visible download button(s).

--- Processing: Augusta-Richmond County | https://library.municode.com/ga/augusta-richmond_county ---
  Found 1 visible download button(s).
  -> Clicking visible button #1...
     Download triggered.

--- Processing: Baldwin County | https://library.municode.com/ga/baldwin_county ---
  Found 1 visible download button(s).
  -> Clicking visible button #1...
     Download triggered.

--- Processing: Banks County | https://library.municode.com/ga/banks_county ---
  Found 1 visible download button(s).
  -> Clicking visible button #1...
     Download triggered.

--- Processing: Barrow County | https://library.municode.com/ga/barrow_county ---
  Found 1