In [None]:
# --- Colab: Upload all PDFs from a Drive folder to S3 by county (CA) ---

!pip -q install boto3

import os, re, unicodedata, pathlib
from typing import Optional, Tuple
import boto3
from botocore.config import Config
from botocore.exceptions import ClientError

# 1) Mount Google Drive (you'll be prompted to authorize)
from google.colab import drive
drive.mount("/content/drive")
DRY_RUN =False
# 2) === EDIT THESE ===
# Google Drive folder that contains your PDF files
DRIVE_FOLDER = "/content/drive/MyDrive/municode_downloads_GA"  # <-- change this

# AWS creds (TEMPORARY KEYS RECOMMENDED in Colab)
AWS_REGION = "us-east-1"
os.environ["AWS_ACCESS_KEY_ID"]     = ""     # <-- put your key
os.environ["AWS_SECRET_ACCESS_KEY"] = ""  # <-- put your secret

# S3 target info
BUCKET      = "berkeley-capstone-unbarred-2.0-data"
BASE_PREFIX = "env=prod/zone=raw/state=ga"  # fixed per your request

# ================== Helpers (GA version) ==================
STATE_ABBR = "ga"  # canonical two-letter code for output

def slugify(s: str) -> str:
    """
    Lowercase, normalize accents, replace non-alphanumerics with '-',
    collapse multiple hyphens, strip edges.
    """
    s = unicodedata.normalize("NFKD", s)
    s = "".join(ch for ch in s if not unicodedata.combining(ch))
    s = s.lower()
    s = re.sub(r"[^a-z0-9]+", "-", s)
    s = re.sub(r"-{2,}", "-", s).strip("-")
    return s

def parse_county_from_filename(filename: str) -> Optional[str]:
    """
    Accepts flexible GA filenames, e.g.:
      - "Fulton County, GA Code of Ordinances.pdf"
      - "DeKalb County, Georgia.PDF"
      - "St. Marys County, Ga.pdf"
      - "Macon-Bibb County, GA Code of Ordinances, Part A.pdf"
    Returns 'fulton-county', 'dekalb-county', etc., or None if no match.
    """
    name = pathlib.Path(filename).name
    base = re.sub(r"\.[Pp][Dd][Ff]$", "", name).strip()

    # Normalize small punctuation/spacing variants
    base = base.replace("_", " ")
    base = re.sub(r"\s{2,}", " ", base)

    # GA variants after 'County' (case-insensitive), with ANY trailing text allowed
    ga_suffix = r"(?:GA|Ga\.?|Georgia)"

    # Capture everything up to 'County' as the county name; require the state marker AFTER 'County'
    m = re.search(
        rf"""(?ix)
            ^\s*
            (?P<county>.+?)      # county words (e.g., 'Macon-Bibb', 'DeKalb', 'St. Marys')
            \s*County\b          # literal 'County'
            \s*,?\s*             # optional comma/space
            {ga_suffix}\b        # one of: GA | Ga | Ga. | Georgia
            .*                   # allow any trailing text (e.g., 'Code of Ordinances')
            $""",
        base,
    )
    if not m:
        return None

    county_words = m.group("county").strip()
    if not county_words:
        return None

    county_slug = slugify(county_words) + "-county"
    return county_slug

def dest_key_for(county_slug: str) -> str:
    """
    Build the destination S3 key.
    File name pattern: '<county-slug>-<state>.pdf' -> e.g., 'fulton-county-ga.pdf'
    Folder layout: '<BASE_PREFIX>/county=<county-slug>/<filename>'
    """
    filename = f"{county_slug}-{STATE_ABBR}.pdf"
    return f"{BASE_PREFIX}/county={county_slug}/{filename}"

# ================== S3 client and upload loop (uses GA helpers) ==================
cfg = Config(retries={"max_attempts": 10, "mode": "standard"})
s3  = boto3.client("s3", region_name=AWS_REGION, config=cfg)

def upload_pdf(local_path: str, key: str) -> bool:
    """Upload a local PDF to S3 with AES256 server-side encryption."""
    try:
        if DRY_RUN:
            print(f"[DRY-RUN] Would upload: {local_path} -> s3://{BUCKET}/{key}")
            return True
        s3.upload_file(
            Filename=local_path,
            Bucket=BUCKET,
            Key=key,
            ExtraArgs={"ServerSideEncryption": "AES256", "ContentType": "application/pdf"},
        )
        print(f"[OK] {local_path} -> s3://{BUCKET}/{key}")
        return True
    except ClientError as e:
        print(f"[ERROR] {local_path}: {e}")
        return False

# Find and upload all PDFs
pdf_paths = []
for p in pathlib.Path(DRIVE_FOLDER).glob("**/*.pdf"):
    pdf_paths.append(str(p))
for p in pathlib.Path(DRIVE_FOLDER).glob("**/*.PDF"):
    pdf_paths.append(str(p))

if not pdf_paths:
    raise SystemExit(f"No PDFs found under: {DRIVE_FOLDER}")

print(f"Found {len(pdf_paths)} PDF(s) under {DRIVE_FOLDER}")

uploaded, skipped = 0, 0
for local_path in sorted(set(pdf_paths)):
    county_slug = parse_county_from_filename(local_path)
    if not county_slug:
        print(f"[SKIP] Filename doesn't look like '<County> County, GA(EORGIA) ... .pdf': {local_path}")
        skipped += 1
        continue

    key = dest_key_for(county_slug)
    ok = upload_pdf(local_path, key)
    uploaded += int(ok)

print(f"\nDone. Uploaded: {uploaded}, Skipped: {skipped}")
print(f"S3 base prefix: s3://{BUCKET}/{BASE_PREFIX}/")




[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m139.3/139.3 kB[0m [31m4.1 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m14.1/14.1 MB[0m [31m109.2 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m85.7/85.7 kB[0m [31m6.3 MB/s[0m eta [36m0:00:00[0m
[?25hMounted at /content/drive
Found 110 PDF(s) under /content/drive/MyDrive/municode_downloads_GA
[OK] /content/drive/MyDrive/municode_downloads_GA/Augusta-Richmond County, GA Code of Ordinances.pdf -> s3://berkeley-capstone-unbarred-2.0-data/env=prod/zone=raw/state=ga/county=augusta-richmond-county/augusta-richmond-county-ga.pdf
[OK] /content/drive/MyDrive/municode_downloads_GA/Baldwin County, GA Code of Ordinances.pdf -> s3://berkeley-capstone-unbarred-2.0-data/env=prod/zone=raw/state=ga/county=baldwin-county/baldwin-county-ga.pdf
[OK] /content/drive/MyDrive/municode_downloads_GA/Banks County, GA Code of Ordinances.pdf -> s3://ber