In [1]:
# Cell 1: Imports & .env loading
from azure.storage.blob import BlobServiceClient
from dotenv import load_dotenv
import os
import random

# Load .env
load_dotenv()

# Read the (ambiguous‑named) connection string that has access to the a5filings container
A5FILINGS_CONN_STR = os.getenv("A5FILINGS_CONN_STR")
if not A5FILINGS_CONN_STR:
    raise RuntimeError("Missing A5FILINGS_CONN_STR—make sure .env is present and loaded")

# Create the account‑level client
service = BlobServiceClient.from_connection_string(A5FILINGS_CONN_STR)


In [2]:
print([c.name for c in service.list_containers()])


['analyses', 'originals', 'processed']


In [3]:
# Cell 2: Helper to sample & download .htm files from a5filings/processed/
def download_random_htm_files(
    service: BlobServiceClient,
    container_name: str,
    prefix: str,
    k: int,
    local_dir: str
):
    """
    - service: BlobServiceClient
    - container_name: e.g. "a5filings"
    - prefix: subfolder under the container root, e.g. "processed/"
    - k: number of random .htm files to download
    - local_dir: local path to save them into
    """
    container_client = service.get_container_client(container_name)

    # 1. Enumerate all .htm blobs under prefix
    print(f"Listing blobs under '{container_name}/{prefix}' …")
    all_blobs = container_client.list_blobs(name_starts_with=prefix)
    htm_paths = [b.name for b in all_blobs if b.name.lower().endswith('.htm')]
    if not htm_paths:
        raise FileNotFoundError(f"No .htm files found under '{prefix}' in container '{container_name}'")
    print(f"Found {len(htm_paths)} .htm files.")

    # 2. Pick k at random
    sampled = random.sample(htm_paths, min(k, len(htm_paths)))
    print(f"Downloading {len(sampled)} randomly selected files…")

    # 3. Ensure local directory exists
    os.makedirs(local_dir, exist_ok=True)

    # 4. Download each one
    for blob_path in sampled:
        blob_client = container_client.get_blob_client(blob_path)
        data = blob_client.download_blob().readall()

        # sanitize filename
        fname = blob_path.replace('/', '_')
        out_path = os.path.join(local_dir, fname)
        with open(out_path, 'wb') as f:
            f.write(data)
        print(f"  • Saved {blob_path} → {out_path}")

    print("Done.")


In [None]:
# Cell 3: Run the sampler
CONTAINER_NAME = "processed"          # this container actually exists
PREFIX         = ""                   # root of that container
K              = 24
LOCAL_DIR      = "../data/raw/a5filings_test"

download_random_htm_files(
    service=service,
    container_name=CONTAINER_NAME,
    prefix=PREFIX,
    k=K,
    local_dir=LOCAL_DIR
)

Listing blobs under 'processed/' …
Found 943 .htm files.
Downloading 24 randomly selected files…
  • Saved 1683471/0000894189-25-001902/glaciersharessummaryprospe.htm → ./data/raw/a5filings_test/1683471_0000894189-25-001902_glaciersharessummaryprospe.htm
  • Saved 1924868/0001999371-25-001875/zipp-497k_022425.htm → ./data/raw/a5filings_test/1924868_0001999371-25-001875_zipp-497k_022425.htm
  • Saved 1924868/0001999371-25-000042/wdte-497k_010225.htm → ./data/raw/a5filings_test/1924868_0001999371-25-000042_wdte-497k_010225.htm
  • Saved 1587982/0001213900-25-038394/ea0238218-04_497k.htm → ./data/raw/a5filings_test/1587982_0001213900-25-038394_ea0238218-04_497k.htm
  • Saved 1689873/0001641172-25-008520/form497k.htm → ./data/raw/a5filings_test/1689873_0001641172-25-008520_form497k.htm
  • Saved 1924868/0001999371-25-006884/rsst-497k_053025.htm → ./data/raw/a5filings_test/1924868_0001999371-25-006884_rsst-497k_053025.htm
  • Saved 1689873/0001641172-25-008505/form497k.htm → ./data/raw/a5fi