In [0]:
%pip install beautifulsoup4==4.14.3

In [0]:
%run ../get_user

In [0]:
# Getting the current user
user_email = spark.sql("SELECT current_user()").collect()[0][0]
username = get_username_from_email(user_email)
print(username)

In [0]:
dataset_bucket_name = "revodata-databricks-geospatial"
catalog_name = "geospatial"
schema_name = "zoetermeer"

In [0]:
spark.sql(f"DROP VOLUME IF EXISTS {catalog_name}.{schema_name}.vaststellingsbesluit_{username}")
spark.sql(f"DROP VOLUME IF EXISTS {catalog_name}.{schema_name}.verwijzing_{username}")

In [0]:
spark.sql(f"CREATE VOLUME IF NOT EXISTS {catalog_name}.{schema_name}.vaststellingsbesluit_{username}")
spark.sql(f"CREATE VOLUME IF NOT EXISTS {catalog_name}.{schema_name}.verwijzing_{username}")

In [0]:
from typing import List, Tuple
import requests
import os
from urllib.parse import urljoin, urlparse
from concurrent.futures import ThreadPoolExecutor, as_completed
import time
from bs4 import BeautifulSoup


requests.packages.urllib3.disable_warnings()


class PDFDownloader:
    """Downloads PDFs from landing pages listed in a Spark DataFrame."""
    
    def __init__(self, df, volume_path: str, url_column: str = "base_url"):
        self.df = df                           # ðŸ”¹ DataFrame now stored here
        self.volume_path = volume_path
        self.url_column = url_column
        self.session = self._create_session()
        
    def _create_session(self) -> requests.Session:
        session = requests.Session()
        session.verify = False
        session.headers.update({"User-Agent": "Mozilla/5.0 (compatible)"})
        return session

    # ----------------------------------------------------------------------
    # Fetch landing page & parse PDF links
    # ----------------------------------------------------------------------
    def _fetch_pdf_links_from_landing_page(self, landing_url: str) -> List[str]:
        """Download HTML and extract all PDF links."""
        try:
            resp = self.session.get(landing_url, timeout=20)
            resp.raise_for_status()
        except Exception as e:
            print(f"Error fetching landing page {landing_url}: {e}")
            return []

        return self._extract_pdf_urls_from_html(resp.text, landing_url)

    def _extract_pdf_urls_from_html(self, html: str, base_url: str) -> List[str]:
        soup = BeautifulSoup(html, "html.parser")
        pdf_urls = []

        for link in soup.find_all("a", href=True):
            href = link["href"].strip()
            if href.lower().endswith(".pdf"):
                full = urljoin(base_url, href)
                pdf_urls.append(full)

        return list(set(pdf_urls))  # distinct

    # ----------------------------------------------------------------------

    def _get_filename(self, pdf_url: str) -> str:
        filename = os.path.basename(urlparse(pdf_url).path)
        if not filename or not filename.lower().endswith(".pdf"):
            filename = f"doc_{hash(pdf_url) % 100000}.pdf"
        return filename

    def _download_pdf(self, pdf_url: str, fid: str):
        filename = self._get_filename(pdf_url)
        folder = os.path.join(self.volume_path, str(fid))
        os.makedirs(folder, exist_ok=True)
        file_path = os.path.join(folder, filename)

        # Skip if file already exists
        if os.path.exists(file_path):
            return True, f"Already exists: {filename}"

        try:
            resp = self.session.get(pdf_url, timeout=30)
            resp.raise_for_status()

            with open(file_path, "wb") as f:
                f.write(resp.content)

            return True, f"Downloaded: {filename}"
        except Exception as e:
            return False, f"Error downloading {filename}: {str(e)}"

    # ----------------------------------------------------------------------
    # Process one landing page â†’ extract all PDFs â†’ download them
    # ----------------------------------------------------------------------
    def _process_record(self, args: Tuple):
        row, index, total = args
        landing_page_url = getattr(row, self.url_column)
        fid = row.fid

        if not landing_page_url:
            return [(fid, False, "Missing landing page URL")]

        pdf_urls = self._fetch_pdf_links_from_landing_page(landing_page_url)

        if not pdf_urls:
            return [(fid, False, "No PDFs found on landing page")]

        results = []
        for pdf_url in pdf_urls:
            success, msg = self._download_pdf(pdf_url, fid)
            results.append((fid, success, msg))
            print(f"{index}/{total} - FID={fid}: {msg}")

        return results

    # ----------------------------------------------------------------------

    def run(self, max_workers: int = 10):
        # Collect rows from given DataFrame
        records = self.df.select(self.url_column, "fid") \
                         .filter(self.df[self.url_column].isNotNull()) \
                         .filter(self.df.fid.isNotNull()) \
                         .collect()

        print(f"Found {len(records)} landing pages")

        if not records:
            return

        os.makedirs(self.volume_path, exist_ok=True)

        success_count = 0
        error_count = 0
        start = time.time()

        workers = min(max_workers, len(records))

        with ThreadPoolExecutor(max_workers=workers) as executor:
            futures = [
                executor.submit(self._process_record, (row, i, len(records)))
                for i, row in enumerate(records, start=1)
            ]

            for future in as_completed(futures):
                try:
                    for fid, ok, msg in future.result():
                        if ok:
                            success_count += 1
                        else:
                            error_count += 1
                except Exception as e:
                    error_count += 1
                    print("Unexpected error:", e)

        print("\n========== SUMMARY ==========")
        print("Successful downloads:", success_count)
        print("Failed downloads:", error_count)
        print("Output folder:", self.volume_path)
        print("Total time:", round(time.time() - start, 1), "seconds")
        print("=============================")

        self.session.close()

In [0]:
from pyspark.sql.functions import split, regexp_replace, expr, col, array_distinct, element_at

SOURCE_TABLE: str = f"{catalog_name}.{schema_name}.bestemmingsplangebied_{username}"

df = spark.read.table(SOURCE_TABLE)

df_with_base_url = df.withColumn(
    "base_url",
    element_at(array_distinct(                           # remove duplicates
        expr("""
            transform(
                split(verwijzingNaarTekst, ','),   -- split into list
                x -> regexp_replace(x, '/[^/]+$', '/')  -- remove last segment (the HTML file)
            )
        """)
    ),1
))

display(df_with_base_url)

In [0]:
VOLUME_PATH = f"/Volumes/{catalog_name}/{schema_name}/verwijzing_{username}"
main(df_with_base_url, VOLUME_PATH, "base_url")