<a href="https://colab.research.google.com/github/sondreskarsten/Bard/blob/main/Download_PDFs_from_Brreg_no.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

https://gemini.google.com/app/83be1bdb73e13fa9

In [1]:
!pip install playwright
!playwright install --with-deps chromium

Collecting playwright
  Downloading playwright-1.52.0-py3-none-manylinux1_x86_64.whl.metadata (3.5 kB)
Collecting pyee<14,>=13 (from playwright)
  Downloading pyee-13.0.0-py3-none-any.whl.metadata (2.9 kB)
Downloading playwright-1.52.0-py3-none-manylinux1_x86_64.whl (45.1 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m45.1/45.1 MB[0m [31m14.5 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading pyee-13.0.0-py3-none-any.whl (15 kB)
Installing collected packages: pyee, playwright
Successfully installed playwright-1.52.0 pyee-13.0.0
Installing dependencies...
Get:1 http://security.ubuntu.com/ubuntu jammy-security InRelease [129 kB]
Get:2 https://cloud.r-project.org/bin/linux/ubuntu jammy-cran40/ InRelease [3,632 B]
Get:3 https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64  InRelease [1,581 B]
Get:4 https://r2u.stat.illinois.edu/ubuntu jammy InRelease [6,555 B]
Hit:5 http://archive.ubuntu.com/ubuntu jammy InRelease
Get:6 http://archive.ubuntu.com/ubun

In [2]:
# Step 1: Run these installations in separate Colab cells (only need to run once per session)
#!pip install playwright nest_asyncio
#!playwright install --with-deps chromium

# Step 2: Import libraries and apply async patch
import asyncio
import os
import nest_asyncio
from playwright.async_api import async_playwright, expect, Playwright, Page, Browser

nest_asyncio.apply()

# Step 3: Define the main automation function
async def download_brreg_reports():
    """
    Automates the download of annual reports from brreg.no for entity 964163006.
    """
    playwright: Playwright = None
    browser: Browser = None
    try:
        async with async_playwright() as playwright:
            # Launch headless Chromium browser
            browser = await playwright.chromium.launch(headless=True)
            print("Chromium browser launched successfully (headless).")

            # Create a new page (tab)
            page: Page = await browser.new_page()
            print("New page created.")

            # Define target URL and years
            entity_id = "964163006"
            url = f"https://virksomhet.brreg.no/nb/oppslag/enheter/{entity_id}"
            years_to_download =
            download_dir = "/content/brreg_downloads" # Writable directory in Colab

            # Ensure download directory exists
            os.makedirs(download_dir, exist_ok=True)
            print(f"Download directory ensured at: {download_dir}")

            # Navigate to the page
            print(f"Navigating to {url}...")
            await page.goto(url, timeout=60000) # Increased timeout for initial load
            print("Navigation complete.")
            await page.wait_for_load_state('domcontentloaded') # Wait for DOM ready

            # Locate and click the "Vis alle opplysninger" button
            print("Locating and clicking 'Vis alle opplysninger' button...")
            reveal_button_locator = page.get_by_role('button', name='Vis alle opplysninger')
            await expect(reveal_button_locator).to_be_visible(timeout=15000)
            await reveal_button_locator.click()
            print("Clicked 'Vis alle opplysninger'.")

            # Wait for the content (specifically the latest year's link) to become visible
            print("Waiting for content reveal...")
            latest_year_test_id = f"download-aarsregnskap-{entity_id}-{years_to_download}"
            latest_link_locator = page.get_by_test_id(latest_year_test_id)
            await expect(latest_link_locator).to_be_visible(timeout=15000) # Wait up to 15s for reveal
            print("Content revealed.")

            # Loop through the years and download files
            for year in years_to_download:
                print(f"\n--- Processing year: {year} ---")
                test_id = f"download-aarsregnskap-{entity_id}-{year}"
                link_locator = page.get_by_test_id(test_id)

                # Verify link is visible before attempting download
                await expect(link_locator).to_be_visible(timeout=5000)
                print(f"Located download link for {year} using data-testid: {test_id}")

                # Start waiting for download and click the link
                print(f"Initiating download for {year}...")
                async with page.expect_download(timeout=60000) as download_info: # Wait up to 60s for download start
                    await link_locator.click()

                download = await download_info.value
                suggested_filename = download.suggested_filename
                print(f"Download started. Suggested filename: {suggested_filename}")

                # Construct save path and save the file
                # Basic filename sanitization
                safe_filename = "".join(c for c in suggested_filename if c.isalnum() or c in ('.', '_', '-')).rstrip()
                if not safe_filename:
                    safe_filename = f"brreg_report_{year}.pdf" # Fallback filename

                save_path = os.path.join(download_dir, f"brreg_arsregnskap_{year}_{safe_filename}")

                try:
                    await download.save_as(save_path)
                    print(f"Successfully saved {year} report to: {save_path}")
                    # Optional: Verify file size
                    if os.path.exists(save_path):
                         print(f"File size: {os.path.getsize(save_path)} bytes")
                    else:
                         print(f"Warning: File not found after saving at {save_path}")
                except Exception as e:
                    print(f"Error saving download for year {year}: {e}")
                    # Attempt to log temporary path if saving failed
                    try:
                        temp_path = await download.path()
                        print(f"Download temporary path was: {temp_path}")
                    except Exception as path_e:
                        print(f"Could not retrieve temporary path: {path_e}")

                # Small pause between downloads (optional, adjust if needed)
                await page.wait_for_timeout(1000)

            print("\nAll specified years processed.")

    except Exception as e:
        print(f"\nAn error occurred during the automation process: {e}")
        import traceback
        traceback.print_exc()

    finally:
        # Ensure browser is closed even if errors occur
        if browser:
            await browser.close()
            print("Browser closed.")
        # Playwright context manager handles playwright.stop()

# Step 4: Execute the main function
if __name__ == "__main__":
    print("Starting Brreg report download automation...")
    # Use asyncio.run() to execute the async function
    # This should work in Colab after nest_asyncio.apply()
    asyncio.run(download_brreg_reports())
    print("Automation script finished.")

Browser launched successfully.
Browser closed.
