In [1]:
# Basic notebook to fetch the mhtml of a page and add it to example data

In [1]:
import json
import os
import urllib.parse
import uuid

from playwright.async_api import async_playwright

In [2]:
# Setup a playwright instance. We need to do this to access the dev tool functions
p = await async_playwright().__aenter__()
browser = await p.chromium.launch(headless=False)
context = await browser.new_context(viewport={"width": 1280, "height": 1280})
page = await context.new_page()
client = await context.new_cdp_session(page)
page.set_default_navigation_timeout(50000)

In [3]:
# Go to your desired URL
desired_url = "https://gpssa.gov.ae/en/Pages/LawsandRegulations.aspx#/"
await page.goto(desired_url)

<Response url='https://gpssa.gov.ae/en/Pages/LawsandRegulations.aspx' request=<Request url='https://gpssa.gov.ae/en/Pages/LawsandRegulations.aspx' method='GET'>>

In [12]:
def get_file_path(url: str) -> str:
    parsed_url = urllib.parse.urlparse(url)
    domain = parsed_url.netloc
    domain = domain.replace("www.", "").replace(".com", "")
    return f"./{domain}.mhtml"

In [49]:
# Capture page content as MHTML
result = await client.send("Page.captureSnapshot", {"format": "mhtml"})
mhtml = result["data"]

# Create example values
# values = {
#     "id": str(uuid.uuid4()),
#     "url": desired_url,
#     "source": "mhtml",
#     "domain": "UPDATE",
#     "subdomain": "UPDATE",
#     "type": "fetch",
#     "goal": {},
#     "evals": [
#       {
#         "type": "json_match",
#         "expected": {}
#       }
#     ]
#   }
values = {
    "id": str(uuid.uuid4()),
    "url": page.url,
    "source": "mhtml",
    "category": "government",
    "subcategory": "download",
    "type": "links_fetch",
    "goal": 'Fetch all of the links to the documents on the current page. Return as a list of objects of the following schema:\n{"name":\n\t{"type": "string", "description": "The name of the document. Note that may not always be the link text. Use the element that best captures the name"}\n, "url":\n\t{"type": "string"}\n}',
    "evals": [{"type": "json_match", "expected": []}],
}

# Write MHTML content to the specified file
folder_path = f"./static/{values['id']}"
os.makedirs(folder_path, exist_ok=True)
file_path = os.path.join(folder_path, "index.mhtml")
with open(file_path, "w") as f:
    f.write(mhtml)

# Print the path to the console
print(f"Page saved as MHTML at: {file_path}")
file_path = os.path.join(folder_path, "index.mhtml")
with open(file_path, "w") as f:
    f.write(mhtml)

# Print the path to the console
print(f"Page saved as MHTML at: {file_path}")

json_file_path = "./static/examples.json"
with open(json_file_path, "r") as json_file:
    data = json.load(json_file)

data.append(values)

with open(json_file_path, "w") as json_file:
    json.dump(data, json_file, indent=4)
    json_file.write("\n")
# print(json.dumps(data, indent=4))

Page saved as MHTML at: ./static/caa0a351-24a8-40c3-9c62-134d52003f49/index.mhtml
Page saved as MHTML at: ./static/caa0a351-24a8-40c3-9c62-134d52003f49/index.mhtml


In [40]:
import os
import shutil

"""
Utility to delete extra mhtml folders
"""
# Read in the examples.json and fetch all of the ids
ids = []
path = "./static/examples.json"
with open(path, "r") as json_file:
    data = json.load(json_file)
    ids += [item["id"] for item in data]
path = "./static/test_examples.json"
with open(path, "r") as json_file:
    data = json.load(json_file)
    ids += [item["id"] for item in data]

print(ids)
print(len(ids))

# Iterate through the folders
base_folder = "./static/"
for folder in os.listdir(base_folder):
    folder_path = os.path.join(base_folder, folder)
    if folder not in ids and os.path.isdir(folder_path):
        # If there is an id that is not present and the path is a directory, delete that folder
        shutil.rmtree(folder_path)

['c5e7ec4b-ad83-49a5-b056-a37ddf974a64', '8366209e-e5fb-4f64-85e0-79cd90985b59', '6152ba56-e62b-42d1-beec-6ecac1dfec35', 'd86e052f-3c14-4b4d-95e7-c26d59b771f7', '4310caaa-bf1b-4704-9362-dbf5b961972d', 'ad142573-4dcb-4c1e-9a4d-6fa1fa818026', '69a755f1-33db-4f77-9b8e-0e90cf7c7188', 'f8e6ac9d-584d-4321-9451-7ebe0af3b6cb', '766abf53-e055-4e4d-ac42-531e74606846', '9a10af79-08e2-4d73-a2a1-675bb97126bb', '77d6efb9-2581-4f5f-a681-1b94696d9044', '5db440aa-f155-4297-bae4-769c8c808acd', 'cc29229c-30f6-4a03-a4b6-2755ed0cca14', '6535b75f-be1e-4f68-aedd-f3ff167d98ca', '0880ea91-7c5e-415c-9d2f-00457f110ee8', 'a266e560-aa88-49fe-b3c0-2731163b1921', 'db0872df-fd27-4fff-bce8-3513f8e1164d', '574db794-8055-4fe6-bd16-5f9ee6eccbb4', 'a4e1ac68-35bc-44a6-9e5f-53785df53bf2', 'c0028644-9bfa-470e-9efd-d8473b42e313', '111ebd24-8617-48cb-b963-eb692bf1e53a', '270c2ebd-96c0-4bab-a24c-2c38a2df20aa', 'ed7b1cf6-5be4-40e7-ad00-89a67d1c8e40', '5df3d301-09f0-4ece-946d-c24952069ab7', '18b0b51e-196c-4765-bd70-aecff53f7e4d',

In [16]:
from playwright.async_api import Page
from typing import List, Dict, Any


async def scrape(page: Page) -> List[Dict[str, Any]]:
    link_elements = await page.query_selector_all("ul.nav.navbar-nav li a.subSubButton")

    document_links: List[Dict[str, Any]] = []
    for link in link_elements:
        name = await link.get_attribute("title")
        url = await link.get_attribute("href")

        if name and url:
            document_links.append({"name": name, "url": url})

    return document_links


print(json.dumps(await scrape(page), indent=2))

[
  {
    "name": "Link Assembleia como \u00f3rg\u00e3o de soberania",
    "url": "/Parlamento/Paginas/assembleia-como-orgao-soberania.aspx"
  },
  {
    "name": "Link Funcionamento",
    "url": "/Parlamento/Paginas/funcionamento.aspx"
  },
  {
    "name": "Link Compet\u00eancia",
    "url": "/Parlamento/Paginas/Competencias.aspx"
  },
  {
    "name": "Link Organiza\u00e7\u00e3o",
    "url": "/Parlamento/Paginas/Organizacao.aspx"
  },
  {
    "name": "Link Plen\u00e1rio",
    "url": "/DeputadoGP/Paginas/Plenario.aspx"
  },
  {
    "name": "Link Presidente",
    "url": "/sites/PARXVL/Presidente/Paginas/default.aspx"
  },
  {
    "name": "Link Mesa",
    "url": "/DeputadoGP/Paginas/Mesa.aspx"
  },
  {
    "name": "Link Confer\u00eancia de L\u00edderes",
    "url": "/DeputadoGP/Paginas/ConferenciaLideres.aspx"
  },
  {
    "name": "Link Comiss\u00f5es",
    "url": "/sites/COM"
  },
  {
    "name": "Link Comiss\u00e3o Permanente",
    "url": "/DeputadoGP/Paginas/ComissaoPermanente.aspx"
  

In [19]:
await page.evaluate("window.scrollBy(0, window.innerHeight);")

In [27]:
await page.mouse.wheel(0, 500)

In [31]:
page.viewport_size["height"]
await page.evaluate(
    "Math.max(document.body.scrollHeight, document.documentElement.scrollHeight,document.body.offsetHeight, document.documentElement.offsetHeight,document.body.clientHeight, document.documentElement.clientHeight);"
)

735

In [32]:
await page.evaluate("async () => {window.scrollBy(0, document.body.scrollHeight);}")