In [None]:
from bs4 import BeautifulSoup
import json
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
import time

def setup_browser(language="en"):
    # Set up headless browser options
    options = Options()
    options.add_argument("--headless=new")
    options.add_argument("--no-sandbox")
    options.add_argument("--disable-dev-shm-usage")

    # Set the preferred language
    options.add_experimental_option('prefs', {'intl.accept_languages': language})

    # Initialize the WebDriver (Chrome in this case)
    driver = webdriver.Chrome(options=options)
    return driver

def crawl_website(url, language="en"):
    driver = setup_browser(language)
    driver.get(url)

    # Wait for JavaScript to load (may require fine-tuning)
    # driver.implicitly_wait(30) ignored on Mac
    time.sleep(10)

    # Extract the page source or other relevant data
    page_source = driver.page_source

    # Quit the driver
    driver.quit()

    return page_source

def convert_tr_to_json(tr_list):
    result = []

    for tr in tr_list:
        td_tags = tr.find_all('td')

        # Rule 1: Discard tr tags with more or less than 2 td tags
        if len(td_tags) != 2:
            continue

        # Rule 2 & 3: Extracting text content for 'name' and 'title'
        record = {
            'name': td_tags[0].get_text(strip=True),
            'title': td_tags[1].get_text(strip=True)
        }

        # Rule 4: Extracting 'url'
        a_tags = td_tags[1].find_all('a')
        if len(a_tags) == 1 and 'javascript:' not in a_tags[0].get('href', ''):
            record['url'] = a_tags[0].get('href')
        else:
            record['url'] = None

        # Rule 5: Extracting 'references'
        record['references'] = [a.get_text(strip=True) for a in a_tags] if len(a_tags) != 1 or 'javascript:' in a_tags[0].get('href', '') else []

        result.append(record)

    return json.dumps(result, indent=2)

# Use the function to crawl a website
url = "https://www.fedlex.admin.ch/it/cc/internal-law/73"
content = crawl_website(url, language="it")  # Set preferred language to italian

# Parse the HTML content
soup = BeautifulSoup(content, "html.parser")

# Find all <tr> tags where the 'name' attribute starts with '73'
tr_tags = soup.find_all('tr', attrs={'name': lambda value: value and value.startswith('73')})

# Display or process the found <tr> tags
for tr in tr_tags:
    print(tr)

print(convert_tr_to_json(tr_tags))
