In [1]:
import requests
from bs4 import BeautifulSoup
from tqdm import tqdm
import pandas as pd
import time

In [2]:
BASE_URL = "https://eur-lex.europa.eu"
ENERGY_QUERY_URL = "https://eur-lex.europa.eu/search.html?name=browse-by%3Alegislation-in-force&type=named&qid=1698155004044&CC_1_CODED=12"
N_RETRIES = 3

In [3]:
def extract_total_documents(eur_lex_query_url):
    # FIXME: Unreliable, sometimes returns 402, although most of the time it is 522?
    response = requests.get(eur_lex_query_url)
    soup = BeautifulSoup(response.text, 'html.parser')
    try:
        total_pages_element = soup.find('strong', string=True, recursive=True)
        total_pages = int(total_pages_element.find_next_sibling('strong').find_next_sibling('strong').string)
        return total_pages
    except AttributeError as e:
        print(f"Could not find total documents. Error: {e}")
        return None

In [4]:
def extract_links_from_results(url):
    response = requests.get(url)
    soup = BeautifulSoup(response.text, 'html.parser')
    
    # Check if page number exceeds the maximum
    warning = soup.find('div', {'class': 'alert alert-warning'})
    if warning and 'maximum number of pages' in warning.text:
        return None

    # Find and save the links to DataFrame
    new_links = [f"{BASE_URL}/{[link['href']][0][2:]}" for div in soup.find_all('div', {'class': 'SearchResult'}) for link in div.find_all('a', {'class': 'title'})]

    if len(new_links) == 0:
        return None
    
    return new_links

## Determine the total number of documents for validation

In [5]:
n_documents = extract_total_documents(ENERGY_QUERY_URL)

if n_documents <= 402:
    # Something is wrong, try again
    for i in range(N_RETRIES):
        print(f"Trying again, attempt {i+1}")
        n_documents = extract_total_documents(ENERGY_QUERY_URL)
        if n_documents > 402:
            break

print(f'Total number of documents matching "Energy" query: {n_documents}')

Total number of documents matching "Energy" query: 522


## Scrape the links

In [6]:
links = []
faulty_html = ""
pbar = tqdm(total=n_documents, desc="Scraping results for links")

# Loop through the pages
page_number = 1
while True:
    url = f"{ENERGY_QUERY_URL}&page={page_number}"
    pbar.set_postfix_str(f"{url}, attempt 1")

    new_links = extract_links_from_results(url)
    retry = 0
    while not new_links and retry <= N_RETRIES:
        pbar.set_postfix_str(f"{url}, attempt {retry+2}")
        new_links = extract_links_from_results(url)
        retry += 1
    
    if not new_links:
        break
    
    links.extend(new_links)
    page_number += 1
    pbar.update(len(new_links))
    time.sleep(0.25)

pbar.close()

df = pd.DataFrame(links, columns=['link'])

Scraping results for links: 100%|██████████| 522/522 [02:40<00:00,  3.25it/s, https://eur-lex.europa.eu/search.html?name=browse-by%3Alegislation-in-force&type=named&qid=1698155004044&CC_1_CODED=12&page=54, attempt 4]


In [8]:
if len(df) != n_documents:
    print(f"WARNING: Number of documents ({n_documents}) does not match the number of links ({len(df)})")

## Get the url of the actual info page of the document and CELEX id

In [None]:
def get_document_info_page(url):
    """
    Get the redirected url from a given url

    Example:
    https://eur-lex.europa.eu/legal-content/AUTO/?uri=CELEX:31983H0230&qid=1698155004044&rid=460 -> https://eur-lex.europa.eu/legal-content/EN/ALL/?uri=CELEX:31983H0230
    """

    url = url.replace('AUTO', 'EN/ALL')
    return url[:url.find('&')]

In [None]:
def get_document_html_page(url):
    """
    Get the redirected url from a given url

    Example:
    https://eur-lex.europa.eu/legal-content/AUTO/?uri=CELEX:31983H0230&qid=1698155004044&rid=460 -> https://eur-lex.europa.eu/legal-content/EN/TXT/HTML/?uri=CELEX:31983H0230
    """

    url = url.replace('AUTO', 'EN/TXT/HTML')
    return url[:url.find('&')]

In [None]:
def get_document_CELEX_id(info_page_url):
    """
    Extract 32023L1791 from https://eur-lex.europa.eu/legal-content/EN/ALL/?uri=CELEX:32023L1791
    """

    return info_page_url.split(':')[-1]

In [None]:
df['info_page_url'] = df['link'].apply(get_document_info_page)
df['html_page_url'] = df['link'].apply(get_document_html_page)
df['CELEX_id'] = df['info_page_url'].apply(get_document_CELEX_id)

In [None]:
# Drop duplicates (some documents are listed multiple times apparently)
df = df.drop_duplicates(subset=['info_page_url'])
df.describe()

Unnamed: 0,link,info_page_url
count,508,508
unique,508,508
top,https://eur-lex.europa.eu/legal-content/AUTO/?...,https://eur-lex.europa.eu/legal-content/EN/ALL...
freq,1,1


In [7]:
df.to_csv('eur_lex_links.csv', index=False)

Unnamed: 0,link
0,https://eur-lex.europa.eu/legal-content/AUTO/?...
1,https://eur-lex.europa.eu/legal-content/AUTO/?...
2,https://eur-lex.europa.eu/legal-content/AUTO/?...
3,https://eur-lex.europa.eu/legal-content/AUTO/?...
4,https://eur-lex.europa.eu/legal-content/AUTO/?...
...,...
517,https://eur-lex.europa.eu/legal-content/AUTO/?...
518,https://eur-lex.europa.eu/legal-content/AUTO/?...
519,https://eur-lex.europa.eu/legal-content/AUTO/?...
520,https://eur-lex.europa.eu/legal-content/AUTO/?...
