In [44]:
import requests
from bs4 import BeautifulSoup
from tqdm import tqdm
import pandas as pd
import time

In [45]:
LINKS_FILE = 'eur_lex_links.csv'

In [46]:
df = pd.read_csv(LINKS_FILE)
df.describe()

Unnamed: 0,link,info_page_url,html_page_url,CELEX_id
count,508,508,508,508
unique,508,508,508,508
top,https://eur-lex.europa.eu/legal-content/AUTO/?...,https://eur-lex.europa.eu/legal-content/EN/ALL...,https://eur-lex.europa.eu/legal-content/EN/TXT...,21959A1006(02)
freq,1,1,1,1


In [47]:
def get_document_content_html(html_page_url):
    response = requests.get(html_page_url)

    if response.status_code != 200:
        print('Error: status code {}'.format(response.status_code))
        return None
    
    return response.text

In [48]:
def get_dates_metadata(html):
    soup = BeautifulSoup(html, 'html.parser')
    
    # Find the div tag with id 'PPMisc_Contents'
    dates_tag = soup.find('div', {'id': 'PPDates_Contents'})

    # Find the dl tag with class 'NMetadata'
    metadata_tag = dates_tag.find('dl', {'class': 'NMetadata'})

    # Initialize an empty dictionary to store dates
    dates_dict = {}
    key_counts = {}

    # Iterate over the dt and dd tags to scrape metadata
    for dt, dd in zip(metadata_tag.find_all('dt'), metadata_tag.find_all('dd')):
        key = dt.get_text(strip=True).replace(":", "")

        # If the key is already present, append a number to it
        if key in key_counts:
            key_counts[key] += 1
            key = f'{key} {key_counts[key]}'
        else:
            key_counts[key] = 0

        value = dd.get_text(strip=True)
        # Discard everything after the ; character if it is present
        if ';' in value:
            value = value.split(';')[0]
        dates_dict[key] = value

    return dates_dict

In [49]:
def get_miscellaneous_information_metadata(html):
    soup = BeautifulSoup(html, 'html.parser')

    # Find the div tag with id 'PPMisc_Contents'
    misc_tag = soup.find('div', {'id': 'PPMisc_Contents'})

    # Find the dl tag with class 'NMetadata' within the div
    metadata_tag = misc_tag.find('dl', {'class': 'NMetadata'})

    # Initialize an empty dictionary to store metadata
    metadata_dict = {}
    key_counts = {}

    # Iterate over the dt and dd tags to scrape metadata
    for dt, dd in zip(metadata_tag.find_all('dt'), metadata_tag.find_all('dd')):
        key = dt.get_text(strip=True).replace(":", "")

        # If the key is already present, append a number to it
        if key in key_counts:
            key_counts[key] += 1
            key = f'{key} {key_counts[key]}'
        else:
            key_counts[key] = 0

        spans = dd.find_all('span', lang='en')
        if spans:
            value = ', '.join(span.get_text(strip=True) for span in spans)
        else:
            value = dd.get_text(strip=True)

        metadata_dict[key] = value

    return metadata_dict

In [50]:
def get_classifications_metadata(html):
    soup = BeautifulSoup(html, 'html.parser')

    # Find the div tag with id 'PPClass_Contents'
    class_tag = soup.find('div', {'id': 'PPClass_Contents'})

    # Find the dl tag with class 'NMetadata' within the div
    metadata_tag = class_tag.find('dl', {'class': 'NMetadata'})

    # Initialize an empty dictionary to store classifications
    classification_dict = {}
    code_dict = {}
    
    # Iterate over the dt and dd tags to scrape metadata
    for dt, dd in zip(metadata_tag.find_all('dt'), metadata_tag.find_all('dd')):
        major_key = dt.get_text(strip=True).replace(":", "")
        
        # For minor keys under each major key
        minor_keys = []
        for li in dd.find_all('li'):
            span = li.find('span', lang='en')
            if span:
                minor_keys.append(span.get_text(strip=True))
                
        # For extracting directory code and levels
        if major_key == "Directory code":
            code = dd.find('li').get_text().split('\n')[0].strip()
            code_dict["code"] = code
            
            levels = dd.find_all('span', lang='en')
            for idx, level in enumerate(levels):
                code_dict[f"level {idx + 1}"] = level.get_text(strip=True)
                
            classification_dict[major_key] = code_dict
        else:
            classification_dict[major_key] = minor_keys

    return classification_dict

In [51]:
def get_metadata(info_page_url):
    response = requests.get(info_page_url)

    if response.status_code != 200:
        print('Error: status code {}'.format(response.status_code))
        return None
    
    metadata = {
        'Dates': get_dates_metadata(response.text),
        'Misc': get_miscellaneous_information_metadata(response.text),
        'Classification': get_classifications_metadata(response.text)
    }

    return metadata

In [52]:
def get_document(info_page_url, html_page_url):    
    metadata = get_metadata(info_page_url)
    metadata['html'] = get_document_content_html(html_page_url)
    
    return metadata

In [53]:
# Create a download directory and get the data for each document.
# Store each document in a separate json file.
import os
import json

DOWNLOAD_DIR = 'eur_lex_data'
if not os.path.exists(DOWNLOAD_DIR):
    os.mkdir(DOWNLOAD_DIR)

for idx, row in tqdm(df.iterrows(), total=df.shape[0]):
    document = get_document(row['info_page_url'], row['html_page_url'])
    with open(os.path.join(DOWNLOAD_DIR, f'{row["CELEX_id"]}.json'), 'w') as f:
        json.dump(document, f)
    time.sleep(0.5)

100%|██████████| 508/508 [18:21<00:00,  2.17s/it]
