In [1]:
from bs4 import BeautifulSoup
import requests
import pandas as pd
import re

url = "https://www.pbl.nl/en/topics/circular-economy/publications"
response = requests.get(url)
soup = BeautifulSoup(response.text, 'html.parser')

publications = soup.find_all("a", class_="node-link")


publications = [pub['href'] for pub in publications]

publications

['/en/publications/integral-circular-economy-report-2023-assessment-for-the-netherlands',
 '/en/publications/plastic-futures-and-their-co2-emissions',
 '/en/publications/addressing-international-impacts-of-the-dutch-circular-economy-transition',
 '/en/publications/circular-economy-progress-report-2022',
 '/en/publications/possible-objectives-for-a-circular-economy',
 '/en/publications/potential-effects-of-dutch-circular-economy-strategies-on-low-and-middle-income-countries',
 '/en/publications/extended-producer-responsibility',
 '/en/publications/potential-effects-of-dutch-circular-economy-policies-on-low-and-middle-income-countries-the-case-of-electrical-and',
 '/en/publications/international-workshop-on-targets-for-a-circular-economy-summary',
 '/en/publications/outline-of-the-circular-economy']

In [4]:
from tqdm import tqdm
url = 'https://www.pbl.nl/'

article_dicts=[]

for pub in tqdm(publications):
    response=requests.get(url+pub)
    soup = BeautifulSoup(response.text, 'html.parser')
    try:
        title = soup.find('meta', property='og:title')['content']
    except:
        title=None
    try:
        date = soup.find('span', class_='date-display-single').text.strip()
    except:
        date=None
    try:
        content=soup.find('div', class_='field field-name-field-paragraphs field-type-paragraphs field-label-hidden field--view-mode-full').text.strip()
    except:
        content=None
    try:
        introduction=soup.find('div', class_='field field-name-field-introduction-text field-type-text-long field-label-hidden field--view-mode-full').text.strip()
    except:
        introduction=None
        
    article_dict = {
        'title':title,
        'date':date,
        'introduction':introduction,
        'content':content
        
    }
    article_dicts.append(article_dict)

100%|██████████| 10/10 [00:05<00:00,  1.93it/s]


In [5]:
df=pd.DataFrame(article_dicts)
# Function to check if a row contains references to figures
def contains_figure_text(row):
    figure_pattern = re.compile(r'\bFigure \d+\b', flags=re.IGNORECASE)
    text_to_check = f"{row['introduction']} {row['content']}"
    return bool(re.search(figure_pattern, text_to_check))

df['contains_figure'] = df.apply(contains_figure_text, axis=1)
df = df[df['contains_figure'] == False]


df.drop(columns=['contains_figure'], inplace=True)

In [6]:
df.to_csv("pbl_data.csv", index=False)

In [7]:
pd.read_csv('pbl_data.csv')

Unnamed: 0,title,date,introduction,content
0,Plastic futures and their CO2 emissions,07-12-2022,A circular bioeconomy can dramatically rein in...,
1,Addressing international impacts of the Dutch ...,09-05-2022,The Netherlands has the ambition to achieve a ...,The policy brief is written at the request of ...
2,Circular Economy Progress Report 2022,14-04-2022,This Circular Economy Progress Report 2022 pro...,Update of certain indicators\nPBL will publish...
3,Possible objectives for a circular economy,17-12-2021,At the request of the Dutch Ministry of Infras...,This is the English version of the summary of ...
4,Potential effects of Dutch circular economy st...,18-11-2021,The Netherlands has the ambition to achieve a ...,Trade data was used to visualise trade in cott...
5,Extended Producer Responsibility,09-07-2021,Extended producer responsibility (EPR) is a wi...,Effects of EPR are determined by its design\nE...
6,Potential effects of Dutch circular economy st...,19-02-2021,The Netherlands has the ambition to achieve a ...,Current impacts of transboundary trade\nAround...
7,International Workshop on Targets for a Circul...,24-02-2020,In preparation for The Annual World Economic F...,The workshop was attended by more than 30 lead...
8,Outline of the circular economy,29-05-2019,"In many countries, governments are looking for...",
