In [49]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
from io import StringIO
import time

In [1]:
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from webdriver_manager.chrome import ChromeDriverManager

# Setup Chrome
chrome_options = Options()
chrome_options.add_argument("--headless")  # optional: run without UI
driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=chrome_options)

url = "https://ourworldindata.org/data"
# Wait until the links with class "chart-hit" are present
wait = WebDriverWait(driver, 15)
elements = wait.until(EC.presence_of_all_elements_located((By.CSS_SELECTOR, "a.chart-hit")))

# Extract hrefs
links = [el.get_attribute("href") for el in elements]

print("Found links:")
for link in links:
    print(link)

driver.quit()

Found links:
https://ourworldindata.org/grapher/share-of-population-in-extreme-poverty
https://ourworldindata.org/grapher/gdp-per-capita-maddison-project-database
https://ourworldindata.org/grapher/distribution-of-population-poverty-thresholds
https://ourworldindata.org/grapher/human-development-index
https://ourworldindata.org/grapher/prevalence-of-undernourishment
https://ourworldindata.org/grapher/daily-per-capita-caloric-supply
https://ourworldindata.org/grapher/share-of-the-labor-force-employed-in-agriculture
https://ourworldindata.org/grapher/meat-consumption-vs-gdp-per-capita
https://ourworldindata.org/grapher/population-with-un-projections
https://ourworldindata.org/grapher/population
https://ourworldindata.org/grapher/children-born-per-woman
https://ourworldindata.org/grapher/median-age
https://ourworldindata.org/grapher/co-emissions-per-capita
https://ourworldindata.org/grapher/per-capita-energy-use
https://ourworldindata.org/grapher/global-energy-substitution
https://ourworl

In [65]:
def get_dataset_information(url, display_results=True):
    
    filename = url.split('/')[-1].split('?')[0]
    
    dataset_url = url + ".csv?csvType=full"
    print("Fetching dataset from:", dataset_url)
    data_request = requests.get(dataset_url)
    data = pd.read_csv(StringIO(data_request.text))
    
    metadata_url = url + '.metadata.json'
    print("Fetching metadata from:", metadata_url)
    metadata_request = requests.get(metadata_url)
    metadata = metadata_request.json()
    
    if display_results:
        display(data.head())
        print("Metadata keys:", metadata.keys())
    
    return data, metadata, filename

test_data, test_metadata, test_filename = get_dataset_information('https://ourworldindata.org/grapher/research-spending-gdp')        

Fetching dataset from: https://ourworldindata.org/grapher/research-spending-gdp.csv?csvType=full
Fetching metadata from: https://ourworldindata.org/grapher/research-spending-gdp.metadata.json


Unnamed: 0,Entity,Code,Year,Research and development expenditure (% of GDP)
0,Albania,ALB,2007,0.08757
1,Albania,ALB,2008,0.15412
2,Algeria,DZA,2001,0.21219
3,Algeria,DZA,2002,0.33807
4,Algeria,DZA,2003,0.18122


Metadata keys: dict_keys(['chart', 'columns', 'dateDownloaded'])


In [66]:
def write_dataset_information(data, metadata, base_filename):
    path_prefix = "../data/exploration-datasets/"
    
    data.to_csv(f"{path_prefix}{base_filename}.csv", index=False)
    
    col_name = list(metadata['columns'].keys())[0]
    col_data = metadata['columns'].get(col_name, {})
    chart_data = metadata.get('chart', {})
    
    dataset_information = {
        'filename': f"{base_filename}.csv",
        'title': chart_data.get('title'),
        'description': col_data.get('descriptionShort'),
        'source': chart_data.get('citation') or col_data.get('citationShort') or col_data.get('citationLong'),
        'num_rows': data.shape[0],
        'url': chart_data.get('originalChartUrl'),
        'date_downloaded': metadata.get('dateDownloaded'),
    }
    
    return dataset_information

data_set_info = write_dataset_information(test_data, test_metadata, test_filename)

In [69]:
datasets = pd.DataFrame(columns = ['filename', 'title', 'description', 'source', 'num_rows', 'url',  'date_downloaded'])

for link in links:
    try:
        data, metadata, filename = get_dataset_information(link, display_results=False)
        data_set_info = write_dataset_information(data, metadata, filename)
        
        datasets = pd.concat([datasets, pd.DataFrame([data_set_info])], ignore_index=True)
        
        time.sleep(1.5)  # be polite and avoid overwhelming the server
    except Exception as e:
        print(f"Error processing {link}: {e}")
        
datasets.to_csv("../data/exploration-datasets/datasets_overview.csv", index=False)

Fetching dataset from: https://ourworldindata.org/grapher/share-of-population-in-extreme-poverty.csv?csvType=full
Fetching metadata from: https://ourworldindata.org/grapher/share-of-population-in-extreme-poverty.metadata.json
Fetching dataset from: https://ourworldindata.org/grapher/gdp-per-capita-maddison-project-database.csv?csvType=full
Fetching metadata from: https://ourworldindata.org/grapher/gdp-per-capita-maddison-project-database.metadata.json
Fetching dataset from: https://ourworldindata.org/grapher/distribution-of-population-poverty-thresholds.csv?csvType=full
Fetching metadata from: https://ourworldindata.org/grapher/distribution-of-population-poverty-thresholds.metadata.json
Fetching dataset from: https://ourworldindata.org/grapher/human-development-index.csv?csvType=full
Fetching metadata from: https://ourworldindata.org/grapher/human-development-index.metadata.json
Fetching dataset from: https://ourworldindata.org/grapher/prevalence-of-undernourishment.csv?csvType=full
Fe

In [70]:
datasets

Unnamed: 0,filename,title,description,source,num_rows,url,date_downloaded
0,share-of-population-in-extreme-poverty.csv,Share of population living in extreme poverty,Percentage of population living in households ...,World Bank Poverty and Inequality Platform (2025),2743,https://ourworldindata.org/grapher/share-of-po...,2025-09-21
1,gdp-per-capita-maddison-project-database.csv,GDP per capita,Average economic output per person in a countr...,Bolt and van Zanden – Maddison Project Databas...,21586,https://ourworldindata.org/grapher/gdp-per-cap...,2025-09-21
2,distribution-of-population-poverty-thresholds.csv,Distribution of population between different p...,Number of people living in households with an ...,World Bank Poverty and Inequality Platform (2025),2743,https://ourworldindata.org/grapher/distributio...,2025-09-21
3,human-development-index.csv,Human Development Index,The Human Development Index (HDI) is a summary...,"UNDP, Human Development Report (2025)",6683,https://ourworldindata.org/grapher/human-devel...,2025-09-21
4,prevalence-of-undernourishment.csv,Share of people who are undernourished,Share of the population whose daily food intak...,Food and Agriculture Organization of the Unite...,4683,https://ourworldindata.org/grapher/prevalence-...,2025-09-21
5,daily-per-capita-caloric-supply.csv,Daily supply of calories per person,,Food and Agriculture Organization of the Unite...,13112,https://ourworldindata.org/grapher/daily-per-c...,2025-09-21
6,share-of-the-labor-force-employed-in-agricultu...,Share of the labor force employed in agriculture,,Our World in Data based on International Labor...,6849,https://ourworldindata.org/grapher/share-of-th...,2025-09-21
7,meat-consumption-vs-gdp-per-capita.csv,Meat supply vs. GDP per capita,Quantity that is available for consumption at ...,Food and Agriculture Organization of the Unite...,14525,https://ourworldindata.org/grapher/meat-consum...,2025-09-21
8,population-with-un-projections.csv,Population,"De facto total population in a country, area o...","UN, World Population Prospects (2024)",38656,https://ourworldindata.org/grapher/population-...,2025-09-21
9,population.csv,Population,"Population by country, available from 10,000 B...",HYDE (2023); Gapminder (2022); UN WPP (2024),58890,https://ourworldindata.org/grapher/population,2025-09-21
