In [1]:
import os
import datetime
import requests
import zipfile
import pandas as pd
import geopandas as gpd
import h3
from shapely.geometry import Point, shape
from multiprocessing import Pool
from bs4 import BeautifulSoup

In [2]:
def scrape_urls(base_url):
    """
    This function scrapes all URLs from a webpage.

    Args:
        base_url (str): The URL of the webpage to scrape.

    Returns:
        list: A list of URLs found on the webpage.
    """
    try:
        response = requests.get(base_url)
        response.raise_for_status()
        soup = BeautifulSoup(response.content, 'html.parser')
        urls = [a['href'] for a in soup.find_all('a', href=True) if 'cbs_vk100' in a['href']]
        return urls
    except Exception as e:
        print(f"Failed to scrape {base_url} due to {e}")
        return []

In [3]:
def clip_gpkg(gpkg_path, shapefile_path, output_path, epsg_code=4326):
    """
    This function clips a GeoPackage to the extent of a shapefile and saves the result as a GeoJSON.

    Args:
        gpkg_path (str): Path to the GeoPackage.
        shapefile_path (str): Path to the shapefile.
        output_path (str): Path to save the clipped GeoJSON.
        epsg_code (int): EPSG code for coordinate system. Default is 4326 (WGS84).

    """
    try:
        geo_df = gpd.read_file(gpkg_path)
        geo_df = geo_df.to_crs(epsg=epsg_code)
        clip_geo_df = gpd.read_file(shapefile_path)
        clip_geo_df = clip_geo_df.to_crs(epsg=epsg_code)
        clipped = gpd.clip(geo_df, clip_geo_df)
        clipped.to_file(output_path, driver='GeoJSON')
    except Exception as e:
        print(f"Error clipping GeoPackage: {e}")

In [4]:
def download_and_extract(url, year, download_path, extract_path):
    """
    This function downloads a ZIP file from a URL and extracts its contents.

    Args:
        url (str): The URL of the ZIP file.
        year (int): The year of the data.
        download_path (str): Path to save the downloaded ZIP file.
        extract_path (str): Path to extract the contents of the ZIP file.

    Returns:
        bool: True if the file was successfully downloaded and extracted, False otherwise.
    """
    try:
        response = requests.get(url)
        response.raise_for_status()
        with open(download_path, 'wb') as f:
            f.write(response.content)
        with zipfile.ZipFile(download_path, 'r') as zip_ref:
            zip_ref.extractall(extract_path)
        return True
    except Exception as e:
        print(f"Error downloading or extracting file: {e}")
        return False

In [5]:
def process_feature(feature, resolution):
    """
    This function processes a single GeoDataFrame feature and converts geometry to H3 cell.

    Args:
        feature (tuple): A single row from the DataFrame, represented as a tuple.
        resolution (int): H3 resolution (0-17).

    Returns:
        tuple: A tuple containing the H3 cell and population data.
    """
    try:
        aantal_inwoners, geometry = feature
        centroid = geometry.centroid
        h3_cell = h3.geo_to_h3(centroid.y, centroid.x, resolution)
        return h3_cell, aantal_inwoners
    except Exception as e:
        print(f"Error processing feature: {e}")
        return None, None

In [6]:
def process_geojson(clip_gdf, resolution, output_dir):
    """
    This function processes a GeoJSON file containing population data and aggregates it to H3 cells.

    Args:
        clip_gdf (str): URL or path to the GeoJSON file.
        resolution (int): H3 resolution (0-17).
        output_dir (str): Directory to save the resulting CSV files.
    """
    try:
        gdf = gpd.read_file(clip_gdf)
        gdf = gdf.to_crs(epsg=4326)

        n_cores = os.cpu_count()
        with Pool(processes=n_cores) as pool:
            for column in gdf.columns:
                if column != 'geometry':
                    # Check if column data is numeric
                    if pd.api.types.is_numeric_dtype(gdf[column]):
                        # Replace negative values with 0
                        gdf[column] = gdf[column].clip(lower=0)


                    data = list(gdf[[column, 'geometry']].itertuples(index=False, name=None))
                    results = pool.starmap(process_feature, zip(data, [resolution] * len(gdf)))
                    h3_data, population_data = zip(*results)
                    df = pd.DataFrame({'hex9': h3_data, 'value': population_data})

                    # Group by 'hex9' and sum 'value'
                    df = df.groupby('hex9')['value'].sum().reset_index()

                    output_filepath = os.path.join(output_dir, f'{column}_h3.csv')
                    df.to_csv(output_filepath, index=False)
    except Exception as e:
        print(f"Error processing data: {e}")


In [7]:
def main():
    """
    This is the main function that orchestrates the execution of all other functions.
    """
    current_year = datetime.datetime.now().year -1
    base_url = 'https://www.cbs.nl/nl-nl/dossier/nederland-regionaal/geografische-data/kaart-van-100-meter-bij-100-meter-met-statistieken#:~:text=In%20deze%20kaart%20met%20vierkanten,en%20nabijheid%20van%20voorzieningen%20samengesteld.'
    urls = scrape_urls(base_url)
    for year in range(current_year - 1, current_year - 12, -1):
        # Create separate directories for downloads, extracts, and outputs
        download_dir = './downloads'
        extract_dir = './extracts'
        output_dir = './outputs'
        os.makedirs(download_dir, exist_ok=True)
        os.makedirs(extract_dir, exist_ok=True)
        os.makedirs(output_dir, exist_ok=True)

        download_path = os.path.join(download_dir, f'cbs_{year}.zip')
        extract_path = os.path.join(extract_dir, f'cbs_{year}')
        url_gpkg = os.path.join(extract_path, f'cbs_vk100_{year}_v1.gpkg')

        url = next((u for u in urls if str(year) in u), None)
        if url:
            # Check if the file already exists
            if not os.path.exists(download_path):
                if download_and_extract(url, year, download_path, extract_path):
                    clip_gpkg(url_gpkg, '../shapefiles/zh_poly.shp', './clipped.geojson')
            process_geojson('./clipped.geojson', 9, output_dir)
            break
        else:
            print(f"No URL available for {year}")

In [8]:
if __name__ == "__main__":
    main()