# Housing and building stock - municipal dataset update

## Setup

In [298]:
# internal modules
import csv
import os
import sys
import importlib

# external modules
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import time
import xlwings as xw
import yaml
from pathlib import Path
import src.load_data_manager
from src.load_data_manager import LoadDataManager
from typing import List

# Reload
importlib.reload(src.load_data_manager)

<module 'src.load_data_manager' from '/Users/koenvanbemmelen/work/etdataset/pipelines/src/load_data_manager.py'>

## General

#### Parameters

Before we start we need to specify which datasets we want to create or update. This can be done with the `data.csv` file in the `config` directory. Here you can specify the geo ID, parent dataset and name for each region.

Also, make sure to specify the parent dataset and the year in the cell below.

In [None]:
# Select the parent data set. Make sure to use the geo ID. It should be existing in the Dataset Manager.
parent = "nl"

# Specify the year
year_etm = 2019
year = 2023 

# Specify the CSV-separator (presumably either "," or ";")
sep=","

# Either specify the municipalities by using the data.csv file in the config directory.
# Make sure to specify the right separator in the pd.read_csv() function.
path = Path("config", f"municipalities_{year}.csv") # First move this file to pipelines/config. It is moved to the Jupyter notebooks/Archive (pre-2023 dataset update) folder. 
municipalities = pd.read_csv(path, sep=sep)['geo_id'].to_list()

# Preview municipality geo IDs
municipalities

['GM1680',
 'GM0358',
 'GM0197',
 'GM0059',
 'GM0482',
 'GM0613',
 'GM0361',
 'GM0141',
 'GM0034',
 'GM0484',
 'GM1723',
 'GM1959',
 'GM0060',
 'GM0307',
 'GM0362',
 'GM0363',
 'GM0200',
 'GM0202',
 'GM0106',
 'GM0743',
 'GM0744',
 'GM0308',
 'GM0489',
 'GM0203',
 'GM0888',
 'GM1954',
 'GM0889',
 'GM1945',
 'GM1724',
 'GM0893',
 'GM0373',
 'GM0748',
 'GM1859',
 'GM1721',
 'GM0753',
 'GM0209',
 'GM0375',
 'GM0310',
 'GM1728',
 'GM0376',
 'GM0377',
 'GM1901',
 'GM0755',
 'GM1681',
 'GM0147',
 'GM0654',
 'GM0757',
 'GM0758',
 'GM1876',
 'GM0213',
 'GM0899',
 'GM0312',
 'GM0313',
 'GM0214',
 'GM0502',
 'GM0383',
 'GM0109',
 'GM1706',
 'GM0216',
 'GM0148',
 'GM1891',
 'GM0503',
 'GM0762',
 'GM0150',
 'GM0384',
 'GM1980',
 'GM1774',
 'GM0221',
 'GM0222',
 'GM0766',
 'GM0505',
 'GM0498',
 'GM1719',
 'GM0303',
 'GM0225',
 'GM0226',
 'GM1711',
 'GM0385',
 'GM0228',
 'GM0317',
 'GM1979',
 'GM0770',
 'GM1903',
 'GM0772',
 'GM0230',
 'GM0114',
 'GM0388',
 'GM0153',
 'GM0232',
 'GM0233',
 'GM0777',

Specify input file paths and output file paths

In [300]:
load_path = Path("data")

#### Helper functions

In [301]:
# Define helper variables for calculating percentages
PERC = 100

Below you can find the classification of the housing types that PBL uses:

Woningtype (W)

**1** | vrijstaand <br>
**2** | 2-onder-1 kap <br>
**3** | rijwoning hoekwoning <br>
**4** | rijwoning tussenwoning <br>
**5** | appartementen t/m 4 (meergezinswoningen t/m 4 verdiepingen) <br>
**6** | appartementen 5>= (meergezinswoningen 5 of meer verdiepingen)
In the ETM we bundle "2-onder-1 kap" and "rijwoning hoekwoning" to the same category ("hoekhuis"). Same goes "appartementen t/m 4" and "appartementen 5>="; we consider both "appartementen".

In [302]:
# Define method for the classification of housing types
def classify_housing_type(housing_type):
    if housing_type == 1:
        return "Vrijstaand huis"
    elif housing_type == 2 or housing_type == 3:
        return "Hoekhuis"
    elif housing_type == 4:
        return "Rijtjeshuis"
    elif housing_type == 5 or housing_type == 6:
        return "Appartement"

Same goes for the construction year ranges:

Bouwperiode (B)

**0** | Tot en met 1929 <br>
**1** | 1930 t/m 1945 <br>
**2** | 1946 t/m 1964 <br>
**3** | 1965 t/m 1974 <br>
**4** | 1975 t/m 1991 <br>
**5** | 1992 t/m 1995 <br>
**6** | 1996 t/m 1999 <br>
**7** | 2000 t/m 2005 <br>
**8** | 2006 t/m 2010 <br>
**9** | 2011 t/m 2014 <br>
**10** | 2015 t/m 2020 <br>
**11** | 2021 en later

In [303]:
# Define method for the classification of building years
def classify_year(year):
    if year < 1945:
        return "< 1945"
    elif 1945 <= year < 1965:
        return "1945 - 1964"
    elif 1965 <= year < 1985:
        return "1965 - 1984"
    elif 1985 <= year < 2005:
        return "1985 - 2004"
    else:
        return ">= 2005"

The method below maps energy labels to useful heat demand in kWh/m2. This mapping is based on the BENG 2 norm and takes the average of the upper and lower bound between two labels. See the image below.

![image](config/energielabel.jpg)

Source: https://www.lente-akkoord.nl/nieuws/planning-regeling-energielabels

In [304]:
# Define method for the classification of energylabels
def classify_label(label):
    if label == "A" or label == "A+" or label == "A++" or label == "A2+" or label == "A+++" or label == "A3+" or label == "A++++" or label == "A4+" or label == "A+++++" or label == "A5+":
        return 118
    elif label == "B":
        return 175
    elif label == "C":
        return 220
    elif label == "D":
        return 270
    elif label == "E":
        return 313
    elif label == "F":
        return 358
    elif label == "G":
        return 403

## ETLocal template

### Extract

This is handled by the **ETLocalDataModule** which provides a unified interface for all data operations.

The module loads the following data sources:
- The ETLocal template (ETLocal_template_empty.csv)

In [305]:
# Initialize the data manager
data_manager = LoadDataManager(sep=sep)

# Load template filtered for buildings
df_template_local = data_manager.load_template()

# Create a filter the ETLocal keys that are relevant for the households and buildings building stock and insulation level categories
filter_housing_stock_and_insulation = (slice(None), 'households', 'households_housing_stock')

df_template_local

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,unit,value,commit
geo_id,group,subgroup,key,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
GM1680,agriculture,agriculture_heat_chp,agriculture_chp_engine_network_gas_dispatchable_demand,TJ,,
GM1680,agriculture,agriculture_heat_chp,agriculture_chp_engine_biogas_demand,TJ,,
GM1680,agriculture,agriculture_heat_chp,agriculture_chp_wood_pellets_demand,TJ,,
GM1680,agriculture,agriculture_energy_demand,agriculture_final_demand_electricity_demand,TJ,,
GM1680,agriculture,agriculture_energy_demand,agriculture_final_demand_network_gas_demand,TJ,,
...,...,...,...,...,...,...
GM0193,industry,,input_percentage_of_kerosene_industry_final_demand_crude_oil_non_energetic,%,,
GM0193,industry,,input_percentage_of_bio_kerosene_industry_final_demand_crude_oil_non_energetic,%,,
GM0193,industry,,input_percentage_of_lpg_industry_final_demand_crude_oil_non_energetic,%,,
GM0193,industry,,input_percentage_of_bio_oil_industry_final_demand_crude_oil_non_energetic,%,,


## Households

### (one-time functionality) Merging PBL csvs into dataset

This section can be skipped unless you want to reconstruct the `pbl_referentieverbruiken_all_data_from_vivet.csv` file.

The PBL referentieverbruiken data can be found on the VIVET server: https://dataportaal.pbl.nl/VIVET/Referentieverbruik_warmte/Gemeentebestanden_XLS. 

This data, however, consists of 355 Excels (11,5 GB total). The relevant data (tab 'Resultaten gemeente') is calculated upon opening the Excel using resource-intensive formules. The script `gemeente_excel_processor.py` extracts the data from the Excels into 355 separate csv files.

This section merges these csv files into a single csv that can be imported in the [Extract](#extract) section.

#### Combine municipal csv files into one dataframe

This subsection imports all municipal csv files and combines them into one large csv.

In [379]:
# We need the list of 2020 municipalities for the PBL Referentieverbruiken data
path = Path("data", "raw", "municipal_geo_ids_2020.csv")
municipalities_2020 = pd.read_csv(path, sep=sep).iloc[:,0].tolist()
municipalities_2020

['GM1680',
 'GM0358',
 'GM0197',
 'GM0059',
 'GM0482',
 'GM0613',
 'GM0361',
 'GM0141',
 'GM0034',
 'GM0484',
 'GM1723',
 'GM1959',
 'GM0060',
 'GM0307',
 'GM0362',
 'GM0363',
 'GM0200',
 'GM0003',
 'GM0202',
 'GM0106',
 'GM0743',
 'GM0744',
 'GM0308',
 'GM0489',
 'GM0203',
 'GM0888',
 'GM1954',
 'GM0370',
 'GM0889',
 'GM1945',
 'GM1724',
 'GM0893',
 'GM0373',
 'GM0748',
 'GM1859',
 'GM1721',
 'GM0753',
 'GM0209',
 'GM0375',
 'GM0310',
 'GM1728',
 'GM0376',
 'GM0377',
 'GM1901',
 'GM0755',
 'GM1681',
 'GM0147',
 'GM0654',
 'GM0756',
 'GM0757',
 'GM0758',
 'GM0501',
 'GM1876',
 'GM0213',
 'GM0899',
 'GM0312',
 'GM0313',
 'GM0214',
 'GM0502',
 'GM0383',
 'GM0109',
 'GM1706',
 'GM1684',
 'GM0216',
 'GM0148',
 'GM1891',
 'GM0503',
 'GM0010',
 'GM0762',
 'GM0150',
 'GM0384',
 'GM1774',
 'GM0221',
 'GM0222',
 'GM0766',
 'GM0505',
 'GM0498',
 'GM1719',
 'GM0303',
 'GM0225',
 'GM0226',
 'GM1711',
 'GM0385',
 'GM0228',
 'GM0317',
 'GM0770',
 'GM1903',
 'GM0772',
 'GM0230',
 'GM0114',
 'GM0388',

In [381]:
def create_combined_header(df_header: pd.DataFrame) -> List[str]:
        """
        Combineer de eerste twee rijen van de header tot 'Categorie/Parameter' format
        
        Args:
            df_header: DataFrame met de header rijen
            
        Returns:
            List van gecombineerde header namen
        """
        combined_headers = []
        current_category = ""  # Houdt de huidige categorie bij voor doorvoering
        
        for col_idx in range(len(df_header.columns)):
            category = str(df_header.iloc[0, col_idx]) if not pd.isna(df_header.iloc[0, col_idx]) else ""
            parameter = str(df_header.iloc[1, col_idx]) if not pd.isna(df_header.iloc[1, col_idx]) else ""
            
            # Schoon lege waarden op
            category = category.strip() if category != "nan" else ""
            parameter = parameter.strip() if parameter != "nan" else ""
            
            # Update current_category als we een nieuwe categorie hebben
            if category:
                current_category = category
            
            # Combineer tot 'Categorie/Parameter'
            if current_category and parameter:
                # Gebruik current_category (kan van vorige kolom zijn)
                combined_header = f"{current_category}/{parameter}"
            elif parameter:  # Parameter zonder categorie
                combined_header = parameter
            elif current_category:  # Categorie zonder parameter
                combined_header = current_category
            else:  # Beide leeg
                combined_header = f"Column_{col_idx + 1}"
                
            combined_headers.append(combined_header)
            
        return combined_headers

Some csv files were created by saving a copy of the Excel file, tab 'Resultaten gemeente' via Excel. Those files have a ';' or `,` separator. The script below converts these csv files into the same format as the others: first 18 columns, separator ',', proper heading.

This script has become obsolete now that all PBL Excel files have been processed.

In [None]:
# municipalities_with_raw_csv_semicolon_sep = [
#     # "GM0363",
#     # "GM0014",
#     # "GM0034",
#     # "GM0153",
#     # "GM0268",
#     # "GM0599",
#     # "GM0518",
# ]

# municipalities_with_raw_csv_comma_sep = [
#     # "GM0307",
#     # "GM0344",
#     # "GM0345",
#     # "GM0356",
#     # "GM0392",
#     # "GM0400",
#     # "GM0772",
#     # "GM0983",
#     # "GM0995",
#     # "GM1904",
#     # "GM1904",
#     # "GM0202",
#     # "GM0479",
#     # "GM0758",
#     # "GM0855",
#     # "GM0796"
# ]

# municipalities_with_raw_csv_only = municipalities_with_raw_csv_semicolon_sep + municipalities_with_raw_csv_comma_sep

# # define maximum number of columns to be extracted from the dataframe
# max_cols = 18

# for municipality in municipalities_with_raw_csv_only:
#     # Import municipality csv without headers
#     print(f"Processing municipality {municipality}...") # DEBUG
#     # define separator
#     if municipality in municipalities_with_raw_csv_semicolon_sep:
#         separator = ";"
#     else:
#         separator = ","

#     path_gemeente = Path("csv_output", "raw csv files", f"gemeente_{municipality}.csv")
#     df_gemeente = pd.read_csv(path_gemeente, sep=separator, header=None)
#     df_gemeente = df_gemeente.iloc[:, :max_cols]  # Limit to max_cols columns

#     # Extract header rows (first four)
#     df_header = df_gemeente.iloc[:4].copy()
#     # Create combined header where the first two rows are combined into 'Category/Parameter' format
#     combined_headers = create_combined_header(df_header.iloc[:, :max_cols])
#     print(combined_headers)  # DEBUG

#     # Set the combined headers as the dataframe columns
#     df_gemeente.columns = combined_headers
#     # Drop the first four header rows from the dataframe
#     df_gemeente = df_gemeente.iloc[4:].reset_index(drop=True)

#     # Write to file
#     export_path = Path("csv_output", f"gemeente_extracted_{municipality}.csv")
#     df_gemeente.to_csv(export_path, sep=sep, index=False)

Now we combine all separate csv files into one large dataframe.

In [382]:
df_pbl_raw = {}

total_number_of_empty_rows = 0
total_rows = 0

# Import the municipal csv files
for municipality in municipalities_2020:
    print(f"Processing municipality {municipality}...") # DEBUG
    path_municipal_csvs = Path("csv_output", f"gemeente_extracted_{municipality}.csv")
    df_municipality = pd.read_csv(path_municipal_csvs, sep=sep)

    # Check if there are any empty entries in column 'Functionele vraag/ruimteverwarming'
    # If so, count the number of rows and print the share of empty entries
    if df_municipality['Functionele vraag/ruimteverwarming'].isnull().any():
        empty_count = df_municipality['Functionele vraag/ruimteverwarming'].isnull().sum()
        total_number_of_empty_rows += empty_count
        total_count = df_municipality.shape[0]
        total_rows += total_count
        # print(f"Warning: Municipality {municipality} has empty entries in 'Functionele vraag/ruimteverwarming' column.") # DEBUG
        # print(f"Total number of empty entries: {empty_count}") # DEBUG
        # print(f"Share of empty entries: {empty_count / total_count:.2%}") # DEBUG

    df_pbl_raw[municipality] = df_municipality

print(f"Total number of households with missing ruimteverwarming data: {total_number_of_empty_rows}")
print(f"Total number of households: {total_rows}")

# Preview the first dataframe in the dictionary
print(f"Previewing dataframe for municipality {municipalities_2020[0]}:")
df_pbl_raw[municipalities_2020[0]].head()

Processing municipality GM1680...
Processing municipality GM0358...
Processing municipality GM0197...
Processing municipality GM0059...
Processing municipality GM0482...
Processing municipality GM0613...
Processing municipality GM0361...
Processing municipality GM0141...
Processing municipality GM0034...
Processing municipality GM0484...
Processing municipality GM1723...
Processing municipality GM1959...
Processing municipality GM0060...
Processing municipality GM0307...
Processing municipality GM0362...
Processing municipality GM0363...
Processing municipality GM0200...
Processing municipality GM0003...
Processing municipality GM0202...
Processing municipality GM0106...
Processing municipality GM0743...
Processing municipality GM0744...
Processing municipality GM0308...
Processing municipality GM0489...
Processing municipality GM0203...
Processing municipality GM0888...
Processing municipality GM1954...
Processing municipality GM0370...
Processing municipality GM0889...
Processing mun

Unnamed: 0,Woning/vbo_id,Adres/Postcode_huisnummer,Aantal bewoners/Aantal bewoners,Regio/gemeente,Regio/wijk,Regio/buurtcode,Woningkenmerken/Kenmerken,Woningkenmerken/woningtype,Woningkenmerken/bouwperiode,Woningkenmerken/bouwjaar,Woningkenmerken/schillabel,Woningkenmerken/labeldatum,Woningkenmerken/eigendom,Woningkenmerken/oppervlakte,Functionele vraag/koken,Functionele vraag/warm tapwater,Functionele vraag/ruimteverwarming,Functionele vraag/Totaal
0,'1680010000000001','9461GC_6',4,1680,168017,'BU16801700','2 onder 1 kap_1930 - 1945_parthuur',2,1,1935,x,0,1,594,0.591685,6.326814,162.935795,169.854294
1,'1680010000000002','9461HE_3',3,1680,168017,'BU16801700','Vrijstaand_1992 - 1995_koop',1,5,1995,x,0,0,240,0.407233,5.939613,70.359398,76.706244
2,'1680010000000003','9463TA_1',2,1680,168001,'BU16800109','Vrijstaand_voor 1930_koop',1,0,1890,D,20200910,0,110,0.351695,4.650108,51.558154,56.559957
3,'1680010000000004','9468ES_73',2,1680,168000,'BU16800000','2 onder 1 kap_1975 - 1991_parthuur',2,4,1985,C,20181022,1,119,0.333673,4.742337,49.197408,54.273419
4,'1680010000000005','9462RR_7',2,1680,168014,'BU16801400','Vrijstaand_1975 - 1991_koop',1,4,1978,x,0,0,159,0.297287,4.527797,61.370166,66.19525


#### Process 2019-2023 municipality changes

We now process the changes in the municipalities between 2019 and 2023.

The most difficult one is the partition of the gemeente Haaren and subsequent merger with four other municipalities ([link](https://www.cbs.nl/nl-nl/onze-diensten/methoden/classificaties/overig/gemeentelijke-indelingen-per-jaar/indeling-per-jaar/gemeentelijke-indeling-op-1-januari-2021)):

| Opgeheven gemeente (Code) | Opgeheven gemeente (Naam) | Ontvangende gemeente (Code) | Ontvangende gemeente (Naam) | Provincie |
| --- | --- | --- | --- | --- |
| 788 | Haaren | 824 | Oisterwijk | NB. |
| 788 | Haaren | 865 | Vught | NB. |
| 788 | Haaren | 757 | Boxtel | NB. |
| 788 | Haaren¹ | 855 | Tilburg¹ | NB. |

Unfortunately all of these municipalities have received a chunk of the residences or inhabitants of the municipality of Haaren, so we can't really assign the entire municipality to only one of the other. Let's load the data on how Haaren was divvied up.

In [383]:
# Load raw/'CBS opgeheven gemeente Haaren 2021.xlsx'
path = Path("Specific for 2023 dataset update", "CBS opgeheven gemeente Haaren 2021.xlsx")
wb_haaren = xw.Book(str(path))
ws_haaren = wb_haaren.sheets[0]
# Read the data from the sheet into a DataFrame
df_haaren_raw = pd.DataFrame(ws_haaren.used_range.value)
# set the first row as the header
df_haaren_raw.columns = df_haaren_raw.iloc[0]
# drop the first row
df_haaren_raw = df_haaren_raw[1:]
# Close the Excel workbook
wb_haaren.close()
# Preview the data
df_haaren_raw.head()

Unnamed: 0,opgeheven gemeente (code),opgeheven gemeente (naam),provincie,nog bestaande gemeente (code),nog bestaande gemeente (naam),provincie.1,ontvangen inwoners,ontvangen woningen,ontvangen km2 land
1,GM0788,Haaren,NB,GM0757,Boxtel,NB,2149.0,918.0,5.35
2,GM0788,Haaren,NB,GM0824,Oisterwijk,NB,5827.0,2412.0,16.27
3,GM0788,Haaren,NB,GM0855,Tilburg,NB,1442.0,588.0,9.78
4,GM0788,Haaren,NB,GM0865,Vught,NB,4900.0,2085.0,26.3


We base the distribution key on the number of residences that each of the municipalities received. That means we distribute the rows of the GM0788 Haaren dataframe over the other four, simply starting at the top.

In [384]:
# Calculate distribution ratios based on housing counts
total_houses = df_haaren_raw['ontvangen woningen'].sum()
print(f"Total houses from GM0788: {total_houses}")

# Create distribution dictionary
distribution_ratios = {}
for _, row in df_haaren_raw.iterrows():
    gm_code = row['nog bestaande gemeente (code)']
    houses = row['ontvangen woningen']
    ratio = houses / total_houses
    distribution_ratios[gm_code] = ratio
    print(f"{gm_code}: {houses} houses ({ratio:.4f})")

print(f"\nDistribution ratios: {distribution_ratios}")
print(f"Sum of ratios: {sum(distribution_ratios.values()):.4f}")

Total houses from GM0788: 6003.0
GM0757: 918.0 houses (0.1529)
GM0824: 2412.0 houses (0.4018)
GM0855: 588.0 houses (0.0980)
GM0865: 2085.0 houses (0.3473)

Distribution ratios: {'GM0757': 0.15292353823088456, 'GM0824': 0.4017991004497751, 'GM0855': 0.09795102448775612, 'GM0865': 0.3473263368315842}
Sum of ratios: 1.0000


In [398]:
df_pbl_transformed = df_pbl_raw.copy()

# Redistribute GM0788 rows to receiving municipalities
if 'GM0788' in list(df_pbl_raw.keys()):
    no_municipalities = len(df_pbl_transformed.keys())
    print(f"df_pbl_transformed has {no_municipalities} municipalities")
    print("Found GM0788 in df_pbl_raw. Starting row redistribution...")
    
    # Get the original GM0788 data
    gm0788_data = df_pbl_transformed['GM0788'].copy()
    total_rows = len(gm0788_data)
    print(f"Total rows (houses) in GM0788: {total_rows}")
    
    # Calculate number of rows for each receiving municipality
    row_allocation = {}
    start_idx = 0
    
    for i, (gm_code, ratio) in enumerate(distribution_ratios.items()):
        if i == len(distribution_ratios) - 1:  # Last municipality gets remaining rows
            num_rows = total_rows - start_idx
        else:
            num_rows = int(total_rows * ratio)
        
        row_allocation[gm_code] = {
            'start': start_idx,
            'end': start_idx + num_rows,
            'count': num_rows
        }
        
        print(f"{gm_code}: {num_rows} rows ({ratio:.4f}) - indices {start_idx} to {start_idx + num_rows - 1}")
        start_idx += num_rows
    
    # Redistribute the rows
    for gm_code, allocation in row_allocation.items():
        start_idx = allocation['start']
        end_idx = allocation['end']
        rows_to_move = gm0788_data.iloc[start_idx:end_idx]
        
        if gm_code in df_pbl_transformed.keys():
            # Update the Regio/gemeente column to the number only (remove leading 'GM')
            print(f"Column 'Regio/gemeente' before update: {rows_to_move['Regio/gemeente'].unique()}") # DEBUG
            rows_to_move['Regio/gemeente'] = int(gm_code[2:])
            print(f"Column 'Regio/gemeente' after update: {rows_to_move['Regio/gemeente'].unique()}") # DEBUG

            # Append to existing municipality data
            df_pbl_transformed[gm_code] = pd.concat([df_pbl_transformed[gm_code], rows_to_move], ignore_index=True)
            print(f"Added {len(rows_to_move)} rows to existing {gm_code}")
        else:
            # Create new dataframe for municipality
            df_pbl_transformed[gm_code] = rows_to_move.reset_index(drop=True)
            print(f"Created new dataframe {gm_code} with {len(rows_to_move)} rows")

    # Remove the original GM0788 column
    df_pbl_transformed.pop('GM0788')
    print("\nRemoved GM0788 dataframe from df_pbl_transformed")
    
    print(f"\nRow redistribution complete. df_pbl_transformed now has {len(df_pbl_transformed.keys())} municipalities")
else:
    print("GM0788 not found in df_pbl_transformed columns")

# preview data for one of the receiving municipalities
df_pbl_transformed['GM0757'].head()

df_pbl_transformed has 355 municipalities
Found GM0788 in df_pbl_raw. Starting row redistribution...
Total rows (houses) in GM0788: 5994
GM0757: 916 rows (0.1529) - indices 0 to 915
GM0824: 2408 rows (0.4018) - indices 916 to 3323
GM0855: 587 rows (0.0980) - indices 3324 to 3910
GM0865: 2083 rows (0.3473) - indices 3911 to 5993
Column 'Regio/gemeente' before update: [788]
Column 'Regio/gemeente' after update: [757]
Added 916 rows to existing GM0757
Column 'Regio/gemeente' before update: [788]
Column 'Regio/gemeente' after update: [824]
Added 2408 rows to existing GM0824
Column 'Regio/gemeente' before update: [788]
Column 'Regio/gemeente' after update: [855]
Added 587 rows to existing GM0855
Column 'Regio/gemeente' before update: [788]
Column 'Regio/gemeente' after update: [865]
Added 2083 rows to existing GM0865

Removed GM0788 dataframe from df_pbl_transformed

Row redistribution complete. df_pbl_transformed now has 354 municipalities


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  rows_to_move['Regio/gemeente'] = int(gm_code[2:])


Unnamed: 0,Woning/vbo_id,Adres/Postcode_huisnummer,Aantal bewoners/Aantal bewoners,Regio/gemeente,Regio/wijk,Regio/buurtcode,Woningkenmerken/Kenmerken,Woningkenmerken/woningtype,Woningkenmerken/bouwperiode,Woningkenmerken/bouwjaar,Woningkenmerken/schillabel,Woningkenmerken/labeldatum,Woningkenmerken/eigendom,Woningkenmerken/oppervlakte,Functionele vraag/koken,Functionele vraag/warm tapwater,Functionele vraag/ruimteverwarming,Functionele vraag/Totaal
0,'0757010001447209','5282XR_25',2,757,75700,'BU07570002','rijwoning hoek_1965 - 1974_wooncorp',3,3,1966,B,20130427,2,97,0.368541,5.133491,31.761737,37.263769
1,'0757010001447211','5282XR_27',2,757,75700,'BU07570002','rijwoning tussen_1965 - 1974_wooncorp',4,3,1966,B,20130427,2,97,0.370137,5.14038,28.017945,33.528462
2,'0757010001447213','5282XS_28',2,757,75700,'BU07570002','rijwoning hoek_1965 - 1974_wooncorp',3,3,1965,E,20170110,2,126,0.336126,4.689711,45.374234,50.400071
3,'0757010001447215','5282XR_29',2,757,75700,'BU07570002','rijwoning tussen_1965 - 1974_wooncorp',4,3,1966,B,20130427,2,97,0.370137,5.14038,28.017945,33.528462
4,'0757010001447217','5282XR_31',2,757,75700,'BU07570002','rijwoning tussen_1965 - 1974_wooncorp',4,3,1966,B,20130427,2,97,0.370137,5.14038,28.017945,33.528462


We now proceed by calculating the merged municipalities between 2019-2023, starting in 2021.

2021 ([link](https://www.cbs.nl/nl-nl/onze-diensten/methoden/classificaties/overig/gemeentelijke-indelingen-per-jaar/indeling-per-jaar/gemeentelijke-indeling-op-1-januari-2021))
| Opgeheven gemeente (Code) | Opgeheven gemeente (Naam) | Nieuwgevormde gemeente (Code) | Nieuwgevormde gemeente (Naam) | Provincie |
| --- | --- | --- | --- | --- |
| 0003 | Appingedam | 1979 | Eemsdelta | Gr. |
| 0010 | Delfzijl | 1979 | Eemsdelta | Gr. |
| 0024 | Loppersum | 1979 | Eemsdelta | Gr. |



In [400]:
# Merge GM0003, GM0010, and GM0024 into GM1979 (Eemsdelta)
if all(gm_code in df_pbl_transformed.keys() for gm_code in ['GM0003', 'GM0010', 'GM0024']):
    no_municipalities = len(df_pbl_transformed.keys())
    print(f"df_pbl_transformed has {no_municipalities} municipalities")
    print("Merging GM0003 (Appingedam), GM0010 (Delfzijl), and GM0024 (Loppersum) into GM1979 (Eemsdelta)...")
    
    # Get the data from the three source municipalities
    gm0003_data = df_pbl_transformed['GM0003']
    gm0010_data = df_pbl_transformed['GM0010'] 
    gm0024_data = df_pbl_transformed['GM0024']
    
    print(f"GM0003 rows: {len(gm0003_data)}")
    print(f"GM0010 rows: {len(gm0010_data)}")
    print(f"GM0024 rows: {len(gm0024_data)}")
    
    # Update the Regio/gemeente column to GM1979
    gm0003_data['Regio/gemeente'] = 1979
    gm0010_data['Regio/gemeente'] = 1979
    gm0024_data['Regio/gemeente'] = 1979

    # Combine the three municipalities' data
    merged_data = pd.concat([gm0003_data, gm0010_data, gm0024_data], ignore_index=True)
    
    # Create the new GM1979 entry
    df_pbl_transformed['GM1979'] = merged_data
    
    # Remove the original three municipalities
    df_pbl_transformed.pop('GM0003')
    df_pbl_transformed.pop('GM0010')  
    df_pbl_transformed.pop('GM0024')
    
    print(f"Successfully merged into GM1979 with {len(merged_data)} total rows")
    print(f"df_pbl_transformed now has {len(df_pbl_transformed.keys())} municipalities")
    
    # Show sample of merged data
    print("\nSample of merged GM1979 data:")
    print(merged_data.head())
    
else:
    missing_municipalities = [gm for gm in ['GM0003', 'GM0010', 'GM0024'] if gm not in df_pbl_transformed.keys()]
    print(f"Not all required municipalities found in df_pbl_transformed. Missing: {missing_municipalities}")

Not all required municipalities found in df_pbl_transformed. Missing: ['GM0003', 'GM0010', 'GM0024']


Continue with the mergers from 2022

2022 ([link](https://www.cbs.nl/nl-nl/onze-diensten/methoden/classificaties/overig/gemeentelijke-indelingen-per-jaar/indeling-per-jaar/gemeentelijke-indeling-op-1-januari-2022))
| Op te heffen gemeente (Code) | Op te heffen gemeente (Naam) | Nieuwe gemeente (Code) | Nieuwe gemeente (Naam) | Provincie |
| --- | --- | --- | --- | --- |
| 0370 | Beemster | 439 | Purmerend | NH. |
| 0439 | Purmerend | 439 | Purmerend | NH. |
| 0398 | Heerhugowaard | 1980 | Dijk en Waard | NH. |
| 0416 | Langedijk | 1980 | Dijk en Waard | NH. |
| 1685 | Landerd | 1991 | Maashorst | NB. |
| 0856 | Uden | 1991 | Maashorst | NB. |
| 0756 | Boxmeer | 1982 | Land van Cuijk | NB. |
| 1684 | Cuijk | 1982 | Land van Cuijk | NB. |
| 0786 | Grave | 1982 | Land van Cuijk | NB. |
| 0815 | Mill en Sint Hubert | 1982 | Land van Cuijk | NB. |
| 1702 | Sint Anthonis | 1982 | Land van Cuijk | NB. |

In [401]:
# Merge 2022 municipalities

# 1. Merge GM0398 (Heerhugowaard) and GM0416 (Langedijk) into GM1980 (Dijk en Waard)
if all(gm_code in df_pbl_transformed.keys() for gm_code in ['GM0398', 'GM0416']):
    no_municipalities = len(df_pbl_transformed.keys())
    print(f"df_pbl_transformed has {no_municipalities} municipalities")
    print("Merging GM0398 (Heerhugowaard) and GM0416 (Langedijk) into GM1980 (Dijk en Waard)...")
    
    # Get the data from the two source municipalities
    gm0398_data = df_pbl_transformed['GM0398']
    gm0416_data = df_pbl_transformed['GM0416']
    
    print(f"GM0398 rows: {len(gm0398_data)}")
    print(f"GM0416 rows: {len(gm0416_data)}")

    # Update the Regio/gemeente column to GM1980
    gm0398_data['Regio/gemeente'] = 1980
    gm0416_data['Regio/gemeente'] = 1980
    
    # Combine the two municipalities' data
    merged_data = pd.concat([gm0398_data, gm0416_data], ignore_index=True)
    
    # Create the new GM1980 entry
    df_pbl_transformed['GM1980'] = merged_data
    
    # Remove the original two municipalities
    df_pbl_transformed.pop('GM0398')
    df_pbl_transformed.pop('GM0416')
    
    print(f"Successfully merged into GM1980 with {len(merged_data)} total rows")
    print(f"df_pbl_transformed now has {len(df_pbl_transformed.keys())} municipalities")

# 2. Merge GM1685 (Landerd) and GM0856 (Uden) into GM1991 (Maashorst)
if all(gm_code in df_pbl_transformed.keys() for gm_code in ['GM1685', 'GM0856']):
    print("Merging GM1685 (Landerd) and GM0856 (Uden) into GM1991 (Maashorst)...")
    
    # Get the data from the two source municipalities
    gm1685_data = df_pbl_transformed['GM1685']
    gm0856_data = df_pbl_transformed['GM0856']
    
    print(f"GM1685 rows: {len(gm1685_data)}")
    print(f"GM0856 rows: {len(gm0856_data)}")

    # Update the Regio/gemeente column to GM1991
    gm1685_data['Regio/gemeente'] = 1991
    gm0856_data['Regio/gemeente'] = 1991
    
    # Combine the two municipalities' data
    merged_data = pd.concat([gm1685_data, gm0856_data], ignore_index=True)
    
    # Create the new GM1991 entry
    df_pbl_transformed['GM1991'] = merged_data
    
    # Remove the original two municipalities
    df_pbl_transformed.pop('GM1685')
    df_pbl_transformed.pop('GM0856')
    
    print(f"Successfully merged into GM1991 with {len(merged_data)} total rows")
    print(f"df_pbl_transformed now has {len(df_pbl_transformed.keys())} municipalities")

# 3. Merge GM0756 (Boxmeer), GM1684 (Cuijk), GM0786 (Grave), GM0815 (Mill en Sint Hubert), and GM1702 (Sint Anthonis) into GM1982 (Land van Cuijk)
if all(gm_code in df_pbl_transformed.keys() for gm_code in ['GM0756', 'GM1684', 'GM0786', 'GM0815', 'GM1702']):
    print("Merging GM0756 (Boxmeer), GM1684 (Cuijk), GM0786 (Grave), GM0815 (Mill en Sint Hubert), and GM1702 (Sint Anthonis) into GM1982 (Land van Cuijk)...")
    
    # Get the data from the five source municipalities
    gm0756_data = df_pbl_transformed['GM0756']
    gm1684_data = df_pbl_transformed['GM1684']
    gm0786_data = df_pbl_transformed['GM0786']
    gm0815_data = df_pbl_transformed['GM0815']
    gm1702_data = df_pbl_transformed['GM1702']
    
    print(f"GM0756 rows: {len(gm0756_data)}")
    print(f"GM1684 rows: {len(gm1684_data)}")
    print(f"GM0786 rows: {len(gm0786_data)}")
    print(f"GM0815 rows: {len(gm0815_data)}")
    print(f"GM1702 rows: {len(gm1702_data)}")

    # Update the Regio/gemeente column to GM1982
    gm0756_data['Regio/gemeente'] = 1982
    gm1684_data['Regio/gemeente'] = 1982
    gm0786_data['Regio/gemeente'] = 1982
    gm0815_data['Regio/gemeente'] = 1982
    gm1702_data['Regio/gemeente'] = 1982
    
    # Combine the five municipalities' data
    merged_data = pd.concat([gm0756_data, gm1684_data, gm0786_data, gm0815_data, gm1702_data], ignore_index=True)
    
    # Create the new GM1982 entry
    df_pbl_transformed['GM1982'] = merged_data
    
    # Remove the original five municipalities
    df_pbl_transformed.pop('GM0756')
    df_pbl_transformed.pop('GM1684')
    df_pbl_transformed.pop('GM0786')
    df_pbl_transformed.pop('GM0815')
    df_pbl_transformed.pop('GM1702')
    
    print(f"Successfully merged into GM1982 with {len(merged_data)} total rows")
    print(f"df_pbl_transformed now has {len(df_pbl_transformed.keys())} municipalities")

# 4. Merge GM0370 (Beemster) with GM0439 (Purmerend) - GM0439 remains as the code
if all(gm_code in df_pbl_transformed.keys() for gm_code in ['GM0370', 'GM0439']):
    print("Merging GM0370 (Beemster) with GM0439 (Purmerend)...")
    
    # Get the data from the two source municipalities
    gm0370_data = df_pbl_transformed['GM0370']
    gm0439_data = df_pbl_transformed['GM0439']
    
    print(f"GM0370 rows: {len(gm0370_data)}")
    print(f"GM0439 rows: {len(gm0439_data)}")

    # Update the Regio/gemeente column to GM0439
    gm0370_data['Regio/gemeente'] = 439
    
    # Combine the two municipalities' data (GM0439 keeps its code)
    merged_data = pd.concat([gm0370_data, gm0439_data], ignore_index=True)
    
    # Update the existing GM0439 entry with merged data
    df_pbl_transformed['GM0439'] = merged_data
    
    # Remove the original GM0370 municipality
    df_pbl_transformed.pop('GM0370')
    
    print(f"Successfully merged into GM0439 with {len(merged_data)} total rows")
    print(f"df_pbl_transformed now has {len(df_pbl_transformed.keys())} municipalities")

else:
    missing_municipalities = []
    for merger in [['GM0398', 'GM0416'], ['GM1685', 'GM0856'], ['GM0756', 'GM1684', 'GM0786', 'GM0815', 'GM1702'], ['GM0370', 'GM0439']]:
        missing = [gm for gm in merger if gm not in df_pbl_transformed.keys()]
        if missing:
            missing_municipalities.extend(missing)
    
    if missing_municipalities:
        print(f"Not all required municipalities found for 2022 mergers. Missing: {list(set(missing_municipalities))}")

# Preview the transformed data for one of the new municipalities
df_pbl_transformed['GM1979'].head()

df_pbl_transformed has 352 municipalities
Merging GM0398 (Heerhugowaard) and GM0416 (Langedijk) into GM1980 (Dijk en Waard)...
GM0398 rows: 24425
GM0416 rows: 11768
Successfully merged into GM1980 with 36193 total rows
df_pbl_transformed now has 351 municipalities
Merging GM1685 (Landerd) and GM0856 (Uden) into GM1991 (Maashorst)...
GM1685 rows: 6439
GM0856 rows: 19078
Successfully merged into GM1991 with 25517 total rows
df_pbl_transformed now has 350 municipalities
Merging GM0756 (Boxmeer), GM1684 (Cuijk), GM0786 (Grave), GM0815 (Mill en Sint Hubert), and GM1702 (Sint Anthonis) into GM1982 (Land van Cuijk)...
GM0756 rows: 12712
GM1684 rows: 11315
GM0786 rows: 5484
GM0815 rows: 4649
GM1702 rows: 4871
Successfully merged into GM1982 with 39031 total rows
df_pbl_transformed now has 346 municipalities
Merging GM0370 (Beemster) with GM0439 (Purmerend)...
GM0370 rows: 4034
GM0439 rows: 37114
Successfully merged into GM0439 with 41148 total rows
df_pbl_transformed now has 345 municipalities

Unnamed: 0,Woning/vbo_id,Adres/Postcode_huisnummer,Aantal bewoners/Aantal bewoners,Regio/gemeente,Regio/wijk,Regio/buurtcode,Woningkenmerken/Kenmerken,Woningkenmerken/woningtype,Woningkenmerken/bouwperiode,Woningkenmerken/bouwjaar,Woningkenmerken/schillabel,Woningkenmerken/labeldatum,Woningkenmerken/eigendom,Woningkenmerken/oppervlakte,Functionele vraag/koken,Functionele vraag/warm tapwater,Functionele vraag/ruimteverwarming,Functionele vraag/Totaal
0,'0003010000125985','9901KB_16',1,1979,300,'BU00030000','meergezins: laag en midden_2000 - 2005_wooncorp',5,7,2002,B,20181031,2,69,0.148951,3.247479,18.971059,22.36749
1,'0003010000125986','9901KB_20',1,1979,300,'BU00030000','meergezins: laag en midden_2000 - 2005_wooncorp',5,7,2002,C,20181031,2,69,0.148951,3.247479,21.414984,24.811414
2,'0003010000125991','9901AD_15',2,1979,300,'BU00030000','rijwoning tussen_voor 1930_koop',4,0,1925,G,20181030,0,66,0.377568,5.122997,30.164553,35.665118
3,'0003010000125992','9901AD_6',3,1979,300,'BU00030000','2 onder 1 kap_1946 - 1964_koop',2,2,1950,x,0,0,153,0.411708,5.781394,52.965646,59.158748
4,'0003010000125994','9901AD_10_a',2,1979,300,'BU00030000','Vrijstaand_1930 - 1945_parthuur',1,1,1930,x,0,1,45,0.35737,5.040676,46.693591,52.091637


And finally those from 2023. That includes the merger of Weesp with Amsterdam that took place on 24-03-2022.

2023 ([link](https://www.cbs.nl/nl-nl/onze-diensten/methoden/classificaties/overig/gemeentelijke-indelingen-per-jaar/indeling-per-jaar/gemeentelijke-indeling-op-1-januari-2023))
| Op te heffen gemeente (Code) | Op te heffen gemeente (Naam) | Nieuw te vormen gemeente (Code) | Nieuw te vormen gemeente (Naam) | Provincie |
| --- | --- | --- | --- | --- |
| 0501 | Brielle | 1992 | Voorne aan Zee | ZH. |
| 0530 | Hellevoetsluis | 1992 | Voorne aan Zee | ZH. |
| 0614 | Westvoorne | 1992 | Voorne aan Zee | ZH. |
| 0457 | Weesp | 0363 | Amsterdam | NH. |

In [402]:
# Merge 2023 municipalities

# Merge GM0501 (Brielle), GM0530 (Hellevoetsluis), and GM0614 (Westvoorne) into GM1992 (Voorne aan Zee)
if all(gm_code in df_pbl_transformed.keys() for gm_code in ['GM0501', 'GM0530', 'GM0614']):
    no_municipalities = len(df_pbl_transformed.keys())
    print(f"df_pbl_transformed has {no_municipalities} municipalities")
    print("Merging GM0501 (Brielle), GM0530 (Hellevoetsluis), and GM0614 (Westvoorne) into GM1992 (Voorne aan Zee)...")
    
    # Get the data from the three source municipalities
    gm0501_data = df_pbl_transformed['GM0501']
    gm0530_data = df_pbl_transformed['GM0530']
    gm0614_data = df_pbl_transformed['GM0614']
    
    print(f"GM0501 rows: {len(gm0501_data)}")
    print(f"GM0530 rows: {len(gm0530_data)}")
    print(f"GM0614 rows: {len(gm0614_data)}")

    # Update the Regio/gemeente column to GM1992
    gm0501_data['Regio/gemeente'] = 1992
    gm0530_data['Regio/gemeente'] = 1992
    gm0614_data['Regio/gemeente'] = 1992
    
    # Combine the three municipalities' data
    merged_data = pd.concat([gm0501_data, gm0530_data, gm0614_data], ignore_index=True)
    
    # Create the new GM1992 entry
    df_pbl_transformed['GM1992'] = merged_data
    
    # Remove the original three municipalities
    df_pbl_transformed.pop('GM0501')
    df_pbl_transformed.pop('GM0530')
    df_pbl_transformed.pop('GM0614')
    
    print(f"Successfully merged into GM1992 with {len(merged_data)} total rows")
    print(f"df_pbl_transformed now has {len(df_pbl_transformed.keys())} municipalities")
    
    # Show sample of merged data
    print("\nSample of merged GM1992 data:")
    print(merged_data.head())
    
else:
    missing_municipalities = [gm for gm in ['GM0501', 'GM0530', 'GM0614'] if gm not in df_pbl_transformed.keys()]
    print(f"Not all required municipalities found for 2023 mergers. Missing: {missing_municipalities}")

# Merge Weesp with Amsterdam
# Merge GM0457 (Weesp) with GM0363 (Amsterdam) - GM0363 remains as the code
if all(gm_code in df_pbl_transformed.keys() for gm_code in ['GM0457', 'GM0363']):
    print("Merging GM0457 (Weesp) with GM0363 (Amsterdam)...")
    
    # Get the data from the two source municipalities
    gm0457_data = df_pbl_transformed['GM0457']
    gm0363_data = df_pbl_transformed['GM0363']
    
    print(f"GM0457 rows: {len(gm0457_data)}")
    print(f"GM0363 rows: {len(gm0363_data)}")

    # Update the Regio/gemeente column to GM0363
    gm0457_data['Regio/gemeente'] = 363
    
    # Combine the two municipalities' data (GM0363 keeps its code)
    merged_data = pd.concat([gm0457_data, gm0363_data], ignore_index=True)
    
    # Update the existing GM0363 entry with merged data
    df_pbl_transformed['GM0363'] = merged_data
    
    # Remove the original GM0457 municipality
    df_pbl_transformed.pop('GM0457')
    
    print(f"Successfully merged into GM0363 with {len(merged_data)} total rows")
    print(f"df_pbl_transformed now has {len(df_pbl_transformed.keys())} municipalities")
else:
    missing_municipalities = [gm for gm in ['GM0457', 'GM0363'] if gm not in df_pbl_transformed.keys()]
    print(f"Not all required municipalities found for Weesp-Amsterdam merger. Missing: {missing_municipalities}")

# Preview the transformed data for one of the new municipalities
df_pbl_transformed['GM1992'].head()

df_pbl_transformed has 345 municipalities
Merging GM0501 (Brielle), GM0530 (Hellevoetsluis), and GM0614 (Westvoorne) into GM1992 (Voorne aan Zee)...
GM0501 rows: 8099
GM0530 rows: 18057
GM0614 rows: 6936
Successfully merged into GM1992 with 33092 total rows
df_pbl_transformed now has 343 municipalities

Sample of merged GM1992 data:
        Woning/vbo_id Adres/Postcode_huisnummer  \
0  '0501010001998102'               '3232HE_19'   
1  '0501010001998103'               '3237AP_23'   
2  '0501010001998104'               '3232VT_50'   
3  '0501010001998105'               '3232PD_74'   
4  '0501010001998106'               '3237AL_17'   

   Aantal bewoners/Aantal bewoners  Regio/gemeente  Regio/wijk  \
0                                3            1992       50100   
1                                3            1992       50101   
2                                2            1992       50100   
3                                3            1992       50100   
4                           

Unnamed: 0,Woning/vbo_id,Adres/Postcode_huisnummer,Aantal bewoners/Aantal bewoners,Regio/gemeente,Regio/wijk,Regio/buurtcode,Woningkenmerken/Kenmerken,Woningkenmerken/woningtype,Woningkenmerken/bouwperiode,Woningkenmerken/bouwjaar,Woningkenmerken/schillabel,Woningkenmerken/labeldatum,Woningkenmerken/eigendom,Woningkenmerken/oppervlakte,Functionele vraag/koken,Functionele vraag/warm tapwater,Functionele vraag/ruimteverwarming,Functionele vraag/Totaal
0,'0501010001998102','3232HE_19',3,1992,50100,'BU05010006','rijwoning hoek_1992 - 1995_koop',3,5,1992,B,20190720,0,124,0.428785,6.291071,32.429816,39.149671
1,'0501010001998103','3237AP_23',3,1992,50101,'BU05010100','rijwoning tussen_1965 - 1974_koop',4,3,1970,C,20170929,0,113,0.435614,6.204778,31.396987,38.037379
2,'0501010001998104','3232VT_50',2,1992,50100,'BU05010003','rijwoning tussen_1965 - 1974_wooncorp',4,3,1967,C,20211001,2,104,0.322547,4.581434,28.317611,33.221593
3,'0501010001998105','3232PD_74',3,1992,50100,'BU05010006','rijwoning tussen_1992 - 1995_koop',4,5,1995,B,20200118,0,105,0.41689,6.238424,25.008407,31.663721
4,'0501010001998106','3237AL_17',3,1992,50101,'BU05010100','rijwoning hoek_1965 - 1974_koop',3,3,1969,C,20150130,0,120,0.43914,6.219441,37.222765,43.881346


In [None]:
# Check: are the municipalities in df_pbl_transformed the same as the 2023 list of municipalities?
# This means checking the keys of the dictionary df_pbl_transformed
print("Checking if df_pbl_transformed contains all 2023 municipalities...")
muns = list(df_pbl_transformed.keys())
# Calculate non-overlap between muns and municipalities
non_overlap_1 = set(municipalities) - set(muns)
non_overlap_2 = set(muns) - set(municipalities)
non_overlap = list(non_overlap_1.union(non_overlap_2))
print(f"Municipalities not in either list: {non_overlap}")

Checking if df_pbl_transformed contains all 2023 municipalities...
Municipalities not in either list: []


In [None]:
# # open first municipality in df_pbl_transformed
# print(f"Showing data for first municipality in df_pbl_transformed: {municipalities[0]}")
# first_municipality = df_pbl_transformed[municipalities[0]]
# first_municipality

Finally we concatenate the dictionary of dataframes into one big dataframe so we can export it in the next section.

In [413]:
# Load df_pbl_indexed and store it in a dict of dataframes per municipality
dfs = []
for municipality in municipalities:
    print(f"Processing {municipality}")
    df_municipality = df_pbl_transformed[municipality]

    # Add a column 'geo_id' that converts the number Regio/gemeente to the format GMXXXX including possible leading zeros
    df_municipality['geo_id'] = df_municipality['Regio/gemeente'].apply(lambda x: f"GM{int(x):04d}")
    
    # Append dataframe to the list
    dfs.append(df_municipality)

# Concatenate list of dataframes to one big dataframe
print("Concatenating dataframes...")
df_pbl_all_data_2023 = pd.concat(dfs,ignore_index=True)

# # Preview dataframe
df_pbl_all_data_2023

Processing GM1680
Processing GM0358
Processing GM0197
Processing GM0059
Processing GM0482
Processing GM0613
Processing GM0361
Processing GM0141
Processing GM0034
Processing GM0484
Processing GM1723
Processing GM1959
Processing GM0060
Processing GM0307
Processing GM0362
Processing GM0363
Processing GM0200
Processing GM0202
Processing GM0106
Processing GM0743
Processing GM0744
Processing GM0308
Processing GM0489
Processing GM0203
Processing GM0888
Processing GM1954
Processing GM0889
Processing GM1945
Processing GM1724
Processing GM0893
Processing GM0373
Processing GM0748
Processing GM1859
Processing GM1721
Processing GM0753
Processing GM0209
Processing GM0375
Processing GM0310
Processing GM1728
Processing GM0376
Processing GM0377
Processing GM1901
Processing GM0755
Processing GM1681
Processing GM0147
Processing GM0654
Processing GM0757
Processing GM0758
Processing GM1876
Processing GM0213
Processing GM0899
Processing GM0312
Processing GM0313
Processing GM0214
Processing GM0502
Processing

Unnamed: 0,Woning/vbo_id,Adres/Postcode_huisnummer,Aantal bewoners/Aantal bewoners,Regio/gemeente,Regio/wijk,Regio/buurtcode,Woningkenmerken/Kenmerken,Woningkenmerken/woningtype,Woningkenmerken/bouwperiode,Woningkenmerken/bouwjaar,Woningkenmerken/schillabel,Woningkenmerken/labeldatum,Woningkenmerken/eigendom,Woningkenmerken/oppervlakte,Functionele vraag/koken,Functionele vraag/warm tapwater,Functionele vraag/ruimteverwarming,Functionele vraag/Totaal,geo_id
0,'1680010000000001','9461GC_6',4,1680,168017,'BU16801700','2 onder 1 kap_1930 - 1945_parthuur',2,1,1935,x,0,1,594,0.591685,6.326814,162.935795,169.854294,GM1680
1,'1680010000000002','9461HE_3',3,1680,168017,'BU16801700','Vrijstaand_1992 - 1995_koop',1,5,1995,x,0,0,240,0.407233,5.939613,70.359398,76.706244,GM1680
2,'1680010000000003','9463TA_1',2,1680,168001,'BU16800109','Vrijstaand_voor 1930_koop',1,0,1890,D,20200910,0,110,0.351695,4.650108,51.558154,56.559957,GM1680
3,'1680010000000004','9468ES_73',2,1680,168000,'BU16800000','2 onder 1 kap_1975 - 1991_parthuur',2,4,1985,C,20181022,1,119,0.333673,4.742337,49.197408,54.273419,GM1680
4,'1680010000000005','9462RR_7',2,1680,168014,'BU16801400','Vrijstaand_1975 - 1991_koop',1,4,1978,x,0,0,159,0.297287,4.527797,61.370166,66.19525,GM1680
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7932436,'0193010000105649','8011XA_10_A',2,193,19310,'BU01931020','rijwoning tussen_voor 1930_koop',4,0,1901,x,0,0,59,0.373432,5.106137,30.591981,36.07155,GM0193
7932437,'0193010000105687','8025AR_10',3,193,19312,'BU01931200','Vrijstaand_1992 - 1995_koop',1,5,1995,B,20201221,0,233,0.430559,6.044307,70.959288,77.434153,GM0193
7932438,'0193010000105923','8042CV_31',2,193,19321,'BU01932100','Vrijstaand_1975 - 1991_koop',1,4,1987,x,0,0,136,0.300839,4.584817,50.66984,55.555495,GM0193
7932439,'0193010000105959','8012CE_196',1,193,19313,'BU01931320','meergezins: laag en midden_2006 - 2010_parthuur',5,8,2007,A,20201220,1,42,0.145302,3.222681,12.740837,16.10882,GM0193


Let's check if the column geo_id now contains all 2023 municipal ids.

In [415]:
geo_ids = df_pbl_all_data_2023['geo_id'].unique().tolist()
print(f"Number of unique geo_ids in filtered data: {len(geo_ids)}")

# Check: are the municipalities in geo_ids the same as the 2023 list of municipalities?
print("Checking if column 'geo_id' contains all 2023 municipalities...")
# Calculate non-overlap between geo_ids and municipalities
non_overlap_1 = set(municipalities) - set(geo_ids)
non_overlap_2 = set(geo_ids) - set(municipalities)
non_overlap = list(non_overlap_1.union(non_overlap_2))
print(f"Municipalities not in either list: {non_overlap}")

Number of unique geo_ids in filtered data: 342
Checking if column 'geo_id' contains all 2023 municipalities...
Municipalities not in either list: []


#### Filtering the data

Let's check for NaN rows in Functionele vraag/ruimteverwarming and Woningkenmerken/oppervlakte, which are the two main data columns of interest

In [416]:
# count number of rows in df_pbl_all_data_2023['Functionele vraag/ruimteverwarming'] that are NaN
num_nan = df_pbl_all_data_2023['Functionele vraag/ruimteverwarming'].isna().sum()
total_rows = df_pbl_all_data_2023.shape[0]
print(f"Number of NaN entries in 'Functionele vraag/ruimteverwarming': {num_nan} out of {total_rows} total rows ({(num_nan/total_rows)*100:.2f}%)")

# Do the same for Woningkenmerken/oppervlakte
num_nan = df_pbl_all_data_2023['Woningkenmerken/oppervlakte'].isna().sum()
total_rows = df_pbl_all_data_2023.shape[0]
print(f"Number of NaN entries in 'Woningkenmerken/oppervlakte': {num_nan} out of {total_rows} total rows ({(num_nan/total_rows)*100:.2f}%)")

Number of NaN entries in 'Functionele vraag/ruimteverwarming': 501 out of 7932441 total rows (0.01%)
Number of NaN entries in 'Woningkenmerken/oppervlakte': 0 out of 7932441 total rows (0.00%)


Some Functionele vraag/ruimteverwarming values use a `,` decimal separator instead of a `.`. Let's fix that and make sure all values are read the same way.

In [417]:
# Convert values in 'Functionele vraag/ruimteverwarming' with a `,` decimal separator instead of a `.` to float
df_pbl_all_data_2023['Functionele vraag/ruimteverwarming'] = df_pbl_all_data_2023['Functionele vraag/ruimteverwarming'].astype(str).str.replace(',', '.')
df_pbl_all_data_2023['Functionele vraag/ruimteverwarming'] = pd.to_numeric(df_pbl_all_data_2023['Functionele vraag/ruimteverwarming'], errors='coerce') # Convert to numeric, coercing errors to NaN

# count number of rows in df_pbl_all_data_2023['Functionele vraag/ruimteverwarming'] that are NaN
num_nan = df_pbl_all_data_2023['Functionele vraag/ruimteverwarming'].isna().sum()
total_rows = df_pbl_all_data_2023.shape[0]
print(f"After conversion: Number of NaN entries in 'Functionele vraag/ruimteverwarming': {num_nan} out of {total_rows} total rows ({(num_nan/total_rows)*100:.2f}%)")

After conversion: Number of NaN entries in 'Functionele vraag/ruimteverwarming': 501 out of 7932441 total rows (0.01%)


We remove the NaN rows in Functionele vraag/ruimteverwarming

In [418]:
# Filter for non-NaN entries in 'Functionele vraag/ruimteverwarming'
df_pbl_all_data_2023_filtered = df_pbl_all_data_2023[df_pbl_all_data_2023['Functionele vraag/ruimteverwarming'].notna()]

# count number of rows in df_pbl_all_data_2023['Functionele vraag/ruimteverwarming']
total_rows = df_pbl_all_data_2023_filtered.shape[0]
print(f"After filtering: Total rows in df_pbl_all_data_2023: {total_rows}")

After filtering: Total rows in df_pbl_all_data_2023: 7931940


#### Export to file

Summary:
* PBL referentieverbruiken data retrieved from the Excel files misses 7,934,572 - 7,932,441 = 2,131 rows (0.02%) compared to the previously used csv we received from PBL in 2023 (Dropbox: `pbl_all_data_2023.csv`);
* 501 out of these rows (<0.01%) had no data for Functionele vraag/ruimteverwarming.

We now export the resulting dataframe to a csv

In [419]:
df_pbl_all_data_2023_filtered.to_csv(Path("data", "raw","pbl_referentieverbruiken_all_data_from_vivet.csv"), sep=sep, index=True)

### Extract

**Note 25 September 2025**: The dataset for Almere seems to be missing some data as a result of extracting the VIVET Excel files. Compared to the faulty 2023 csvs we received from PBL, for example, the number of `present_number_of_detached_houses_2005_present` is lowered by ~22% (544 houses). 

That seems like something we ought to fix eventually. Currently we don't have time.

In [420]:
extract_path = Path("data", "raw", "pbl_referentieverbruiken_all_data_from_vivet.csv")
df_raw_pbl = pd.read_csv(extract_path, sep=sep)

# set the column geo_id as the index
df_raw_pbl.set_index("geo_id", inplace=True)

# preview
df_raw_pbl.head(10)

  df_raw_pbl = pd.read_csv(extract_path, sep=sep)


Unnamed: 0_level_0,Unnamed: 0,Woning/vbo_id,Adres/Postcode_huisnummer,Aantal bewoners/Aantal bewoners,Regio/gemeente,Regio/wijk,Regio/buurtcode,Woningkenmerken/Kenmerken,Woningkenmerken/woningtype,Woningkenmerken/bouwperiode,Woningkenmerken/bouwjaar,Woningkenmerken/schillabel,Woningkenmerken/labeldatum,Woningkenmerken/eigendom,Woningkenmerken/oppervlakte,Functionele vraag/koken,Functionele vraag/warm tapwater,Functionele vraag/ruimteverwarming,Functionele vraag/Totaal
geo_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1
GM1680,0,'1680010000000001','9461GC_6',4,1680,168017,'BU16801700','2 onder 1 kap_1930 - 1945_parthuur',2,1,1935,x,0,1,594,0.591685,6.326814,162.935795,169.854294
GM1680,1,'1680010000000002','9461HE_3',3,1680,168017,'BU16801700','Vrijstaand_1992 - 1995_koop',1,5,1995,x,0,0,240,0.407233,5.939613,70.359398,76.706244
GM1680,2,'1680010000000003','9463TA_1',2,1680,168001,'BU16800109','Vrijstaand_voor 1930_koop',1,0,1890,D,20200910,0,110,0.351695,4.650108,51.558154,56.559957
GM1680,3,'1680010000000004','9468ES_73',2,1680,168000,'BU16800000','2 onder 1 kap_1975 - 1991_parthuur',2,4,1985,C,20181022,1,119,0.333673,4.742337,49.197408,54.273419
GM1680,4,'1680010000000005','9462RR_7',2,1680,168014,'BU16801400','Vrijstaand_1975 - 1991_koop',1,4,1978,x,0,0,159,0.297287,4.527797,61.370166,66.19525
GM1680,5,'1680010000000007','9468EZ_10',2,1680,168000,'BU16800000','rijwoning hoek_1965 - 1974_wooncorp',3,3,1973,C,20180711,2,84,0.39561,5.250335,39.80482,45.450765
GM1680,6,'1680010000000009','9658PH_59',2,1680,168006,'BU16800600','Vrijstaand_1965 - 1974_koop',1,3,1965,D,20181024,0,152,0.275806,4.426657,61.348057,66.050521
GM1680,7,'1680010000000010','9462PK_29',2,1680,168014,'BU16801400','Vrijstaand_1930 - 1945_koop',1,1,1935,x,0,0,80,0.38056,5.127882,55.815792,61.324235
GM1680,8,'1680010000000011','9451GA_5',3,1680,168019,'BU16801900','rijwoning tussen_1946 - 1964_koop',4,2,1964,D,20180716,0,108,0.507538,6.50386,43.806633,50.818031
GM1680,9,'1680010000000012','9659PA_24',2,1680,168012,'BU16801200','Vrijstaand_voor 1930_koop',1,0,1888,x,0,0,137,0.338688,4.561201,56.544337,61.444226


In [421]:
df_raw_pbl

Unnamed: 0_level_0,Unnamed: 0,Woning/vbo_id,Adres/Postcode_huisnummer,Aantal bewoners/Aantal bewoners,Regio/gemeente,Regio/wijk,Regio/buurtcode,Woningkenmerken/Kenmerken,Woningkenmerken/woningtype,Woningkenmerken/bouwperiode,Woningkenmerken/bouwjaar,Woningkenmerken/schillabel,Woningkenmerken/labeldatum,Woningkenmerken/eigendom,Woningkenmerken/oppervlakte,Functionele vraag/koken,Functionele vraag/warm tapwater,Functionele vraag/ruimteverwarming,Functionele vraag/Totaal
geo_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1
GM1680,0,'1680010000000001','9461GC_6',4,1680,168017,'BU16801700','2 onder 1 kap_1930 - 1945_parthuur',2,1,1935,x,0,1,594,0.591685,6.326814,162.935795,169.854294
GM1680,1,'1680010000000002','9461HE_3',3,1680,168017,'BU16801700','Vrijstaand_1992 - 1995_koop',1,5,1995,x,0,0,240,0.407233,5.939613,70.359398,76.706244
GM1680,2,'1680010000000003','9463TA_1',2,1680,168001,'BU16800109','Vrijstaand_voor 1930_koop',1,0,1890,D,20200910,0,110,0.351695,4.650108,51.558154,56.559957
GM1680,3,'1680010000000004','9468ES_73',2,1680,168000,'BU16800000','2 onder 1 kap_1975 - 1991_parthuur',2,4,1985,C,20181022,1,119,0.333673,4.742337,49.197408,54.273419
GM1680,4,'1680010000000005','9462RR_7',2,1680,168014,'BU16801400','Vrijstaand_1975 - 1991_koop',1,4,1978,x,0,0,159,0.297287,4.527797,61.370166,66.19525
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
GM0193,7932436,'0193010000105649','8011XA_10_A',2,193,19310,'BU01931020','rijwoning tussen_voor 1930_koop',4,0,1901,x,0,0,59,0.373432,5.106137,30.591981,36.07155
GM0193,7932437,'0193010000105687','8025AR_10',3,193,19312,'BU01931200','Vrijstaand_1992 - 1995_koop',1,5,1995,B,20201221,0,233,0.430559,6.044307,70.959288,77.434153
GM0193,7932438,'0193010000105923','8042CV_31',2,193,19321,'BU01932100','Vrijstaand_1975 - 1991_koop',1,4,1987,x,0,0,136,0.300839,4.584817,50.669840,55.555495
GM0193,7932439,'0193010000105959','8012CE_196',1,193,19313,'BU01931320','meergezins: laag en midden_2006 - 2010_parthuur',5,8,2007,A,20201220,1,42,0.145302,3.222681,12.740837,16.10882


### Transform

#### Clean and preprocess

The PBL Referentieverbruiken dataset contains data up to 2020. Let's limit the build years to 2023 to be sure.

In [422]:
# Keep the buildings that were built in 2023 or before.
df_cleaned_pbl = df_raw_pbl[df_raw_pbl['Woningkenmerken/bouwjaar'] <= 2023].copy()

# Preview data
df_cleaned_pbl.head(3)

Unnamed: 0_level_0,Unnamed: 0,Woning/vbo_id,Adres/Postcode_huisnummer,Aantal bewoners/Aantal bewoners,Regio/gemeente,Regio/wijk,Regio/buurtcode,Woningkenmerken/Kenmerken,Woningkenmerken/woningtype,Woningkenmerken/bouwperiode,Woningkenmerken/bouwjaar,Woningkenmerken/schillabel,Woningkenmerken/labeldatum,Woningkenmerken/eigendom,Woningkenmerken/oppervlakte,Functionele vraag/koken,Functionele vraag/warm tapwater,Functionele vraag/ruimteverwarming,Functionele vraag/Totaal
geo_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1
GM1680,0,'1680010000000001','9461GC_6',4,1680,168017,'BU16801700','2 onder 1 kap_1930 - 1945_parthuur',2,1,1935,x,0,1,594,0.591685,6.326814,162.935795,169.854294
GM1680,1,'1680010000000002','9461HE_3',3,1680,168017,'BU16801700','Vrijstaand_1992 - 1995_koop',1,5,1995,x,0,0,240,0.407233,5.939613,70.359398,76.706244
GM1680,2,'1680010000000003','9463TA_1',2,1680,168001,'BU16800109','Vrijstaand_voor 1930_koop',1,0,1890,D,20200910,0,110,0.351695,4.650108,51.558154,56.559957


In [423]:
# Convert the columns Woningkenmerken/oppervlakte and Functionele vraag/ruimteverwarming to numeric, forcing errors to NaN
df_cleaned_pbl['Woningkenmerken/oppervlakte'] = pd.to_numeric(df_cleaned_pbl['Woningkenmerken/oppervlakte'], errors='coerce')
df_cleaned_pbl['Functionele vraag/ruimteverwarming'] = pd.to_numeric(df_cleaned_pbl['Functionele vraag/ruimteverwarming'], errors='coerce')

# Show all rows where Functionele vraag/ruimteverwarming is NaN
df_raw_pbl[df_cleaned_pbl['Functionele vraag/ruimteverwarming'].isna()]

# Check for NaN values in the two columns
# nan_oppervlakte = df_cleaned_pbl['Woningkenmerken/oppervlakte'].isna().sum()
# nan_ruimteverwarming = df_cleaned_pbl['Functionele vraag/ruimteverwarming'].isna().sum()
# total_rows = df_cleaned_pbl.shape[0]
# print(f"NaN values in 'Woningkenmerken/oppervlakte': {nan_oppervlakte} out of {total_rows} ({(nan_oppervlakte/total_rows)*100:.2f}%)")
# print(f"NaN values in 'Functionele vraag/ruimteverwarming': {nan_ruimteverwarming} out of {total_rows} ({(nan_ruimteverwarming/total_rows)*100:.2f}%)")

Unnamed: 0_level_0,Unnamed: 0,Woning/vbo_id,Adres/Postcode_huisnummer,Aantal bewoners/Aantal bewoners,Regio/gemeente,Regio/wijk,Regio/buurtcode,Woningkenmerken/Kenmerken,Woningkenmerken/woningtype,Woningkenmerken/bouwperiode,Woningkenmerken/bouwjaar,Woningkenmerken/schillabel,Woningkenmerken/labeldatum,Woningkenmerken/eigendom,Woningkenmerken/oppervlakte,Functionele vraag/koken,Functionele vraag/warm tapwater,Functionele vraag/ruimteverwarming,Functionele vraag/Totaal
geo_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1


We then add new columns that include the ETM housing type and ETM build period

In [424]:
df_cleaned_pbl['Woningtype ETM'] = df_cleaned_pbl['Woningkenmerken/woningtype'].apply(classify_housing_type)
df_cleaned_pbl['Bouwjaarklasse ETM'] = df_cleaned_pbl['Woningkenmerken/bouwjaar'].apply(classify_year)
# Preview data
df_cleaned_pbl.head(3)

Unnamed: 0_level_0,Unnamed: 0,Woning/vbo_id,Adres/Postcode_huisnummer,Aantal bewoners/Aantal bewoners,Regio/gemeente,Regio/wijk,Regio/buurtcode,Woningkenmerken/Kenmerken,Woningkenmerken/woningtype,Woningkenmerken/bouwperiode,...,Woningkenmerken/schillabel,Woningkenmerken/labeldatum,Woningkenmerken/eigendom,Woningkenmerken/oppervlakte,Functionele vraag/koken,Functionele vraag/warm tapwater,Functionele vraag/ruimteverwarming,Functionele vraag/Totaal,Woningtype ETM,Bouwjaarklasse ETM
geo_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
GM1680,0,'1680010000000001','9461GC_6',4,1680,168017,'BU16801700','2 onder 1 kap_1930 - 1945_parthuur',2,1,...,x,0,1,594,0.591685,6.326814,162.935795,169.854294,Hoekhuis,< 1945
GM1680,1,'1680010000000002','9461HE_3',3,1680,168017,'BU16801700','Vrijstaand_1992 - 1995_koop',1,5,...,x,0,0,240,0.407233,5.939613,70.359398,76.706244,Vrijstaand huis,1985 - 2004
GM1680,2,'1680010000000003','9463TA_1',2,1680,168001,'BU16800109','Vrijstaand_voor 1930_koop',1,0,...,D,20200910,0,110,0.351695,4.650108,51.558154,56.559957,Vrijstaand huis,< 1945


We now select the columns we are interested in

* Woningkenmerken/oppervlakte
* Functionele vraag/ruimteverwarming
* Woningtype ETM
* Bouwjaarklasse ETM
* geo_id

In [425]:
# Specify the columns to keep
columns_to_keep = [
    'Woningkenmerken/oppervlakte',
    'Functionele vraag/ruimteverwarming',
    'Woningtype ETM',
    'Bouwjaarklasse ETM',
    'geo_id'
]

df_housing_stock_pbl = df_cleaned_pbl.reset_index()[columns_to_keep].copy()
# df_housing_stock_pbl.set_index('geo_id', inplace=True)

# Preview data
df_housing_stock_pbl

Unnamed: 0,Woningkenmerken/oppervlakte,Functionele vraag/ruimteverwarming,Woningtype ETM,Bouwjaarklasse ETM,geo_id
0,594,162.935795,Hoekhuis,< 1945,GM1680
1,240,70.359398,Vrijstaand huis,1985 - 2004,GM1680
2,110,51.558154,Vrijstaand huis,< 1945,GM1680
3,119,49.197408,Hoekhuis,1985 - 2004,GM1680
4,159,61.370166,Vrijstaand huis,1965 - 1984,GM1680
...,...,...,...,...,...
7931935,59,30.591981,Rijtjeshuis,< 1945,GM0193
7931936,233,70.959288,Vrijstaand huis,1985 - 2004,GM0193
7931937,136,50.669840,Vrijstaand huis,1985 - 2004,GM0193
7931938,42,12.740837,Appartement,>= 2005,GM0193


In [426]:
list(df_housing_stock_pbl['geo_id'].unique())

['GM1680',
 'GM0358',
 'GM0197',
 'GM0059',
 'GM0482',
 'GM0613',
 'GM0361',
 'GM0141',
 'GM0034',
 'GM0484',
 'GM1723',
 'GM1959',
 'GM0060',
 'GM0307',
 'GM0362',
 'GM0363',
 'GM0200',
 'GM0202',
 'GM0106',
 'GM0743',
 'GM0744',
 'GM0308',
 'GM0489',
 'GM0203',
 'GM0888',
 'GM1954',
 'GM0889',
 'GM1945',
 'GM1724',
 'GM0893',
 'GM0373',
 'GM0748',
 'GM1859',
 'GM1721',
 'GM0753',
 'GM0209',
 'GM0375',
 'GM0310',
 'GM1728',
 'GM0376',
 'GM0377',
 'GM1901',
 'GM0755',
 'GM1681',
 'GM0147',
 'GM0654',
 'GM0757',
 'GM0758',
 'GM1876',
 'GM0213',
 'GM0899',
 'GM0312',
 'GM0313',
 'GM0214',
 'GM0502',
 'GM0383',
 'GM0109',
 'GM1706',
 'GM0216',
 'GM0148',
 'GM1891',
 'GM0503',
 'GM0762',
 'GM0150',
 'GM0384',
 'GM1980',
 'GM1774',
 'GM0221',
 'GM0222',
 'GM0766',
 'GM0505',
 'GM0498',
 'GM1719',
 'GM0303',
 'GM0225',
 'GM0226',
 'GM1711',
 'GM0385',
 'GM0228',
 'GM0317',
 'GM1979',
 'GM0770',
 'GM1903',
 'GM0772',
 'GM0230',
 'GM0114',
 'GM0388',
 'GM0153',
 'GM0232',
 'GM0233',
 'GM0777',

#### Calculate ETM parameters

In this section we calculate the three relevant parameters for the ETM for each housing type - build year combination:
* number of households (#) 
* typical useful heat demand (kWh/m2)
* useful heat demand share (factor)
NOTE: the useful heat demand shares have a `%` unit in the etlocal template but actually need to sum up to 1.

##### Municipal level

We start by grouping the data by 
* geo_id
* Woningtype ETM
* Bouwjaarklasse ETM

In [427]:
groups = [
    'geo_id',
    'Woningtype ETM',
    'Bouwjaarklasse ETM'
]

filter = [
    'geo_id',
    'Woningtype ETM',
    'Bouwjaarklasse ETM',
    'Woningkenmerken/oppervlakte',
    'Functionele vraag/ruimteverwarming'
]

df_housing_stock_pbl_by_groups = df_housing_stock_pbl.reset_index().loc[:, filter].groupby(by=groups).sum()

# Add a column Aantal woningen (#) by counting the number of Woningkenmerken/oppervlakte entries per group
df_housing_stock_pbl_by_groups['Aantal woningen (#)'] = df_housing_stock_pbl.reset_index().loc[:, filter].groupby(by=groups)['Woningkenmerken/oppervlakte'].count()

# Preview
df_housing_stock_pbl_by_groups

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Woningkenmerken/oppervlakte,Functionele vraag/ruimteverwarming,Aantal woningen (#)
geo_id,Woningtype ETM,Bouwjaarklasse ETM,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
GM0014,Appartement,1945 - 1964,981436,361422.900000,13043
GM0014,Appartement,1965 - 1984,1299003,410307.300000,16823
GM0014,Appartement,1985 - 2004,692134,181735.900000,7854
GM0014,Appartement,< 1945,1779442,718240.700000,22091
GM0014,Appartement,>= 2005,750450,161749.400000,10909
...,...,...,...,...,...
GM1992,Vrijstaand huis,1945 - 1964,109328,41100.749841,673
GM1992,Vrijstaand huis,1965 - 1984,166346,56182.520755,892
GM1992,Vrijstaand huis,1985 - 2004,232555,66993.683409,1255
GM1992,Vrijstaand huis,< 1945,289770,97443.044699,1568


Now calculate the ETLocal keys for each municipality:
- Useful demand share (% of useful demand per woningcategory and bouwjaarklasse)
- typical heat demand (kWh/m2)

and the average heat demand (GJ/woning) for general analysis purposes.

In [428]:
# Add columns to grouped dataframe
GJ_to_kWh = 1.0 / 3.6e-3

df_housing_stock_pbl_by_groups_transformed = df_housing_stock_pbl_by_groups.copy()

df_housing_stock_pbl_by_groups_transformed['Functionele vraag ruimteverwarming (aandeel van totaal)'] = float('nan')
df_housing_stock_pbl_by_groups_transformed['Gemiddelde netto warmtevraag (kWh/m2)'] = float('nan')
df_housing_stock_pbl_by_groups_transformed['Gemiddeld warmteverbruik (GJ/woning)'] = float('nan')

# Loop over all municipalities, housing types, and construction periods to calculate the new columns
for municipality in municipalities:
    for housing_type in ['Appartement', 'Hoekhuis', 'Vrijstaand huis', 'Rijtjeshuis']:
        for construction_period in ['< 1945', '1945 - 1964', '1965 - 1984', '1985 - 2004', '>= 2005']:
           
            # Calculate share of heating demand per building year range for each housing type
            # Use the last item (Rijtjeshuis - >= 2005) to force the sum to 1.0
            try:
                if housing_type == 'Rijtjeshuis' and construction_period == '>= 2005':
                    second_to_last_sum = df_housing_stock_pbl_by_groups_transformed.loc[(municipality, slice(None), slice(None)), 'Functionele vraag ruimteverwarming (aandeel van totaal)'].sum().round(5)
                    print(f"Second to last sum for {municipality}: {second_to_last_sum}")  # DEBUG
                    df_housing_stock_pbl_by_groups_transformed.loc[(municipality, housing_type, construction_period), 'Functionele vraag ruimteverwarming (aandeel van totaal)'] = 1.0 - second_to_last_sum
                    total_sum = df_housing_stock_pbl_by_groups_transformed.loc[(municipality, slice(None), slice(None)), 'Functionele vraag ruimteverwarming (aandeel van totaal)'].sum().round(5)
                    print(f"Total sum for {municipality} after adjustment: {total_sum}")  # DEBUG
                else:
                    # calculate share rounded to 5 decimal places
                    df_housing_stock_pbl_by_groups_transformed.loc[(municipality, housing_type, construction_period), 'Functionele vraag ruimteverwarming (aandeel van totaal)'] = (
                        df_housing_stock_pbl_by_groups_transformed.loc[(municipality, housing_type, construction_period), 'Functionele vraag/ruimteverwarming'] / 
                        df_housing_stock_pbl_by_groups_transformed.loc[(municipality, slice(None), slice(None)), 'Functionele vraag/ruimteverwarming'].sum()
                    ).round(5)
            except:
                pass

            # Calculate net heating demand by dividing the Functionele vraag/ruimteverwarming by the total surface for the given housing type / construction period combination 
            try:
                df_housing_stock_pbl_by_groups_transformed.loc[(municipality, housing_type, construction_period), 'Gemiddelde netto warmtevraag (kWh/m2)'] = (
                    df_housing_stock_pbl_by_groups_transformed.loc[(municipality, housing_type, construction_period), 'Functionele vraag/ruimteverwarming'] * GJ_to_kWh / 
                    df_housing_stock_pbl_by_groups_transformed.loc[(municipality, housing_type, construction_period), 'Woningkenmerken/oppervlakte']
                )
            except:
                pass

            try:
                df_housing_stock_pbl_by_groups_transformed.loc[(municipality, housing_type, construction_period), 'Gemiddeld warmteverbruik (GJ/woning)'] = (
                    df_housing_stock_pbl_by_groups_transformed.loc[(municipality, housing_type, construction_period), 'Functionele vraag/ruimteverwarming'] / 
                    df_housing_stock_pbl_by_groups_transformed.loc[(municipality, housing_type, construction_period), 'Aantal woningen (#)']
                )
            except:
                pass

# preview
df_housing_stock_pbl_by_groups_transformed

Second to last sum for GM1680: 0.99876
Total sum for GM1680 after adjustment: 1.0
Second to last sum for GM0358: 0.91562
Total sum for GM0358 after adjustment: 1.0
Second to last sum for GM0197: 0.9956
Total sum for GM0197 after adjustment: 1.0
Second to last sum for GM0059: 0.9987
Total sum for GM0059 after adjustment: 1.0
Second to last sum for GM0482: 0.96998
Total sum for GM0482 after adjustment: 1.0
Second to last sum for GM0613: 0.93195
Total sum for GM0613 after adjustment: 1.0
Second to last sum for GM0361: 0.98698
Total sum for GM0361 after adjustment: 1.0
Second to last sum for GM0141: 0.98714
Total sum for GM0141 after adjustment: 1.0
Second to last sum for GM0034: 0.95003
Total sum for GM0034 after adjustment: 1.0
Second to last sum for GM0484: 0.97743
Total sum for GM0484 after adjustment: 1.0
Second to last sum for GM1723: 0.98298
Total sum for GM1723 after adjustment: 1.0
Second to last sum for GM1959: 0.9835
Total sum for GM1959 after adjustment: 1.0
Second to last sum 

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Woningkenmerken/oppervlakte,Functionele vraag/ruimteverwarming,Aantal woningen (#),Functionele vraag ruimteverwarming (aandeel van totaal),Gemiddelde netto warmtevraag (kWh/m2),Gemiddeld warmteverbruik (GJ/woning)
geo_id,Woningtype ETM,Bouwjaarklasse ETM,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
GM0014,Appartement,1945 - 1964,981436,361422.900000,13043,0.09364,102.294240,27.710105
GM0014,Appartement,1965 - 1984,1299003,410307.300000,16823,0.10630,87.739790,24.389663
GM0014,Appartement,1985 - 2004,692134,181735.900000,7854,0.04708,72.937024,23.139279
GM0014,Appartement,< 1945,1779442,718240.700000,22091,0.18608,112.120151,32.512820
GM0014,Appartement,>= 2005,750450,161749.400000,10909,0.04191,59.871262,14.827152
...,...,...,...,...,...,...,...,...
GM1992,Vrijstaand huis,1945 - 1964,109328,41100.749841,673,0.03680,104.427731,61.070951
GM1992,Vrijstaand huis,1965 - 1984,166346,56182.520755,892,0.05030,93.818041,62.984889
GM1992,Vrijstaand huis,1985 - 2004,232555,66993.683409,1255,0.05998,80.021313,53.381421
GM1992,Vrijstaand huis,< 1945,289770,97443.044699,1568,0.08724,93.410334,62.144799


In [429]:
# Validate the functional heating demand shares sum to 1 per housing type
for municipality in municipalities:
    try:
        total_share = df_housing_stock_pbl_by_groups_transformed.loc[(municipality, slice(None), slice(None)), 'Functionele vraag ruimteverwarming (aandeel van totaal)'].sum()
        print(f"Total share for {municipality}: {total_share}")
        # assert np.isclose(total_share, 1.0), f"Total share for {municipality} does not sum to 1 but to {total_share}"
    except:
        pass

Total share for GM1680: 1.0
Total share for GM0358: 1.0
Total share for GM0197: 1.0
Total share for GM0059: 1.0
Total share for GM0482: 1.0
Total share for GM0613: 1.0
Total share for GM0361: 1.0
Total share for GM0141: 1.0
Total share for GM0034: 1.0
Total share for GM0484: 1.0
Total share for GM1723: 1.0000000000000002
Total share for GM1959: 1.0
Total share for GM0060: 1.0
Total share for GM0307: 1.0
Total share for GM0362: 1.0
Total share for GM0363: 0.9999999999999999
Total share for GM0200: 1.0
Total share for GM0202: 1.0
Total share for GM0106: 1.0
Total share for GM0743: 1.0
Total share for GM0744: 1.0
Total share for GM0308: 1.0000000000000002
Total share for GM0489: 1.0
Total share for GM0203: 1.0
Total share for GM0888: 0.9999999999999999
Total share for GM1954: 1.0
Total share for GM0889: 1.0
Total share for GM1945: 1.0
Total share for GM1724: 1.0
Total share for GM0893: 0.9999999999999999
Total share for GM0373: 1.0
Total share for GM0748: 1.0
Total share for GM1859: 0.999

##### National level


For the national data we do the same operations as above except we also sum for all geo ids.

Group by
* Woningtype ETM
* Bouwjaarklasse ETM

In [430]:
groups = [
    'Woningtype ETM',
    'Bouwjaarklasse ETM'
]

filter = [
    'Woningtype ETM',
    'Bouwjaarklasse ETM',
    'Woningkenmerken/oppervlakte',
    'Functionele vraag/ruimteverwarming'
]

df_housing_stock_pbl_by_groups_nl = df_housing_stock_pbl.reset_index().loc[:, filter].groupby(by=groups).sum()

# Add a column Aantal woningen (#) by counting the number of Woningkenmerken/oppervlakte entries per group
df_housing_stock_pbl_by_groups_nl['Aantal woningen (#)'] = df_housing_stock_pbl.reset_index().loc[:, filter].groupby(by=groups)['Woningkenmerken/oppervlakte'].count()

# Preview
df_housing_stock_pbl_by_groups_nl

Unnamed: 0_level_0,Unnamed: 1_level_0,Woningkenmerken/oppervlakte,Functionele vraag/ruimteverwarming,Aantal woningen (#)
Woningtype ETM,Bouwjaarklasse ETM,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Appartement,1945 - 1964,27877208,10210840.0,389149
Appartement,1965 - 1984,56533027,17513940.0,764098
Appartement,1985 - 2004,49760753,12414340.0,619418
Appartement,< 1945,44384324,16197800.0,540306
Appartement,>= 2005,40635710,7929159.0,487283
Hoekhuis,1945 - 1964,34059905,12830810.0,298688
Hoekhuis,1965 - 1984,71076426,23485750.0,565545
Hoekhuis,1985 - 2004,51070078,13949350.0,379666
Hoekhuis,< 1945,42295031,15752700.0,303172
Hoekhuis,>= 2005,26281732,5257993.0,178343


Now calculate the ETLocal keys for each municipality:
- Useful demand share (% of useful demand per woningcategory and bouwjaarklasse)
- typical heat demand (kWh/m2)

and the average heat demand (GJ/woning) for general analysis purposes.

In [465]:
# Add columns to grouped dataframe
GJ_to_kWh = 1.0 / 3.6e-3

df_housing_stock_pbl_by_groups_transformed_nl = df_housing_stock_pbl_by_groups_nl.copy()

df_housing_stock_pbl_by_groups_transformed_nl['Functionele vraag ruimteverwarming (aandeel van totaal)'] = float('nan')
df_housing_stock_pbl_by_groups_transformed_nl['Gemiddelde netto warmtevraag (kWh/m2)'] = float('nan')
df_housing_stock_pbl_by_groups_transformed_nl['Gemiddeld warmteverbruik (GJ/woning)'] = float('nan')

for housing_type in ['Appartement', 'Hoekhuis', 'Vrijstaand huis', 'Rijtjeshuis']:
    for construction_period in ['< 1945', '1945 - 1964', '1965 - 1984', '1985 - 2004', '>= 2005']:
        
        # Calculate share of heating demand per building year range for each housing type
        # Use the last item (Rijtjeshuis - >= 2005) to force the sum to 1.0
        try:
            if housing_type == 'Rijtjeshuis' and construction_period == '>= 2005':
                second_to_last_sum = df_housing_stock_pbl_by_groups_transformed_nl.loc[(slice(None), slice(None)), 'Functionele vraag ruimteverwarming (aandeel van totaal)'].sum().round(5)
                print(f"Second to last sum for {housing_type}: {second_to_last_sum}")  # DEBUG
                df_housing_stock_pbl_by_groups_transformed_nl.loc[(housing_type, construction_period), 'Functionele vraag ruimteverwarming (aandeel van totaal)'] = 1.0 - second_to_last_sum
                total_sum = df_housing_stock_pbl_by_groups_transformed_nl.loc[(slice(None), slice(None)), 'Functionele vraag ruimteverwarming (aandeel van totaal)'].sum().round(5)
                print(f"Total sum for {housing_type} after adjustment: {total_sum}")  # DEBUG
            else:
                df_housing_stock_pbl_by_groups_transformed_nl.loc[(housing_type, construction_period), 'Functionele vraag ruimteverwarming (aandeel van totaal)'] = (
                    df_housing_stock_pbl_by_groups_transformed_nl.loc[(housing_type, construction_period), 'Functionele vraag/ruimteverwarming'] / 
                    df_housing_stock_pbl_by_groups_transformed_nl.loc[(slice(None), slice(None)), 'Functionele vraag/ruimteverwarming'].sum()
                ).round(5)
        except:
            pass

        # Calculate net heating demand by dividing the Functionele vraag/ruimteverwarming by the total surface for the given housing type / construction period combination 
        try:
            df_housing_stock_pbl_by_groups_transformed_nl.loc[(housing_type, construction_period), 'Gemiddelde netto warmtevraag (kWh/m2)'] = (
                df_housing_stock_pbl_by_groups_transformed_nl.loc[(housing_type, construction_period), 'Functionele vraag/ruimteverwarming'] * GJ_to_kWh / 
                df_housing_stock_pbl_by_groups_transformed_nl.loc[(housing_type, construction_period), 'Woningkenmerken/oppervlakte']
            )
        except:
            pass

        try:
            df_housing_stock_pbl_by_groups_transformed_nl.loc[(housing_type, construction_period), 'Gemiddeld warmteverbruik (GJ/woning)'] = (
                df_housing_stock_pbl_by_groups_transformed_nl.loc[(housing_type, construction_period), 'Functionele vraag/ruimteverwarming'] / 
                df_housing_stock_pbl_by_groups_transformed_nl.loc[(housing_type, construction_period), 'Aantal woningen (#)']
            )
        except:
            pass

# preview
df_housing_stock_pbl_by_groups_transformed_nl

Second to last sum for Rijtjeshuis: 0.97879
Total sum for Rijtjeshuis after adjustment: 1.0


Unnamed: 0_level_0,Unnamed: 1_level_0,Woningkenmerken/oppervlakte,Functionele vraag/ruimteverwarming,Aantal woningen (#),Functionele vraag ruimteverwarming (aandeel van totaal),Gemiddelde netto warmtevraag (kWh/m2),Gemiddeld warmteverbruik (GJ/woning)
Woningtype ETM,Bouwjaarklasse ETM,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
Appartement,1945 - 1964,27877208,10210840.0,389149,0.03731,101.744214,26.238897
Appartement,1965 - 1984,56533027,17513940.0,764098,0.06399,86.05562,22.921072
Appartement,1985 - 2004,49760753,12414340.0,619418,0.04536,69.300132,20.041937
Appartement,< 1945,44384324,16197800.0,540306,0.05918,101.373379,29.978938
Appartement,>= 2005,40635710,7929159.0,487283,0.02897,54.202185,16.272186
Hoekhuis,1945 - 1964,34059905,12830810.0,298688,0.04688,104.642535,42.957244
Hoekhuis,1965 - 1984,71076426,23485750.0,565545,0.08581,91.785973,41.527638
Hoekhuis,1985 - 2004,51070078,13949350.0,379666,0.05096,75.872615,36.741118
Hoekhuis,< 1945,42295031,15752700.0,303172,0.05755,103.45779,51.959619
Hoekhuis,>= 2005,26281732,5257993.0,178343,0.01921,55.572959,29.482475


In [466]:
# Validate the functional heating demand shares sum to 1 per housing type
try:
    total_share = df_housing_stock_pbl_by_groups_transformed_nl.loc[(slice(None), slice(None)), 'Functionele vraag ruimteverwarming (aandeel van totaal)'].sum()
    print(f"Total share for NL: {total_share}")
    assert np.isclose(total_share, 1.0), f"Total share for NL does not sum to 1 but to {total_share}"
except:
    pass

Total share for NL: 0.9999999999999998


### Load

#### Municipal data: load to template

Map ETLocal keys to the indices and columns of the `df_housing_stock_by_groups_transformed` dataframe

In [433]:
mapping_households = {
    'Housing stock': {
        # Number of apartments
        'present_number_of_apartments_before_1945': {'Woningtype ETM': 'Appartement', 'Bouwjaarklasse ETM': '< 1945', 'Categorie': 'Aantal woningen (#)'},
        'present_number_of_apartments_1945_1964': {'Woningtype ETM': 'Appartement', 'Bouwjaarklasse ETM': '1945 - 1964', 'Categorie': 'Aantal woningen (#)'},
        'present_number_of_apartments_1965_1984': {'Woningtype ETM': 'Appartement', 'Bouwjaarklasse ETM': '1965 - 1984', 'Categorie': 'Aantal woningen (#)'},
        'present_number_of_apartments_1985_2004': {'Woningtype ETM': 'Appartement', 'Bouwjaarklasse ETM': '1985 - 2004', 'Categorie': 'Aantal woningen (#)'},
        'present_number_of_apartments_2005_present': {'Woningtype ETM': 'Appartement', 'Bouwjaarklasse ETM': '>= 2005', 'Categorie': 'Aantal woningen (#)'},

        # Number of detached houses
        'present_number_of_detached_houses_before_1945': {'Woningtype ETM': 'Vrijstaand huis', 'Bouwjaarklasse ETM': '< 1945', 'Categorie': 'Aantal woningen (#)'},
        'present_number_of_detached_houses_1945_1964': {'Woningtype ETM': 'Vrijstaand huis', 'Bouwjaarklasse ETM': '1945 - 1964', 'Categorie': 'Aantal woningen (#)'},
        'present_number_of_detached_houses_1965_1984': {'Woningtype ETM': 'Vrijstaand huis', 'Bouwjaarklasse ETM': '1965 - 1984', 'Categorie': 'Aantal woningen (#)'},
        'present_number_of_detached_houses_1985_2004': {'Woningtype ETM': 'Vrijstaand huis', 'Bouwjaarklasse ETM': '1985 - 2004', 'Categorie': 'Aantal woningen (#)'},
        'present_number_of_detached_houses_2005_present': {'Woningtype ETM': 'Vrijstaand huis', 'Bouwjaarklasse ETM': '>= 2005', 'Categorie': 'Aantal woningen (#)'},

        # Number of semi-detached houses
        'present_number_of_semi_detached_houses_before_1945': {'Woningtype ETM': 'Hoekhuis', 'Bouwjaarklasse ETM': '< 1945', 'Categorie': 'Aantal woningen (#)'},
        'present_number_of_semi_detached_houses_1945_1964': {'Woningtype ETM': 'Hoekhuis', 'Bouwjaarklasse ETM': '1945 - 1964', 'Categorie': 'Aantal woningen (#)'},
        'present_number_of_semi_detached_houses_1965_1984': {'Woningtype ETM': 'Hoekhuis', 'Bouwjaarklasse ETM': '1965 - 1984', 'Categorie': 'Aantal woningen (#)'},
        'present_number_of_semi_detached_houses_1985_2004': {'Woningtype ETM': 'Hoekhuis', 'Bouwjaarklasse ETM': '1985 - 2004', 'Categorie': 'Aantal woningen (#)'},
        'present_number_of_semi_detached_houses_2005_present': {'Woningtype ETM': 'Hoekhuis', 'Bouwjaarklasse ETM': '>= 2005', 'Categorie': 'Aantal woningen (#)'},

        # Number of terraced houses
        'present_number_of_terraced_houses_before_1945': {'Woningtype ETM': 'Rijtjeshuis', 'Bouwjaarklasse ETM': '< 1945', 'Categorie': 'Aantal woningen (#)'},
        'present_number_of_terraced_houses_1945_1964': {'Woningtype ETM': 'Rijtjeshuis', 'Bouwjaarklasse ETM': '1945 - 1964', 'Categorie': 'Aantal woningen (#)'},
        'present_number_of_terraced_houses_1965_1984': {'Woningtype ETM': 'Rijtjeshuis', 'Bouwjaarklasse ETM': '1965 - 1984', 'Categorie': 'Aantal woningen (#)'},
        'present_number_of_terraced_houses_1985_2004': {'Woningtype ETM': 'Rijtjeshuis', 'Bouwjaarklasse ETM': '1985 - 2004', 'Categorie': 'Aantal woningen (#)'},
        'present_number_of_terraced_houses_2005_present': {'Woningtype ETM': 'Rijtjeshuis', 'Bouwjaarklasse ETM': '>= 2005', 'Categorie': 'Aantal woningen (#)'},
    },
    'Insulation level': {
        # Typical useful demand for space heating - apartments
        'typical_useful_demand_for_space_heating_apartments_before_1945': {'Woningtype ETM': 'Appartement', 'Bouwjaarklasse ETM': '< 1945', 'Categorie': 'Gemiddelde netto warmtevraag (kWh/m2)'},
        'typical_useful_demand_for_space_heating_apartments_1945_1964': {'Woningtype ETM': 'Appartement', 'Bouwjaarklasse ETM': '1945 - 1964', 'Categorie': 'Gemiddelde netto warmtevraag (kWh/m2)'},
        'typical_useful_demand_for_space_heating_apartments_1965_1984': {'Woningtype ETM': 'Appartement', 'Bouwjaarklasse ETM': '1965 - 1984', 'Categorie': 'Gemiddelde netto warmtevraag (kWh/m2)'},
        'typical_useful_demand_for_space_heating_apartments_1985_2004': {'Woningtype ETM': 'Appartement', 'Bouwjaarklasse ETM': '1985 - 2004', 'Categorie': 'Gemiddelde netto warmtevraag (kWh/m2)'},
        'typical_useful_demand_for_space_heating_apartments_2005_present': {'Woningtype ETM': 'Appartement', 'Bouwjaarklasse ETM': '>= 2005', 'Categorie': 'Gemiddelde netto warmtevraag (kWh/m2)'},

        # Typical useful demand for space heating - detached houses
        'typical_useful_demand_for_space_heating_detached_houses_before_1945': {'Woningtype ETM': 'Vrijstaand huis', 'Bouwjaarklasse ETM': '< 1945', 'Categorie': 'Gemiddelde netto warmtevraag (kWh/m2)'},
        'typical_useful_demand_for_space_heating_detached_houses_1945_1964': {'Woningtype ETM': 'Vrijstaand huis', 'Bouwjaarklasse ETM': '1945 - 1964', 'Categorie': 'Gemiddelde netto warmtevraag (kWh/m2)'},
        'typical_useful_demand_for_space_heating_detached_houses_1965_1984': {'Woningtype ETM': 'Vrijstaand huis', 'Bouwjaarklasse ETM': '1965 - 1984', 'Categorie': 'Gemiddelde netto warmtevraag (kWh/m2)'},
        'typical_useful_demand_for_space_heating_detached_houses_1985_2004': {'Woningtype ETM': 'Vrijstaand huis', 'Bouwjaarklasse ETM': '1985 - 2004', 'Categorie': 'Gemiddelde netto warmtevraag (kWh/m2)'},
        'typical_useful_demand_for_space_heating_detached_houses_2005_present': {'Woningtype ETM': 'Vrijstaand huis', 'Bouwjaarklasse ETM': '>= 2005', 'Categorie': 'Gemiddelde netto warmtevraag (kWh/m2)'},

        # Typical useful demand for space heating - semi-detached houses
        'typical_useful_demand_for_space_heating_semi_detached_houses_before_1945': {'Woningtype ETM': 'Hoekhuis', 'Bouwjaarklasse ETM': '< 1945', 'Categorie': 'Gemiddelde netto warmtevraag (kWh/m2)'},
        'typical_useful_demand_for_space_heating_semi_detached_houses_1945_1964': {'Woningtype ETM': 'Hoekhuis', 'Bouwjaarklasse ETM': '1945 - 1964', 'Categorie': 'Gemiddelde netto warmtevraag (kWh/m2)'},
        'typical_useful_demand_for_space_heating_semi_detached_houses_1965_1984': {'Woningtype ETM': 'Hoekhuis', 'Bouwjaarklasse ETM': '1965 - 1984', 'Categorie': 'Gemiddelde netto warmtevraag (kWh/m2)'},
        'typical_useful_demand_for_space_heating_semi_detached_houses_1985_2004': {'Woningtype ETM': 'Hoekhuis', 'Bouwjaarklasse ETM': '1985 - 2004', 'Categorie': 'Gemiddelde netto warmtevraag (kWh/m2)'},
        'typical_useful_demand_for_space_heating_semi_detached_houses_2005_present': {'Woningtype ETM': 'Hoekhuis', 'Bouwjaarklasse ETM': '>= 2005', 'Categorie': 'Gemiddelde netto warmtevraag (kWh/m2)'},

        # Typical useful demand for space heating - terraced houses
        'typical_useful_demand_for_space_heating_terraced_houses_before_1945': {'Woningtype ETM': 'Rijtjeshuis', 'Bouwjaarklasse ETM': '< 1945', 'Categorie': 'Gemiddelde netto warmtevraag (kWh/m2)'},
        'typical_useful_demand_for_space_heating_terraced_houses_1945_1964': {'Woningtype ETM': 'Rijtjeshuis', 'Bouwjaarklasse ETM': '1945 - 1964', 'Categorie': 'Gemiddelde netto warmtevraag (kWh/m2)'},
        'typical_useful_demand_for_space_heating_terraced_houses_1965_1984': {'Woningtype ETM': 'Rijtjeshuis', 'Bouwjaarklasse ETM': '1965 - 1984', 'Categorie': 'Gemiddelde netto warmtevraag (kWh/m2)'},
        'typical_useful_demand_for_space_heating_terraced_houses_1985_2004': {'Woningtype ETM': 'Rijtjeshuis', 'Bouwjaarklasse ETM': '1985 - 2004', 'Categorie': 'Gemiddelde netto warmtevraag (kWh/m2)'},
        'typical_useful_demand_for_space_heating_terraced_houses_2005_present': {'Woningtype ETM': 'Rijtjeshuis', 'Bouwjaarklasse ETM': '>= 2005', 'Categorie': 'Gemiddelde netto warmtevraag (kWh/m2)'},
    },
    'Useful heat demand': {
        # Share of useful heat demand - apartments
        'present_share_of_apartments_before_1945_in_useful_demand_for_space_heating': {'Woningtype ETM': 'Appartement', 'Bouwjaarklasse ETM': '< 1945', 'Categorie': 'Functionele vraag ruimteverwarming (aandeel van totaal)'},
        'present_share_of_apartments_1945_1964_in_useful_demand_for_space_heating': {'Woningtype ETM': 'Appartement', 'Bouwjaarklasse ETM': '1945 - 1964', 'Categorie': 'Functionele vraag ruimteverwarming (aandeel van totaal)'},
        'present_share_of_apartments_1965_1984_in_useful_demand_for_space_heating': {'Woningtype ETM': 'Appartement', 'Bouwjaarklasse ETM': '1965 - 1984', 'Categorie': 'Functionele vraag ruimteverwarming (aandeel van totaal)'},
        'present_share_of_apartments_1985_2004_in_useful_demand_for_space_heating': {'Woningtype ETM': 'Appartement', 'Bouwjaarklasse ETM': '1985 - 2004', 'Categorie': 'Functionele vraag ruimteverwarming (aandeel van totaal)'},
        'present_share_of_apartments_2005_present_in_useful_demand_for_space_heating': {'Woningtype ETM': 'Appartement', 'Bouwjaarklasse ETM': '>= 2005', 'Categorie': 'Functionele vraag ruimteverwarming (aandeel van totaal)'},

        # Share of useful heat demand - detached houses
        'present_share_of_detached_houses_before_1945_in_useful_demand_for_space_heating': {'Woningtype ETM': 'Vrijstaand huis', 'Bouwjaarklasse ETM': '< 1945', 'Categorie': 'Functionele vraag ruimteverwarming (aandeel van totaal)'},
        'present_share_of_detached_houses_1945_1964_in_useful_demand_for_space_heating': {'Woningtype ETM': 'Vrijstaand huis', 'Bouwjaarklasse ETM': '1945 - 1964', 'Categorie': 'Functionele vraag ruimteverwarming (aandeel van totaal)'},
        'present_share_of_detached_houses_1965_1984_in_useful_demand_for_space_heating': {'Woningtype ETM': 'Vrijstaand huis', 'Bouwjaarklasse ETM': '1965 - 1984', 'Categorie': 'Functionele vraag ruimteverwarming (aandeel van totaal)'},
        'present_share_of_detached_houses_1985_2004_in_useful_demand_for_space_heating': {'Woningtype ETM': 'Vrijstaand huis', 'Bouwjaarklasse ETM': '1985 - 2004', 'Categorie': 'Functionele vraag ruimteverwarming (aandeel van totaal)'},
        'present_share_of_detached_houses_2005_present_in_useful_demand_for_space_heating': {'Woningtype ETM': 'Vrijstaand huis', 'Bouwjaarklasse ETM': '>= 2005', 'Categorie': 'Functionele vraag ruimteverwarming (aandeel van totaal)'},

        # Share of useful heat demand - semi-detached houses
        'present_share_of_semi_detached_houses_before_1945_in_useful_demand_for_space_heating': {'Woningtype ETM': 'Hoekhuis', 'Bouwjaarklasse ETM': '< 1945', 'Categorie': 'Functionele vraag ruimteverwarming (aandeel van totaal)'},
        'present_share_of_semi_detached_houses_1945_1964_in_useful_demand_for_space_heating': {'Woningtype ETM': 'Hoekhuis', 'Bouwjaarklasse ETM': '1945 - 1964', 'Categorie': 'Functionele vraag ruimteverwarming (aandeel van totaal)'},
        'present_share_of_semi_detached_houses_1965_1984_in_useful_demand_for_space_heating': {'Woningtype ETM': 'Hoekhuis', 'Bouwjaarklasse ETM': '1965 - 1984', 'Categorie': 'Functionele vraag ruimteverwarming (aandeel van totaal)'},
        'present_share_of_semi_detached_houses_1985_2004_in_useful_demand_for_space_heating': {'Woningtype ETM': 'Hoekhuis', 'Bouwjaarklasse ETM': '1985 - 2004', 'Categorie': 'Functionele vraag ruimteverwarming (aandeel van totaal)'},
        'present_share_of_semi_detached_houses_2005_present_in_useful_demand_for_space_heating': {'Woningtype ETM': 'Hoekhuis', 'Bouwjaarklasse ETM': '>= 2005', 'Categorie': 'Functionele vraag ruimteverwarming (aandeel van totaal)'},

        # Share of useful heat demand - terraced houses
        'present_share_of_terraced_houses_before_1945_in_useful_demand_for_space_heating': {'Woningtype ETM': 'Rijtjeshuis', 'Bouwjaarklasse ETM': '< 1945', 'Categorie': 'Functionele vraag ruimteverwarming (aandeel van totaal)'},
        'present_share_of_terraced_houses_1945_1964_in_useful_demand_for_space_heating': {'Woningtype ETM': 'Rijtjeshuis', 'Bouwjaarklasse ETM': '1945 - 1964', 'Categorie': 'Functionele vraag ruimteverwarming (aandeel van totaal)'},
        'present_share_of_terraced_houses_1965_1984_in_useful_demand_for_space_heating': {'Woningtype ETM': 'Rijtjeshuis', 'Bouwjaarklasse ETM': '1965 - 1984', 'Categorie': 'Functionele vraag ruimteverwarming (aandeel van totaal)'},
        'present_share_of_terraced_houses_1985_2004_in_useful_demand_for_space_heating': {'Woningtype ETM': 'Rijtjeshuis', 'Bouwjaarklasse ETM': '1985 - 2004', 'Categorie': 'Functionele vraag ruimteverwarming (aandeel van totaal)'},
        'present_share_of_terraced_houses_2005_present_in_useful_demand_for_space_heating': {'Woningtype ETM': 'Rijtjeshuis', 'Bouwjaarklasse ETM': '>= 2005', 'Categorie': 'Functionele vraag ruimteverwarming (aandeel van totaal)'},
    }
}

Add **housing** stock and insulation level values to the (ETLocal) dataset manager template

In [434]:
# Commit messages for number of households, insulation level, and useful heat demand
commit_messages_households = {
    'Housing stock': "The housing stock is calculated using the PBL Referentieverbruiken dataset (2023).",
    'Insulation level': "The typical useful demand for space heating is calculated using the PBL Referentieverbruiken dataset (2023). For each ETM housing type - construction period combination the total Functionele vraag/ruimteverwarming is divided by the total Woningkenmerken/oppervlakte.",
    'Useful heat demand': "The useful heat demand per housing type and construction year is calculated based on the 'Functionele vraag/ruimteverwarming' data from the PBL Referentieverbruiken dataset (2023)."
}

In [435]:
for municipality in municipalities:
    for category, etlocal_keys in mapping_households.items(): # Housing stock, insulation level, useful heat demand
        commit_message = commit_messages_households[category]
        for etlocal_key, source_values in etlocal_keys.items():
            woningtype_etm = source_values['Woningtype ETM']
            # print(f"woningtype_etm: {woningtype_etm}") # DEBUG
            bouwjaarklasse_etm = source_values['Bouwjaarklasse ETM']
            # print(f"bouwjaarklasse_etm: {bouwjaarklasse_etm}") # DEBUG
            housing_stock_by_groups_col_name = source_values['Categorie']
            # print(f"category: {category}, etlocal_key: {etlocal_key}") # DEBUG
            try:
                # print(f"housing_stock_by_groups_col_name: {housing_stock_by_groups_col_name}") # DEBUG
                value = df_housing_stock_pbl_by_groups_transformed.loc[(municipality, woningtype_etm, bouwjaarklasse_etm), housing_stock_by_groups_col_name]
                df_template_local.loc[(municipality, slice(None), slice(None), etlocal_key), 'value'] = value
                df_template_local.loc[(municipality, slice(None), slice(None), etlocal_key), 'commit'] = commit_message
            except KeyError:
                # print(f"Municipality: {municipality}, ETLocal Key: {etlocal_key} not found in DataFrame.") # DEBUG
                df_template_local.loc[(municipality, slice(None), slice(None), etlocal_key), 'value'] = 0
                df_template_local.loc[(municipality, slice(None), slice(None), etlocal_key), 'commit'] = f"No data available for {etlocal_key} in {municipality}. Fallback value set to 0."


In [436]:
# Preview the filtered template for GM0363 Amsterdam
df_template_local.loc[('GM0363', 'households', 'households_housing_stock', slice(None)),'value']

geo_id  group       subgroup                  key                                                                             
GM0363  households  households_housing_stock  number_of_inhabitants                                                                        NaN
                                              residences_roof_surface_available_for_pv                                                     NaN
                                              present_number_of_apartments_before_1945                                            173573.00000
                                              present_number_of_apartments_1945_1964                                               39210.00000
                                              present_number_of_apartments_1965_1984                                               64454.00000
                                                                                                                                      ...     
               

#### National data: load to file

In [None]:
df_housing_stock_pbl_by_groups_transformed_nl.to_csv(Path("data", "processed","housing_stock_by_groups_nl2023_based_on_pbl_vivet_data.csv"), sep=sep)

: 

## Buildings

### Extract

#### Verrijkte BAG 1.0 data

In [None]:
# Specify the path to the raw data file
path = Path("data", "raw", "TNO-2023-P10648_vbobestand.csv")

# Extract the data and turn it into a dataframe
df_raw_bag_tno = pd.read_csv(path, header=[0], sep=",", low_memory=False, encoding='latin1')

In [None]:
# Preview data
df_raw_bag_tno

Unnamed: 0,vboid,vbo_ligt_binnen_x_panden,vboid_x,vboid_binnen_ander_pand,vbo_opp_m2,vbo_opp_cor_m2,vbo_opp_m2_x,pandid,bouwjaar,aantal_vbo_in_dit_pand,...,label,isso_nen,ei_origineel,label_origineel,gf_nta_epa,pand_label_keus,warmtenet,warmtenet_pbl1,warmtenet_pbl,in_ubouwpand
0,1.930100e+14,1.0,v0193010000030663_1,0,548.0,548.0,548.0,193100000029998,1977,1,...,,,,,,F,,0,,1
1,1.600100e+14,1.0,v0160010000051304_1,0,42.0,42.0,42.0,160100001392420,1989,1,...,,,,,,E,,0,,1
2,3.920100e+14,1.0,v0392010000003208_1,0,2982.0,2982.0,2982.0,392100000061537,1993,2,...,,,,,,D,,0,,1
3,6.320100e+14,1.0,v0632010000007072_1,0,1198.0,1198.0,1198.0,632100000015217,1950,1,...,1.0,NTA,,A,Kantoorfunctie,A,nee,0,,1
4,3.630100e+14,1.0,v0363010000810818_1,0,1.0,1.0,1.0,363100012075402,1971,1,...,,,,,,G,,0,,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2614033,2.940100e+14,1.0,v0294010000414631_1,0,80.0,80.0,80.0,294100000413386,1997,1,...,,,,,,C,,0,,1
2614034,3.630100e+14,1.0,v0363010001168516_1,0,77.0,77.0,77.0,363100012238489,2010,120,...,,,,,,A,,1,A'dam Noord en West,1
2614035,6.370100e+14,1.0,v0637010000265529_1,0,80.0,80.0,80.0,637100000157907,1988,76,...,,,,,,C,,0,,1
2614036,6.540100e+14,1.0,v0654010000044162_1,0,73.0,73.0,73.0,654100000096180,2016,8,...,,,,,,A2+,,0,,1


#### Verrijkte BAG 2.0

The BAG 2.0 data from 2025 is currently not used for reasons of time. By email TNO has informed us that the Verrijke BAG 2.0 contains data aggregated to the 'pand' level rather than 'verblijfsobject' level. This dataset is therefore only ~1 mln lines instead of the ~2.6 of the Verrijkte BAG 1.0. 


The data does seem to be useful.
See this link: https://energy.nl/publications/verrijkte-bag-2-0/
And this link for definition of 'pand' and 'verblijfsobject': https://catalogus.kadaster.nl/bag/nl/page/Pand

In [None]:
# path = Path("data", "raw", "VerrijkteBAG_publiek_250625.csv")
# df_raw_bag_publiek = pd.read_csv(path, header=[0], sep=";")

# # Preview data
# df_raw_bag_publiek.head(5)

In [None]:
# df_raw_bag_publiek

In [None]:
# path = Path("data", "raw", "VerrijkteBAG_indicatieve_labels_250625.csv")
# df_raw_bag_indicatieve_labels = pd.read_csv(path, header=[0], sep=";")

# # Preview data
# df_raw_bag_indicatieve_labels.head(5)

In [None]:
# # Merge df_raw_bag_publiek and df_raw_bag_indicatieve_labels on the 'identificatie' column
# df_bag_merged = pd.merge(df_raw_bag_publiek, df_raw_bag_indicatieve_labels, on='identificatie', how='inner')
# df_bag_merged

### Transform
The raw data should be filtered, cleaned and enriched before we can use it.

#### Cleaning and preprocessing the data

First, we need to drop the buildings with the BAG use functions "woon" and "industrie".

In [438]:
# Drop rows for which the value is equal to 1 in the columns 'f1woon' and 'f9industrie'
df_cleaned_bag_tno = df_raw_bag_tno[(df_raw_bag_tno['f1woon'] != 1) & (df_raw_bag_tno['f9industrie'] != 1)]

# Preview data
df_cleaned_bag_tno

Unnamed: 0,vboid,vbo_ligt_binnen_x_panden,vboid_x,vboid_binnen_ander_pand,vbo_opp_m2,vbo_opp_cor_m2,vbo_opp_m2_x,pandid,bouwjaar,aantal_vbo_in_dit_pand,...,label,isso_nen,ei_origineel,label_origineel,gf_nta_epa,pand_label_keus,warmtenet,warmtenet_pbl1,warmtenet_pbl,in_ubouwpand
0,1.930100e+14,1.0,v0193010000030663_1,0,548.0,548.0,548.0,193100000029998,1977,1,...,,,,,,F,,0,,1
1,1.600100e+14,1.0,v0160010000051304_1,0,42.0,42.0,42.0,160100001392420,1989,1,...,,,,,,E,,0,,1
4,3.630100e+14,1.0,v0363010000810818_1,0,1.0,1.0,1.0,363100012075402,1971,1,...,,,,,,G,,0,,1
5,5.460100e+14,3.0,v0546010000074033_3,1,350.0,350.0,117.0,546100000036201,1915,1,...,,,,,,G,,0,,1
6,6.270100e+14,2.0,v0627010000029453_2,1,6116.0,6116.0,3058.0,627100000005741,2009,1,...,1.0,ISSO,0.77,A,Winkelfunctie,A2+,nee,0,,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2614029,4.701000e+13,1.0,v0047010000311630_1,0,36.0,36.0,36.0,47100000264426,1875,2,...,,,,,,G,,0,,1
2614031,6.260100e+14,1.0,v0626010000009793_1,0,17.0,17.0,17.0,626100000007527,1971,11,...,,,,,,G,,0,,1
2614032,6.320100e+14,1.0,v0632010000003623_1,0,15.0,15.0,15.0,632100000020160,1957,7,...,,,,,,G,,0,,1
2614033,2.940100e+14,1.0,v0294010000414631_1,0,80.0,80.0,80.0,294100000413386,1997,1,...,,,,,,C,,0,,1


It appears that the merger of GM0501 (Brielle), GM0530 (Hellevoetsluis), and GM0614 (Westvoorne) into GM1992 (Voorne aan Zee) from 2023 has not been processed in the Verrijkte BAG, because the constituent municipalities still occur. Strangely enough the merger of Weesp (GM0457) with Amsterdam _has_ been processed.

We therefore need to manually fix this.

In [439]:
# Set the gemeente_id columns to GM1992
# for GM0501 (Brielle), GM0530 (Hellevoetsluis), and GM0614 (Westvoorne) 
# and set the gemeentenaam to 'Voorne aan Zee' for these municipalities
df_cleaned_bag_tno.loc[df_cleaned_bag_tno['gemeente_id'] == 'GM0501', 'gemeente_id'] = 'GM1992'
df_cleaned_bag_tno.loc[df_cleaned_bag_tno['gemeente_id'] == 'GM0530', 'gemeente_id'] = 'GM1992'
df_cleaned_bag_tno.loc[df_cleaned_bag_tno['gemeente_id'] == 'GM0614', 'gemeente_id'] = 'GM1992'
df_cleaned_bag_tno.loc[df_cleaned_bag_tno['gemeentenaam'] == 'Brielle', 'gemeentenaam'] = 'Voorne aan Zee'
df_cleaned_bag_tno.loc[df_cleaned_bag_tno['gemeentenaam'] == 'Hellevoetsluis', 'gemeentenaam'] = 'Voorne aan Zee'
df_cleaned_bag_tno.loc[df_cleaned_bag_tno['gemeentenaam'] == 'Westvoorne', 'gemeentenaam'] = 'Voorne aan Zee'

# Preview data
df_cleaned_bag_tno


Unnamed: 0,vboid,vbo_ligt_binnen_x_panden,vboid_x,vboid_binnen_ander_pand,vbo_opp_m2,vbo_opp_cor_m2,vbo_opp_m2_x,pandid,bouwjaar,aantal_vbo_in_dit_pand,...,label,isso_nen,ei_origineel,label_origineel,gf_nta_epa,pand_label_keus,warmtenet,warmtenet_pbl1,warmtenet_pbl,in_ubouwpand
0,1.930100e+14,1.0,v0193010000030663_1,0,548.0,548.0,548.0,193100000029998,1977,1,...,,,,,,F,,0,,1
1,1.600100e+14,1.0,v0160010000051304_1,0,42.0,42.0,42.0,160100001392420,1989,1,...,,,,,,E,,0,,1
4,3.630100e+14,1.0,v0363010000810818_1,0,1.0,1.0,1.0,363100012075402,1971,1,...,,,,,,G,,0,,1
5,5.460100e+14,3.0,v0546010000074033_3,1,350.0,350.0,117.0,546100000036201,1915,1,...,,,,,,G,,0,,1
6,6.270100e+14,2.0,v0627010000029453_2,1,6116.0,6116.0,3058.0,627100000005741,2009,1,...,1.0,ISSO,0.77,A,Winkelfunctie,A2+,nee,0,,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2614029,4.701000e+13,1.0,v0047010000311630_1,0,36.0,36.0,36.0,47100000264426,1875,2,...,,,,,,G,,0,,1
2614031,6.260100e+14,1.0,v0626010000009793_1,0,17.0,17.0,17.0,626100000007527,1971,11,...,,,,,,G,,0,,1
2614032,6.320100e+14,1.0,v0632010000003623_1,0,15.0,15.0,15.0,632100000020160,1957,7,...,,,,,,G,,0,,1
2614033,2.940100e+14,1.0,v0294010000414631_1,0,80.0,80.0,80.0,294100000413386,1997,1,...,,,,,,C,,0,,1


Next, we need to keep buildings up to the maximum available year (2022) for the 2023 municipal dataset update.

In [440]:
# Keep the rows for which the building year is less or equal to 2022
df_cleaned_bag_tno = df_cleaned_bag_tno[df_cleaned_bag_tno['bouwjaar'] <= 2022]

# Check if this went well
df_cleaned_bag_tno['bouwjaar'].max()

np.int64(2022)

The dataset still has a lot of columns we're not interested in. Let's specify the ones we want to keep and filter for those:
* vboid
* vbo_opp_m2
* bouwjaar
* gemeentenaam
* gemeente_id
* pand_label_keus

In [441]:
# Specify the columns to keep
columns_to_keep = [
    'vboid',
    'vbo_opp_m2',
    'bouwjaar',
    'gemeentenaam',
    'gemeente_id',
    'pand_label_keus'
]

df_filtered_bag_tno = df_cleaned_bag_tno[columns_to_keep]

# Preview data
df_filtered_bag_tno.head(3)

Unnamed: 0,vboid,vbo_opp_m2,bouwjaar,gemeentenaam,gemeente_id,pand_label_keus
0,193010000000000.0,548.0,1977,Zwolle,GM0193,F
1,160010000000000.0,42.0,1989,Hardenberg,GM0160,E
4,363010000000000.0,1.0,1971,Amsterdam,GM0363,G


#### Analyzing the data

Let's take a quick look at the surface area distribution of the data

In [442]:
# Analyseer oppervlakte distributie in de dataset
print("Oppervlakte distributie:")
print("=" * 50)

# Statistieken van oppervlaktes
print(f"Minimum oppervlakte: {df_cleaned_bag_tno['vbo_opp_m2'].min()} m2")
print(f"Maximum oppervlakte: {df_cleaned_bag_tno['vbo_opp_m2'].max()} m2")
print(f"Gemiddelde oppervlakte: {df_cleaned_bag_tno['vbo_opp_m2'].mean():.2f} m2")
print(f"Mediaan oppervlakte: {df_cleaned_bag_tno['vbo_opp_m2'].median()} m2")
print()

# Count van verschillende kleine oppervlaktes
print("Aantal gebouwen per zeer kleine oppervlakte:")
for size in [1, 2, 3, 4, 5, 10, 15, 20]:
    count = len(df_cleaned_bag_tno[df_cleaned_bag_tno['vbo_opp_m2'] == size])
    if count > 0:
        print(f"{size} m2: {count} gebouwen ({count/len(df_cleaned_bag_tno)*100:.2f}%)")

print()

# Bekijk oppervlakte ranges
print("Gebouwen per oppervlakte range:")
ranges = [
    (1, 1, "Exact 1 m2"),
    (1, 5, "1-5 m2"), 
    (6, 10, "6-10 m2"),
    (11, 20, "11-20 m2"),
    (21, 50, "21-50 m2"),
    (51, 100, "51-100 m2")
]

for min_size, max_size, label in ranges:
    count = len(df_cleaned_bag_tno[
        (df_cleaned_bag_tno['vbo_opp_m2'] >= min_size) & 
        (df_cleaned_bag_tno['vbo_opp_m2'] <= max_size)
    ])
    print(f"{label}: {count} gebouwen ({count/len(df_cleaned_bag_tno)*100:.2f}%)")

Oppervlakte distributie:
Minimum oppervlakte: 1.0 m2
Maximum oppervlakte: 999999.0 m2
Gemiddelde oppervlakte: 294.38 m2
Mediaan oppervlakte: 48.0 m2

Aantal gebouwen per zeer kleine oppervlakte:
1 m2: 10387 gebouwen (1.00%)
2 m2: 1044 gebouwen (0.10%)
3 m2: 1923 gebouwen (0.18%)
4 m2: 2631 gebouwen (0.25%)
5 m2: 3612 gebouwen (0.35%)
10 m2: 5068 gebouwen (0.49%)
15 m2: 40102 gebouwen (3.84%)
20 m2: 22003 gebouwen (2.11%)

Gebouwen per oppervlakte range:
Exact 1 m2: 10387 gebouwen (1.00%)
1-5 m2: 19597 gebouwen (1.88%)
6-10 m2: 23156 gebouwen (2.22%)
11-20 m2: 319461 gebouwen (30.62%)
21-50 m2: 109379 gebouwen (10.48%)
51-100 m2: 152919 gebouwen (14.65%)


Let's drop the building with 999999 m2 of surface area

In [443]:
# Drop the buildings with a surface area of 999999 m2
df_filtered_bag_tno = df_filtered_bag_tno[df_filtered_bag_tno['vbo_opp_m2'] != 999999]

# check if this went well
print(f"Max surface area after filtering: {df_filtered_bag_tno['vbo_opp_m2'].max()} m2")

Max surface area after filtering: 400000.0 m2


There seem to be a surprising number of very small buildings in the dataset. Let's analyse what these buildings are.

In [444]:
# Analyseer gebouwen met <5 m2 oppervlakte
small_buildings = df_cleaned_bag_tno[df_cleaned_bag_tno['vbo_opp_m2'] < 4.0]
print(f"Aantal gebouwen met <5 m2: {len(small_buildings)}")
print(f"Percentage van totaal: {len(small_buildings) / len(df_cleaned_bag_tno) * 100:.2f}%")
print()

# Bekijk de functies van deze kleine gebouwen
print("BAG functies van <5 m2 gebouwen:")
function_columns = [col for col in df_cleaned_bag_tno.columns if col.startswith('f') and col[1:].isdigit()]
for col in function_columns:
    if col in small_buildings.columns:
        count = small_buildings[col].sum()
        if count > 0:
            print(f"{col}: {count} gebouwen")

print()

# Bekijk bouwjaren van <5 m2 gebouwen
print("Bouwjaar distributie van <5 m2 gebouwen:")
print(small_buildings['bouwjaar'].value_counts().head(10))

print()

# Bekijk gemeenten met meeste <5 m2 gebouwen
print("Top 10 gemeenten met meeste <5 m2 gebouwen:")
if 'gemeentenaam' in small_buildings.columns:
    print(small_buildings['gemeentenaam'].value_counts().head(10))

print()

# Bekijk energielabels van <5 m2 gebouwen
print("Energielabels van <5 m2 gebouwen:")
print(small_buildings['pand_label_keus'].value_counts())

# Analyse van <5m2 gebouwen in de originele dataset
small_buildings_orig = df_cleaned_bag_tno[df_cleaned_bag_tno['vbo_opp_m2'] < 5.0]
print(f"Aantal <5 m2 gebouwen in originele dataset: {len(small_buildings_orig)}")

print()

if len(small_buildings_orig) > 0:
    # Kijk naar functie kolommen
    print("\nFunctie analyse van <5 m2 gebouwen:")
    for col in function_columns:
        if col in small_buildings_orig.columns:
            # Tel hoeveel gebouwen deze functie hebben (waarde = 1)
            if small_buildings_orig[col].dtype in ['int64', 'float64']:
                count = small_buildings_orig[col].sum()
                if count > 0:
                    print(f"{col}: {count} gebouwen")

# Preview enkele voorbeelden
print("\nVoorbeelden van <5 m2 gebouwen:")
display(small_buildings[['vboid', 'bouwjaar', 'gemeentenaam', 'pand_label_keus', 'vbo_opp_m2']].head(10))

Aantal gebouwen met <5 m2: 13354
Percentage van totaal: 1.28%

BAG functies van <5 m2 gebouwen:

Bouwjaar distributie van <5 m2 gebouwen:
bouwjaar
1967    870
1968    600
1965    563
1960    507
1959    417
1961    407
1969    378
1962    358
1966    329
1931    325
Name: count, dtype: int64

Top 10 gemeenten met meeste <5 m2 gebouwen:
gemeentenaam
Amsterdam           9722
Utrecht              234
Lochem               142
Weststellingwerf     110
Kampen               105
Alkmaar              102
Ooststellingwerf     100
Ommen                 92
Zeist                 91
Dalfsen               86
Name: count, dtype: int64

Energielabels van <5 m2 gebouwen:
pand_label_keus
G      7910
F      1726
A       750
C       740
A2+     524
E       491
A+      399
D       385
B       269
A4+     107
A3+      53
Name: count, dtype: int64
Aantal <5 m2 gebouwen in originele dataset: 15985


Functie analyse van <5 m2 gebouwen:

Voorbeelden van <5 m2 gebouwen:


Unnamed: 0,vboid,bouwjaar,gemeentenaam,pand_label_keus,vbo_opp_m2
4,363010000000000.0,1971,Amsterdam,G,1.0
30,363010000000000.0,1957,Amsterdam,G,1.0
152,1859010000000000.0,1993,Berkelland,D,3.0
216,148010000000000.0,2005,Dalfsen,A,3.0
299,193010000000000.0,1999,Zwolle,B,2.0
367,363010000000000.0,1969,Amsterdam,G,1.0
425,363010000000000.0,1967,Amsterdam,G,1.0
474,363010000000000.0,1965,Amsterdam,G,1.0
984,363010000000000.0,1958,Amsterdam,G,1.0
1004,363010000000000.0,1967,Amsterdam,G,1.0


These small buildings probably don't add up to much energy use but let's check that later on.

#### Enriching the data with ETM classification for energy labels

First, create a copy of the dataframe. Then, set the index to the BAG VBO ID.

In [445]:
# Create a copy of the data
df_building_stock = df_filtered_bag_tno.copy()

# Set a multi-level index based on the municipal code and the BAG VBO ID
index_columns = ['vboid']
df_building_stock.set_index(index_columns, inplace=True)

# Preview data
df_building_stock.head(3)

Unnamed: 0_level_0,vbo_opp_m2,bouwjaar,gemeentenaam,gemeente_id,pand_label_keus
vboid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
193010000000000.0,548.0,1977,Zwolle,GM0193,F
160010000000000.0,42.0,1989,Hardenberg,GM0160,E
363010000000000.0,1.0,1971,Amsterdam,GM0363,G


Calculate the net heat demand in kWh/m2 as well as the useful heat demand in kWh for each building:

In [446]:
# Apply the classification function to the "Pand_energieklasse" column
df_building_stock['Netto warmtevraag (kWh/m2)'] = df_building_stock['pand_label_keus'].apply(classify_label)

# Calculate useful demand based on the EPI 
df_building_stock['Functionele vraag ruimteverwarming (kWh)'] = df_building_stock['vbo_opp_m2'] * df_building_stock['Netto warmtevraag (kWh/m2)']

# Preview data
df_building_stock

Unnamed: 0_level_0,vbo_opp_m2,bouwjaar,gemeentenaam,gemeente_id,pand_label_keus,Netto warmtevraag (kWh/m2),Functionele vraag ruimteverwarming (kWh)
vboid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
1.930100e+14,548.0,1977,Zwolle,GM0193,F,358,196184.0
1.600100e+14,42.0,1989,Hardenberg,GM0160,E,313,13146.0
3.630100e+14,1.0,1971,Amsterdam,GM0363,G,403,403.0
5.460100e+14,350.0,1915,Leiden,GM0546,G,403,141050.0
6.270100e+14,6116.0,2009,Waddinxveen,GM0627,A2+,118,721688.0
...,...,...,...,...,...,...,...
4.701000e+13,36.0,1875,Veendam,GM0047,G,403,14508.0
6.260100e+14,17.0,1971,Voorschoten,GM0626,G,403,6851.0
6.320100e+14,15.0,1957,Woerden,GM0632,G,403,6045.0
2.940100e+14,80.0,1997,Winterswijk,GM0294,C,220,17600.0


#### Grouping the data

In [447]:
groups = [
    'gemeente_id'
]

filter = [
    'gemeente_id',
    'vbo_opp_m2',
    'Functionele vraag ruimteverwarming (kWh)'
]

df_building_stock_by_groups = df_building_stock.loc[:, filter].groupby(by=groups).sum()

# Preview
df_building_stock_by_groups

Unnamed: 0_level_0,vbo_opp_m2,Functionele vraag ruimteverwarming (kWh)
gemeente_id,Unnamed: 1_level_1,Unnamed: 2_level_1
GM0014,5455756.0,1.342173e+09
GM0034,2520116.0,4.075398e+08
GM0037,391694.0,9.980311e+07
GM0047,330811.0,8.852588e+07
GM0050,357397.0,7.100362e+07
...,...,...
GM1979,460870.0,1.234622e+08
GM1980,865967.0,1.761288e+08
GM1982,1048583.0,2.645844e+08
GM1991,882509.0,1.889169e+08


#### Enriching the data by adding the average net heat demand and the number of buildings

In [448]:
# Add column for the average net heat demand to grouped dataframe and calculate the values
df_building_stock_by_groups['Gemiddelde netto warmtevraag (kWh/m2)'] = df_building_stock_by_groups['Functionele vraag ruimteverwarming (kWh)'] / df_building_stock_by_groups['vbo_opp_m2']

# Preview
df_building_stock_by_groups

Unnamed: 0_level_0,vbo_opp_m2,Functionele vraag ruimteverwarming (kWh),Gemiddelde netto warmtevraag (kWh/m2)
gemeente_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
GM0014,5455756.0,1.342173e+09,246.010423
GM0034,2520116.0,4.075398e+08,161.714709
GM0037,391694.0,9.980311e+07,254.798677
GM0047,330811.0,8.852588e+07,267.602574
GM0050,357397.0,7.100362e+07,198.668752
...,...,...,...
GM1979,460870.0,1.234622e+08,267.889531
GM1980,865967.0,1.761288e+08,203.389771
GM1982,1048583.0,2.645844e+08,252.325620
GM1991,882509.0,1.889169e+08,214.067911


Let's do the same but then excluding all buildings with surface area < 5 m2

In [449]:
# Filter df_building_stock to exclude buildings with an area of less than 5 m2
df_building_stock_filtered = df_building_stock[df_building_stock['vbo_opp_m2'] >= 5]

# group the filtered dataframe by municipality and calculate the sum of the functional heat demand
df_building_stock_filtered_by_groups = df_building_stock_filtered.loc[:, filter].groupby(by=groups).sum()

# preview
df_building_stock_filtered_by_groups

Unnamed: 0_level_0,vbo_opp_m2,Functionele vraag ruimteverwarming (kWh)
gemeente_id,Unnamed: 1_level_1,Unnamed: 2_level_1
GM0014,5455722.0,1.342164e+09
GM0034,2520068.0,4.075282e+08
GM0037,391687.0,9.980229e+07
GM0047,330775.0,8.852049e+07
GM0050,357373.0,7.099896e+07
...,...,...
GM1979,460859.0,1.234589e+08
GM1980,865802.0,1.760708e+08
GM1982,1048537.0,2.645722e+08
GM1991,882450.0,1.888993e+08


In [450]:
# Calculate average net heat demand for the filtered data
df_building_stock_filtered_by_groups['Gemiddelde netto warmtevraag (kWh/m2)'] = df_building_stock_filtered_by_groups['Functionele vraag ruimteverwarming (kWh)'] / df_building_stock_filtered_by_groups['vbo_opp_m2']

# Calculate the difference in average net heat demand between the filtered and unfiltered data
df_diff_average_net_heat_demand = df_building_stock_by_groups['Gemiddelde netto warmtevraag (kWh/m2)'] - df_building_stock_filtered_by_groups['Gemiddelde netto warmtevraag (kWh/m2)']

# List the municipalities for which the average net heat demand differs by more than a threshold
threshold = 0.5 # kWh/m2
df_diff_average_net_heat_demand[abs(df_diff_average_net_heat_demand) > threshold]

Series([], Name: Gemiddelde netto warmtevraag (kWh/m2), dtype: float64)

Fortunately the <5 m2 buildings don't significantly change the average net heat demand. Let's leave them in the data for now.

In the 2023 municipal dataset update, the number of buildings is expressed as resident-equivalents ('woningequivalenten', weq). For utilities one weq = 130 m2 of floor surface area (source: https://www.nplw.nl/uploads/files/Warmteprogramma/Handreiking-Warmteprogramma-NPLW.pdf, p. 89). This equivalence will also appear in the Begrippenkader Warmte, which is not yet published as of September 2025.

We therefore calculate the number of buildings per municipality by dividing the surface area by the weq.

In [451]:
weq = 130 # m2/building

# Add column for the number of buildings expressed in weq, rounded to the nearest integer
df_building_stock_by_groups['Aantal gebouwen (weq)'] = np.round(df_building_stock_by_groups['vbo_opp_m2'] / weq)
df_building_stock_by_groups


Unnamed: 0_level_0,vbo_opp_m2,Functionele vraag ruimteverwarming (kWh),Gemiddelde netto warmtevraag (kWh/m2),Aantal gebouwen (weq)
gemeente_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
GM0014,5455756.0,1.342173e+09,246.010423,41967.0
GM0034,2520116.0,4.075398e+08,161.714709,19386.0
GM0037,391694.0,9.980311e+07,254.798677,3013.0
GM0047,330811.0,8.852588e+07,267.602574,2545.0
GM0050,357397.0,7.100362e+07,198.668752,2749.0
...,...,...,...,...
GM1979,460870.0,1.234622e+08,267.889531,3545.0
GM1980,865967.0,1.761288e+08,203.389771,6661.0
GM1982,1048583.0,2.645844e+08,252.325620,8066.0
GM1991,882509.0,1.889169e+08,214.067911,6789.0


Finally, we expect that the <5 m2 buildings don't make a difference for the number of buildings either. Let's check that.

In [452]:
# Add column for the number of buildings expressed in weq, rounded to the nearest integer
df_building_stock_filtered_by_groups['Aantal gebouwen (weq)'] = np.round(df_building_stock_filtered_by_groups['vbo_opp_m2'] / weq)

df_diff_number_of_buildings = (df_building_stock_filtered_by_groups['Aantal gebouwen (weq)'] - df_building_stock_by_groups['Aantal gebouwen (weq)']) / df_building_stock_by_groups['Aantal gebouwen (weq)'] * 100 # Percentage difference
threshold = 1 # Percentage difference threshold
df_diff_number_of_buildings[abs(df_diff_number_of_buildings) > threshold] # Only show differences larger than 1%

Series([], Name: Aantal gebouwen (weq), dtype: float64)

#### Summing municipal data to national values

Also here, make sure to calculate the same thing for the Netherlands (**nl2023**) as a whole (instead of per municipality)

In [453]:
# Sum the columns to get the values for nl2023
df_building_stock_by_groups_nl2023 = pd.DataFrame({'nl2023': df_building_stock_by_groups.sum()})

# Calculate the average net heat demand again and overwrite the (incorrect) value
df_building_stock_by_groups_nl2023.loc['Gemiddelde netto warmtevraag (kWh/m2)', 'nl2023'] = df_building_stock_by_groups_nl2023.loc['Functionele vraag ruimteverwarming (kWh)', 'nl2023'] / df_building_stock_by_groups_nl2023.loc['vbo_opp_m2', 'nl2023']

# Add the average building surface area in m2
# df_building_stock_by_groups_nl2023.loc['Gemiddelde oppervlakte (m2)', 'nl2023'] = df_building_stock_by_groups_nl2023.loc['vbo_opp_m2', 'nl2023'] / df_building_stock_by_groups_nl2023.loc['Aantal gebouwen (#)', 'nl2023']

# Preview data
df_building_stock_by_groups_nl2023

Unnamed: 0,nl2023
vbo_opp_m2,270622000.0
Functionele vraag ruimteverwarming (kWh),65587200000.0
Gemiddelde netto warmtevraag (kWh/m2),242.3572
Aantal gebouwen (weq),2081704.0


Optionally we can load the municipal data to file.

In [454]:
# # Specify path for the to be created CSV file
# path = Path("data", "intermediate", "municipal_building_stock_by_groups.csv")

# # Write the dataframe to this path
# df_building_stock_by_groups.to_csv(path)

### Load to template

#### Municipal data: load to template

Here we upload the Buildings data into the ETLocal template. 

Preview the ETLocal keys that are relevant for the **buildings** energy demand category

In [455]:
# Filter the ETLocal keys that are relevant for the households and buildings building stock and insulation level categories
filter_building_stock_and_insulation = (slice(None), 'buildings', 'buildings_building_stock')

# Preview the filtered template
df_template_local.loc[filter_building_stock_and_insulation, :]

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,unit,value,commit
geo_id,group,subgroup,key,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
GM1680,buildings,buildings_building_stock,present_number_of_buildings,#,2206.000000,The present number of buildings is expressed i...
GM1680,buildings,buildings_building_stock,buildings_roof_surface_available_for_pv,km<sup>2</sup>,,
GM1680,buildings,buildings_building_stock,typical_useful_demand_for_space_heating_buildings_present,kWh/m<sup>2</sup>,249.777253,The typical useful demand for space heating in...
GM1680,buildings,buildings_building_stock,typical_useful_demand_for_space_heating_buildings_future,kWh/m<sup>2</sup>,,
GM0358,buildings,buildings_building_stock,present_number_of_buildings,#,2787.000000,The present number of buildings is expressed i...
...,...,...,...,...,...,...
GM0642,buildings,buildings_building_stock,typical_useful_demand_for_space_heating_buildings_future,kWh/m<sup>2</sup>,,
GM0193,buildings,buildings_building_stock,present_number_of_buildings,#,22273.000000,The present number of buildings is expressed i...
GM0193,buildings,buildings_building_stock,buildings_roof_surface_available_for_pv,km<sup>2</sup>,,
GM0193,buildings,buildings_building_stock,typical_useful_demand_for_space_heating_buildings_present,kWh/m<sup>2</sup>,222.436674,The typical useful demand for space heating in...


In [456]:
# Filter the ETLocal keys that are relevant for the households and buildings building stock and insulation level categories
filter_building_stock_and_insulation = (slice(None), 'buildings', 'buildings_building_stock')

# Preview the filtered template
df_template_local.loc[filter_building_stock_and_insulation, :]

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,unit,value,commit
geo_id,group,subgroup,key,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
GM1680,buildings,buildings_building_stock,present_number_of_buildings,#,2206.000000,The present number of buildings is expressed i...
GM1680,buildings,buildings_building_stock,buildings_roof_surface_available_for_pv,km<sup>2</sup>,,
GM1680,buildings,buildings_building_stock,typical_useful_demand_for_space_heating_buildings_present,kWh/m<sup>2</sup>,249.777253,The typical useful demand for space heating in...
GM1680,buildings,buildings_building_stock,typical_useful_demand_for_space_heating_buildings_future,kWh/m<sup>2</sup>,,
GM0358,buildings,buildings_building_stock,present_number_of_buildings,#,2787.000000,The present number of buildings is expressed i...
...,...,...,...,...,...,...
GM0642,buildings,buildings_building_stock,typical_useful_demand_for_space_heating_buildings_future,kWh/m<sup>2</sup>,,
GM0193,buildings,buildings_building_stock,present_number_of_buildings,#,22273.000000,The present number of buildings is expressed i...
GM0193,buildings,buildings_building_stock,buildings_roof_surface_available_for_pv,km<sup>2</sup>,,
GM0193,buildings,buildings_building_stock,typical_useful_demand_for_space_heating_buildings_present,kWh/m<sup>2</sup>,222.436674,The typical useful demand for space heating in...


In [457]:
# List all ETLocal keys that are relevant for the households energy demand category
keys_building_stock_and_insulation = list(df_template_local.loc[filter_building_stock_and_insulation, :].index.get_level_values(3).unique())

# Preview list
keys_building_stock_and_insulation

['present_number_of_buildings',
 'buildings_roof_surface_available_for_pv',
 'typical_useful_demand_for_space_heating_buildings_present',
 'typical_useful_demand_for_space_heating_buildings_future']

Add **building** stock and insulation level values to the (ETLocal) dataset manager template

In [458]:
# TODO: this could also be a CSV transformed into a data frame--is this more readable for the notebook user?
mapping_buildings = {
    
    # Number of buildings
    'present_number_of_buildings': 'Aantal gebouwen (weq)',
    
    # Insulation level buildings
    'typical_useful_demand_for_space_heating_buildings_present': 'Gemiddelde netto warmtevraag (kWh/m2)'   

}

In [459]:
commit_messages_buildings = {
    'present_number_of_buildings': f"The present number of buildings is expressed in woningequivalent (weq) and derives from the 'verrijkte BAG' by TNO (source: https://energy.nl/publications/verrijkte-bag-energetische-vraagstukken/). The total floor surface area in a municipality is divided by 130 m2/weq (Handreiking Warmteprogramma, NPLW, 2024, p. 89).",
    'typical_useful_demand_for_space_heating_buildings_present': f"The typical useful demand for space heating in existing buildings is based on the 'verrijkte BAG' by TNO (source: https://energy.nl/publications/verrijkte-bag-energetische-vraagstukken/). The energy labels are mapped to a typical demand in kWh/m2 and then combined with the floor surface area from the 'verrijkte BAG' dataset to give an average typical demand."
}    

In [460]:
for municipality in municipalities:
    for etlocal_key, building_stock_by_groups_col_name in mapping_buildings.items():
        commit_message = commit_messages_buildings[etlocal_key]
        try:
            value = df_building_stock_by_groups.loc[municipality, building_stock_by_groups_col_name]
            df_template_local.loc[(municipality, slice(None), slice(None), etlocal_key), 'value'] = value
            df_template_local.loc[(municipality, slice(None), slice(None), etlocal_key), 'commit'] = commit_message
        except KeyError:
            df_template_local.loc[(municipality, slice(None), slice(None), etlocal_key), 'value'] = 0
            df_template_local.loc[(municipality, slice(None), slice(None), etlocal_key), 'commit'] = f"No data available for {etlocal_key} in {municipality}. Fallback value set to 0."

In [461]:
# Preview the filtered template
df_template_local.loc[filter_building_stock_and_insulation, :]

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,unit,value,commit
geo_id,group,subgroup,key,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
GM1680,buildings,buildings_building_stock,present_number_of_buildings,#,2206.000000,The present number of buildings is expressed i...
GM1680,buildings,buildings_building_stock,buildings_roof_surface_available_for_pv,km<sup>2</sup>,,
GM1680,buildings,buildings_building_stock,typical_useful_demand_for_space_heating_buildings_present,kWh/m<sup>2</sup>,249.777253,The typical useful demand for space heating in...
GM1680,buildings,buildings_building_stock,typical_useful_demand_for_space_heating_buildings_future,kWh/m<sup>2</sup>,,
GM0358,buildings,buildings_building_stock,present_number_of_buildings,#,2787.000000,The present number of buildings is expressed i...
...,...,...,...,...,...,...
GM0642,buildings,buildings_building_stock,typical_useful_demand_for_space_heating_buildings_future,kWh/m<sup>2</sup>,,
GM0193,buildings,buildings_building_stock,present_number_of_buildings,#,22273.000000,The present number of buildings is expressed i...
GM0193,buildings,buildings_building_stock,buildings_roof_surface_available_for_pv,km<sup>2</sup>,,
GM0193,buildings,buildings_building_stock,typical_useful_demand_for_space_heating_buildings_present,kWh/m<sup>2</sup>,222.436674,The typical useful demand for space heating in...


The 'future' values are still missing. These are imported from the national dataset in the Preprocessing notebook.

#### National data: load to file

In [462]:
# Specify path for the to be created CSV file
path = Path("data", "processed", "building_stock_by_groups_nl2023.csv")

# Write the dataframe to this path
df_building_stock_by_groups_nl2023.to_csv(path)

## Export template

In [463]:
path = Path("data","processed", "etlocal_template_built_environment_stock_filled.csv")
# Save the template to a CSV file
df_template_local.to_csv(path, index=True)
print(f"Template saved to {path}")

Template saved to data/processed/etlocal_template_built_environment_stock_filled.csv
