# Create and/or update (Dutch) municipal datasets

**TO DO** | Add "getting started" documentation

In [1]:
# internal modules
import os
import sys

# external modules
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import requests
import seaborn as sns
import xlwings as xw
import yaml
from pathlib import Path

# project modules
import config.config as config
from src.checks import Checker
from src.extract import PblService
from src.transform import Transformer

## Scope

Before we start we need to specify which datasets we want to create or update. This can be done with the `data.csv` file in the `config` directory. Here you can specify the geo ID, parent dataset and name for each region.

Also, make sure to specify the parent dataset and the year in the cell below.

**TO DO** | Add variable all_municipalities (based on the default etlocal config)

In [2]:
# Select the parent data set. Make sure to use the geo ID. It should be existing in the Dataset Manager.
parent = "nl"

# Specify the year
year = 2019

# Specify the CSV-separator (presumably either "," or ";")
sep=","

# Either specify the municipalities by using the data.csv file in the config directory.
# Make sure to specify the right separator in the pd.read_csv() function.
path = Path("config", "data.csv")
municipalities = pd.read_csv(path, sep=sep)['geo_id'].to_list()

# Preview municipality geo IDs
# municipalities

## Setup

In this step, we will setup some files and data which is relevant for all different sectors:
* Collecting the parent dataset from the ETM dataset manager
* Initializing the (empty) ETLocal template

#### Collect the parent dataset from the ETM dataset manager 

In case certain data points are not available for the given region, we want to inherit the values of the parent dataset. For the municipality datasets, the parent is the NL2019 dataset.

##### Extract
In this step we call the ETM Dataset Manager API to get the parent dataset.

In [3]:
# Connect to the ETM dataset manager API and collect the parent dataset 
response = requests.get(f"https://data.energytransitionmodel.com/api/v1/exports/{parent}")

In [4]:
# Check if the response has been successful. If the code is anything else than 200, something went wrong.
response.status_code

200

Here is an overview of the different status codes and what these imply:

- **200**: Everything went okay, and the result has been returned (if any).
- **301**: The server is redirecting you to a different endpoint. This can happen when a company switches domain names, or an endpoint name is changed.
- **400**: The server thinks you made a bad request. This can happen when you don’t send along the right data, among other things.
- **401**: The server thinks you’re not authenticated. Many APIs require login ccredentials, so this happens when you don’t send the right credentials to access an API.
- **403**: The resource you’re trying to access is forbidden: you don’t have the right perlessons to see it.
- **404**: The resource you tried to access wasn’t found on the server.
- **503**: The server is not ready to handle the request.

In [5]:
# In that case, print the error message to see what caused the error
response.reason

'OK'

##### Transform
Create a dataframe from the API response in which the parent dataset is included.

In [6]:
# If the reponse has been successful, collect its data.

# The data is stored as a dict in a single-itemed array. To get the dict, get the first item of the array.
raw_parent_data = response.json()[0]

# Then, create a dataframe from the dict.
df_parent = pd.DataFrame(raw_parent_data, index=[0])

# Rename the index to nl
df_parent = df_parent.rename(index={0: 'nl'})

In [7]:
# Preview the data in its transposed form (by using .T)
df_parent.T

Unnamed: 0,nl
agriculture_final_demand_electricity_demand,41584.615
agriculture_final_demand_network_gas_demand,91072.918
input_agriculture_final_demand_steam_hot_water_demand,2783.107
agriculture_final_demand_wood_pellets_demand,5437.224
input_agriculture_final_demand_crude_oil_demand,17965.442
...,...
input_transport_ship_electricity_demand,0.0
bunkers_total_useful_demand_ships_demand,478316.636
area,NL_netherlands
base_dataset,eu


##### Load 
Save the parent data set to the intermediate data folder.

In [8]:
# Write dataframe to intermediate data folder
df_parent.to_csv(f"data/intermediate/parent_data.csv")

#### Initialize (empty) ETLocal templates

Only run these cells if you want to create empty ETLocal templates. If the templates have been filled in other steps, these changes will be overwritten. If you don't want to overwrite your changes, **DON'T** run the cells below. :-) 

In [9]:
# First, initialize data.csv file based on the config file
path = Path("config", "data.csv")
df_data_csv = pd.read_csv(path, index_col=[0], sep=sep)

# Drop unnamed columns
df_data_csv = df_data_csv.drop(columns=df_data_csv.columns[df_data_csv.columns.str.startswith('Unnamed')])

# Preview dataframe representing data.csv
df_data_csv

Unnamed: 0_level_0,country,name
geo_id,Unnamed: 1_level_1,Unnamed: 2_level_1
GM0003,nl2019,Appingedam
GM0010,nl2019,Delfzijl
GM0024,nl2019,Loppersum
GM1680,nl2019,Aa en Hunze
GM0358,nl2019,Aalsmeer
...,...,...
GM0879,nl2019,Zundert
GM0301,nl2019,Zutphen
GM1896,nl2019,Zwartewaterland
GM0642,nl2019,Zwijndrecht


In [10]:
# Then, initialize commits.yml file by specifying the file path where you want to create the YAML file
path = Path("data", "reporting", "commits.yml")

# Write the document separator to the YAML file
with open(path, 'w') as file:
    file.write('---\n')

In [11]:
# TODO: Describe that by running this cell an empty template is created. This overwrites the filled template.

# Load ETLocal template with target keys from config
path = Path("config", "etlocal_interface_elements.csv")
empty_template = pd.read_csv(path, header=[0], sep=sep)#, index_col=[2])

# Add columns geo_id, value and commit to the template    
for column in ['geo_id', 'value', 'commit']:
    empty_template[column] = float('nan')
    
# Fill the geo_id column with all relevant municipal geo IDs
templates = []
for municipality in municipalities:
    template_to_add = empty_template.copy()
    template_to_add['geo_id'] = municipality
    templates.append(template_to_add)
    
# Concatenate list of templates to one big template with all municipalities
template = pd.concat(templates)

# Transform the templates into a multi-index dataframe
index = pd.MultiIndex.from_frame(template[['geo_id', 'group', 'subgroup', 'key']])
template = template.drop(columns=['geo_id', 'group', 'subgroup', 'key'])
template.index = index

# Preview merged template
template

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,unit,value,commit
geo_id,group,subgroup,key,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
GM0003,area,area_emission_factors,file_carriers_imported_heat_co2_conversion_per_mj,kg/MJ,,
GM0003,area,area_emission_factors,file_carriers_propane_co2_conversion_per_mj,kg/MJ,,
GM0003,households,households_energy_demand,input_percentage_of_lt_steam_hot_water_households_final_demand_steam_hot_water,%,,
GM0003,households,households_energy_demand,input_percentage_of_mt_steam_hot_water_households_final_demand_steam_hot_water,%,,
GM0003,households,households_energy_demand,input_percentage_of_ht_steam_hot_water_households_final_demand_steam_hot_water,%,,
...,...,...,...,...,...,...
GM0193,energy,energy_heat_production,energy_heat_import_mt_steam_hot_water_demand,TJ,,
GM0193,energy,energy_heat_production,energy_heat_import_ht_steam_hot_water_demand,TJ,,
GM0193,energy,energy_energy_demand,input_energy_heat_distribution_ht_loss_share,%,,
GM0193,energy,energy_energy_demand,input_energy_heat_distribution_mt_loss_share,%,,


## General

### 1. Data collection

In [12]:
# TODO

In the data collection phase, we make an overview of the required data and design a data model to efficiently build our database. If possible, we call the API to collect our data. Otherwise, we first manually collect a data export and store it in the `data / raw` directory.

![image.png](attachment:75b96a93-92f4-444a-9a96-18eb4a6d5295.png)

For the municipal datasets we use the following data sources:
* **??** | ...
* **Dataset manager** | to collect the parent dataset

For each data source, the following steps are followed:
* **Extract** | Extracting the raw data from its source
* **Transform** | Transforming the raw data to a 'workable' format 
* **Load** | Writing the processed data to the intermediate data directory


In [13]:
# TODO

### 2. Data quality

In [14]:
# TODO

#### Setup

Setup the checker and load all files from the `data / intermediate` directory.

First, setup the checker class

In [15]:
# Initialise checker class to access transformation functions
checker = Checker()

Then, setup all data files:
- ...
- parent data

### 3. Data transformation

In [16]:
# TODO

#### Setup

Setup the transformer and load all intermediate data files for the next step in the pipeline.

First, setup the transformer class:

In [17]:
# Initialise transformer class to access transformation functions
transformer = Transformer(municipalities, year)

Then, setup all data files:
- dataframe representing data.csv 
- commits.yml
- dataframe with all etlocal interface elements
- ...
- parent data

##### Load parent data

In [18]:
# Import parent data
path = Path("data", "intermediate", "parent_data.csv")
df_parent = pd.read_csv(path, index_col=0)

# Preview parent data
df_parent.T.head()

Unnamed: 0,nl
agriculture_final_demand_electricity_demand,41584.615
agriculture_final_demand_network_gas_demand,91072.918
input_agriculture_final_demand_steam_hot_water_demand,2783.107
agriculture_final_demand_wood_pellets_demand,5437.224
input_agriculture_final_demand_crude_oil_demand,17965.442


#### Emission factors
Transform the data to the right format. 

##### List ETLocal keys

Preview the ETLocal keys that are relevant for the agriculture energy demand category

In [19]:
# Filter the ETLocal keys that are relevant for the emission factors category
filter_emission_factors = (slice(None), 'area', 'area_emission_factors')

# Preview the filtered template
template.loc[filter_emission_factors, :]

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,unit,value,commit
geo_id,group,subgroup,key,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
GM0003,area,area_emission_factors,file_carriers_imported_heat_co2_conversion_per_mj,kg/MJ,,
GM0003,area,area_emission_factors,file_carriers_propane_co2_conversion_per_mj,kg/MJ,,
GM0010,area,area_emission_factors,file_carriers_imported_heat_co2_conversion_per_mj,kg/MJ,,
GM0010,area,area_emission_factors,file_carriers_propane_co2_conversion_per_mj,kg/MJ,,
GM0024,area,area_emission_factors,file_carriers_imported_heat_co2_conversion_per_mj,kg/MJ,,
...,...,...,...,...,...,...
GM1896,area,area_emission_factors,file_carriers_propane_co2_conversion_per_mj,kg/MJ,,
GM0642,area,area_emission_factors,file_carriers_imported_heat_co2_conversion_per_mj,kg/MJ,,
GM0642,area,area_emission_factors,file_carriers_propane_co2_conversion_per_mj,kg/MJ,,
GM0193,area,area_emission_factors,file_carriers_imported_heat_co2_conversion_per_mj,kg/MJ,,


##### Fill missing values

Then, fill the missing values for:
- Imported heat
- Propane
- ...

**Imported heat** | Emission factors are cross-regionals. Hence, we use the same values for all datasets.

In [20]:
# TODO: we could also use the parent data shares instead?
key = f'file_carriers_imported_heat_co2_conversion_per_mj'

val = 0.036 # TODO: this should be moved to a config or raw data file?

# Update value and corresponding commit message for the temperature level share
template.loc[(slice(None), slice(None), slice(None), key), 'value'] = val
template.loc[(slice(None), slice(None), slice(None), key), 'commit'] = f"Based on https://www.co2emissiefactoren.nl/lijst-emissiefactoren/" # TODO: Describe which exact data is used

**Propane** | Emission factors are cross-regionals. Hence, we use the same values for all datasets.

In [21]:
key = f'file_carriers_propane_co2_conversion_per_mj'

# Let's use the parent data
val = df_parent.loc['nl','file_carriers_propane_co2_conversion_per_mj']

# Update value and corresponding commit message for the temperature level share
template.loc[(slice(None), slice(None), slice(None), key), 'value'] = val
template.loc[(slice(None), slice(None), slice(None), key), 'commit'] = f"Adopted from the parent dataset (the Netherlands)" # TODO: Describe which exact data is used

In [22]:
# Preview template after changes
template.loc[filter_emission_factors, :].head()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,unit,value,commit
geo_id,group,subgroup,key,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
GM0003,area,area_emission_factors,file_carriers_imported_heat_co2_conversion_per_mj,kg/MJ,0.036,Based on https://www.co2emissiefactoren.nl/lij...
GM0003,area,area_emission_factors,file_carriers_propane_co2_conversion_per_mj,kg/MJ,0.06448,Adopted from the parent dataset (the Netherlands)
GM0010,area,area_emission_factors,file_carriers_imported_heat_co2_conversion_per_mj,kg/MJ,0.036,Based on https://www.co2emissiefactoren.nl/lij...
GM0010,area,area_emission_factors,file_carriers_propane_co2_conversion_per_mj,kg/MJ,0.06448,Adopted from the parent dataset (the Netherlands)
GM0024,area,area_emission_factors,file_carriers_imported_heat_co2_conversion_per_mj,kg/MJ,0.036,Based on https://www.co2emissiefactoren.nl/lij...


#### Exceptions

If there are any exceptions for municipal datasets, you can overwrite the values below.

**TO DO** | Explain to the user that this section should always be checked. Should the exception be overwritten by the update? Or should it remain untouched?

##### Gemeente X

In [23]:
#

#### Export processed data
Write the transformed data to our  `data / processed` directory. This is the data we will be using for the next (analysis and visualisation) steps in the pipeline.

In [24]:
# Write dataframe to intermediate data folder
path = Path("data", "processed", "etlocal_template.csv")
template.to_csv(path)

### 4. Data analysis and visualisation

In [25]:
# TODO

## Households

In this section we focus on the data of the households sector: energy demand, energy supply, housing stock and application shares.

### 1. Data collection (extract, transform, load)

In the data collection phase, we make an overview of the required data and design a data model to efficiently build our database. If possible, we call the API to collect our data. Otherwise, we first manually collect a data export and store it in the `data / raw` directory.

![image.png](attachment:75b96a93-92f4-444a-9a96-18eb4a6d5295.png)

For the municipal datasets we use the following data sources:
* **Klimaatmonitor** | to collect municipal data about energy demand, supply and housing stock
* **Referentieverbruiken PBL** | to collect municipal data about application shares
* **Dataset manager** | to collect the parent dataset

For each data source, the following steps are followed:
* **Extract** | Extracting the raw data from its source
* **Transform** | Transforming the raw data to a 'workable' format 
* **Load** | Writing the processed data to the intermediate data directory


#### Klimaatmonitor

Klimaatmonitor is used to determine the final demand per carrier for each municipality, using the following the keys:

- inwoners
- woningen
- el_woningen_tj_incl_zonachtermeter
- gaswoningen
- warmwontier2
- houtskool
- houtwontj
- zonpvtj_won

**TO DO** | Describe the steps for retrieving the raw data from Klimaatmonitor

##### Extract

We start with collecting the relevant data from the data export. This results in a dataframe with key/value combinations for all Dutch municipalities.

In [26]:
# # Source data retrieves all KM data for each municipality
# path_km = Path("data", "raw", f"Thema's - {year} - Gemeenten.xlsx")
# wb_km = xw.Book(str(path_km))
# ws_km_source_data = wb_km.sheets["Data"]

# df_km_source_data = pd.DataFrame(ws_km_source_data.used_range.value)
# df_km_source_data.columns = df_km_source_data.iloc[0]
# df_km_source_data = df_km_source_data[1:]
# df_km_source_data = df_km_source_data.set_index(df_km_source_data.columns[1])

# # Close the Excel workbook
# wb_km.close()

In [27]:
# # Preview
# df_km_source_data.head()

Next, we collect the relevant meta data for each key (or "topic"). This provides us with information about, among others, the topic and unit for each key.

In [28]:
# # Meta data is used to generate commit messages and extract the unit
# path_km = Path("data", "raw", f"Thema's - {year} - Gemeenten.xlsx")
# wb_km = xw.Book(str(path_km))
# ws_km_meta_data = wb_km.sheets["Onderwerp Informatie"]

# df_km_meta_data = pd.DataFrame(ws_km_meta_data.used_range.value)
# df_km_meta_data.columns = df_km_meta_data.iloc[0]
# df_km_meta_data = df_km_meta_data[1:]
# df_km_meta_data = df_km_meta_data.set_index(df_km_meta_data.columns[0])

# # Close the Excel workbook
# wb_km.close()

In [29]:
# # Preview
# df_km_meta_data.head()

##### Transform

We need to transform the municipal codes to the geo ID format that the ETM is familiar with. Thus, we need to paste "GM" and possibly some zeros in front of the code.

In [30]:
# # Fill GM code to match desired area code structure
# df_km_source_data.index = df_km_source_data.index.str.zfill(4).map(lambda x: 'GM' + x)

In [31]:
# # Preview
# df_km_source_data.head()

##### Load

We're now done with the data collection from Klimaatmonitor. Let's store our data in the `data / intermediate` directory.

In [32]:
# # Write dataframe to intermediate data folder
# df_km_source_data.to_csv(f"data/intermediate/klimaatmonitor_source_data_{year}.csv")

In [33]:
# # Write dataframe to intermediate data folder
# df_km_meta_data.to_csv(f"data/intermediate/klimaatmonitor_meta_data_{year}.csv")

#### Referentieverbruiken PBL

The "referentieverbruiken" study by PBL is used to calculate the application shares for each municipality.

##### Extract
First, we extract the raw data in Excel format from the PBL website. Then, we create a dataframe from the relevant Excel data sheet.

In [34]:
# # Call the PBL referentieverbruiken service and store the raw Excel data in the "data / raw" directory
# # It's not necessary to run this step if the raw Excel data is yet present in the directory
# for municipality in municipalities:
#     PblService(municipality).call()

In [35]:
# # Transform and load the raw Excel data to a workable dataframe format
# df_pbl_raw = {}
# for municipality in municipalities:
#     # Initialise the path to the application shares data by PBL
#     path = Path("data", "raw", f"pbl_referentieverbruiken_{municipality}.xlsx")
        
#     # Load the relevant sheet ("Resultaten gemeente") into a dataframe
#     wb = xw.Book(path)
#     ws = wb.sheets["Resultaten gemeente"]
    
#     # Add the dataframe to the dict
#     df_pbl_raw[municipality] = pd.DataFrame(ws.used_range.value)

#     # Close the workbook
#     wb.close()
    
# # NB: Excel will open the relevant files after running this cell.
# # Make sure not to actively close them. The files should close themselves 
# # after all steps from the data collection have been run.

In [36]:
# # Preview dataframe for the first municipality in the dict
# df_pbl_raw[municipalities[0]]

##### Transform
The raw data should be cleaned before we can use it. We should remove columns without headers or values, use the bag vbo id as index, and set the right headers. Also, we want to merge the data for all municipalities into one big dataframe.

In [37]:
# df_pbl = {}
# for municipality in municipalities:
#     # Columns 62 through 74 seem to be empty--thus, these can be removed
#     df_pbl[municipality] = df_pbl_raw[municipality].drop(df_pbl_raw[municipality].iloc[:, 62:75], axis=1)

#     # Use first column as (woning vbo_id) as index and rename it
#     df_pbl[municipality] = df_pbl[municipality].set_index(df_pbl[municipality][0])
#     df_pbl[municipality].index.names = ['bag_vbo_id']
#     df_pbl[municipality] = df_pbl[municipality].drop(df_pbl[municipality].columns[0], axis=1)

#     # Use first four rows to create a multi-row header
#     df_pbl[municipality].columns = [df_pbl[municipality].iloc[0], df_pbl[municipality].iloc[1], df_pbl[municipality].iloc[2], df_pbl[municipality].iloc[3]]
#     df_pbl[municipality] = df_pbl[municipality][4:]

#     # Remove redundant top-level index names 
#     df_pbl[municipality].columns.names = [None] * len(df_pbl[municipality].columns.names)

In [38]:
# # Run this cell if you want the transformed dataframe to be previewed for the first municipality in the dict
# df_pbl[municipalities[0]]

Since the raw data is not formatted properly, we need to create a new dataframe and fill it with the relevant data.

In [39]:
# # Setup dataframe to collect the relevant data from the PBL referentieverbruiken overview
# header=[['Metervraag aardgas','Metervraag aardgas','Metervraag aardgas',
#          'Metervraag elektriciteit','Metervraag elektriciteit','Metervraag elektriciteit','Metervraag elektriciteit',
#          'Metervraag warmtenet','Metervraag warmtenet',
#          'Metervraag waterstof','Metervraag waterstof',
#          'Metervraag biomassa','Metervraag biomassa',
#          'Metervraag olie',
#          'Metervraag totaal','Metervraag totaal','Metervraag totaal','Metervraag totaal','Metervraag totaal','Metervraag totaal','Metervraag totaal'],
#         ['koken','warm tapwater','ruimteverwarming', # aardgas
#          'koken','warm tapwater','ruimteverwarming','hulpvraag', # elektriciteit
#          'warm tapwater','ruimteverwarming', # warmtenet
#          'warm tapwater','ruimteverwarming', # waterstof
#          'warm tapwater','ruimteverwarming', # biomassa
#          'ruimteverwarming', # olie
#          'aardgas', 'elektriciteit', 'warmtenet', 'waterstof', 'biomassa', 'olie', 'totaal']]

# df_pbl_new = {}
# for municipality in municipalities:
#     df_pbl_new[municipality] = pd.DataFrame(columns=header)
#     df_pbl_new[municipality].reindex(index=list(df_pbl[municipality].index))
#     df_pbl_new[municipality].index.names = ['bag_vbo_id']

In [40]:
# for municipality in municipalities:
#     # Fill df_new with relevant data from df
#     # TODO: restructure df so that we don't have to collect the data in this VERY VERY ugly way (which is also VERY VERY error prone)
#     df_pbl_new[municipality].loc[:, ('Metervraag aardgas', 'koken')] = df_pbl[municipality].iloc[:,29]
#     df_pbl_new[municipality].loc[:, ('Metervraag aardgas', 'warm tapwater')] = df_pbl[municipality].iloc[:,30]
#     df_pbl_new[municipality].loc[:, ('Metervraag aardgas', 'ruimteverwarming')] = df_pbl[municipality].iloc[:,32]
#     df_pbl_new[municipality].loc[:, ('Metervraag elektriciteit', 'koken')] = df_pbl[municipality].iloc[:,34]
#     df_pbl_new[municipality].loc[:, ('Metervraag elektriciteit', 'warm tapwater')] = df_pbl[municipality].iloc[:,35]
#     df_pbl_new[municipality].loc[:, ('Metervraag elektriciteit', 'ruimteverwarming')] = df_pbl[municipality].iloc[:,37]
#     df_pbl_new[municipality].loc[:, ('Metervraag elektriciteit', 'hulpvraag')] = df_pbl[municipality].iloc[:,39]
#     df_pbl_new[municipality].loc[:, ('Metervraag warmtenet', 'warm tapwater')] = df_pbl[municipality].iloc[:,40]
#     df_pbl_new[municipality].loc[:, ('Metervraag warmtenet', 'ruimteverwarming')] = df_pbl[municipality].iloc[:,42]
#     df_pbl_new[municipality].loc[:, ('Metervraag waterstof', 'warm tapwater')] = df_pbl[municipality].iloc[:,44]
#     df_pbl_new[municipality].loc[:, ('Metervraag waterstof', 'ruimteverwarming')] = df_pbl[municipality].iloc[:,46]
#     df_pbl_new[municipality].loc[:, ('Metervraag biomassa', 'warm tapwater')] = df_pbl[municipality].iloc[:,48]
#     df_pbl_new[municipality].loc[:, ('Metervraag biomassa', 'ruimteverwarming')] = df_pbl[municipality].iloc[:,50]
#     df_pbl_new[municipality].loc[:, ('Metervraag olie', 'ruimteverwarming')] = df_pbl[municipality].iloc[:,52]
#     df_pbl_new[municipality].loc[:, ('Metervraag totaal', 'aardgas')] = df_pbl[municipality].iloc[:,54]
#     df_pbl_new[municipality].loc[:, ('Metervraag totaal', 'elektriciteit')] = df_pbl[municipality].iloc[:,55]
#     df_pbl_new[municipality].loc[:, ('Metervraag totaal', 'warmtenet')] = df_pbl[municipality].iloc[:,56]
#     df_pbl_new[municipality].loc[:, ('Metervraag totaal', 'waterstof')] = df_pbl[municipality].iloc[:,57]
#     df_pbl_new[municipality].loc[:, ('Metervraag totaal', 'biomassa')] = df_pbl[municipality].iloc[:,58]
#     df_pbl_new[municipality].loc[:, ('Metervraag totaal', 'olie')] = df_pbl[municipality].iloc[:,59]
#     df_pbl_new[municipality].loc[:, ('Metervraag totaal', 'totaal')] = df_pbl[municipality].iloc[:,60]

In [41]:
# # Preview new dataframe (df_pbl_new) filled with data from the old dataframe (df_pbl) for the first municipality in the dict
# df_pbl_new[municipalities[0]]

Now concatenate all separate dataframes into one big one containing data for all municipalities

In [42]:
# # Load application shares and store it in a dict of dataframes per municipality
# dfs = []
# for municipality in municipalities:
#     # Add municipal geo ID to the dataframe and create a multi-index
#     df_pbl_new[municipality]['geo_id'] = municipality
#     df_pbl_new[municipality].set_index('geo_id', append=True, inplace=True)
#     df_pbl_new[municipality] = df_pbl_new[municipality].swaplevel()

#     # Remove the top-level index name
#     df_pbl_new[municipality].columns.names = [None] * len(df_pbl_new[municipality].columns.names)
    
#     # Append dataframe to the list
#     dfs.append(df_pbl_new[municipality])

# # Concatenate list of dataframes to one big dataframe
# df_application_shares = pd.concat(dfs)

# # Preview dataframe
# df_application_shares

##### Load
Load the data into the intermediate data folder. This is the data we will be using for the next steps in the pipeline.

In [43]:
# # Specify path for the to be created CSV file
# path = Path("data", "intermediate", "application_shares.csv")

# # Write the dataframe to this path
# df_application_shares.to_csv(path)

### 2. Data quality

Addressing and managing the data quality of our data is crucial; it will ensure we have the correct data to answer the business question and therefore are able to have a good, reliable analytical solution. Not only for our current intention but also for future purposes. Hence, perform checks on accuracy, relevancy, completeness, timeliness, consistency:

* **Accuracy** | for whatever data described, it needs to be accurate
* **Relevancy** | the data should meet the requirements for the intended use
* **Completeness** | the data should not have missing values or miss records
* **Timeliness** | the data should be up to date
* **Consistency** |the data should have the data format as expected and can be cross reference-able with the same results


Typical checks that could be performed:

* Are there any negative values or shares?
* Are there any NaN values?
* Are the data types per column as expected?
* Do all keys occur in the dataset?
* Are there keys without data? Could we inherit parent data here?
* Are there any duplicate values for a key?
* Do all share groups sum up to 100%?
* Do the hourly curves sum up to 1?
* Do the hourly curves correspond to the dataset year?

#### Setup

Setup the checker and load all files from the `data / intermediate` directory.

First, setup the checker class

In [44]:
# Initialise checker class to access transformation functions
checker = Checker()

Then, setup all data files:
- Klimaatmonitor source data
- Klimaatmonitor meta data
- application shares
- parent data

In [45]:
# # Load Klimaatmonitor source data
# df_km_source_data = pd.read_csv(f'data/intermediate/klimaatmonitor_source_data_{year}.csv', index_col=[0])

# # Preview data
# df_km_source_data

In [46]:
# # Load Klimaatmonitor meta data
# df_km_meta_data = pd.read_csv(f'data/intermediate/klimaatmonitor_meta_data_{year}.csv', index_col=[0])

# # Preview data
# df_km_meta_data

In [47]:
# # Load application shares
# df_application_shares = pd.read_csv(f'data/intermediate/application_shares.csv', header=[0,1], index_col=[0,1])

# # Preview data
# df_application_shares

In [48]:
# # Import parent data
# path = Path("data", "intermediate", f"parent_data.csv")
# df_parent = pd.read_csv(path, index_col=0)

# # Preview data
# df_parent

#### Checks

Perform checks on accuracy, relevancy, completeness, timeliness and consistency.

**TO DO** | Add checks

##### Accuracy checks

In [49]:
# TODO

##### Relevancy checks

In [50]:
# TODO

##### Completeness checks

Does the data have missing values or miss records?

In [51]:
# TODO
# checker.has_missing_values(values)

Are there any NaN values?

In [52]:
# TODO

Do all keys occur in the dataset?

In [53]:
# TODO

##### Timeliness checks

In [54]:
# TODO

##### Consistency checks

Are the data types per column as expected? Are there any columns with non-numerical values? Check for columns with datatype "object". Are these as expected?

In [55]:
# # Check if the parent data has any columns with unexpected non-numerical values
# checker.get_non_numeric_columns(df_parent)

In [56]:
# # Perform the same check for the Klimaatmonitor source data
# checker.get_non_numeric_columns(df_km_source_data)

In [57]:
# # The 'warmwontier2_2019' variable was expected to have only float values.
# # Let's dive deeper into this column:
# for key, value in df_km_source_data['warmwontier2_2019'].items():
#     try:
#         float_value = float(value)
#     except ValueError:
#         print(f"Non float value for {key}: {value}")

In [58]:
# # Let's first check out the entire data row for this municipality
# df_km_source_data.loc['GM2000', :]

In [59]:
# # GM2000 doesn't seem to be a relevant (and maybe not even existing) municipality.
# # Hence, let's remove the entire row from our source data.
# df_km_source_data = df_km_source_data.drop('GM2000')

# # Check if the data for GM2000 has indeed been removed
# df_km_source_data.tail()

In [60]:
# # Lastly, check if the application shares data has any columns with unexpected non-numerical values
# checker.get_non_numeric_columns(df_application_shares)

Are there any negative values or shares?

In [61]:
# TODO

#### Load 

After all checks and preprocessing steps, make sure to save your data again to the `data / intermediate` directory.

In [62]:
# # Klimaatmonitor source data
# df_km_source_data.to_csv(f"data/intermediate/klimaatmonitor_source_data_{year}.csv")

In [63]:
# # Klimaatmonitor meta data
# df_km_meta_data.to_csv(f"data/intermediate/klimaatmonitor_meta_data_{year}.csv")

In [64]:
# # Application shares data
# df_application_shares.to_csv("data/intermediate/application_shares.csv")

In [65]:
# # Parent data
# df_parent.to_csv("data/intermediate/parent_data.csv")

### 3. Data transformation

In the data transformation step, we bring our data together into one final analysis dataset or database.
The data transformation phase usually involves 4 steps:

* **Edit data** — filter, and select the relevant data
* **Aggregate data** — aggregate the data into the necessary level
* **Combine data** — combine all data sources into a single table
* **Extend data** — create new variables based on existing data

#### Setup

Setup the transformer and load all intermediate data files for the next step in the pipeline.

First, setup the transformer class:

In [66]:
# Initialise transformer class to access transformation functions
transformer = Transformer(municipalities, year)

Then, setup all data files:
- dataframe representing data.csv 
- commits.yml
- dataframe with all etlocal interface elements
- mapping to map Klimaatmonitor keys to ETLocal keys
- Klimaatmonitor data and meta data
- parent data

##### Setup Klimaatmonitor mapping files

In [67]:
# # Use mapping from CSV and make sure that it works (TODO!)
# path = Path("config", "klimaatmonitor_to_etlocal_mapping.csv")
# df_etlocal_to_km = pd.read_csv(str(path))

In [68]:
# # Load intermediate Klimaatmonitor data
# df_km_source_data = pd.read_csv(f'data/intermediate/klimaatmonitor_source_data_{year}.csv', index_col=[0])
# df_km_meta_data = pd.read_csv(f'data/intermediate/klimaatmonitor_meta_data_{year}.csv', index_col=[0])

##### Setup parent data

In [69]:
# Import parent data
path = Path("data", "intermediate", "parent_data.csv")
df_parent = pd.read_csv(path, index_col=0)

# Preview parent data
df_parent.T.head()

Unnamed: 0,nl
agriculture_final_demand_electricity_demand,41584.615
agriculture_final_demand_network_gas_demand,91072.918
input_agriculture_final_demand_steam_hot_water_demand,2783.107
agriculture_final_demand_wood_pellets_demand,5437.224
input_agriculture_final_demand_crude_oil_demand,17965.442


##### What is your goal?

At this point, the empty template is ready to fill for all specified datasets and all relevant (config) files have been setup for use. Based on your goal, decide where to continue in the pipeline. If your aim is to fill it with all data, continue running the cells below. If you aim to fill it with specific data, search for the relevant chapter to continue.

#### Energy demand
Transform the energy demand data from Klimaatmonitor to the right format. 

##### List ETLocal keys

Preview the ETLocal keys that are relevant for the households energy demand category

In [70]:
# Filter the ETLocal keys that are relevant for the households energy demand category
filter_households_demand = (slice(None), 'households', 'households_energy_demand')

# Preview the filtered template
template.loc[filter_households_demand, :]

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,unit,value,commit
geo_id,group,subgroup,key,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
GM0003,households,households_energy_demand,input_percentage_of_lt_steam_hot_water_households_final_demand_steam_hot_water,%,,
GM0003,households,households_energy_demand,input_percentage_of_mt_steam_hot_water_households_final_demand_steam_hot_water,%,,
GM0003,households,households_energy_demand,input_percentage_of_ht_steam_hot_water_households_final_demand_steam_hot_water,%,,
GM0010,households,households_energy_demand,input_percentage_of_lt_steam_hot_water_households_final_demand_steam_hot_water,%,,
GM0010,households,households_energy_demand,input_percentage_of_mt_steam_hot_water_households_final_demand_steam_hot_water,%,,
...,...,...,...,...,...,...
GM0642,households,households_energy_demand,input_percentage_of_mt_steam_hot_water_households_final_demand_steam_hot_water,%,,
GM0642,households,households_energy_demand,input_percentage_of_ht_steam_hot_water_households_final_demand_steam_hot_water,%,,
GM0193,households,households_energy_demand,input_percentage_of_lt_steam_hot_water_households_final_demand_steam_hot_water,%,,
GM0193,households,households_energy_demand,input_percentage_of_mt_steam_hot_water_households_final_demand_steam_hot_water,%,,


In [71]:
# List all ETLocal keys that are relevant for the households energy demand category
keys_households_demand = list(template.loc[filter_households_demand, :].index.get_level_values(3).unique())

# Preview list
keys_households_demand

['input_percentage_of_lt_steam_hot_water_households_final_demand_steam_hot_water',
 'input_percentage_of_mt_steam_hot_water_households_final_demand_steam_hot_water',
 'input_percentage_of_ht_steam_hot_water_households_final_demand_steam_hot_water']

##### List Klimaatmonitor keys

List the Klimaatmonitor keys that are relevant for filling our ETLocal template for the energy demand in households.

In [72]:
# km_keys_household_demand = []

# for key in keys_households_demand:
#     if df_etlocal_to_km["interface_elements"].isin([key]).any():
#         km_keys = df_etlocal_to_km.loc[df_etlocal_to_km["interface_elements"] == key, "klimaatmonitor_keys"].values[0]
#         km_key = km_keys.split(',')
        
#         km_keys_household_demand += km_key

In [73]:
# # Preview
# km_keys_household_demand

##### Extract relevant data from Klimaatmonitor

For each key that is relevant for section energy demand in households

In [74]:
# # Transform Klimaatmonitor data to the right format and fill the data frame
# input_data_households_demand = transformer.filter_km_data(km_keys_household_demand, df_km_source_data, df_km_meta_data)

# # Preview
# input_data_households_demand.head(len(km_keys_household_demand))

##### Transform source units to target units

Some values are specified in GJ. We need to convert this to TJ.

In [75]:
# # Transform all GJ rows to TJ rows

# source_unit = "GJ"
# target_unit = "TJ"

# source_to_target = f"{source_unit.lower()}_to_{target_unit.lower()}"
# source_to_target_conversion = transformer.constants[source_to_target]

# input_data_households_demand.loc[input_data_households_demand["unit"] == source_unit, "value"] *= source_to_target_conversion
# input_data_households_demand.loc[input_data_households_demand["unit"] == source_unit, "unit"] = target_unit

The gas demand values are specified in mln m3. We need to convert this to TJ by using the lower heating value (LHV).

In [76]:
# # Transform all miljoen m3 rows to TJ rows

# source_unit = "miljoen m3"
# target_unit = "TJ"
# source_to_target = "m3_to_mj" # miljoen m3 to TJ is the same as m3 to mj

# source_to_target_conversion = transformer.constants[source_to_target]

# input_data_households_demand.loc[input_data_households_demand["unit"] == source_unit, "value"] *= source_to_target_conversion
# input_data_households_demand.loc[input_data_households_demand["unit"] == source_unit, "unit"] = target_unit

In [77]:
# # Preview
# input_data_households_demand.head(len(km_keys_household_demand))

##### Load Klimaatmonitor data in ETLocal template

For all municipalities, fill the ETLocal template with the relevant Klimaatmonitor data

In [78]:
# # Load relevant Klimaatmonitor data into template
# template = transformer.load_km_data(input_data_households_demand, df_etlocal_to_km, template, keys_households_demand)

In [79]:
# # Preview template (filtered for the households energy demand keys)
# template.loc[filter_households_demand, :]

##### Fill missing data and create fallback values

Now, create fallback values for the keys for which no data is available on Klimaatmonitor.

**Oil mix** | There is no public data available about the oil mix in households. Hence, we assume that all oil is crude oil.

In [80]:
# # TODO: retrieve share_groups from ETLocal in df_etlocal, possibly including the flexible share and automatically set fallback

# # Define interface element for which the fallback value should be created
# interface_element = 'input_percentage_of_crude_oil_households_final_demand_crude_oil'

# # Set fallback value to 1.0 and update corresponding commit message
# template.loc[(slice(None), slice(None), slice(None), interface_element), 'value'] = 1.0
# template.loc[(slice(None), slice(None), slice(None), interface_element), 'commit'] = "No data available on Klimaatmonitor. Fallback value set to 1.0 to ensure share totals sum to 1.0."

**Temperature levels of district heating networks** | Klimaatmonitor doesn't provide any data about the demand of district heating networks per temperature level. Hence, we assume that all residential district heating networks require MT heat.

In [81]:
# TODO: we could also use the parent data shares instead?
for temperature_level in ['lt', 'mt', 'ht']:
    key = f'input_percentage_of_{temperature_level}_steam_hot_water_households_final_demand_steam_hot_water'
    
    val = 0.
    
    if temperature_level == 'ht':
        val = 1.
    
    # Update value and corresponding commit message for the temperature level share
    template.loc[(slice(None), slice(None), slice(None), key), 'value'] = val
    template.loc[(slice(None), slice(None), slice(None), key), 'commit'] = f"No data available. Fallback value set to {val} to ensure share totals sum to 1."

For all ETLocal keys (or interface elements) that still have a Nan value, we assume there is no relevant data available on Klimaatmonitor. Hence, we set these values to zero.

In [82]:
# Set NaN values to zero
template.loc[filter_households_demand, 'value'] = template.loc[filter_households_demand, 'value'].fillna(0)

# Add corresponding commit message
template.loc[filter_households_demand, 'commit'] = template.loc[filter_households_demand, 'commit'].fillna("No data available on Klimaatmonitor. Set to 0.0.")

In [83]:
# Preview filtered template
template.loc[filter_households_demand, :]

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,unit,value,commit
geo_id,group,subgroup,key,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
GM0003,households,households_energy_demand,input_percentage_of_lt_steam_hot_water_households_final_demand_steam_hot_water,%,0.0,No data available. Fallback value set to 0.0 t...
GM0003,households,households_energy_demand,input_percentage_of_mt_steam_hot_water_households_final_demand_steam_hot_water,%,0.0,No data available. Fallback value set to 0.0 t...
GM0003,households,households_energy_demand,input_percentage_of_ht_steam_hot_water_households_final_demand_steam_hot_water,%,1.0,No data available. Fallback value set to 1.0 t...
GM0010,households,households_energy_demand,input_percentage_of_lt_steam_hot_water_households_final_demand_steam_hot_water,%,0.0,No data available. Fallback value set to 0.0 t...
GM0010,households,households_energy_demand,input_percentage_of_mt_steam_hot_water_households_final_demand_steam_hot_water,%,0.0,No data available. Fallback value set to 0.0 t...
...,...,...,...,...,...,...
GM0642,households,households_energy_demand,input_percentage_of_mt_steam_hot_water_households_final_demand_steam_hot_water,%,0.0,No data available. Fallback value set to 0.0 t...
GM0642,households,households_energy_demand,input_percentage_of_ht_steam_hot_water_households_final_demand_steam_hot_water,%,1.0,No data available. Fallback value set to 1.0 t...
GM0193,households,households_energy_demand,input_percentage_of_lt_steam_hot_water_households_final_demand_steam_hot_water,%,0.0,No data available. Fallback value set to 0.0 t...
GM0193,households,households_energy_demand,input_percentage_of_mt_steam_hot_water_households_final_demand_steam_hot_water,%,0.0,No data available. Fallback value set to 0.0 t...


#### Energy supply
Transform the energy supply data from Klimaatmonitor to the right format.

##### **SETUP** | List ETLocal and Klimaatmonitor keys

Preview the ETLocal keys that are relevant for the households energy supply category

In [84]:
# # Filter the ETLocal keys that are relevant for the households energy demand category
# filter_households_supply = (slice(None), 'households', 'households_energy_supply')

# # Preview the filtered template
# template.loc[filter_households_supply, :].head()

In [85]:
# # List all ETLocal keys that are relevant for the households energy supply category
# keys_households_supply = list(template.loc[filter_households_supply, :].index.get_level_values(3).unique())

# # Preview list
# keys_households_supply

List the Klimaatmonitor keys that are relevant for filling our ETLocal template for the energy supply in households

In [86]:
# km_keys_household_supply = []

# for key in keys_households_supply:
#     if df_etlocal_to_km["interface_elements"].isin([key]).any():
#         km_keys = df_etlocal_to_km.loc[df_etlocal_to_km["interface_elements"] == key, "klimaatmonitor_keys"].values[0]
#         km_key = km_keys.split(',')
        
#         km_keys_household_supply += km_key

In [87]:
# # Preview Klimaatmonitor keys relevant for energy supply in households
# km_keys_household_supply

##### **EDIT** | Extract data from Klimaatmonitor

For each key that is relevant for section energy supply in households

In [88]:
# # Transform Klimaatmonitor data to the right format and fill the data frame
# input_data_households_supply = transformer.filter_km_data(km_keys_household_supply, df_km_source_data, df_km_meta_data)

# # Preview
# input_data_households_supply.head(len(km_keys_household_supply))

##### **COMBINE** | Load Klimaatmonitor data in ETLocal template, fill missing data and create fallback values

For all municipalities, fill the ETLocal template with the relevant Klimaatmonitor data

In [89]:
# # Load relevant Klimaatmonitor data into template
# template = transformer.load_km_data(input_data_households_supply, df_etlocal_to_km, template, keys_households_supply)

In [90]:
# # Preview template (filtered for the households energy demand keys)
# template.loc[filter_households_supply, :].head()

For all ETLocal keys (or interface elements) that still have a Nan value, we assume there is no relevant data available on Klimaatmonitor. Hence, we set these values to zero.

In [91]:
# # Set NaN values to zero
# template.loc[filter_households_supply, 'value'] = template.loc[filter_households_supply, 'value'].fillna(0)

# # Add corresponding commit message
# template.loc[filter_households_supply, 'commit'] = template.loc[filter_households_supply, 'commit'].fillna("No data available on Klimaatmonitor. Set to 0.0.")

# # Preview filtered template
# template.loc[filter_households_supply, :]

In [92]:
# # Set NaN values to zero
# template.loc[filter_households_supply, 'value'] = template.loc[filter_households_supply, 'value'].fillna(0)

# # Add corresponding commit message
# template.loc[filter_households_supply, 'commit'] = template.loc[filter_households_supply, 'commit'].fillna("No data available on Klimaatmonitor. Set to 0.0.")

# # Preview filtered template
# template.loc[filter_households_supply, :].head()

#### Housing stock
Transform the housing stock data from Klimaatmonitor to the right format.

##### **SETUP** | List ETLocal keys
Preview the ETLocal keys that are relevant for the housing stock category

In [93]:
# Filter the ETLocal keys that are relevant for the housing stock category
filter_housing_stock = (slice(None), 'households', 'households_housing_stock')

# Preview the filtered template
template.loc[filter_housing_stock, :].head()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,unit,value,commit
geo_id,group,subgroup,key,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
GM0003,households,households_housing_stock,households_number_of_apartments_2005,#,,
GM0003,households,households_housing_stock,households_number_of_apartments_1985_2004,#,,
GM0003,households,households_housing_stock,households_number_of_apartments_1965_1984,#,,
GM0003,households,households_housing_stock,households_number_of_apartments_1945_1964,#,,
GM0003,households,households_housing_stock,households_number_of_apartments_1945,#,,


In [94]:
# List all ETLocal keys that are relevant for the housing stock category
keys_housing_stock = list(template.loc[filter_housing_stock, :].index.get_level_values(3).unique())

# Preview list
keys_housing_stock

['households_number_of_apartments_2005',
 'households_number_of_apartments_1985_2004',
 'households_number_of_apartments_1965_1984',
 'households_number_of_apartments_1945_1964',
 'households_number_of_apartments_1945',
 'households_number_of_terraced_2005',
 'households_number_of_terraced_1985_2004',
 'households_number_of_terraced_1965_1984',
 'households_number_of_terraced_1945_1964',
 'households_number_of_terraced_1945',
 'households_number_of_semi_detached_2005',
 'households_number_of_semi_detached_1985_2004',
 'households_number_of_semi_detached_1965_1984',
 'households_number_of_semi_detached_1945_1964',
 'households_number_of_semi_detached_1945',
 'households_number_of_detached_2005',
 'households_number_of_detached_1985_2004',
 'households_number_of_detached_1965_1984',
 'households_number_of_detached_1945_1964',
 'households_number_of_detached_1945']

##### **SETUP** | List Klimaatmonitor keys

List the Klimaatmonitor keys that are relevant for filling our ETLocal template for the energy supply in households

In [95]:
# km_keys_housing_stock = []

# for key in keys_housing_stock:
#     if df_etlocal_to_km["interface_elements"].isin([key]).any():
#         km_keys = df_etlocal_to_km.loc[df_etlocal_to_km["interface_elements"] == key, "klimaatmonitor_keys"].values[0]
#         km_key = km_keys.split(',')
        
#         km_keys_housing_stock += km_key

In [96]:
# # Preview Klimaatmonitor keys relevant for energy supply in households
# km_keys_housing_stock

##### **EDIT** | Extract data from Klimaatmonitor

For each key that is relevant for the housing stock category

In [97]:
# # Transform Klimaatmonitor data to the right format and fill the data frame
# input_data_housing_stock = transformer.filter_km_data(km_keys_housing_stock, df_km_source_data, df_km_meta_data)

# # Preview
# input_data_housing_stock.head(len(km_keys_housing_stock))

##### **COMBINE** | Load Klimaatmonitor data in ETLocal template

For all municipalities, fill the ETLocal template with the relevant Klimaatmonitor data

In [98]:
# # Load relevant Klimaatmonitor data into template
# template = transformer.load_km_data(input_data_housing_stock, df_etlocal_to_km, template, keys_housing_stock)

In [99]:
# # Preview template (filtered for the households energy demand keys)
# template.loc[filter_housing_stock, :].head()

In [100]:
# TODO add additional data source for households_housing_stock

#### Application shares

Calculate the applciation shares for each municipality based on the data from the "referentieverbruiken" study by PBL and transform it to the right format.

##### Load relevant files

Load the intermediate data for application shares and store it in a dict of dataframes per municipality. Also, create a new dataframe in which the totals per municipality can be stored. Further, load the config with ETLocal keys that should be filled with our intermediate data. 

In [101]:
# # Filter the ETLocal keys that are relevant for the households application shares category
# filter_households_applications = (slice(None), 'households', 'households_applications')

# # Preview the filtered template
# template.loc[filter_households_applications, :].head()

In [102]:
# # List all ETLocal keys that are relevant for the households energy demand category
# keys_households_applications = list(template.loc[filter_households_applications, :].index.get_level_values(3).unique())

# # Preview list
# keys_households_applications

In [103]:
# # Load intermediate data for application shares
# path = Path("data", "intermediate", "application_shares.csv")
# input_data_households_applications = pd.read_csv(path, header=[0,1], index_col=[0,1])

# # Preview application shares data
# input_data_households_applications.head()

##### Sum individual building data to totals on municipal level and calculate shares

First, calculate the sums of the application demands for all municipalities.

In [104]:
# # For each municipality sum the data per column
# households_applications_totals = input_data_households_applications.groupby(level='geo_id').sum()

# # Preview summed input data
# households_applications_totals

Then, create a new dataframe (`households_applications_shares`) to store the application _shares_ for all municipalities. Hereafter, we can calculate the shares based on the summed application (total) demands per municipality.

In [105]:
# # Reuse the headers of the existing dataframes 
# households_applications_shares = pd.DataFrame(columns=households_applications_totals.columns)

# # Set the index to municipality
# households_applications_shares = households_applications_shares.rename_axis('municipality')

# # Preview the empty dataframe
# households_applications_shares.head()

In [106]:
# for municipality in municipalities:
#     for carrier in ['aardgas', 'elektriciteit', 'warmtenet', 'waterstof', 'biomassa', 'olie']:
#         application_demands = households_applications_totals.loc[municipality, f'Metervraag {carrier}']
#         total_demand = application_demands.sum(axis=0)

#         for application in application_demands.keys():
#             # Suppress possible warnings
#             with np.errstate(divide='ignore', invalid='ignore'):
#                 # The np.divide method is used to easily handle a division by zero error.
#                 # If this is the case, the share is set to nan.
#                 share_of_application_demand = np.divide(application_demands[application], total_demand)

#             # Add the share to the application shares dataframe for the given municipality
#             households_applications_shares.loc[municipality, (f'Metervraag {carrier}', f'{application}')] = share_of_application_demand
            
# # Preview dataframe with application shares
# households_applications_shares

##### Select the relevant data from the source data

In order to fill the ETLocal template, we need to map the PBL input data to the ETLocal (dataset manager) interface elements.

In [107]:
# # TODO: this could also be a CSV transformed into a data frame--is this more readable for the notebook user?
# mapping = {
#     'households_final_demand_electricity_households_final_demand_for_cooking_electricity_parent_share': 
#     ('Metervraag elektriciteit', 'koken'),
    
#     'households_final_demand_electricity_households_final_demand_for_hot_water_electricity_parent_share': 
#     ('Metervraag elektriciteit', 'warm tapwater'),
    
#     'households_final_demand_electricity_households_final_demand_for_space_heating_electricity_parent_share': 
#     ('Metervraag elektriciteit', 'ruimteverwarming'),
    
#     'households_final_demand_network_gas_households_final_demand_for_cooking_network_gas_parent_share': 
#     ('Metervraag aardgas', 'koken'),
    
#     'households_final_demand_network_gas_households_final_demand_for_hot_water_network_gas_parent_share': 
#     ('Metervraag aardgas', 'warm tapwater'),
    
#     'households_final_demand_network_gas_households_final_demand_for_space_heating_network_gas_parent_share': 
#     ('Metervraag aardgas', 'ruimteverwarming'),
    
#     'households_final_demand_steam_hot_water_households_final_demand_for_hot_water_steam_hot_water_parent_share':
#     ('Metervraag warmtenet', 'warm tapwater'),
    
#     'households_final_demand_steam_hot_water_households_final_demand_for_space_heating_steam_hot_water_parent_share':
#     ('Metervraag warmtenet', 'ruimteverwarming'),
    
#     'households_final_demand_wood_pellets_households_final_demand_for_hot_water_wood_pellets_parent_share':
#     ('Metervraag biomassa', 'warm tapwater'),
    
#     'households_final_demand_wood_pellets_households_final_demand_for_space_heating_wood_pellets_parent_share':
#     ('Metervraag biomassa', 'ruimteverwarming'),
    
#     'households_final_demand_crude_oil_households_final_demand_for_space_heating_crude_oil_parent_share':
#     ('Metervraag olie', 'ruimteverwarming')
# }

##### Add values for application shares to the (ETLocal) dataset manager template

First, load relevant PBL data into the ETLocal template based on the mapping.

In [108]:
# # Add PBL values to (ETLocal) dataset manager template
# for municipality in municipalities:
#     for etlocal_key, pbl_key in mapping.items():
#         # Update the value
#         template.loc[(municipality, slice(None), slice(None), etlocal_key), 'value'] = households_applications_shares.loc[municipality, pbl_key]
        
#         # Add a commit message
#         template.loc[(municipality, slice(None), slice(None), etlocal_key), 'commit'] = f"Calculated the shares based on the value of {pbl_key} as specified in the Referentieverbruiken data export by PBL"

Then, fill the missing values for:
- Coal and crude oil
- Wood pellets
- District heating
- Technology splits

**Coal** | PBL doesn't provide any data about coal. Also, it doesn't provide any data about crude oil for hot water. Hence, we assume that all coal and crude oil is used for space heating.

In [109]:
# # TODO: we could also use the parent data shares instead?
# for carrier in ['coal', 'crude_oil']:
#     space_heating_key = f'households_final_demand_{carrier}_households_final_demand_for_space_heating_{carrier}_parent_share'
#     hot_water_key = f'households_final_demand_{carrier}_households_final_demand_for_hot_water_{carrier}_parent_share'
    
#     # Update value and corresponding commit message for the share of space heating
#     template.loc[(slice(None), slice(None), slice(None), space_heating_key), 'value'] = 1.
#     template.loc[(slice(None), slice(None), slice(None), space_heating_key), 'commit'] = "No data available. Fallback value set to 1. to ensure share totals sum to 1."
    
#     # Update value and corresponding commit message for the share of hot water
#     template.loc[(slice(None), slice(None), slice(None), hot_water_key), 'value'] = 0.
#     template.loc[(slice(None), slice(None), slice(None), hot_water_key), 'commit'] = "No data available. Fallback value set to 0. to ensure share totals sum to 1."

**Wood pellets** | PBL doesn't assume wood pellets to be used for cooking. Since the parent data share is also set to zero, 
we will use this value in our template as well.

In [110]:
# # Define the key for which the value should be updated
# wood_pellets_key = 'households_final_demand_wood_pellets_households_final_demand_for_cooking_wood_pellets_parent_share'

# # Set the value to zero
# template.loc[(slice(None), slice(None), slice(None), wood_pellets_key), 'value'] = 0.

# # And add the corresponding commit message
# template.loc[(slice(None), slice(None), slice(None), wood_pellets_key), 'commit'] = f"No data available. The value of the parent dataset ({parent}) is inherited."

**District heating** | If there's no discrict heating in a municipality (in other words, no steam/hot water demand at all), the template values haven't been set yet. If this is the case--if the district heating shares are still nan values--use the parent data shares instead.

In [111]:
# # Define the key for which the value should be updated
# space_heating_key = 'households_final_demand_steam_hot_water_households_final_demand_for_space_heating_steam_hot_water_parent_share'
# hot_water_key = 'households_final_demand_steam_hot_water_households_final_demand_for_hot_water_steam_hot_water_parent_share'

# for municipality in municipalities:

#     # If the value for the space heating share of district heating is still NaN for the municipality:
#     if pd.isna(template.loc[(municipality, slice(None), slice(None), space_heating_key), 'value'].any()): 
        
#         # Update it to 1.
#         template.loc[(municipality, slice(None), slice(None), space_heating_key), 'value'] = 1.
        
#         # Add a corresponding commit message
#         template.loc[(municipality, slice(None), slice(None), space_heating_key), 
#                      'commit'] = f"Assumed this share to be 100% since the district heating demand is zero according to the Referentieverbruiken data export by PBL"
        
#         # Update the value for the hot water share to 0.
#         template.loc[(municipality, slice(None), slice(None), hot_water_key), 'value'] = 0.
        
#         # Add a corresponding commit message
#         template.loc[(municipality, slice(None), slice(None), hot_water_key),
#                      'commit'] = f"Assumed this share to be 0% since the district heating demand is zero according to the Referentieverbruiken data export by PBL"

**Technology splits** | There is no municipal data for the technology splits. Hence, we inherit these values from the parent dataset.

In [112]:
# # Define a list of ETLocal keys that should be inherited from the parent dataset.
# # These are typically the technology shares.
# inherited_keys = [
#     # technology split space heating
#     'households_final_demand_for_space_heating_network_gas_households_space_heater_combined_network_gas_parent_share',
#     'households_final_demand_for_space_heating_network_gas_households_space_heater_network_gas_parent_share',
#     'households_final_demand_for_space_heating_network_gas_households_space_heater_hybrid_heatpump_air_water_electricity_parent_share',
#     'households_final_demand_for_space_heating_electricity_households_space_heater_electricity_parent_share',
#     'households_final_demand_for_space_heating_electricity_households_space_heater_heatpump_air_water_electricity_parent_share',
#     'households_final_demand_for_space_heating_electricity_households_space_heater_hybrid_heatpump_air_water_electricity_parent_share',
#     'households_final_demand_for_space_heating_electricity_households_space_heater_heatpump_ground_water_electricity_parent_share',
    
#     # technology split cooking
#     'households_final_demand_for_cooking_electricity_households_cooker_halogen_electricity_parent_share',
#     'households_final_demand_for_cooking_electricity_households_cooker_induction_electricity_parent_share',
#     'households_final_demand_for_cooking_electricity_households_cooker_resistive_electricity_parent_share',
    
#     # technology split lighting
#     'households_final_demand_for_lighting_electricity_households_lighting_incandescent_electricity_parent_share',
#     'households_final_demand_for_lighting_electricity_households_lighting_efficient_fluorescent_electricity_parent_share',
#     'households_final_demand_for_lighting_electricity_households_lighting_led_electricity_parent_share',
    
#     # technology split appliances
#     'households_final_demand_for_appliances_electricity_households_appliances_clothes_dryer_electricity_parent_share',
#     'households_final_demand_for_appliances_electricity_households_appliances_computer_media_electricity_parent_share',
#     'households_final_demand_for_appliances_electricity_households_appliances_dishwasher_electricity_parent_share',
#     'households_final_demand_for_appliances_electricity_households_appliances_fridge_freezer_electricity_parent_share',
#     'households_final_demand_for_appliances_electricity_households_appliances_other_electricity_parent_share',
#     'households_final_demand_for_appliances_electricity_households_appliances_television_electricity_parent_share',
#     'households_final_demand_for_appliances_electricity_households_appliances_vacuum_cleaner_electricity_parent_share',
#     'households_final_demand_for_appliances_electricity_households_appliances_washing_machine_electricity_parent_share',
    
#     # technology split cooling
#     'households_final_demand_for_cooling_electricity_households_cooling_airconditioning_electricity_parent_share',
#     'households_final_demand_for_cooling_electricity_households_cooling_heatpump_ground_water_electricity_parent_share',
#     'households_final_demand_for_cooling_electricity_households_cooling_heatpump_air_water_electricity_parent_share'
# ]

In [113]:
# # Inherit missing data values from parent dataset
# for key in inherited_keys:
#     # Update the value
#     template.loc[(slice(None), slice(None), slice(None), key), 'value'] = df_parent.at['nl', key]

#     # And add a commit message
#     template.loc[(slice(None), slice(None), slice(None), key), 'commit'] = f"Inherited the value of the parent dataset ({parent})"

In [114]:
# # Preview template after changes
# template.loc[filter_households_applications, :].head()

#### Exceptions

If there are any exceptions for municipal datasets, you can overwrite the values below.

**TO DO** | Explain to the user that this section should always be checked. Should the exception be overwritten by the update? Or should it remain untouched?

##### Gemeente X

In [115]:
#

#### Export processed data
Write the transformed data to our  `data / processed` directory. This is the data we will be using for the next (analysis and visualisation) steps in the pipeline.

In [116]:
# Write dataframe to intermediate data folder
path = Path("data", "processed", "etlocal_template.csv")
template.to_csv(path)

#### Export reporting data
Merge application shares template in the `data.csv` and `commits.yml` files that are necessary for a ETLocal migration.

In [117]:
# Preview data.csv dataframe
df_data_csv

Unnamed: 0_level_0,country,name
geo_id,Unnamed: 1_level_1,Unnamed: 2_level_1
GM0003,nl2019,Appingedam
GM0010,nl2019,Delfzijl
GM0024,nl2019,Loppersum
GM1680,nl2019,Aa en Hunze
GM0358,nl2019,Aalsmeer
...,...,...
GM0879,nl2019,Zundert
GM0301,nl2019,Zutphen
GM1896,nl2019,Zwartewaterland
GM0642,nl2019,Zwijndrecht


In [118]:
# Load commits.yml file into a Python data structure
path = Path("data", "reporting", "commits.yml")

with open(path, 'r') as file:
    commits = yaml.safe_load(file)

if commits is None:
    commits = []

In [119]:
# List all keys that have been added to the template
keys = list(template.index.get_level_values(3).unique())

# For each municipality fill data.csv and commits.yml
for key in keys:
    df_data_csv.loc[:, key] = float('nan')
    for municipality in municipalities:
        # Add data value to data.csv
        df_data_csv.loc[municipality, key] = template.loc[(municipality, slice(None), slice(None), key), 'value'].values[0]
        
        # Add commit message to commits.yml
        commits.append({'fields': [key], 'message': template.loc[(municipality, slice(None), slice(None), key), 'commit'].values[0]})
    
# Preview (transposed) data for all municipalities in the format required for the data.csv file
df_data_csv.T

  df_data_csv.loc[:, key] = float('nan')
  df_data_csv.loc[:, key] = float('nan')
  df_data_csv.loc[:, key] = float('nan')
  df_data_csv.loc[:, key] = float('nan')
  df_data_csv.loc[:, key] = float('nan')
  df_data_csv.loc[:, key] = float('nan')
  df_data_csv.loc[:, key] = float('nan')
  df_data_csv.loc[:, key] = float('nan')
  df_data_csv.loc[:, key] = float('nan')
  df_data_csv.loc[:, key] = float('nan')
  df_data_csv.loc[:, key] = float('nan')
  df_data_csv.loc[:, key] = float('nan')
  df_data_csv.loc[:, key] = float('nan')
  df_data_csv.loc[:, key] = float('nan')
  df_data_csv.loc[:, key] = float('nan')
  df_data_csv.loc[:, key] = float('nan')
  df_data_csv.loc[:, key] = float('nan')
  df_data_csv.loc[:, key] = float('nan')
  df_data_csv.loc[:, key] = float('nan')
  df_data_csv.loc[:, key] = float('nan')
  df_data_csv.loc[:, key] = float('nan')
  df_data_csv.loc[:, key] = float('nan')


geo_id,GM0003,GM0010,GM0024,GM1680,GM0358,GM0197,GM0059,GM0482,GM0613,GM0361,...,GM0355,GM0299,GM0637,GM0638,GM1892,GM0879,GM0301,GM1896,GM0642,GM0193
country,nl2019,nl2019,nl2019,nl2019,nl2019,nl2019,nl2019,nl2019,nl2019,nl2019,...,nl2019,nl2019,nl2019,nl2019,nl2019,nl2019,nl2019,nl2019,nl2019,nl2019
name,Appingedam,Delfzijl,Loppersum,Aa en Hunze,Aalsmeer,Aalten,Achtkarspelen,Alblasserdam,Albrandswaard,Alkmaar,...,Zeist,Zevenaar,Zoetermeer,Zoeterwoude,Zuidplas,Zundert,Zutphen,Zwartewaterland,Zwijndrecht,Zwolle
file_carriers_imported_heat_co2_conversion_per_mj,0.036,0.036,0.036,0.036,0.036,0.036,0.036,0.036,0.036,0.036,...,0.036,0.036,0.036,0.036,0.036,0.036,0.036,0.036,0.036,0.036
file_carriers_propane_co2_conversion_per_mj,0.06448,0.06448,0.06448,0.06448,0.06448,0.06448,0.06448,0.06448,0.06448,0.06448,...,0.06448,0.06448,0.06448,0.06448,0.06448,0.06448,0.06448,0.06448,0.06448,0.06448
input_percentage_of_lt_steam_hot_water_households_final_demand_steam_hot_water,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
energy_heat_import_mt_steam_hot_water_demand,,,,,,,,,,,...,,,,,,,,,,
energy_heat_import_ht_steam_hot_water_demand,,,,,,,,,,,...,,,,,,,,,,
input_energy_heat_distribution_ht_loss_share,,,,,,,,,,,...,,,,,,,,,,
input_energy_heat_distribution_mt_loss_share,,,,,,,,,,,...,,,,,,,,,,


In [120]:
# Write data.csv to processed data directory
df_data_csv.to_csv(f"data/reporting/data.csv", index=False)

In [121]:
# TODO: Beautify the yaml data and make it more readable
# commits_yaml = yaml.dump(commits, sort_keys=False, indent=4, default_flow_style=False, default_style='|')

# Write the updated data back to the YAML file
path = Path("data", "reporting", "commits.yml")

with open(path, 'w') as file:
    yaml.safe_dump(commits, file)

### 4. Data analysis and visualisation (sanity check)

When we have a complete dataset, we can apply exploratory data analysis and visualisation to validate our data and perform sanity checks such as:

* Do the totals of energy demand per sector correspond to the original (or alternative) source data? 
* Does the distribution per sector / category / carrier make sense? Or does it imply inaccurate data?
* Do the totals of energy demand and supply per carrier correspond to the original (or alternative) source data?
* Does the distribution of energy demand and supply per carrier make sense? Or does it imply inaccurate data?
* How does the energy demand per inhabitant compare to the parent value?
* How do the added hourly curves relate to the parent ones?

A dashboard of charts and key figures will be designed to support the sanity check. The visualisations will make it easier to detect flaws and insanities at a glance. Here, among others, parent values are compared to child values to check whether the results meet our expectations. Same goes for the balance between demand and supply for each carrier.

In [122]:
# TODO: Create reusable modules to visualise charts/data

In [123]:
# TODO: Reporting MW installed capacities based on demands per carrier and full load hours

In [124]:
# TODO: Reporting application split per transport mode based on demands and modal split per carrier

In [125]:
# TODO: Reporting change log compared to current version of the dataset in the ETM dataset manager (e.g. have you changed the year?) 

#### Setup

Setup the checker (if not setup yet) and load all processed data files for the next step in the pipeline.

In [126]:
# Initialise checker class to access transformation functions
checker = Checker()

Then, load the processed data file(s):
- etlocal_template.csv

In [127]:
# Load data
template = pd.read_csv(f'data/processed/etlocal_template.csv', index_col=[0,1,2,3])

In [128]:
# Preview data
template

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,unit,value,commit
geo_id,group,subgroup,key,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
GM0003,area,area_emission_factors,file_carriers_imported_heat_co2_conversion_per_mj,kg/MJ,0.03600,Based on https://www.co2emissiefactoren.nl/lij...
GM0003,area,area_emission_factors,file_carriers_propane_co2_conversion_per_mj,kg/MJ,0.06448,Adopted from the parent dataset (the Netherlands)
GM0003,households,households_energy_demand,input_percentage_of_lt_steam_hot_water_households_final_demand_steam_hot_water,%,0.00000,No data available. Fallback value set to 0.0 t...
GM0003,households,households_energy_demand,input_percentage_of_mt_steam_hot_water_households_final_demand_steam_hot_water,%,0.00000,No data available. Fallback value set to 0.0 t...
GM0003,households,households_energy_demand,input_percentage_of_ht_steam_hot_water_households_final_demand_steam_hot_water,%,1.00000,No data available. Fallback value set to 1.0 t...
...,...,...,...,...,...,...
GM0193,energy,energy_heat_production,energy_heat_import_mt_steam_hot_water_demand,TJ,,
GM0193,energy,energy_heat_production,energy_heat_import_ht_steam_hot_water_demand,TJ,,
GM0193,energy,energy_energy_demand,input_energy_heat_distribution_ht_loss_share,%,,
GM0193,energy,energy_energy_demand,input_energy_heat_distribution_mt_loss_share,%,,


#### Sanity check

Visualise datasets in order to perform a sanity check

**TO DO** | Create methods for visualisations

**TO DO** | Expand sanity checks and visualisations

In [129]:
# data_to_plot = template.swaplevel(0,3).loc[[
#     "households_final_demand_electricity_demand", 
#     "households_final_demand_network_gas_demand", 
#     "households_final_demand_steam_hot_water_demand", 
#     "households_final_demand_wood_pellets_demand",
#     "households_final_demand_coal_demand",
#     "input_households_final_demand_crude_oil_demand",
#     "households_final_demand_solar_thermal_demand"
# ]]

# data_to_plot.index = data_to_plot.index.droplevel([1,2,3])

# data_to_plot = data_to_plot["value"].reset_index()

# plt.figure(figsize=(8, 6), dpi = 100)

# sns.boxplot(data=data_to_plot, y='key', x='value')
# sns.set_theme(style="darkgrid")
# sns.set()

# plt.show()

In [130]:
# # For households: visualise distribution of inhabitants and residences across all municipalities

# data_to_plot = "number_of_inhabitants"
# variable_to_plot = "value"
# df_to_plot = template.loc[(slice(None), slice(None), slice(None), data_to_plot), :]

# plt.figure(figsize=(8, 6), dpi = 100)

# sns.histplot(data=df_to_plot, x=variable_to_plot)
# sns.set_theme(style="darkgrid")
# sns.set()

# plt.xlabel('Number of inhabitants')
# plt.ylabel('Count of municipalities')

# plt.show()

In [131]:
# data_to_plot = template.swaplevel(0,3).loc[["number_of_inhabitants", "number_of_residences"]]

# data_to_plot.index = data_to_plot.index.droplevel([1,2,3])

# data_to_plot = data_to_plot["value"].reset_index()

# plt.figure(figsize=(8, 6), dpi = 100)

# sns.boxplot(data=data_to_plot, y='key', x='value')
# sns.set_theme(style="darkgrid")
# sns.set()

# plt.show()

In [132]:
# # For households: visualise ratio between residences and inhabitants as scatterplot

# # Transform code below to function

# variable_x_to_plot = template.loc[(slice(None), slice(None), slice(None), 'number_of_inhabitants'), 'value'].values
# variable_x_to_plot = pd.DataFrame(variable_x_to_plot, columns=['number_of_inhabitants'])

# variable_y_to_plot = template.loc[(slice(None), slice(None), slice(None), 'number_of_residences'), 'value'].values
# variable_y_to_plot = pd.DataFrame(variable_y_to_plot, columns=['number_of_residences'])

# data_to_plot = pd.concat([variable_x_to_plot, variable_y_to_plot], axis=1)

# plt.figure(figsize=(8, 6), dpi = 100)

# sns.scatterplot(data=data_to_plot, x='number_of_inhabitants', y='number_of_residences')
# sns.set_theme(style="darkgrid")
# sns.set()

# plt.xlabel('Number of inhabitants')
# plt.ylabel('Number of residences')

# plt.show()

In [133]:
# # For households: visualise energy demand per inhabitant or per residence

# data_to_convert = template.swaplevel(0,3).loc[[
#     "households_final_demand_electricity_demand", 
#     "households_final_demand_network_gas_demand", 
#     "households_final_demand_steam_hot_water_demand", 
#     "households_final_demand_wood_pellets_demand",
#     "households_final_demand_coal_demand",
#     "input_households_final_demand_crude_oil_demand",
#     "households_final_demand_solar_thermal_demand"
# ]]

# data_to_convert.index = data_to_convert.index.droplevel([1,2])
# data_to_convert = data_to_convert.swaplevel(0,1)

# data_to_use = template.swaplevel(0,3).loc[[
#     "number_of_inhabitants"
# ]]

# data_to_use.index = data_to_use.index.droplevel([1,2])
# data_to_use = data_to_use.swaplevel(0,1)
# data_to_use.index = data_to_use.index.droplevel(1)
# data_to_use = data_to_use['value']

# data_to_convert["relative"] = data_to_convert["value"] / data_to_use
 
# data_to_convert.index = data_to_convert.index.droplevel(0)

# data_to_plot = data_to_convert["relative"].reset_index()

# plt.figure(figsize=(8, 6), dpi = 100)

# sns.boxplot(data=data_to_plot, y='key', x='relative')
# sns.set_theme(style="darkgrid")
# sns.set()

# plt.show()

In [134]:
# data_to_plot = template.swaplevel(0,3).loc[[
#     "households_final_demand_electricity_demand", 
#     "households_final_demand_network_gas_demand", 
#     "households_final_demand_steam_hot_water_demand", 
#     "households_final_demand_wood_pellets_demand",
#     "households_final_demand_coal_demand",
#     "input_households_final_demand_crude_oil_demand",
#     "households_final_demand_solar_thermal_demand"
# ]]

# data_to_plot.index = data_to_plot.index.droplevel([1,2,3])

# data_to_plot = data_to_plot["value"].reset_index()

# plt.figure(figsize=(8, 6), dpi = 100)

# sns.boxplot(data=data_to_plot, y='key', x='value')
# sns.set_theme(style="darkgrid")
# sns.set()

# plt.show()

## Services

### 1. Data collection

In the data collection phase, we make an overview of the required data and design a data model to efficiently build our database. If possible, we call the API to collect our data. Otherwise, we first manually collect a data export and store it in the `data / raw` directory.

![image.png](attachment:75b96a93-92f4-444a-9a96-18eb4a6d5295.png)

For the municipal datasets we use the following data sources:
* **??** | ...
* **Dataset manager** | to collect the parent dataset

For each data source, the following steps are followed:
* **Extract** | Extracting the raw data from its source
* **Transform** | Transforming the raw data to a 'workable' format 
* **Load** | Writing the processed data to the intermediate data directory


In [135]:
# TODO

### 2. Data quality

In [136]:
# TODO

#### Setup

Setup the checker and load all files from the `data / intermediate` directory.

First, setup the checker class

In [137]:
# Initialise checker class to access transformation functions
checker = Checker()

Then, setup all data files:
- ...
- parent data

### 3. Data transformation

In [138]:
# TODO

#### Setup

Setup the transformer and load all intermediate data files for the next step in the pipeline.

First, setup the transformer class:

In [139]:
# Initialise transformer class to access transformation functions
transformer = Transformer(municipalities, year)

Then, setup all data files:
- dataframe representing data.csv 
- commits.yml
- dataframe with all etlocal interface elements
- ...
- parent data

##### Load parent data

In [140]:
# Import parent data
path = Path("data", "intermediate", "parent_data.csv")
df_parent = pd.read_csv(path, index_col=0)

# Preview parent data
df_parent.T.head()

Unnamed: 0,nl
agriculture_final_demand_electricity_demand,41584.615
agriculture_final_demand_network_gas_demand,91072.918
input_agriculture_final_demand_steam_hot_water_demand,2783.107
agriculture_final_demand_wood_pellets_demand,5437.224
input_agriculture_final_demand_crude_oil_demand,17965.442


#### Energy demand
Transform the energy demand data to the right format. 

##### List ETLocal keys

Preview the ETLocal keys that are relevant for the services energy demand category

In [141]:
# Filter the ETLocal keys that are relevant for the services energy demand category
filter_buildings_demand = (slice(None), 'buildings', 'buildings_energy_demand')

# Preview the filtered template
template.loc[filter_buildings_demand, :]

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,unit,value,commit
geo_id,group,subgroup,key,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
GM0003,buildings,buildings_energy_demand,input_percentage_of_lt_steam_hot_water_buildings_final_demand_steam_hot_water,%,,
GM0003,buildings,buildings_energy_demand,input_percentage_of_mt_steam_hot_water_buildings_final_demand_steam_hot_water,%,,
GM0003,buildings,buildings_energy_demand,input_percentage_of_ht_steam_hot_water_buildings_final_demand_steam_hot_water,%,,
GM0010,buildings,buildings_energy_demand,input_percentage_of_lt_steam_hot_water_buildings_final_demand_steam_hot_water,%,,
GM0010,buildings,buildings_energy_demand,input_percentage_of_mt_steam_hot_water_buildings_final_demand_steam_hot_water,%,,
...,...,...,...,...,...,...
GM0642,buildings,buildings_energy_demand,input_percentage_of_mt_steam_hot_water_buildings_final_demand_steam_hot_water,%,,
GM0642,buildings,buildings_energy_demand,input_percentage_of_ht_steam_hot_water_buildings_final_demand_steam_hot_water,%,,
GM0193,buildings,buildings_energy_demand,input_percentage_of_lt_steam_hot_water_buildings_final_demand_steam_hot_water,%,,
GM0193,buildings,buildings_energy_demand,input_percentage_of_mt_steam_hot_water_buildings_final_demand_steam_hot_water,%,,


##### Fill missing values

Then, fill the missing values for:
- Temperature levels of district heating networks
- ...

**Temperature levels of district heating networks** | Klimaatmonitor doesn't provide any data about the demand of district heating networks per temperature level. Hence, we assume that all residential district heating networks require MT heat.

In [142]:
# TODO: we could also use the parent data shares instead?
for temperature_level in ['lt', 'mt', 'ht']:
    key = f'input_percentage_of_{temperature_level}_steam_hot_water_buildings_final_demand_steam_hot_water'
    
    val = 0.
    
    if temperature_level == 'ht':
        val = 1.
    
    # Update value and corresponding commit message for the temperature level share
    template.loc[(slice(None), slice(None), slice(None), key), 'value'] = val
    template.loc[(slice(None), slice(None), slice(None), key), 'commit'] = f"No data available. Fallback value set to {val} to ensure share totals sum to 1."

In [143]:
# Preview template after changes
template.loc[filter_buildings_demand, :].head()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,unit,value,commit
geo_id,group,subgroup,key,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
GM0003,buildings,buildings_energy_demand,input_percentage_of_lt_steam_hot_water_buildings_final_demand_steam_hot_water,%,0.0,No data available. Fallback value set to 0.0 t...
GM0003,buildings,buildings_energy_demand,input_percentage_of_mt_steam_hot_water_buildings_final_demand_steam_hot_water,%,0.0,No data available. Fallback value set to 0.0 t...
GM0003,buildings,buildings_energy_demand,input_percentage_of_ht_steam_hot_water_buildings_final_demand_steam_hot_water,%,1.0,No data available. Fallback value set to 1.0 t...
GM0010,buildings,buildings_energy_demand,input_percentage_of_lt_steam_hot_water_buildings_final_demand_steam_hot_water,%,0.0,No data available. Fallback value set to 0.0 t...
GM0010,buildings,buildings_energy_demand,input_percentage_of_mt_steam_hot_water_buildings_final_demand_steam_hot_water,%,0.0,No data available. Fallback value set to 0.0 t...


### 4. Data analysis and visualisation

In [144]:
# TODO

## Agriculture

### 1. Data collection

In [145]:
# TODO

In the data collection phase, we make an overview of the required data and design a data model to efficiently build our database. If possible, we call the API to collect our data. Otherwise, we first manually collect a data export and store it in the `data / raw` directory.

![image.png](attachment:75b96a93-92f4-444a-9a96-18eb4a6d5295.png)

For the municipal datasets we use the following data sources:
* **??** | ...
* **Dataset manager** | to collect the parent dataset

For each data source, the following steps are followed:
* **Extract** | Extracting the raw data from its source
* **Transform** | Transforming the raw data to a 'workable' format 
* **Load** | Writing the processed data to the intermediate data directory


In [146]:
# TODO

### 2. Data quality

In [147]:
# TODO

#### Setup

Setup the checker and load all files from the `data / intermediate` directory.

First, setup the checker class

In [148]:
# Initialise checker class to access transformation functions
checker = Checker()

Then, setup all data files:
- ...
- parent data

### 3. Data transformation

In [149]:
# TODO

#### Setup

Setup the transformer and load all intermediate data files for the next step in the pipeline.

First, setup the transformer class:

In [150]:
# Initialise transformer class to access transformation functions
transformer = Transformer(municipalities, year)

Then, setup all data files:
- dataframe representing data.csv 
- commits.yml
- dataframe with all etlocal interface elements
- ...
- parent data

##### Load parent data

In [151]:
# Import parent data
path = Path("data", "intermediate", "parent_data.csv")
df_parent = pd.read_csv(path, index_col=0)

# Preview parent data
df_parent.T.head()

Unnamed: 0,nl
agriculture_final_demand_electricity_demand,41584.615
agriculture_final_demand_network_gas_demand,91072.918
input_agriculture_final_demand_steam_hot_water_demand,2783.107
agriculture_final_demand_wood_pellets_demand,5437.224
input_agriculture_final_demand_crude_oil_demand,17965.442


#### Energy demand
Transform the energy demand data from Klimaatmonitor to the right format. 

##### List ETLocal keys

Preview the ETLocal keys that are relevant for the agriculture energy demand category

In [152]:
# Filter the ETLocal keys that are relevant for the agriculture energy demand category
filter_agriculture_demand = (slice(None), 'agriculture', 'agriculture_energy_demand')

# Preview the filtered template
template.loc[filter_agriculture_demand, :]

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,unit,value,commit
geo_id,group,subgroup,key,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
GM0003,agriculture,agriculture_energy_demand,input_percentage_of_central_mt_steam_hot_water_agriculture_final_demand_steam_hot_water,%,,
GM0003,agriculture,agriculture_energy_demand,input_percentage_of_central_ht_steam_hot_water_agriculture_final_demand_steam_hot_water,%,,
GM0003,agriculture,agriculture_energy_demand,input_percentage_of_local_steam_hot_water_agriculture_final_demand_steam_hot_water,%,,
GM0010,agriculture,agriculture_energy_demand,input_percentage_of_central_mt_steam_hot_water_agriculture_final_demand_steam_hot_water,%,,
GM0010,agriculture,agriculture_energy_demand,input_percentage_of_central_ht_steam_hot_water_agriculture_final_demand_steam_hot_water,%,,
...,...,...,...,...,...,...
GM0642,agriculture,agriculture_energy_demand,input_percentage_of_central_ht_steam_hot_water_agriculture_final_demand_steam_hot_water,%,,
GM0642,agriculture,agriculture_energy_demand,input_percentage_of_local_steam_hot_water_agriculture_final_demand_steam_hot_water,%,,
GM0193,agriculture,agriculture_energy_demand,input_percentage_of_central_mt_steam_hot_water_agriculture_final_demand_steam_hot_water,%,,
GM0193,agriculture,agriculture_energy_demand,input_percentage_of_central_ht_steam_hot_water_agriculture_final_demand_steam_hot_water,%,,


##### Fill missing values

Then, fill the missing values for:
- Temperature levels of district heating networks
- ...

**Temperature levels of district heating networks** | Klimaatmonitor doesn't provide any data about the demand of district heating networks per temperature level. Hence, we assume that all agricultural district heating networks require MT heat.

In [153]:
# TODO: we could also use the parent data shares instead?
for temperature_level in ['mt', 'ht']:
    key = f'input_percentage_of_central_{temperature_level}_steam_hot_water_agriculture_final_demand_steam_hot_water'
    
    val = 0.
    
    if temperature_level == 'ht':
        val = 1.
    
    # Update value and corresponding commit message for the temperature level share
    template.loc[(slice(None), slice(None), slice(None), key), 'value'] = val
    template.loc[(slice(None), slice(None), slice(None), key), 'commit'] = f"No data available. Fallback value set to {val} to ensure share totals sum to 1."

In [154]:
key = f'input_percentage_of_local_steam_hot_water_agriculture_final_demand_steam_hot_water'

val = 0.

# Update value and corresponding commit message for the temperature level share
template.loc[(slice(None), slice(None), slice(None), key), 'value'] = val
template.loc[(slice(None), slice(None), slice(None), key), 'commit'] = f"No data available. Fallback value set to {val} to ensure share totals sum to 1."

In [155]:
# Preview template after changes
template.loc[filter_agriculture_demand, :].head()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,unit,value,commit
geo_id,group,subgroup,key,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
GM0003,agriculture,agriculture_energy_demand,input_percentage_of_central_mt_steam_hot_water_agriculture_final_demand_steam_hot_water,%,0.0,No data available. Fallback value set to 0.0 t...
GM0003,agriculture,agriculture_energy_demand,input_percentage_of_central_ht_steam_hot_water_agriculture_final_demand_steam_hot_water,%,1.0,No data available. Fallback value set to 1.0 t...
GM0003,agriculture,agriculture_energy_demand,input_percentage_of_local_steam_hot_water_agriculture_final_demand_steam_hot_water,%,0.0,No data available. Fallback value set to 0.0 t...
GM0010,agriculture,agriculture_energy_demand,input_percentage_of_central_mt_steam_hot_water_agriculture_final_demand_steam_hot_water,%,0.0,No data available. Fallback value set to 0.0 t...
GM0010,agriculture,agriculture_energy_demand,input_percentage_of_central_ht_steam_hot_water_agriculture_final_demand_steam_hot_water,%,1.0,No data available. Fallback value set to 1.0 t...


#### Exceptions

If there are any exceptions for municipal datasets, you can overwrite the values below.

**TO DO** | Explain to the user that this section should always be checked. Should the exception be overwritten by the update? Or should it remain untouched?

##### Gemeente X

In [156]:
#

#### Export processed data
Write the transformed data to our  `data / processed` directory. This is the data we will be using for the next (analysis and visualisation) steps in the pipeline.

In [157]:
# Write dataframe to intermediate data folder
path = Path("data", "processed", "etlocal_template.csv")
template.to_csv(path)

### 4. Data analysis and visualisation

In [158]:
# TODO

## Transport

### 1. Data collection

In [159]:
# TODO

In the data collection phase, we make an overview of the required data and design a data model to efficiently build our database. If possible, we call the API to collect our data. Otherwise, we first manually collect a data export and store it in the `data / raw` directory.

![image.png](attachment:75b96a93-92f4-444a-9a96-18eb4a6d5295.png)

For the municipal datasets we use the following data sources:
* **??** | ...
* **Dataset manager** | to collect the parent dataset

For each data source, the following steps are followed:
* **Extract** | Extracting the raw data from its source
* **Transform** | Transforming the raw data to a 'workable' format 
* **Load** | Writing the processed data to the intermediate data directory


In [160]:
# TODO

### 2. Data quality

In [161]:
# TODO

#### Setup

Setup the checker and load all files from the `data / intermediate` directory.

First, setup the checker class

In [162]:
# Initialise checker class to access transformation functions
checker = Checker()

Then, setup all data files:
- ...
- parent data

### 3. Data transformation

In [163]:
# TODO

#### Setup

Setup the transformer and load all intermediate data files for the next step in the pipeline.

First, setup the transformer class:

In [164]:
# Initialise transformer class to access transformation functions
transformer = Transformer(municipalities, year)

Then, setup all data files:
- dataframe representing data.csv 
- commits.yml
- dataframe with all etlocal interface elements
- ...
- parent data

##### Load parent data

In [165]:
# Import parent data
path = Path("data", "intermediate", "parent_data.csv")
df_parent = pd.read_csv(path, index_col=0)

# Preview parent data
df_parent.T.head()

Unnamed: 0,nl
agriculture_final_demand_electricity_demand,41584.615
agriculture_final_demand_network_gas_demand,91072.918
input_agriculture_final_demand_steam_hot_water_demand,2783.107
agriculture_final_demand_wood_pellets_demand,5437.224
input_agriculture_final_demand_crude_oil_demand,17965.442


#### Energy demand
Transform the energy demand data from Klimaatmonitor to the right format. 

##### List ETLocal keys

Preview the ETLocal keys that are relevant for the transport energy demand category

In [166]:
# # Filter the ETLocal keys that are relevant for the agriculture energy demand category
# filter_transport_demand = (slice(None), 'transport', 'transport_energy_demand')

# # Preview the filtered template
# template.loc[filter_transport_demand, :]

##### Fill missing values

Then, fill the missing values for:
- ...

**??** | ..

In [167]:
# # Preview template after changes
# template.loc[filter_transport_demand, :].head()

#### Exceptions

If there are any exceptions for municipal datasets, you can overwrite the values below.

**TO DO** | Explain to the user that this section should always be checked. Should the exception be overwritten by the update? Or should it remain untouched?

##### Gemeente X

In [168]:
#

#### Export processed data
Write the transformed data to our  `data / processed` directory. This is the data we will be using for the next (analysis and visualisation) steps in the pipeline.

In [169]:
# Write dataframe to intermediate data folder
path = Path("data", "processed", "etlocal_template.csv")
template.to_csv(path)

### 4. Data analysis and visualisation

In [170]:
# TODO

## Industry

In [171]:
# TODO

### 1. Data collection

In [172]:
# TODO

In the data collection phase, we make an overview of the required data and design a data model to efficiently build our database. If possible, we call the API to collect our data. Otherwise, we first manually collect a data export and store it in the `data / raw` directory.

![image.png](attachment:75b96a93-92f4-444a-9a96-18eb4a6d5295.png)

For the municipal datasets we use the following data sources:
* **??** | ...
* **Dataset manager** | to collect the parent dataset

For each data source, the following steps are followed:
* **Extract** | Extracting the raw data from its source
* **Transform** | Transforming the raw data to a 'workable' format 
* **Load** | Writing the processed data to the intermediate data directory


In [173]:
# TODO

### 2. Data quality

In [174]:
# TODO

#### Setup

Setup the checker and load all files from the `data / intermediate` directory.

First, setup the checker class

In [175]:
# Initialise checker class to access transformation functions
checker = Checker()

Then, setup all data files:
- ...
- parent data

### 3. Data transformation

In [176]:
# TODO

#### Setup

Setup the transformer and load all intermediate data files for the next step in the pipeline.

First, setup the transformer class:

In [177]:
# Initialise transformer class to access transformation functions
transformer = Transformer(municipalities, year)

Then, setup all data files:
- dataframe representing data.csv 
- commits.yml
- dataframe with all etlocal interface elements
- ...
- parent data

##### Load parent data

In [178]:
# Import parent data
path = Path("data", "intermediate", "parent_data.csv")
df_parent = pd.read_csv(path, index_col=0)

# Preview parent data
df_parent.T.head()

Unnamed: 0,nl
agriculture_final_demand_electricity_demand,41584.615
agriculture_final_demand_network_gas_demand,91072.918
input_agriculture_final_demand_steam_hot_water_demand,2783.107
agriculture_final_demand_wood_pellets_demand,5437.224
input_agriculture_final_demand_crude_oil_demand,17965.442


#### Energy demand
Transform the energy demand data from Klimaatmonitor to the right format. 

##### List ETLocal keys

Preview the ETLocal keys that are relevant for the industry energy demand category

In [179]:
# # Filter the ETLocal keys that are relevant for the industry energy demand category
# filter_industry_demand = (slice(None), 'industry', 'industry_energy_demand')

# # Preview the filtered template
# template.loc[filter_industry_demand, :]

##### Fill missing values

Then, fill the missing values for:
- ...

**??** | ..

In [180]:
# # Preview template after changes
# template.loc[filter_industry_demand, :].head()

#### Exceptions

If there are any exceptions for municipal datasets, you can overwrite the values below.

**TO DO** | Explain to the user that this section should always be checked. Should the exception be overwritten by the update? Or should it remain untouched?

##### Gemeente X

In [181]:
#

#### Export processed data
Write the transformed data to our  `data / processed` directory. This is the data we will be using for the next (analysis and visualisation) steps in the pipeline.

In [182]:
# Write dataframe to intermediate data folder
path = Path("data", "processed", "etlocal_template.csv")
template.to_csv(path)

### 4. Data analysis and visualisation

In [183]:
# TODO

## Energy production

In [184]:
# TODO

### 1. Data collection

In [185]:
# TODO

In the data collection phase, we make an overview of the required data and design a data model to efficiently build our database. If possible, we call the API to collect our data. Otherwise, we first manually collect a data export and store it in the `data / raw` directory.

![image.png](attachment:75b96a93-92f4-444a-9a96-18eb4a6d5295.png)

For the municipal datasets we use the following data sources:
* **??** | ...
* **Dataset manager** | to collect the parent dataset

For each data source, the following steps are followed:
* **Extract** | Extracting the raw data from its source
* **Transform** | Transforming the raw data to a 'workable' format 
* **Load** | Writing the processed data to the intermediate data directory


In [186]:
# TODO

### 2. Data quality

In [187]:
# TODO

#### Setup

Setup the checker and load all files from the `data / intermediate` directory.

First, setup the checker class

In [188]:
# Initialise checker class to access transformation functions
checker = Checker()

Then, setup all data files:
- ...
- parent data

### 3. Data transformation

In [189]:
# TODO

#### Setup

Setup the transformer and load all intermediate data files for the next step in the pipeline.

First, setup the transformer class:

In [190]:
# Initialise transformer class to access transformation functions
transformer = Transformer(municipalities, year)

Then, setup all data files:
- dataframe representing data.csv 
- commits.yml
- dataframe with all etlocal interface elements
- ...
- parent data

##### Load parent data

In [191]:
# Import parent data
path = Path("data", "intermediate", "parent_data.csv")
df_parent = pd.read_csv(path, index_col=0)

# Preview parent data
df_parent.T.head()

Unnamed: 0,nl
agriculture_final_demand_electricity_demand,41584.615
agriculture_final_demand_network_gas_demand,91072.918
input_agriculture_final_demand_steam_hot_water_demand,2783.107
agriculture_final_demand_wood_pellets_demand,5437.224
input_agriculture_final_demand_crude_oil_demand,17965.442


#### Energy demand (own use)
Transform the energy demand (own use) production data to the right format. 

##### List ETLocal keys

Preview the ETLocal keys that are relevant for the fossil electricity production category

In [192]:
# Filter the ETLocal keys that are relevant for the energy demand (own use) category
filter_energy_energy_demand = (slice(None), 'energy', 'energy_energy_demand')

# Preview the filtered template
template.loc[filter_energy_energy_demand, :]

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,unit,value,commit
geo_id,group,subgroup,key,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
GM0003,energy,energy_energy_demand,input_energy_heat_distribution_ht_loss_share,%,,
GM0003,energy,energy_energy_demand,input_energy_heat_distribution_mt_loss_share,%,,
GM0003,energy,energy_energy_demand,input_energy_heat_distribution_lt_loss_share,%,,
GM0010,energy,energy_energy_demand,input_energy_heat_distribution_ht_loss_share,%,,
GM0010,energy,energy_energy_demand,input_energy_heat_distribution_mt_loss_share,%,,
...,...,...,...,...,...,...
GM0642,energy,energy_energy_demand,input_energy_heat_distribution_mt_loss_share,%,,
GM0642,energy,energy_energy_demand,input_energy_heat_distribution_lt_loss_share,%,,
GM0193,energy,energy_energy_demand,input_energy_heat_distribution_ht_loss_share,%,,
GM0193,energy,energy_energy_demand,input_energy_heat_distribution_mt_loss_share,%,,


##### Fill missing values

Then, fill the missing values for:
- Heat network distribution losses
- ...

**Heat network distribution losses** | In the present situation we presume all residual heat networks to be HT. Hence, all distribution losses are allocated to HT networks.

In [193]:
for temperature_level in ['lt', 'mt', 'ht']:
    key = f'input_energy_heat_distribution_{temperature_level}_loss_share'
    # The parent data hasn't been migrated yet so for now we're using a static value
    # val = df_parent[key]
    val = 100-75.71885721737949

    # Update value and corresponding commit message for the temperature level share
    template.loc[(slice(None), slice(None), slice(None), key), 'value'] = val
    template.loc[(slice(None), slice(None), slice(None), key), 'commit'] = f"No data available. We assume the share of loss is equal to the parent dataset ({parent})."

**Preview** the template to see all changes

In [194]:
# Preview template after changes
template.loc[filter_energy_energy_demand, :].head()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,unit,value,commit
geo_id,group,subgroup,key,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
GM0003,energy,energy_energy_demand,input_energy_heat_distribution_ht_loss_share,%,24.281143,No data available. We assume the share of loss...
GM0003,energy,energy_energy_demand,input_energy_heat_distribution_mt_loss_share,%,24.281143,No data available. We assume the share of loss...
GM0003,energy,energy_energy_demand,input_energy_heat_distribution_lt_loss_share,%,24.281143,No data available. We assume the share of loss...
GM0010,energy,energy_energy_demand,input_energy_heat_distribution_ht_loss_share,%,24.281143,No data available. We assume the share of loss...
GM0010,energy,energy_energy_demand,input_energy_heat_distribution_mt_loss_share,%,24.281143,No data available. We assume the share of loss...


#### Fossil electricity production
Transform the fossil electricity production data to the right format. 

##### List ETLocal keys

Preview the ETLocal keys that are relevant for the fossil electricity production category

In [195]:
# Filter the ETLocal keys that are relevant for the fossil electricity production category
filter_fossil_electricity_production = (slice(None), 'energy', 'energy_fossil_electricity_production')

# Preview the filtered template
template.loc[filter_fossil_electricity_production, :]

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,unit,value,commit
geo_id,group,subgroup,key,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
GM0003,energy,energy_fossil_electricity_production,input_energy_chp_ultra_supercritical_ht_coal_production,TJ,,
GM0003,energy,energy_fossil_electricity_production,input_energy_chp_ultra_supercritical_mt_coal_production,TJ,,
GM0003,energy,energy_fossil_electricity_production,input_energy_chp_ultra_supercritical_cofiring_ht_coal_production,TJ,,
GM0003,energy,energy_fossil_electricity_production,input_energy_chp_ultra_supercritical_cofiring_mt_coal_production,TJ,,
GM0003,energy,energy_fossil_electricity_production,input_energy_chp_ultra_supercritical_ht_lignite_production,TJ,,
...,...,...,...,...,...,...
GM0193,energy,energy_fossil_electricity_production,input_energy_chp_ultra_supercritical_mt_lignite_production,TJ,,
GM0193,energy,energy_fossil_electricity_production,input_energy_chp_combined_cycle_ht_network_gas_production,TJ,,
GM0193,energy,energy_fossil_electricity_production,input_energy_chp_combined_cycle_mt_network_gas_production,TJ,,
GM0193,energy,energy_fossil_electricity_production,input_energy_chp_local_engine_ht_network_gas_production,TJ,,


##### Fill missing values

Then, fill the missing values for:
- MT CHPs
- ...

**MT CHPs** | In the present situatie we presume all CHPs to produce HT heat, meaning the production of MT heat by CHPs can be set to zero for all producers.

In [196]:
# TODO: we could also use the parent data shares instead?
for producer in ['ultra_supercritical_mt_coal', 'ultra_supercritical_cofiring_mt_coal', 'ultra_supercritical_mt_lignite', 'combined_cycle_mt_network_gas', 'local_engine_mt_network_gas']:
    key = f'input_energy_chp_{producer}_production'
    val = 0.
    
    # Update value and corresponding commit message for the temperature level share
    template.loc[(slice(None), slice(None), slice(None), key), 'value'] = val
    template.loc[(slice(None), slice(None), slice(None), key), 'commit'] = f"No data available. All CHPs are assumed to produce only HT heat and zero MT heat."

**Preview** the template to see all changes

In [197]:
# Preview template after changes
template.loc[filter_fossil_electricity_production, :].head()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,unit,value,commit
geo_id,group,subgroup,key,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
GM0003,energy,energy_fossil_electricity_production,input_energy_chp_ultra_supercritical_ht_coal_production,TJ,,
GM0003,energy,energy_fossil_electricity_production,input_energy_chp_ultra_supercritical_mt_coal_production,TJ,0.0,No data available. All CHPs are assumed to pro...
GM0003,energy,energy_fossil_electricity_production,input_energy_chp_ultra_supercritical_cofiring_ht_coal_production,TJ,,
GM0003,energy,energy_fossil_electricity_production,input_energy_chp_ultra_supercritical_cofiring_mt_coal_production,TJ,0.0,No data available. All CHPs are assumed to pro...
GM0003,energy,energy_fossil_electricity_production,input_energy_chp_ultra_supercritical_ht_lignite_production,TJ,,


#### Renewable electricity production
Transform the renewable electricity production data to the right format. 

##### List ETLocal keys

Preview the ETLocal keys that are relevant for the renewable electricity production category

In [198]:
# Filter the ETLocal keys that are relevant for the renewable electricity production category
filter_renewable_electricity_production = (slice(None), 'energy', 'energy_renewable_electricity_production')

# Preview the filtered template
template.loc[filter_renewable_electricity_production, :]

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,unit,value,commit
geo_id,group,subgroup,key,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
GM0003,energy,energy_renewable_electricity_production,input_energy_chp_supercritical_ht_waste_mix_production,TJ,,
GM0003,energy,energy_renewable_electricity_production,input_energy_chp_supercritical_mt_waste_mix_production,TJ,,
GM0003,energy,energy_renewable_electricity_production,input_energy_chp_local_ht_wood_pellets_production,TJ,,
GM0003,energy,energy_renewable_electricity_production,input_energy_chp_local_mt_wood_pellets_production,TJ,,
GM0003,energy,energy_renewable_electricity_production,input_energy_chp_local_engine_ht_biogas_production,TJ,,
...,...,...,...,...,...,...
GM0193,energy,energy_renewable_electricity_production,input_energy_chp_supercritical_mt_waste_mix_production,TJ,,
GM0193,energy,energy_renewable_electricity_production,input_energy_chp_local_ht_wood_pellets_production,TJ,,
GM0193,energy,energy_renewable_electricity_production,input_energy_chp_local_mt_wood_pellets_production,TJ,,
GM0193,energy,energy_renewable_electricity_production,input_energy_chp_local_engine_ht_biogas_production,TJ,,


##### Fill missing values

Then, fill the missing values for:
- MT CHPs
- ...

**MT CHPs** | In the present situatie we presume all CHPs to produce HT heat, meaning the production of MT heat by CHPs can be set to zero for all producers.

In [199]:
# TODO: we could also use the parent data shares instead?
for producer in ['supercritical_mt_waste_mix', 'local_mt_wood_pellets', 'local_engine_mt_biogas']:
    key = f'input_energy_chp_{producer}_production'
    val = 0.
    
    # Update value and corresponding commit message for the temperature level share
    template.loc[(slice(None), slice(None), slice(None), key), 'value'] = val
    template.loc[(slice(None), slice(None), slice(None), key), 'commit'] = f"No data available. All CHPs are assumed to produce only HT heat and zero MT heat."

Preview the template to see all changes

In [200]:
# Preview template after changes
template.loc[filter_renewable_electricity_production, :].head()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,unit,value,commit
geo_id,group,subgroup,key,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
GM0003,energy,energy_renewable_electricity_production,input_energy_chp_supercritical_ht_waste_mix_production,TJ,,
GM0003,energy,energy_renewable_electricity_production,input_energy_chp_supercritical_mt_waste_mix_production,TJ,0.0,No data available. All CHPs are assumed to pro...
GM0003,energy,energy_renewable_electricity_production,input_energy_chp_local_ht_wood_pellets_production,TJ,,
GM0003,energy,energy_renewable_electricity_production,input_energy_chp_local_mt_wood_pellets_production,TJ,0.0,No data available. All CHPs are assumed to pro...
GM0003,energy,energy_renewable_electricity_production,input_energy_chp_local_engine_ht_biogas_production,TJ,,


#### Heat production
Transform the heat production data to the right format. 

##### List ETLocal keys

Preview the ETLocal keys that are relevant for the heat production category

In [201]:
# Filter the ETLocal keys that are relevant for the energy heat production category
filter_energy_heat_production = (slice(None), 'energy', 'energy_heat_production')

# Preview the filtered template
template.loc[filter_energy_heat_production, :]

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,unit,value,commit
geo_id,group,subgroup,key,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
GM0003,energy,energy_heat_production,input_energy_heat_well_deep_ht_geothermal_production,TJ,,
GM0003,energy,energy_heat_production,input_energy_heat_well_deep_mt_geothermal_production,TJ,,
GM0003,energy,energy_heat_production,input_energy_heat_well_shallow_heatpump_mt_geothermal_production,TJ,,
GM0003,energy,energy_heat_production,input_energy_heat_well_shallow_lt_geothermal_production,TJ,,
GM0003,energy,energy_heat_production,input_energy_heat_burner_ht_wood_pellets_production,TJ,,
...,...,...,...,...,...,...
GM0193,energy,energy_heat_production,input_energy_heat_burner_ht_crude_oil_production,TJ,,
GM0193,energy,energy_heat_production,input_energy_heat_burner_mt_crude_oil_production,TJ,,
GM0193,energy,energy_heat_production,energy_heat_import_lt_steam_hot_water_demand,TJ,,
GM0193,energy,energy_heat_production,energy_heat_import_mt_steam_hot_water_demand,TJ,,


##### Fill missing values

Then, fill the missing values for:
- MT/LT heat production
- Imported heat
- ...

**MT/LT heat production** | In the present situation we presume all HT/MT producers to produce only HT heat, meaning the production of MT heat by those producers can be set to zero. Furthermore, there is no data available about LT heat production. Hence, we assume that all production of LT heat is zero.

In [202]:
for producer in ['well_deep_mt_geothermal', 'burner_mt_wood_pellets', 'burner_mt_waste_mix', 'burner_lt_hydrogen', 'burner_mt_hydrogen', 'burner_mt_network_gas', 'heatpump_water_water_mt_electricity', 'boiler_mt_electricity', 'boiler_lt_electricity', 'burner_mt_coal', 'burner_mt_crude_oil']:
    key = f'input_energy_heat_{producer}_production'
    val = 0.
    
    # Update value and corresponding commit message for the temperature level share
    template.loc[(slice(None), slice(None), slice(None), key), 'value'] = val
    template.loc[(slice(None), slice(None), slice(None), key), 'commit'] = f"No data available. All heat producers are assumed to produce only HT heat and zero MT or LT heat."

In [203]:
for producer in ['well_shallow_heatpump_mt_geothermal', 'solar_mt_solar_thermal']:
    key = f'input_energy_heat_{producer}_production'
    val = 0.
    
    # Update value and corresponding commit message for the temperature level share
    template.loc[(slice(None), slice(None), slice(None), key), 'value'] = val
    template.loc[(slice(None), slice(None), slice(None), key), 'commit'] = f"No data available. Production of MT heat is set to zero."

In [204]:
for producer in ['well_shallow_lt_geothermal', 'heatpump_water_water_lt_electricity', 'solar_lt_solar_thermal']:
    key = f'input_energy_heat_{producer}_production'
    val = 0.
    
    # Update value and corresponding commit message for the temperature level share
    template.loc[(slice(None), slice(None), slice(None), key), 'value'] = val
    template.loc[(slice(None), slice(None), slice(None), key), 'commit'] = f"No data available. Production of LT heat is set to zero."

**Imported heat** | In the present situation we assume no heat is imported.

In [205]:
for temperature_level in ['lt', 'mt', 'ht']:
    # TODO: turn into method
    key = f'energy_heat_import_{temperature_level}_steam_hot_water_demand'
    val = 0.

    # Update value and corresponding commit message for the temperature level share
    template.loc[(slice(None), slice(None), slice(None), key), 'value'] = val
    template.loc[(slice(None), slice(None), slice(None), key), 'commit'] = f"No data available. By default, the import of heat is set to zero."

**Preview** the template to see all changes

In [206]:
# Preview template after changes
template.loc[filter_energy_heat_production, :].head()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,unit,value,commit
geo_id,group,subgroup,key,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
GM0003,energy,energy_heat_production,input_energy_heat_well_deep_ht_geothermal_production,TJ,,
GM0003,energy,energy_heat_production,input_energy_heat_well_deep_mt_geothermal_production,TJ,0.0,No data available. All heat producers are assu...
GM0003,energy,energy_heat_production,input_energy_heat_well_shallow_heatpump_mt_geothermal_production,TJ,0.0,No data available. Production of MT heat is se...
GM0003,energy,energy_heat_production,input_energy_heat_well_shallow_lt_geothermal_production,TJ,0.0,No data available. Production of LT heat is se...
GM0003,energy,energy_heat_production,input_energy_heat_burner_ht_wood_pellets_production,TJ,,


#### Exceptions

If there are any exceptions for municipal datasets, you can overwrite the values below.

**TO DO** | Explain to the user that this section should always be checked. Should the exception be overwritten by the update? Or should it remain untouched?

##### Gemeente X

In [207]:
#

#### Export processed data
Write the transformed data to our  `data / processed` directory. This is the data we will be using for the next (analysis and visualisation) steps in the pipeline.

In [208]:
# Write dataframe to intermediate data folder
path = Path("data", "processed", "etlocal_template.csv")
template.to_csv(path)

### 4. Data analysis and visualisation

In [209]:
# TODO

## Migration

#### Initialize migration files

In [210]:
# First, initialize data.csv file based on the config file
path = Path("config", "data.csv")
df_data_csv = pd.read_csv(path, sep=sep, index_col=[0])

# Drop unnamed columns
df_data_csv = df_data_csv.drop(columns=df_data_csv.columns[df_data_csv.columns.str.startswith('Unnamed')])

In [211]:
# Preview dataframe representing data.csv
df_data_csv

Unnamed: 0_level_0,country,name
geo_id,Unnamed: 1_level_1,Unnamed: 2_level_1
GM0003,nl2019,Appingedam
GM0010,nl2019,Delfzijl
GM0024,nl2019,Loppersum
GM1680,nl2019,Aa en Hunze
GM0358,nl2019,Aalsmeer
...,...,...
GM0879,nl2019,Zundert
GM0301,nl2019,Zutphen
GM1896,nl2019,Zwartewaterland
GM0642,nl2019,Zwijndrecht


In [212]:
# Then, initialize commits.yml file by specifying the file path where you want to create the YAML file
path = Path("data", "reporting", "commits.yml")

# Write the document separator to the YAML file
with open(path, 'w') as file:
    file.write('---\n')

#### Fill migration files
Fill the relevant processed data from the ETLocal template in the `data.csv` and `commits.yml` files that are necessary for a ETLocal migration.

In [213]:
# Preview ETLocal template
template

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,unit,value,commit
geo_id,group,subgroup,key,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
GM0003,area,area_emission_factors,file_carriers_imported_heat_co2_conversion_per_mj,kg/MJ,0.036000,Based on https://www.co2emissiefactoren.nl/lij...
GM0003,area,area_emission_factors,file_carriers_propane_co2_conversion_per_mj,kg/MJ,0.064480,Adopted from the parent dataset (the Netherlands)
GM0003,households,households_energy_demand,input_percentage_of_lt_steam_hot_water_households_final_demand_steam_hot_water,%,0.000000,No data available. Fallback value set to 0.0 t...
GM0003,households,households_energy_demand,input_percentage_of_mt_steam_hot_water_households_final_demand_steam_hot_water,%,0.000000,No data available. Fallback value set to 0.0 t...
GM0003,households,households_energy_demand,input_percentage_of_ht_steam_hot_water_households_final_demand_steam_hot_water,%,1.000000,No data available. Fallback value set to 1.0 t...
...,...,...,...,...,...,...
GM0193,energy,energy_heat_production,energy_heat_import_mt_steam_hot_water_demand,TJ,0.000000,"No data available. By default, the import of h..."
GM0193,energy,energy_heat_production,energy_heat_import_ht_steam_hot_water_demand,TJ,0.000000,"No data available. By default, the import of h..."
GM0193,energy,energy_energy_demand,input_energy_heat_distribution_ht_loss_share,%,24.281143,No data available. We assume the share of loss...
GM0193,energy,energy_energy_demand,input_energy_heat_distribution_mt_loss_share,%,24.281143,No data available. We assume the share of loss...


In [214]:
# List all keys for which values have been added to the template
keys = list(template.dropna().index.get_level_values(3).unique())

# Preview list of keys
keys

['file_carriers_imported_heat_co2_conversion_per_mj',
 'file_carriers_propane_co2_conversion_per_mj',
 'input_percentage_of_lt_steam_hot_water_households_final_demand_steam_hot_water',
 'input_percentage_of_mt_steam_hot_water_households_final_demand_steam_hot_water',
 'input_percentage_of_ht_steam_hot_water_households_final_demand_steam_hot_water',
 'input_percentage_of_lt_steam_hot_water_buildings_final_demand_steam_hot_water',
 'input_percentage_of_mt_steam_hot_water_buildings_final_demand_steam_hot_water',
 'input_percentage_of_ht_steam_hot_water_buildings_final_demand_steam_hot_water',
 'input_percentage_of_central_mt_steam_hot_water_agriculture_final_demand_steam_hot_water',
 'input_percentage_of_central_ht_steam_hot_water_agriculture_final_demand_steam_hot_water',
 'input_percentage_of_local_steam_hot_water_agriculture_final_demand_steam_hot_water',
 'input_energy_chp_ultra_supercritical_mt_coal_production',
 'input_energy_chp_ultra_supercritical_cofiring_mt_coal_production',
 'i

In [215]:
# Add keys to data.csv migration file
for key in keys:
    df_data_csv.loc[:, key] = float('nan')
    
# Preview data.csv
df_data_csv

Unnamed: 0_level_0,country,name,file_carriers_imported_heat_co2_conversion_per_mj,file_carriers_propane_co2_conversion_per_mj,input_percentage_of_lt_steam_hot_water_households_final_demand_steam_hot_water,input_percentage_of_mt_steam_hot_water_households_final_demand_steam_hot_water,input_percentage_of_ht_steam_hot_water_households_final_demand_steam_hot_water,input_percentage_of_lt_steam_hot_water_buildings_final_demand_steam_hot_water,input_percentage_of_mt_steam_hot_water_buildings_final_demand_steam_hot_water,input_percentage_of_ht_steam_hot_water_buildings_final_demand_steam_hot_water,...,input_energy_heat_boiler_lt_electricity_production,input_energy_heat_burner_mt_network_gas_production,input_energy_heat_burner_mt_coal_production,input_energy_heat_burner_mt_crude_oil_production,energy_heat_import_lt_steam_hot_water_demand,energy_heat_import_mt_steam_hot_water_demand,energy_heat_import_ht_steam_hot_water_demand,input_energy_heat_distribution_ht_loss_share,input_energy_heat_distribution_mt_loss_share,input_energy_heat_distribution_lt_loss_share
geo_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
GM0003,nl2019,Appingedam,,,,,,,,,...,,,,,,,,,,
GM0010,nl2019,Delfzijl,,,,,,,,,...,,,,,,,,,,
GM0024,nl2019,Loppersum,,,,,,,,,...,,,,,,,,,,
GM1680,nl2019,Aa en Hunze,,,,,,,,,...,,,,,,,,,,
GM0358,nl2019,Aalsmeer,,,,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
GM0879,nl2019,Zundert,,,,,,,,,...,,,,,,,,,,
GM0301,nl2019,Zutphen,,,,,,,,,...,,,,,,,,,,
GM1896,nl2019,Zwartewaterland,,,,,,,,,...,,,,,,,,,,
GM0642,nl2019,Zwijndrecht,,,,,,,,,...,,,,,,,,,,


In [216]:
# Initialize an empty array for the commits
commits = []

# For all relevant keys and for each municipality fill data.csv and commits.yml
for key in keys:
    for municipality in municipalities:
        # Add data value to data.csv
        df_data_csv.loc[municipality, key] = template.loc[(municipality, slice(None), slice(None), key), 'value'].values[0]
        
    # Add commit message to commits.yml
    commits.append({'fields': [key], 'message': template.loc[(slice(None), slice(None), slice(None), key), 'commit'].values[0]})
    
# Preview data for all municipalities in the format required for the data.csv file
df_data_csv

Unnamed: 0_level_0,country,name,file_carriers_imported_heat_co2_conversion_per_mj,file_carriers_propane_co2_conversion_per_mj,input_percentage_of_lt_steam_hot_water_households_final_demand_steam_hot_water,input_percentage_of_mt_steam_hot_water_households_final_demand_steam_hot_water,input_percentage_of_ht_steam_hot_water_households_final_demand_steam_hot_water,input_percentage_of_lt_steam_hot_water_buildings_final_demand_steam_hot_water,input_percentage_of_mt_steam_hot_water_buildings_final_demand_steam_hot_water,input_percentage_of_ht_steam_hot_water_buildings_final_demand_steam_hot_water,...,input_energy_heat_boiler_lt_electricity_production,input_energy_heat_burner_mt_network_gas_production,input_energy_heat_burner_mt_coal_production,input_energy_heat_burner_mt_crude_oil_production,energy_heat_import_lt_steam_hot_water_demand,energy_heat_import_mt_steam_hot_water_demand,energy_heat_import_ht_steam_hot_water_demand,input_energy_heat_distribution_ht_loss_share,input_energy_heat_distribution_mt_loss_share,input_energy_heat_distribution_lt_loss_share
geo_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
GM0003,nl2019,Appingedam,0.036,0.06448,0.0,0.0,1.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,24.281143,24.281143,24.281143
GM0010,nl2019,Delfzijl,0.036,0.06448,0.0,0.0,1.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,24.281143,24.281143,24.281143
GM0024,nl2019,Loppersum,0.036,0.06448,0.0,0.0,1.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,24.281143,24.281143,24.281143
GM1680,nl2019,Aa en Hunze,0.036,0.06448,0.0,0.0,1.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,24.281143,24.281143,24.281143
GM0358,nl2019,Aalsmeer,0.036,0.06448,0.0,0.0,1.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,24.281143,24.281143,24.281143
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
GM0879,nl2019,Zundert,0.036,0.06448,0.0,0.0,1.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,24.281143,24.281143,24.281143
GM0301,nl2019,Zutphen,0.036,0.06448,0.0,0.0,1.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,24.281143,24.281143,24.281143
GM1896,nl2019,Zwartewaterland,0.036,0.06448,0.0,0.0,1.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,24.281143,24.281143,24.281143
GM0642,nl2019,Zwijndrecht,0.036,0.06448,0.0,0.0,1.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,24.281143,24.281143,24.281143


In [217]:
# Write data.csv to processed data directory
df_data_csv.to_csv(f"data/reporting/data.csv")

In [218]:
# TODO: Beautify the yaml data and make it more readable
# commits_yaml = yaml.dump(commits, sort_keys=False, indent=4, default_flow_style=False, default_style='|')

# Write the updated data back to the YAML file
path = Path("data", "reporting", "commits.yml")

with open(path, 'w') as file:
    yaml.safe_dump(commits, file)