# Post processing

Takes the 'processed' data and converts it into a format useful for export to ETLocal. These steps could also be appended to the other workbooks in order to get data ready for export immediately.

In [26]:
# Imports
from pathlib import Path
import src.helper
import importlib
importlib.reload(src.helper)

<module 'src.helper' from '/Users/koenvanbemmelen/work/etdataset/pipelines/src/helper.py'>

In [27]:
# Setup
path_inputs = [
    Path("data", "processed", f"etlocal_template_households_filled.csv"),
    Path("data", "processed", f"etlocal_template_buildings_filled.csv"),
    Path("data", "processed", f"etlocal_template_industry_filled.csv"),
    Path("data", "processed", f"etlocal_template_transport_filled.csv"),
    Path("data", "processed", f"etlocal_template_built_environment_stock_filled.csv"),
    Path("data", "processed", f"etlocal_template_agriculture_filled.csv"),
    Path("data", "processed", f"etlocal_template_area_filled.csv"),
    Path("data", "processed", f"etlocal_template_energy_production_filled.csv"),
    Path("data", "processed", f"etlocal_template_non_energetic_emissions_filled.csv"),
    Path("data", "processed", f"etlocal_template_other_energy_filled.csv"),
    Path("data", "intermediate", f"etlocal_template_for_sector_notebooks.csv")
    ]

path_output = Path("data", "post_processed")
path_reporting_output = Path("data", "reporting")

# Fill this in appropriately:
country = "nl2023"
analysis_year = "2023"
dataset_names= Path("config", f"all_datasets_mapping.csv")
output_name = f"etlocal_filled_and_formatted.csv"

In [28]:
# Csv
src.helper.pivot_n_merge_processed(
    input_csv_paths= path_inputs,
    output_dir=path_output,
    country= country,
    analysis_year=analysis_year,
    config_path=dataset_names,
    csv_filename= "combined_unfilled_processed_data.csv"
)

Found 708 total unique keys
Final result: 708 data columns


{'csv_path': 'data/post_processed/combined_unfilled_processed_data.csv'}

In [29]:
import pandas as pd

# Read the combined data
df = pd.read_csv(path_output / "combined_unfilled_processed_data.csv")

# Define the municipalities to keep
municipalities_to_keep = {
  "GM1980": "Dijk en Waard",
  "GM1982": "Land van Cuijk", 
  "GM1991": "Maashorst",
  "GM1992": "Voorne aan Zee"
}

# Clear all name values first
df['name'] = ''
# Sort the dataframe alphabetically by geo_id before saving
df = df.sort_values('geo_id')

# Set name for specific municipalities
for geo_id, name in municipalities_to_keep.items():
  df.loc[df['geo_id'] == geo_id, 'name'] = name

# Save the modified dataframe as data.csv in reporting
# This is the file to be used in the ETLocal data migration
df.to_csv(path_reporting_output / "data.csv", index=False)

print(f"Updated Name column for {len(municipalities_to_keep)} municipalities")



Updated Name column for 4 municipalities


In [30]:
# Commits yaml - you have to run this so it strips the commits row off the end of the combined unfilled processed data
# The commits.yml file is used in the ETLocal data migration
src.helper.generate_commits_yml(
    input_csv_paths= Path(path_reporting_output, "data.csv"),
    output_dir= path_reporting_output,
    yml_filename= "commits.yml",
    analysis_year=analysis_year
)

{'yml_path': 'data/reporting/commits.yml'}

Note: Currently the csv method leaves missing value data as empty (' '). The yml method filters out rows with missing key OR commit values.

This cell takes hierarchical csvs to fill values for the missing values in the post_processed data. Values which are filled in are displayed in bold, and the total number of cells filled is displayed in a short printed report.