# Rails amalgamator command generator

This notebook generates the rails commands for the ETLocal amalgamator. It processes the CBS regional data to arrive at sets of municipal codes for each RES region and province.

## Setup

In [25]:
# internal modules
import csv
import os
import sys
import importlib

# external modules
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import time
import xlwings as xw
import yaml
from pathlib import Path
import src.load_data_manager
from src.load_data_manager import LoadDataManager
from typing import List

# Reload
importlib.reload(src.load_data_manager)

<module 'src.load_data_manager' from '/Users/koenvanbemmelen/work/etdataset/pipelines/src/load_data_manager.py'>

## General

## Extract

In [26]:
path = Path("config", "CBS Gebieden in Nederland 2023 - ETM version.csv")
sep = ','
df_cbs = pd.read_csv(path, sep=sep)
df_cbs

Unnamed: 0,Codes en namen van gemeenten|Naam,Codes en namen van gemeenten|Code,Lokaliseringen van gemeenten|Provincies|Code,Lokaliseringen van gemeenten|Provincies|Naam,Lokaliseringen van gemeenten|Regionale Energiestrategie regio’s|Code,Lokaliseringen van gemeenten|Regionale Energiestrategie regio’s|Naam
0,Aa en Hunze,GM1680,PV22,Drenthe,ES03,Drenthe
1,Aalsmeer,GM0358,PV27,Noord-Holland,ES16,Noord-Holland Zuid
2,Aalten,GM0197,PV25,Gelderland,ES07,Achterhoek
3,Achtkarspelen,GM0059,PV21,Fryslân,ES02,Friesland
4,Alblasserdam,GM0482,PV28,Zuid-Holland,ES18,Drechtsteden
...,...,...,...,...,...,...
337,Zundert,GM0879,PV30,Noord-Brabant,ES28,West-Brabant
338,Zutphen,GM0301,PV25,Gelderland,ES12,Stedendriehoek
339,Zwartewaterland,GM1896,PV23,Overijssel,ES05,West-Overijssel
340,Zwijndrecht,GM0642,PV28,Zuid-Holland,ES18,Drechtsteden


Define the RES regions as dictionary

In [27]:
# Create dictionary for RES regions
res_dict = {}
res_codes = df_cbs["Lokaliseringen van gemeenten|Regionale Energiestrategie regio’s|Code"].unique().tolist()
res_codes.sort()

for res_code in res_codes:
    res_name = df_cbs[df_cbs["Lokaliseringen van gemeenten|Regionale Energiestrategie regio’s|Code"] == res_code]["Lokaliseringen van gemeenten|Regionale Energiestrategie regio’s|Naam"].values[0]
    municipalities = df_cbs[df_cbs["Lokaliseringen van gemeenten|Regionale Energiestrategie regio’s|Code"] == res_code]["Codes en namen van gemeenten|Code"].tolist()
    # For each res code, add a dictionary with keys 'name' and 'municipalities'
    res_dict[res_code] = {
        "name": res_name,
        "municipalities": municipalities
    }



In [29]:
# Validate: count the number of municipalities in the res_dict
total_municipalities = sum([len(res_dict[res_code]["municipalities"]) for res_code in res_dict])
total_municipalities

342

Now do the same for provinces

In [30]:
# Create dictionary for RES regions
province_dict = {}
province_codes = df_cbs["Lokaliseringen van gemeenten|Provincies|Code"].unique().tolist()
province_codes.sort()

for province_code in province_codes:
    province_name = df_cbs[df_cbs["Lokaliseringen van gemeenten|Provincies|Code"] == province_code]["Lokaliseringen van gemeenten|Provincies|Naam"].values[0]
    municipalities = df_cbs[df_cbs["Lokaliseringen van gemeenten|Provincies|Code"] == province_code]["Codes en namen van gemeenten|Code"].tolist()
    # For each province code, add a dictionary with keys 'name' and 'municipalities'
    province_dict[province_code] = {
        "name": province_name,
        "municipalities": municipalities
    }

# Rename Fryslân to Fryslan to avoid encoding issues
province_dict['PV21']['name'] = 'Fryslan'

province_dict

{'PV20': {'name': 'Groningen',
  'municipalities': ['GM1979',
   'GM0014',
   'GM1966',
   'GM1952',
   'GM1895',
   'GM0765',
   'GM0037',
   'GM0047',
   'GM1969',
   'GM1950']},
 'PV21': {'name': 'Fryslan',
  'municipalities': ['GM0059',
   'GM0060',
   'GM1891',
   'GM1940',
   'GM0072',
   'GM0074',
   'GM0080',
   'GM1970',
   'GM0085',
   'GM0086',
   'GM0088',
   'GM0090',
   'GM1900',
   'GM0093',
   'GM0737',
   'GM0096',
   'GM1949',
   'GM0098']},
 'PV22': {'name': 'Drenthe',
  'municipalities': ['GM1680',
   'GM0106',
   'GM1681',
   'GM0109',
   'GM0114',
   'GM0118',
   'GM0119',
   'GM1731',
   'GM1699',
   'GM1730',
   'GM1701',
   'GM1690']},
 'PV23': {'name': 'Overijssel',
  'municipalities': ['GM0141',
   'GM0147',
   'GM0148',
   'GM0150',
   'GM1774',
   'GM0153',
   'GM0158',
   'GM0160',
   'GM0163',
   'GM0164',
   'GM1735',
   'GM0166',
   'GM0168',
   'GM0173',
   'GM1773',
   'GM0175',
   'GM0177',
   'GM1742',
   'GM0180',
   'GM1708',
   'GM0183',
   'GM17

In [31]:
# Validate: count the number of municipalities in the province_dict
total_municipalities = sum([len(province_dict[province_code]["municipalities"]) for province_code in province_dict])
total_municipalities

342

## Generate amalgamator code

### RES regions

In [35]:
# Script to generate Ruby code from res_dict
def generate_ruby_code_for_res_dict(res_dict):
    ruby_code_lines = []
    
    for res_code, res_data in res_dict.items():
        # Convert ES01 to es01 for variable naming
        variable_name = res_code.lower()
        
        # Sort municipalities for consistent output
        sorted_municipalities = sorted(res_data['municipalities'])
        
        # Format municipalities as Ruby %w array
        municipalities_str = ' '.join(sorted_municipalities)
        
        # Generate the combiner code block
        combiner_block = f"""{variable_name}_combiner = Amalgamator::Combiner.new(
  target_dataset_geo_id: '{res_code}',
  source_data_year: '2023',
  source_dataset_geo_ids: %w[{municipalities_str}],
  target_area_name: '{res_data['name']}',
  target_country_name: 'nl2023',
  migration_slug: 'update_2023'
)
{variable_name}_combined_data = {variable_name}_combiner.result
{variable_name}_migration_filename = {variable_name}_combiner.export_data"""
        
        ruby_code_lines.append(combiner_block)
    
    return '\n\n'.join(ruby_code_lines)

# Generate the Ruby code
ruby_code = generate_ruby_code_for_res_dict(res_dict)
print(ruby_code)

es01_combiner = Amalgamator::Combiner.new(
  target_dataset_geo_id: 'ES01',
  source_data_year: '2023',
  source_dataset_geo_ids: %w[GM0014 GM0037 GM0047 GM0765 GM1895 GM1950 GM1952 GM1966 GM1969 GM1979],
  target_area_name: 'Groningen',
  target_country_name: 'nl2023',
  migration_slug: 'update_2023'
)
es01_combined_data = es01_combiner.result
es01_migration_filename = es01_combiner.export_data

es02_combiner = Amalgamator::Combiner.new(
  target_dataset_geo_id: 'ES02',
  source_data_year: '2023',
  source_dataset_geo_ids: %w[GM0059 GM0060 GM0072 GM0074 GM0080 GM0085 GM0086 GM0088 GM0090 GM0093 GM0096 GM0098 GM0737 GM1891 GM1900 GM1940 GM1949 GM1970],
  target_area_name: 'Friesland',
  target_country_name: 'nl2023',
  migration_slug: 'update_2023'
)
es02_combined_data = es02_combiner.result
es02_migration_filename = es02_combiner.export_data

es03_combiner = Amalgamator::Combiner.new(
  target_dataset_geo_id: 'ES03',
  source_data_year: '2023',
  source_dataset_geo_ids: %w[GM0106 GM01

### Provinces

In [34]:
# Script to generate Ruby code from province_dict
def generate_ruby_code_for_province_dict(province_dict):
    ruby_code_lines = []
    
    for province_code, province_data in province_dict.items():
        # Convert PV20 to pv20 for variable naming
        variable_name = province_code.lower()
        
        # Sort municipalities for consistent output
        sorted_municipalities = sorted(province_data['municipalities'])
        
        # Format municipalities as Ruby %w array
        municipalities_str = ' '.join(sorted_municipalities)
        
        # Generate the combiner code block
        combiner_block = f"""{variable_name}_combiner = Amalgamator::Combiner.new(
  target_dataset_geo_id: '{province_code}',
  source_data_year: '2023',
  source_dataset_geo_ids: %w[{municipalities_str}],
  target_area_name: '{province_data['name']}',
  target_country_name: 'nl2023',
  migration_slug: 'update_2023'
)
{variable_name}_combined_data = {variable_name}_combiner.result
{variable_name}_migration_filename = {variable_name}_combiner.export_data"""
        
        ruby_code_lines.append(combiner_block)
    
    return '\n\n'.join(ruby_code_lines)

# Generate the Ruby code
ruby_code = generate_ruby_code_for_province_dict(province_dict)
print(ruby_code)

pv20_combiner = Amalgamator::Combiner.new(
  target_dataset_geo_id: 'PV20',
  source_data_year: '2023',
  source_dataset_geo_ids: %w[GM0014 GM0037 GM0047 GM0765 GM1895 GM1950 GM1952 GM1966 GM1969 GM1979],
  target_area_name: 'Groningen',
  target_country_name: 'nl2023',
  migration_slug: 'update_2023'
)
pv20_combined_data = pv20_combiner.result
pv20_migration_filename = pv20_combiner.export_data

pv21_combiner = Amalgamator::Combiner.new(
  target_dataset_geo_id: 'PV21',
  source_data_year: '2023',
  source_dataset_geo_ids: %w[GM0059 GM0060 GM0072 GM0074 GM0080 GM0085 GM0086 GM0088 GM0090 GM0093 GM0096 GM0098 GM0737 GM1891 GM1900 GM1940 GM1949 GM1970],
  target_area_name: 'Fryslan',
  target_country_name: 'nl2023',
  migration_slug: 'update_2023'
)
pv21_combined_data = pv21_combiner.result
pv21_migration_filename = pv21_combiner.export_data

pv22_combiner = Amalgamator::Combiner.new(
  target_dataset_geo_id: 'PV22',
  source_data_year: '2023',
  source_dataset_geo_ids: %w[GM0106 GM0109