In [1]:
import numpy as np
import yaml  # or json
from pathlib import Path
import csv
import os
from pprint import pprint


In [2]:
def load_config(config_path):
    """Load YAML config file and validate structure."""
    config_path = Path(config_path)
    with open(config_path) as f:
        if config_path.suffix == '.yaml':
            config = yaml.safe_load(f)
        else:
            import json
            config = json.load(f)
    
    # Validate config structure
    assert "microdata" in config, "Config missing 'microdata' section"
    assert "constraints" in config and len(config["constraints"]) > 0, "No constraints defined"
    return config

In [3]:
config = load_config("data/2025-08-08/data/config.yaml")
pprint(config)

{'constraints': [{'constraint_prefix': 's1_hh_urban_rural%',
                  'dataprocess': 'onehot',
                  'file': 'data/2025-08-08/data/census2021_c1_urbanrural_master.csv',
                  'geography_column': 'areacode',
                  'microdata_id': 's1_hh_urban_rural',
                  'set_as_population_total': True},
                 {'constraint_prefix': 's2_hh_size%',
                  'dataprocess': 'onehot',
                  'file': 'data/2025-08-08/data/census2021_c2_hhsize.csv',
                  'geography_column': 'areacode',
                  'microdata_id': 's2_hh_size',
                  'set_as_population_total': False},
                 {'constraint_prefix': 's3_hh_tenure_child%',
                  'dataprocess': 'onehot',
                  'file': 'data/2025-08-08/data/census2021_c3_hhtenure_child.csv',
                  'geography_column': 'areacode',
                  'microdata_id': 's3_hh_tenure_child',
                  'set_as_population

In [4]:
def build_constraint_arrays(config):
    """
    Enhanced version that:
    1. Uses set_as_population_total to calculate population sizes
    2. Tracks geography codes (GEOIDs) separately
    3. Returns results in a structured dict
    
    Returns:
        {
            "constraint_labels": List[str],
            "constraint_targets": np.array,
            "geography_codes": List[str],
            "population_constraints": np.array
            "population_totals":np.arry
            "data_handeling":List[str]
        }
    """
    constraint_labels = []
    constraint_targets = None
    geography_codes = []
    pop_total_constraint = False
    population_constraints = []
    population_totals=[]
    data_handeling=[]
    

    for constraint in config["constraints"]:
        with open(constraint["file"], mode='r') as f:
            reader = csv.reader(f)
            headers = next(reader)
            data = list(reader)
        
        poptotal_constraint = constraint["set_as_population_total"]
        print(poptotal_constraint,constraint["file"])
            
        geo_col = constraint["geography_column"]
        geo_idx = headers.index(geo_col)
        
        # Store GEOIDs on first pass
        if not geography_codes:
            geography_codes = [row[geo_idx] for row in data]
        
        # Handle population totals if specified
        if pop_total_constraint: 
            population_constraints = np.array([float(row[pop_idx]) for row in data])
        
        # Process categories
        categories = [h for i, h in enumerate(headers) if i != geo_idx]
        prefix = constraint["constraint_prefix"]
        constraint_labels.extend(f"{prefix}{cat}" for cat in categories)
        
        # Extract targets
        target_rows = []
        for row in data:
            target_values = [float(row[i]) for i in range(len(headers)) if i != geo_idx]
            total_population = sum(target_values)
            if poptotal_constraint:
                population_totals.append(total_population)
            target_rows.append(target_values)
        data_handeling.append(constraint['dataprocess'])

        
        targets = np.array(target_rows)
        constraint_targets = targets if constraint_targets is None else np.hstack([constraint_targets, targets])
    
    header = ['geography_code','population_total']+constraint_labels
    print(header)
    print(data_handeling)
    table = [[geography_codes[i]]+[population_totals[i]]+constraint_targets[i].tolist() for i in range(len(geography_codes))]
    table.insert(0,header)
    return {
        "constraint_labels": constraint_labels,
        "constraint_targets": constraint_targets.tolist(),
        "geography_codes": geography_codes,
        "population_totals":population_totals,
        "data_handeling":data_handeling,
        "table":table
    }

In [5]:
constraints_dict = build_constraint_arrays(config)

True data/2025-08-08/data/census2021_c1_urbanrural_master.csv
False data/2025-08-08/data/census2021_c2_hhsize.csv
False data/2025-08-08/data/census2021_c3_hhtenure_child.csv
False data/2025-08-08/data/census2021_c4_hhcars_hhsize.csv
False data/2025-08-08/data/census2021_c5_hhbeds.csv
False data/2025-08-08/data/census2021_c6_hhheating.csv
False data/2025-08-08/data/census2021_c7_hhtype.csv
False data/2025-08-08/data/census2021_c8_hhref_activity.csv
False data/2025-08-08/data/census2021_c9_hhref_sex_hhsize.csv
False data/2025-08-08/data/census2021_c10_hhref_ethnicity.csv
False data/2025-08-08/data/census2021_c11_hhref_age_size_child.csv
False data/2025-08-08/data/census2021_c12_employment_hh_size.csv
False data/2025-08-08/data/census2021_c13_unpaid_carer.csv
False data/2025-08-08/data/census2021_c14_deprivation_count.csv
['geography_code', 'population_total', 's1_hh_urban_rural%urban', 's1_hh_urban_rural%rural', 's2_hh_size%hhsize_1', 's2_hh_size%hhsize_2', 's2_hh_size%hhsize_3', 's2_hh_

In [6]:
def encode_microdata(config, constraint_labels,data_handeling):
    """
    Encode microdata into a one-hot-like array where missing values are 0.
    Returns:
        microdata_encoded: np.array shape (n_individuals, n_constraints)
        ids: list of IDs from the microdata
    """
    # Step 1: Load microdata from CSV
    with open(config["microdata"]["file"], mode='r') as f:
        reader = csv.DictReader(f)  # Reads header and rows as dictionaries
        microdata = list(reader)    # Convert to list of dicts

    n_individuals = len(microdata)
    n_constraints = len(constraint_labels)

    # Step 2: Create label-to-index mapping
    label_to_idx = {label: idx for idx, label in enumerate(constraint_labels)}

    # Step 3: Initialize output array (all zeros)
    microdata_encoded = np.zeros((n_individuals, n_constraints), dtype=np.int8)

    # Step 4: Extract IDs
    ids = [row[config["microdata"]["id_column"]] for row in microdata]
    

    # Step 5: Encode each constraint
    for constraint in config["constraints"]:
        print(constraint["microdata_id"])
        col = constraint["microdata_id"]
        prefix = constraint["constraint_prefix"]
        dathandling = constraint["dataprocess"]
        print(dathandling)
        count = 0
        count2 = 0
        for row_idx, row in enumerate(microdata):
            value = row.get(col)  # Get value for the current constraint column
            if count%10000 == 0:
                print(f"{count/n_individuals*100:.2g}% complete {count2}  {count} {row_idx}")
            count+=1
            # Skip missing values (leave as 0)
            if value is not None and value.strip() != '':  # Check for non-empty strings
                if dathandling == "onehot":
                    label = f"{prefix}{value}"
                    if label in label_to_idx:  # Ensure label exists in constraints
                        microdata_encoded[row_idx, label_to_idx[label]]  = 1
                else:
                    lable=prefix
                    if label in label_to_idx:  # Ensure label exists in constraints
                        microdata_encoded[row_idx, label_to_idx[label]]  = int(value.strip())
                count2+=1                
    
            table = [[ids[i]] + microdata_encoded[i].tolist() for i in range(len(microdata_encoded))]
            # table.insert(0,header)
    return {
        "microdata_encoded":microdata_encoded, 
        "ids":ids,
        "table":  table
    }

In [7]:
microdata_dict = encode_microdata(config,constraints_dict["constraint_labels"],constraints_dict['data_handeling'])

s1_hh_urban_rural
onehot
0% complete 0  0 0
51% complete 10000  10000 10000
s2_hh_size
onehot
0% complete 0  0 0
51% complete 10000  10000 10000
s3_hh_tenure_child
onehot
0% complete 0  0 0
51% complete 10000  10000 10000
s4_hh_ncars_hh_size
onehot
0% complete 0  0 0
51% complete 10000  10000 10000
s5_hh_beds
onehot
0% complete 0  0 0
51% complete 10000  10000 10000
s6_hh_heating
onehot
0% complete 0  0 0
51% complete 10000  10000 10000
s7_hh_type
onehot
0% complete 0  0 0
51% complete 10000  10000 10000
s8_hhref_activity
onehot
0% complete 0  0 0
51% complete 10000  10000 10000
s9_hhref_sex_hh_size
onehot
0% complete 0  0 0
51% complete 10000  10000 10000
s10_hhref_ethnicity
onehot
0% complete 0  0 0
51% complete 10000  10000 10000
s11_hhref_age_size_child
onehot
0% complete 0  0 0
51% complete 10000  10000 10000
s12_employment_hh_size
onehot
0% complete 0  0 0
51% complete 10000  10000 10000
s13_unpaid_carer
onehot
0% complete 0  0 0
51% complete 10000  10000 10000
s14_deprivation_co

In [10]:
def to_file(file_path,data):
    filename = file_path
    # Open the file in write mode
    with open(filename, mode='w', newline='') as file:
        writer = csv.writer(file)

        # Write the data to the CSV file
        writer.writerows(data)

In [11]:
to_file('data/england_wales_microdata_encoded_counts_individuals_1808.csv', microdata_dict["table"])
to_file('data/england_wales_constraint_targets_counts_individuals_1808.csv', constraints_dict["table"])

In [8]:
print("done!")

done!
