In [30]:
import numpy as np
from pathlib import Path
import yaml
import csv


In [31]:
def load_config(config_path):
    """Load YAML config file and validate structure."""
    config_path = Path(config_path)
    with open(config_path) as f:
        if config_path.suffix == '.yaml':
            config = yaml.safe_load(f)
        else:
            import json
            config = json.load(f)
    
    # # Validate config structure
    assert "microdata" in config, "Config missing 'microdata' section"
    assert "constraints" in config and len(config["constraints"]) > 0, "No constraints defined"
    return config

In [32]:
config = load_config(config_path)
print(config)

{'microdata': {'file': '../data/Survey/urban_size_tenure.csv', 'id_column': 'n_hidp'}, 'constraints': [{'file': '../data/Census/census2022_c1_urbanrural_master.csv', 'microdata_id': 'n_urban_dv', 'constraint_prefix': 'Urban%', 'geography_column': 'areacode', 'census_headings': ['urban', 'rural'], 'microdat_range': ['1', '2'], 'set_as_population_total': True}, {'file': '../data/Census/census2022_c2_hhsize.csv', 'microdata_id': 'n_hhsize', 'constraint_prefix': 'HHSize%', 'geography_column': 'areacode', 'census_headings': ['hhsize_1', 'hhsize_2', 'hhsize_3', 'hhsize_4', 'hhsize_5', 'hhsize_6', 'hhsize_7', 'hhsize_8'], 'microdat_range': ['1', '2', '3', '4', '5', '6', '7', '8'], 'set_as_population_total': False}, {'file': '../data/Census/census2022_c3_hhtenure.csv', 'microdata_id': 'n_tenure_dv', 'constraint_prefix': 'Tenure%', 'geography_column': 'areacode', 'census_headings': ['owned_outright', 'owned_mortgage', 'private_rented', 'social_rented'], 'microdat_range': ['1', '2', '5-6', '3-4'

In [33]:
def build_constraint_arrays(config):
    """
    Enhanced version that:
    1. Uses set_as_population_total to calculate population sizes
    2. Tracks geography codes (GEOIDs) separately
    3. Returns results in a structured dict
    
    Returns:
        {
            "constraint_labels": List[str],
            "constraint_targets": np.array,
            "geography_codes": List[str],
            "population_constraints": np.array
            "population_totals":np.arry
        }
    """
    constraint_labels = []
    constraint_targets = None
    geography_codes = []
    pop_total_constraint = False
    population_constraints = []
    population_totals=[]
    

    for constraint in config["constraints"]:
        with open(constraint["file"], mode='r') as f:
            reader = csv.reader(f)
            headers = next(reader)
            data = list(reader)
        
        poptotal_constraint = constraint["set_as_population_total"]
        print(poptotal_constraint)
            
        geo_col = constraint["geography_column"]
        geo_idx = headers.index(geo_col)
        
        # Store GEOIDs on first pass
        if not geography_codes:
            geography_codes = [row[geo_idx] for row in data]
        
        # Handle population totals if specified
        if pop_total_constraint: 
            population_constraints = np.array([float(row[pop_idx]) for row in data])
        
        # Process categories
        categories = [h for i, h in enumerate(headers) if i != geo_idx]
        prefix = constraint["constraint_prefix"]
        constraint_labels.extend(f"{prefix}{cat}" for cat in categories)
        
        # Extract targets
        target_rows = []
        for row in data:
            target_values = [float(row[i]) for i in range(len(headers)) if i != geo_idx]
            total_population = sum(target_values)
            if poptotal_constraint:
                population_totals.append(total_population)
            target_values = [v/total_population for v in target_values]
            target_rows.append(target_values)

        
        targets = np.array(target_rows)
        constraint_targets = targets if constraint_targets is None else np.hstack([constraint_targets, targets])
    
    header = ['geography_code','population_total']+constraint_labels
    print(header)
    table = [[geography_codes[i]]+[population_totals[i]]+constraint_targets[i].tolist() for i in range(len(geography_codes))]
    table.insert(0,header)
    return {
        "constraint_labels": constraint_labels,
        "constraint_targets": constraint_targets.tolist(),
        "geography_codes": geography_codes,
        "population_totals":population_totals,
        "table":table
    }

In [34]:
constraints_dict = build_constraint_arrays(config)

True
False
False
['geography_code', 'population_total', 'Urban%urban', 'Urban%rural', 'HHSize%hhsize_1', 'HHSize%hhsize_2', 'HHSize%hhsize_3', 'HHSize%hhsize_4', 'HHSize%hhsize_5', 'HHSize%hhsize_6', 'HHSize%hhsize_7', 'HHSize%hhsize_8', 'Tenure%owned_outright', 'Tenure%owned_mortgage', 'Tenure%private_rented', 'Tenure%social_rented']


In [35]:
constraints_dict['table']

[['geography_code',
  'population_total',
  'Urban%urban',
  'Urban%rural',
  'HHSize%hhsize_1',
  'HHSize%hhsize_2',
  'HHSize%hhsize_3',
  'HHSize%hhsize_4',
  'HHSize%hhsize_5',
  'HHSize%hhsize_6',
  'HHSize%hhsize_7',
  'HHSize%hhsize_8',
  'Tenure%owned_outright',
  'Tenure%owned_mortgage',
  'Tenure%private_rented',
  'Tenure%social_rented'],
 ['S00135307',
  62.0,
  0.0,
  1.0,
  0.3225806451612903,
  0.3709677419354839,
  0.11290322580645161,
  0.12903225806451613,
  0.04838709677419355,
  0.016129032258064516,
  0.0,
  0.0,
  0.4838709677419355,
  0.2903225806451613,
  0.1935483870967742,
  0.03225806451612903],
 ['S00135308',
  33.0,
  0.0,
  1.0,
  0.24242424242424243,
  0.5454545454545454,
  0.09090909090909091,
  0.09090909090909091,
  0.0,
  0.0,
  0.030303030303030304,
  0.0,
  0.36363636363636365,
  0.36363636363636365,
  0.21212121212121213,
  0.06060606060606061],
 ['S00135309',
  71.0,
  0.0,
  1.0,
  0.36619718309859156,
  0.43661971830985913,
  0.08450704225352113

In [37]:
def encode_microdata(config, constraint_labels):
    """
    Encode microdata into a one-hot-like array where missing values are 0.
    Returns:
        microdata_encoded: np.array shape (n_individuals, n_constraints)
        ids: list of IDs from the microdata
    """
    # Step 1: Load microdata from CSV
    with open(config["microdata"]["file"], mode='r') as f:
        reader = csv.DictReader(f)  # Reads header and rows as dictionaries
        microdata = list(reader)    # Convert to list of dicts

    n_individuals = len(microdata)
    n_constraints = len(constraint_labels)

    # Step 2: Create label-to-index mapping
    label_to_idx = {label: idx for idx, label in enumerate(constraint_labels)}

    # Step 3: Initialize output array (all zeros)
    microdata_encoded = np.zeros((n_individuals, n_constraints), dtype=np.int8)

    # Step 4: Extract IDs
    ids = [row[config["microdata"]["id_column"]] for row in microdata]

    # Step 5: Encode each constraint
    for constraint in config["constraints"]:
        col = constraint["microdata_id"]
        prefix = constraint["constraint_prefix"]
        census_headings = constraint["census_headings"]
        microdat_range = constraint["microdat_range"]

        # Create mapping from microdata values to constraint labels
        value_to_label = {}
        for i, value_range in enumerate(microdat_range):
            # Handle ranges like '5-6'
            if '-' in value_range:
                start, end = map(int, value_range.split('-'))
                for val in range(start, end + 1):
                    value_to_label[str(val)] = f"{prefix}{census_headings[i]}"
            else:
                value_to_label[value_range] = f"{prefix}{census_headings[i]}"

        for row_idx, row in enumerate(microdata):
            value = row.get(col)  # Get value for the current constraint column

            # Skip missing values (leave as 0)
            if value is not None and value.strip() != '':  # Check for non-empty strings
                if value in value_to_label:
                    label = value_to_label[value]
                    if label in label_to_idx:  # Ensure label exists in constraints
                        microdata_encoded[row_idx, label_to_idx[label]] = 1

    header = ['id'] + constraint_labels
    table = [[ids[i]] + microdata_encoded[i].tolist() for i in range(len(microdata_encoded))]
    table.insert(0, header)
    return {
        "microdata_encoded": microdata_encoded, 
        "ids": ids,
        "table": table
    }

In [38]:
microdata_dict = encode_microdata(config,constraints_dict["constraint_labels"])

In [39]:
microdata_dict['table']

[['id',
  'Urban%urban',
  'Urban%rural',
  'HHSize%hhsize_1',
  'HHSize%hhsize_2',
  'HHSize%hhsize_3',
  'HHSize%hhsize_4',
  'HHSize%hhsize_5',
  'HHSize%hhsize_6',
  'HHSize%hhsize_7',
  'HHSize%hhsize_8',
  'Tenure%owned_outright',
  'Tenure%owned_mortgage',
  'Tenure%private_rented',
  'Tenure%social_rented'],
 ['68006826', 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0],
 ['68013626', 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0],
 ['68020426', 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0],
 ['68027226', 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0],
 ['68047626', 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0],
 ['68054426', 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0],
 ['68061226', 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1],
 ['68068026', 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0],
 ['68088426', 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0],
 ['68115626', 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1],
 ['68129226', 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0],
 ['68136026', 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0]

In [40]:
def to_file(file_path,data):
    filename = file_path
    # Open the file in write mode
    with open(filename, mode='w', newline='') as file:
        writer = csv.writer(file)

        # Write the data to the CSV file
        writer.writerows(data)

In [41]:
to_file('../data/Sandbox/microdata_encoded.csv', microdata_dict["table"])
to_file('../data/Sandbox//constraint_targets.csv', constraints_dict["table"])