In [None]:
# Databricks notebook source# MAGIC %md# MAGIC # Census Demographics - Bronze Layer Ingestion# MAGIC# MAGIC Ingests ACS 5-Year demographic data via Census API into Unity Catalog.# MAGIC# MAGIC **Configuration:** Externalized to YAML (`resources/configs/census_variables.yml`)  # MAGIC **Orchestration:** Databricks Asset Bundle with task-level retries  # MAGIC **Storage:** Unity Catalog managed tables and volumes# MAGIC

In [None]:
!pip install pyyaml

In [None]:
import requestsimport yamlfrom pyspark.sql import functions as Ffrom pyspark.sql.types import *from datetime import datetimeimport uuid# Widget parameters (injected by DABs job)dbutils.widgets.text("catalog", "")dbutils.widgets.text("bronze_schema", "")dbutils.widgets.text("census_api_key", "")dbutils.widgets.text("census_data_volume", "")dbutils.widgets.text("config_path", "")dbutils.widgets.text("acs_year", "")dbutils.widgets.text("state_fips", "")# Extract parameterscatalog = dbutils.widgets.get("catalog")bronze_schema = dbutils.widgets.get("bronze_schema")census_api_key = dbutils.widgets.get("census_api_key")census_data_volume = dbutils.widgets.get("census_data_volume")config_path = dbutils.widgets.get("config_path")acs_year = dbutils.widgets.get("acs_year")state_fips = dbutils.widgets.get("state_fips")# Validate required parametersassert catalog and bronze_schema and census_api_key and config_path, "Missing required parameters"

In [None]:
# Load census variables from externalized YAML configwith open(config_path, 'r') as f:    config = yaml.safe_load(f)# Flatten nested structurecensus_variables = {}for category, variables in config['acs_5_year_variables'].items():    census_variables.update(variables)

In [None]:
def get_census_data(geography_level, state_fips, variables_dict, api_key, year):    """    Fetch ACS 5-Year data from Census API.    Block groups require full geographic hierarchy with multiple 'in' parameters.    """    base_url = f"https://api.census.gov/data/{year}/acs/acs5"    var_string = ",".join(variables_dict.keys())        # Build URL manually to handle multiple 'in' parameters    url = f"{base_url}?get=NAME,{var_string}&for=block%20group:*&in=state:{state_fips}&in=county:*&in=tract:*&key={api_key}"        response = requests.get(url, timeout=120)    response.raise_for_status()        data = response.json()    assert data and len(data) >= 2, f"Invalid API response for {geography_level}"        return (data[0], data[1:])def transform_to_dataframe(headers, rows, geography_level, variables_dict, ingest_id, ingest_timestamp):    """Transform API response to Spark DataFrame with type casting and metadata."""    df = spark.createDataFrame(rows, schema=headers)        # Rename to friendly names    for census_code, friendly_name in variables_dict.items():        if census_code in df.columns:            df = df.withColumnRenamed(census_code, friendly_name)        # Add metadata    df = (df          .withColumn("geography_level", F.lit(geography_level))          .withColumn("acs_year", F.lit(acs_year))          .withColumn("ingestion_id", F.lit(ingest_id))          .withColumn("ingestion_timestamp", F.lit(ingest_timestamp)))        # Cast numeric columns    geo_cols = ["NAME", "state", "county", "tract", "block_group",                 "geography_level", "acs_year", "ingestion_id", "ingestion_timestamp"]    for col_name in df.columns:        if col_name not in geo_cols:            df = df.withColumn(col_name, F.col(col_name).cast("long"))        return df

In [None]:
# Generate ingestion metadataingest_id = str(uuid.uuid4())ingest_timestamp = datetime.now()# Fetch block groups only (Massachusetts, ACS 5-Year 2022)bg_headers, bg_rows = get_census_data("block_group", state_fips, census_variables, census_api_key, acs_year)# Transform to Spark DataFramebg_headers = [h.replace("block group", "block_group") for h in bg_headers]census_df = transform_to_dataframe(bg_headers, bg_rows, "block_group", census_variables, ingest_id, ingest_timestamp)

In [None]:
# Write to Unity Catalog census_table = f"{catalog}.{bronze_schema}.bronze_census_demographics"(census_df .write .mode("overwrite") .option("mergeSchema", "true") .saveAsTable(census_table))