In [0]:
# src/ingestion/bronze_ingest.py

import json
import os
from glob import glob
import yaml
from pyspark.sql import SparkSession
from pyspark.sql.types import (BooleanType, DoubleType, IntegerType,
                               StringType, StructField, StructType, ArrayType,
                               StructType,TimestampType)

# Mapping YAML/JSON types to Spark types
TYPE_MAP = {
    "string": StringType(),
    "int": IntegerType(),
    "integer": IntegerType(),
    "double": DoubleType(),
    "timestamp": TimestampType(),
    "boolean": BooleanType(),
    "struct": StructType(), 
    "array": ArrayType(StringType()), 
}

def build_spark_schema(columns: list) -> StructType:
    """
    Build a Spark StructType schema from a list of columns from YAML config.
    Supports nested structs and arrays.
    """
    fields = []
    for col in columns:
        col_type = col["type"].lower()
        nullable = col.get("nullable", True)

        if col_type == "struct":
            # recursively build schema for nested struct
            struct_fields = build_spark_schema(col["fields"])
            fields.append(StructField(col["name"], struct_fields, nullable))
        elif col_type == "array":
            # assume array of strings by default, but can also be nested structs
            element_type = col.get("element_type", "string").lower()
            if element_type == "struct":
                element_schema = build_spark_schema(col["fields"])
                fields.append(StructField(col["name"], ArrayType(element_schema), nullable))
            else:
                fields.append(StructField(col["name"], TYPE_MAP[element_type], nullable))
        else:
            fields.append(StructField(col["name"], TYPE_MAP[col_type], nullable))

    return StructType(fields)

def validate_yaml_against_json_schema(columns: list, schema_file: str, yaml_file: str = None):
    """
    Validate that all fields in the JSON schema exist in the YAML columns.
    Raises ValueError if any required fields are missing.
    """
    with open(schema_file, "r") as f:
        schema_json = json.load(f)

    yaml_fields = {col["name"] for col in columns}
    json_fields = {field["name"] for field in schema_json["fields"]}
    missing_fields = json_fields - yaml_fields
    if missing_fields:
        raise ValueError(f"{yaml_file or 'YAML config'} is missing columns defined in JSON schema: {missing_fields}")

def ingest_json_to_bronze(
    json_file: str,
    yaml_file: str,
    ingestion_type: str = "overwrite"
):
    """| `ingestion_type` / `mode`      | Description                                                         | Typical Use Case                                   |
        | ------------------------------ | ------------------------------------------------------------------- | -------------------------------------------------- |
        | `"overwrite"`                  | Replaces the existing table with the new data.                      | Full refresh of a table.                           |
        | `"append"`                     | Adds new rows to the existing table without removing existing data. | Incremental ingestion / new batch of data.         |
        | `"ignore"`                     | If the table already exists, Spark does nothing.                    | Avoid overwriting existing tables.                 |
        | `"error"` or `"errorifexists"` | Raises an error if the table already exists.                        | Strict ingestion to prevent accidental overwrites. |
    """
    """
    Ingest a single JSON file to bronze Delta table using its YAML config.
    """
    spark = SparkSession.builder.getOrCreate()

    # Load YAML
    with open(yaml_file, "r") as f:
        config = yaml.safe_load(f)

    table_name = config["table_name"]
    schema_file = config.get("schema_file")
    columns = config["columns"]

    # Validate if schema_file is provided
    if schema_file:
        validate_yaml_against_json_schema(columns, schema_file, yaml_file)

    # Build Spark schema
    spark_schema = build_spark_schema(columns)

    # Read JSON
    df = spark.read.schema(spark_schema).json(json_file)

    # Write to catalog as Delta table Bronze
    catalog_table = f"lingokids.bronze.{table_name}"
    df.write.format("delta").mode(ingestion_type).saveAsTable(catalog_table)

    return (f"lingokids.bronze.{table_name} ingested from {json_file}", catalog_table)

def ingest_all_yaml_configs(
    yaml_dir: str = "configs/bronze",
    ingestion_type: str = "overwrite"
):
    """
    Scan all YAML configs in yaml_dir and ingest their corresponding JSON files.
    Returns a dictionary of table_name -> target_path.
    """
    yaml_files = glob(os.path.join(yaml_dir, "*.yaml"))
    results = {}

    for yaml_file in yaml_files:
        with open(yaml_file, "r") as f:
            config = yaml.safe_load(f)
        json_file = config["json_file"]
        table_name = config["table_name"]
        target_path = ingest_json_to_bronze(json_file, yaml_file, ingestion_type)
        results[table_name] = target_path

    return results


In [0]:

yaml_dir = "/Workspace/Users/pablo.sanchez.armas@gmail.com/lingokids/configs/bronze"
ingested_tables = ingest_all_yaml_configs(yaml_dir=yaml_dir)
