 Bronze → Silver ETL (Incremental with Merge)

Loads activities from Bronze to Silver using config and prevents duplicates via merge on `activity_id`.

In [0]:
import yaml
from pyspark.sql import SparkSession
from pyspark.sql.functions import size, from_json, size, current_timestamp
from pyspark.sql.types import ArrayType, StringType
from delta.tables import DeltaTable

spark = SparkSession.builder.getOrCreate()


In [0]:
config_path = "/Workspace/Users/pablo.sanchez.armas@gmail.com/lingokids/configs/silver/activities.yaml"


with open(config_path, "r") as f:
    config = yaml.safe_load(f)

source_cfg = config["source"]
target_cfg = config["target"]
load_cfg = config["load"]

bronze_table = f"{source_cfg['catalog']}.{source_cfg['schema']}.{source_cfg['table']}"
silver_table = f"{target_cfg['catalog']}.{target_cfg['schema']}.{target_cfg['table']}"

merge_key = load_cfg.get("merge_key")
if not merge_key:
    raise ValueError("merge_key must be defined in the config file")

print(f"Source: {bronze_table}")
print(f"Target: {silver_table}")
print(f"Merge Key: {merge_key}")

In [0]:
# Read from bronze
df_bronze = spark.read.table(bronze_table)

# Add processing timestamp
df_bronze = df_bronze.withColumn("updated_at", current_timestamp())

# Create Silver table if it doesn't exist
if not spark.catalog.tableExists(silver_table):
    print(f"Silver table {silver_table} does not exist — creating it.")
    (
        df_bronze.write
        .format("delta")
        .mode("overwrite")
        .option("overwriteSchema", "true")
        .saveAsTable(silver_table)
    )

# Prepare Delta table for merge
silver_delta = DeltaTable.forName(spark, silver_table)

# Merge condition (based on merge_key from config)
merge_condition = f"t.{merge_key} = s.{merge_key}"

# Perform the upsert:
(
    silver_delta.alias("t")
    .merge(
        df_bronze.alias("s"),
        merge_condition
    )
    # Insert only new activities
    .whenNotMatchedInsertAll()
    .execute()
)

print("✅ Merge completed: new activites added.")