In [0]:
from pyspark.sql.functions import lit, current_timestamp


file_name     = "utility1_circuits.csv"          
table_suffix  = "utility1_circuits"              
utility_label = "utility1"                       

catalog_name = "workspace"
schema_name  = "default"
volume_name  = "iedr_raw"

file_path = f"/Volumes/workspace/default/iedr_raw/utility1_circuits.csv"

bronze_schema = "iedr_dev_bronze"
spark.sql(f"CREATE SCHEMA IF NOT EXISTS {bronze_schema}")

print(f"Processing {file_name} → {bronze_schema}.{table_suffix}")

# Read
df_raw = spark.read.option("header", "true").option("inferSchema", "true").csv(file_path)

# Add metadata columns
df_bronze = df_raw.withColumn("utility_id", lit(utility_label)) \
                   .withColumn("ingest_timestamp", current_timestamp()) \
                   .withColumn("source_file", lit(file_name))

# Save as Delta table
full_table = f"{bronze_schema}.{table_suffix}"
df_bronze.write.mode("append").format("delta").saveAsTable(full_table)

# Verify
count = spark.table(full_table).count()
print(f"Saved {count} rows to {full_table}")
spark.table(full_table).show(5, truncate=False)
print("-" * 60)

Processing utility1_circuits.csv → iedr_dev_bronze.utility1_circuits
Saved 64539 rows to iedr_dev_bronze.utility1_circuits
+---+-----------------------+-------------------------+-------------------------+---------------------+-------------------+------------------+-------------------+-----------------+-------------------------+------------------+-------------------+-----------------+-----------------+-------------------+-----------------+------------+----------+-------------------------+---------------------+
|_c0|Circuits_Phase3_CIRCUIT|Circuits_Phase3_NUMPHASES|Circuits_Phase3_OVERUNDER|Circuits_Phase3_PHASE|NYHCPV_csv_NSECTION|NYHCPV_csv_NFEEDER|NYHCPV_csv_NVOLTAGE|NYHCPV_csv_NMAXHC|NYHCPV_csv_NMAPCOLOR     |NYHCPV_csv_FFEEDER|NYHCPV_csv_FVOLTAGE|NYHCPV_csv_FMAXHC|NYHCPV_csv_FMINHC|NYHCPV_csv_FHCADATE|NYHCPV_csv_FNOTES|Shape_Length|utility_id|ingest_timestamp         |source_file          |
+---+-----------------------+-------------------------+-------------------------+------------

In [0]:
from pyspark.sql.functions import lit, current_timestamp


file_name     = "utility1_install_der.csv"          
table_suffix  = "utility1_install_der"              
utility_label = "utility1"                       

catalog_name = "workspace"
schema_name  = "default"
volume_name  = "iedr_raw"

file_path = f"/Volumes/workspace/default/iedr_raw/utility1_install_der.csv"

bronze_schema = "iedr_dev_bronze"
spark.sql(f"CREATE SCHEMA IF NOT EXISTS {bronze_schema}")

print(f"Processing {file_name} → {bronze_schema}.{table_suffix}")

# Read
df_raw = spark.read.option("header", "true").option("inferSchema", "true").csv(file_path)

# Add metadata columns
df_bronze = df_raw.withColumn("utility_id", lit(utility_label)) \
                   .withColumn("ingest_timestamp", current_timestamp()) \
                   .withColumn("source_file", lit(file_name))

# Save as Delta table
full_table = f"{bronze_schema}.{table_suffix}"
df_bronze.write.mode("append").format("delta").saveAsTable(full_table)

# Verify
count = spark.table(full_table).count()
print(f"Saved {count} rows to {full_table}")
spark.table(full_table).show(5, truncate=False)
print("-" * 60)

Processing utility1_install_der.csv → iedr_dev_bronze.utility1_install_der
Saved 13727 rows to iedr_dev_bronze.utility1_install_der
+---------+------------+---------------+-----------------+------------------------+---------+-----------------+----------------+------+-------+-------------------+-------+------------+--------------------+------------------+---------+--------+--------------------+----------+-----+------------------------+------------+-----+----------+--------------------------+------------------------+
|ProjectID|ProjectType |NamePlateRating|TotalChargesCESIR|TotalChargesConstruction|CESIR_EST|SystemUpgrade_EST|ProjectCircuitID|Hybrid|SolarPV|EnergyStorageSystem|Wind   |MicroTurbine|SynchronousGenerator|InductionGenerator|FarmWaste|FuelCell|CombinedHeatandPower|GasTurbine|Hydro|InternalCombustionEngine|SteamTurbine|Other|utility_id|ingest_timestamp          |source_file             |
+---------+------------+---------------+-----------------+------------------------+-------

In [0]:
from pyspark.sql.functions import lit, current_timestamp


file_name     = "utility1_planned_der.csv"          
table_suffix  = "utility1_planned_der"              
utility_label = "utility1"                       

catalog_name = "workspace"
schema_name  = "default"
volume_name  = "iedr_raw"

file_path = f"/Volumes/workspace/default/iedr_raw/utility1_planned_der.csv"

bronze_schema = "iedr_dev_bronze"
spark.sql(f"CREATE SCHEMA IF NOT EXISTS {bronze_schema}")

print(f"Processing {file_name} → {bronze_schema}.{table_suffix}")

# Read
df_raw = spark.read.option("header", "true").option("inferSchema", "true").csv(file_path)

# Add metadata columns
df_bronze = df_raw.withColumn("utility_id", lit(utility_label)) \
                   .withColumn("ingest_timestamp", current_timestamp()) \
                   .withColumn("source_file", lit(file_name))

# Save as Delta table
full_table = f"{bronze_schema}.{table_suffix}"
df_bronze.write.mode("append").format("delta").saveAsTable(full_table)

# Verify
count = spark.table(full_table).count()
print(f"Saved {count} rows to {full_table}")
spark.table(full_table).show(5, truncate=False)
print("-" * 60)

Processing utility1_planned_der.csv → iedr_dev_bronze.utility1_planned_der
Saved 1688 rows to iedr_dev_bronze.utility1_planned_der
+-----------+---------------+-------------+-------------+---------+--------------+----------------+------+-------+-------------------+----+------------+--------------------+------------------+---------+--------+--------------------+----------+-----+------------------------+------------+-----+----------+--------------------------+------------------------+
|ProjectType|NamePlateRating|InServiceDate|ProjectStatus|ProjectID|CompletionDate|ProjectCircuitID|Hybrid|SolarPV|EnergyStorageSystem|Wind|MicroTurbine|SynchronousGenerator|InductionGenerator|FarmWaste|FuelCell|CombinedHeatandPower|GasTurbine|Hydro|InternalCombustionEngine|SteamTurbine|Other|utility_id|ingest_timestamp          |source_file             |
+-----------+---------------+-------------+-------------+---------+--------------+----------------+------+-------+-------------------+----+------------+---

In [0]:
from pyspark.sql.functions import lit, current_timestamp


file_name     = "utility2_circuits.csv"          
table_suffix  = "utility2_circuits"              
utility_label = "utility2"                       

catalog_name = "workspace"
schema_name  = "default"
volume_name  = "iedr_raw"

file_path = f"/Volumes/workspace/default/iedr_raw/utility2_circuits.csv"

bronze_schema = "iedr_dev_bronze"
spark.sql(f"CREATE SCHEMA IF NOT EXISTS {bronze_schema}")

print(f"Processing {file_name} → {bronze_schema}.{table_suffix}")

# Read
df_raw = spark.read.option("header", "true").option("inferSchema", "true").csv(file_path)

# Add metadata columns
df_bronze = df_raw.withColumn("utility_id", lit(utility_label)) \
                   .withColumn("ingest_timestamp", current_timestamp()) \
                   .withColumn("source_file", lit(file_name))

# Save as Delta table
full_table = f"{bronze_schema}.{table_suffix}"
df_bronze.write.mode("append").format("delta").saveAsTable(full_table)

# Verify
count = spark.table(full_table).count()
print(f"Saved {count} rows to {full_table}")
spark.table(full_table).show(5, truncate=False)
print("-" * 60)

Processing utility2_circuits.csv → iedr_dev_bronze.utility2_circuits
Saved 1909 rows to iedr_dev_bronze.utility2_circuits
+-----------+--------------+-------------+-------------+---------------------------------+----------------------+---------+------------+----------+--------------------------+---------------------+
|Master_CDF |feeder_voltage|feeder_max_hc|feeder_min_hc|feeder_dg_connected_since_refresh|hca_refresh_date      |color    |shape_length|utility_id|ingest_timestamp          |source_file          |
+-----------+--------------+-------------+-------------+---------------------------------+----------------------+---------+------------+----------+--------------------------+---------------------+
|36_13_81756|13.2          |0.0          |0.0          |0.0                              |2022/10/01 00:00:00+00|brown    |1.277295106 |utility2  |2026-01-19 23:04:26.788543|utility2_circuits.csv|
|36_13_81757|13.2          |1.1          |0.1          |0.01                             |

In [0]:
from pyspark.sql.functions import lit, current_timestamp


file_name     = "utility2_install_der.csv"          
table_suffix  = "utility2_install_der"              
utility_label = "utility2"                       

catalog_name = "workspace"
schema_name  = "default"
volume_name  = "iedr_raw"

file_path = f"/Volumes/workspace/default/iedr_raw/utility2_install_der.csv"

bronze_schema = "iedr_dev_bronze"
spark.sql(f"CREATE SCHEMA IF NOT EXISTS {bronze_schema}")

print(f"Processing {file_name} → {bronze_schema}.{table_suffix}")

# Read
df_raw = spark.read.option("header", "true").option("inferSchema", "true").csv(file_path)

# Add metadata columns
df_bronze = df_raw.withColumn("utility_id", lit(utility_label)) \
                   .withColumn("ingest_timestamp", current_timestamp()) \
                   .withColumn("source_file", lit(file_name))

# Save as Delta table
full_table = f"{bronze_schema}.{table_suffix}"
df_bronze.write.mode("append").format("delta").saveAsTable(full_table)

# Verify
count = spark.table(full_table).count()
print(f"Saved {count} rows to {full_table}")
spark.table(full_table).show(5, truncate=False)
print("-" * 60)

Processing utility2_install_der.csv → iedr_dev_bronze.utility2_install_der
Saved 25537 rows to iedr_dev_bronze.utility2_install_der
+------+----------------------+--------+--------------------+----------------------------+--------------------+----------+--------------------------+------------------------+
|DER_ID|SERVICE_STREET_ADDRESS|DER_TYPE|DER_NAMEPLATE_RATING|DER_INTERCONNECTION_LOCATION|INTERCONNECTION_COST|utility_id|ingest_timestamp          |source_file             |
+------+----------------------+--------+--------------------+----------------------------+--------------------+----------+--------------------------+------------------------+
|391308|null                  |Solar   |7.6                 |36_39_01251                 |0.0                 |utility2  |2026-01-19 23:08:23.670109|utility2_install_der.csv|
|386550|null                  |Solar   |6.0                 |36_32_36452                 |0.0                 |utility2  |2026-01-19 23:08:23.670109|utility2_install_de

In [0]:
from pyspark.sql.functions import lit, current_timestamp


file_name     = "utility2_planned_der.csv"          
table_suffix  = "utility2_planned_der"              
utility_label = "utility2"                       

catalog_name = "workspace"
schema_name  = "default"
volume_name  = "iedr_raw"

file_path = f"/Volumes/workspace/default/iedr_raw/utility2_planned_der.csv"

bronze_schema = "iedr_dev_bronze"
spark.sql(f"CREATE SCHEMA IF NOT EXISTS {bronze_schema}")

print(f"Processing {file_name} → {bronze_schema}.{table_suffix}")

# Read
df_raw = spark.read.option("header", "true").option("inferSchema", "true").csv(file_path)

# Add metadata columns
df_bronze = df_raw.withColumn("utility_id", lit(utility_label)) \
                   .withColumn("ingest_timestamp", current_timestamp()) \
                   .withColumn("source_file", lit(file_name))

# Save as Delta table 
full_table = f"{bronze_schema}.{table_suffix}"
df_bronze.write.mode("append").format("delta").saveAsTable(full_table)

# Verify
count = spark.table(full_table).count()
print(f"Saved {count} rows to {full_table}")
spark.table(full_table).show(5, truncate=False)
print("-" * 60)

Processing utility2_planned_der.csv → iedr_dev_bronze.utility2_planned_der
Saved 30957 rows to iedr_dev_bronze.utility2_planned_der
+--------+--------------------+-------------------------+-------------------------+----------+--------------------+-----------------------+--------------------------------+------------------------------+----------------------------+----------+--------------------------+------------------------+
|DER_TYPE|DER_NAMEPLATE_RATING|INVERTER_NAMEPLATE_RATING|PLANNED_INSTALLATION_DATE|DER_STATUS|DER_STATUS_RATIONALE|TOTAL_MW_FOR_SUBSTATION|INTERCONNECTION_QUEUE_REQUEST_ID|INTERCONNECTION_QUEUE_POSITION|DER_INTERCONNECTION_LOCATION|utility_id|ingest_timestamp          |source_file             |
+--------+--------------------+-------------------------+-------------------------+----------+--------------------+-----------------------+--------------------------------+------------------------------+----------------------------+----------+--------------------------+------

In [0]:
from pyspark.sql.functions import col, count, when, sum as spark_sum

bronze_schema = "iedr_dev_bronze"

tables = [
    "utility1_circuits",
    "utility1_install_der",
    "utility1_planned_der",
    "utility2_circuits",
    "utility2_install_der",
    "utility2_planned_der"
]

print("Bronze Layer Summary")
print("-" * 60)

for tbl in tables:
    full_tbl = f"{bronze_schema}.{tbl}"
    try:
        df = spark.table(full_tbl)
        row_count = df.count()
        print(f"Table: {full_tbl}")
        print(f"  Rows: {row_count:,}")
        
        # Quick null check on first 5 columns + metadata
        null_counts = df.select([
            spark_sum(when(col(c).isNull(), 1).otherwise(0)).alias(c)
            for c in df.columns[:5] + ["utility_id", "ingest_timestamp"]
        ]).collect()[0]
        
        print("  Nulls in sample columns + metadata:")
        for c, cnt in zip(null_counts.asDict().keys(), null_counts):
            if cnt > 0:
                print(f"    {c}: {cnt:,}")
        
        # Show schema once for first table
        if tbl == tables[0]:
            print("\nExample Schema (utility1_circuits):")
            df.printSchema()
        
        print("-" * 60)
    except Exception as e:
        print(f"Error checking {full_tbl}: {str(e)}")
        print("-" * 60)

print("Bronze quality check complete!")

Bronze Layer Summary
------------------------------------------------------------
Table: iedr_dev_bronze.utility1_circuits
  Rows: 64,539
  Nulls in sample columns + metadata:

Example Schema (utility1_circuits):
root
 |-- _c0: integer (nullable = true)
 |-- Circuits_Phase3_CIRCUIT: integer (nullable = true)
 |-- Circuits_Phase3_NUMPHASES: integer (nullable = true)
 |-- Circuits_Phase3_OVERUNDER: string (nullable = true)
 |-- Circuits_Phase3_PHASE: string (nullable = true)
 |-- NYHCPV_csv_NSECTION: integer (nullable = true)
 |-- NYHCPV_csv_NFEEDER: integer (nullable = true)
 |-- NYHCPV_csv_NVOLTAGE: double (nullable = true)
 |-- NYHCPV_csv_NMAXHC: double (nullable = true)
 |-- NYHCPV_csv_NMAPCOLOR: string (nullable = true)
 |-- NYHCPV_csv_FFEEDER: integer (nullable = true)
 |-- NYHCPV_csv_FVOLTAGE: double (nullable = true)
 |-- NYHCPV_csv_FMAXHC: double (nullable = true)
 |-- NYHCPV_csv_FMINHC: double (nullable = true)
 |-- NYHCPV_csv_FHCADATE: timestamp (nullable = true)
 |-- NYHCPV_c