In [0]:
from pyspark.sql import SparkSession
from pyspark.sql import functions as F
from pyspark.sql.types import TimestampType, StringType, IntegerType
from delta.tables import DeltaTable
from pyspark.sql.window import Window

# --------------------------------------
# 0) Spark Session 
# --------------------------------------
spark = SparkSession.builder.appName("ENPPI_Silver").getOrCreate()

# --------------------------------------
# 1) Bronze table actual name
# --------------------------------------
bronze_table_inc = "sicinc.bronze.enppi_smart_data_inc"
bronze_df = spark.table(bronze_table_inc)
# --------------------------------------
# 2) Silver Transformations
# --------------------------------------
silver_df = bronze_df \
    .withColumn("timestamp", F.col("timestamp").cast(TimestampType())) \
    .withColumn("date", F.date_format(F.col("timestamp"), "yyyy-MM-dd")) \
    .withColumn("gas_type", F.when(F.col("gas_type").isNull(), "None").otherwise(F.col("gas_type"))) \
    .withColumn("risk_level", 
        F.when((F.col("methane_leak_detected") == True) | (F.col("h2s_alert_level") > 2), "High")
        .when(F.col("h2s_alert_level") > 0, "Medium")
        .otherwise("Low")
    ) \
    .withColumn("outlier_flag", 
        F.when((F.col("gas_concentration_ppm") > 500) | (F.col("temperature_celsius") > 100) | (F.col("pressure_bar") > 100), True)
        .otherwise(False)
    ) \
    .withColumn("rn", F.row_number().over(Window.partitionBy("timestamp", "sensor_id", "facility_id").orderBy(F.desc("timestamp")))) \
    .filter(F.col("rn") == 1).drop("rn") \
    .withColumn("ingestion_timestamp", F.current_timestamp()) \
    .drop("_rescued_data")

# Quality checks
print("Silver Quality Checks:")
silver_df.select(
    F.count("*").alias("total_rows"),
    F.sum(F.when(F.col("outlier_flag") == True, 1).otherwise(0)).alias("outliers_count"),
    F.countDistinct("facility_id").alias("unique_facilities"),
    F.countDistinct("sensor_id").alias("unique_sensors")
).show()

# --------------------------------------
# 3) Write to Silver Delta Table
# --------------------------------------
silver_table_inc = "sicinc.silver.enppi_smart_data_inc"
######
try:
    last_processed_timestamp = spark.table(silver_table_inc)\
        .agg(F.max(F.col("ingestion_timestamp"))).collect()[0][0]
except Exception:
    last_processed_timestamp = None

if last_processed_timestamp:
    silver_new_df = silver_df.filter(F.col("ingestion_timestamp") > last_processed_timestamp)
else:
    silver_new_df = silver_df

if not spark.catalog.tableExists(silver_table_inc):
    silver_new_df.write.format("delta").mode("overwrite") \
        .partitionBy("date", "facility_id") \
        .option("overwriteSchema", "true") \
        .saveAsTable(silver_table_inc)
else:
    silver_delta = DeltaTable.forName(spark, silver_table_inc)
    silver_delta.alias("target").merge(
        silver_new_df.alias("source"),
        "target.timestamp = source.timestamp AND target.sensor_id = source.sensor_id AND target.facility_id = source.facility_id"
    ).whenNotMatchedInsertAll().execute()


print(f"Silver table written: {silver_table_inc}")

Silver Quality Checks:
+----------+--------------+-----------------+--------------+
|total_rows|outliers_count|unique_facilities|unique_sensors|
+----------+--------------+-----------------+--------------+
|  10000000|       2535675|                3|           497|
+----------+--------------+-----------------+--------------+

Silver table written: sicinc.silver.enppi_smart_data_inc


In [0]:
silver_df_inc = spark.table("sicinc.silver.enppi_smart_data_inc")

In [0]:
silver_df_inc = silver_df_inc.withColumn("date_key", F.date_format("timestamp", "yyyyMMdd").cast("int"))
