In [16]:
%%html
<h4> 02_Silver_Processing </h4>


StatementMeta(, cfc06aad-a3f6-4fae-9237-ec36ca71eefe, 19, Finished, Available, Finished)




In [25]:
from pyspark.sql.functions import col, to_timestamp, current_timestamp
from delta.tables import DeltaTable

# --- CONFIGURATION ---
BRONZE_PATH = "Files/Bronze/Landing"
SILVER_TABLE_NAME = "silver_news"

# 1. READ RAW DATA
print("Reading raw files...")
# We use try-except here in case the folder is empty on very first run
try:
    df_raw = spark.read.json(BRONZE_PATH)
except Exception as e:
    print("No files found in Bronze yet. Run Phase 1 first!")
    dbutils.notebook.exit("Stop")

# 2. CLEANING
df_clean = df_raw.withColumn("date", to_timestamp(col("date"))) \
                 .withColumn("processed_time", current_timestamp()) \
                 .select(
                     col("title"),
                     col("url"),
                     col("body").alias("snippet"), 
                     col("source"),
                     col("date"),
                     col("competitor_tag"),
                     col("processed_time")
                 )

# Drop null URLs
df_clean = df_clean.filter(col("url").isNotNull())

# 3. DEDUPLICATION (The Fixed Logic)
# Check the CATALOG (Registry) instead of the file path
if spark.catalog.tableExists(SILVER_TABLE_NAME):
    print(f"Table '{SILVER_TABLE_NAME}' exists. Performing Incremental Merge...")
    
    # Load the Delta Table using the Name, not the path
    delta_table = DeltaTable.forName(spark, SILVER_TABLE_NAME)
    
    # UPSERT: Merge on URL
    delta_table.alias("target").merge(
        df_clean.alias("source"),
        "target.url = source.url"
    ).whenNotMatchedInsertAll().execute()
    
    print("Merge Complete.")

else:
    print(f"Table '{SILVER_TABLE_NAME}' does not exist. Creating new...")
    # Create the table for the first time
    df_clean.write.format("delta").saveAsTable(SILVER_TABLE_NAME)
    print(f"Table '{SILVER_TABLE_NAME}' created successfully.")

StatementMeta(, cfc06aad-a3f6-4fae-9237-ec36ca71eefe, 28, Finished, Available, Finished)

Reading raw files...
Table 'silver_news' exists. Performing Incremental Merge...
Merge Complete.
