In [0]:
%run ../../config/project_config

In [0]:
from pyspark.sql import functions as F
from pyspark.sql.window import Window

In [0]:
def process_silver_time_series(table_name):
    # Construct full table paths using parameters from project_config
    bronze_table = f"{CATALOG_NAME}.{SCHEMA_BRONZE}.{table_name}"
    silver_table = f"{CATALOG_NAME}.{SCHEMA_SILVER}.{table_name}"
    quarantine_table = f"{CATALOG_NAME}.{SCHEMA_BRONZE}.quarantine_{table_name}"
    
    print(f"Reading from: {bronze_table}")
    
    # 1. Load from Bronze
    df = spark.read.table(bronze_table)
    
    # 2. Quarantine: Identify records with null primary keys
    # Date and RegionName are mandatory for time-series integrity
    quarantine_condition = (F.col("Date").isNull()) | (F.col("RegionName").isNull())
    
    quarantine_df = df.filter(quarantine_condition)
    clean_base_df = df.filter(~quarantine_condition)
    
    # Save Quarantine records to the silver schema for audit
    if quarantine_df.count() > 0:
        print(f"Quarantining {quarantine_df.count()} records to {quarantine_table}")
        quarantine_df.write.format("delta").mode("append").saveAsTable(quarantine_table)
    
    # 3. Deduplication: One record per Region per Date
    df_dedup = clean_base_df.dropDuplicates(["Date", "RegionName"])
    
    # 4. Imputation: Handle numeric columns
    # Dynamically find numeric columns to avoid empty dictionary errors
    numeric_types = ("DoubleType", "IntegerType", "LongType", "FloatType", "DecimalType")
    numeric_cols = [f.name for f in df_dedup.schema.fields if any(t in str(f.dataType) for t in numeric_types)]
    
    # Build dictionary: Fill null numbers with 0
    impute_dict = {col: 0 for col in numeric_cols}
    
    # Fix for [CANNOT_BE_EMPTY] error: check if dict has values
    if impute_dict:
        df_final = df_dedup.fillna(impute_dict)
    else:
        df_final = df_dedup
        print(f"Warning: No numeric columns found to impute for {table_name}")
    
    # 5. Write to Silver
    df_final.write.format("delta") \
        .mode("overwrite") \
        .option("overwriteSchema", "true") \
        .saveAsTable(silver_table)
    
    print(f"Successfully loaded {silver_table}. Clean: {df_final.count()}")

In [0]:
# Main execution loop
for t in ["metro_time_series", "neighborhood_time_series", "state_time_series"]:
    process_silver_time_series(t)

In [0]:
%sql
select * from zillow.silver.neighborhood_time_series limit 1;

In [0]:
%sql
select * from  zillow.silver.state_time_series limit 1;

In [0]:
%sql
select * from  zillow.silver.metro_time_series limit 1;