In [0]:
%run ../../config/project_config

In [0]:
from pyspark.sql import functions as F
from pyspark.sql.functions import col

In [0]:
df = spark.table(f"{CATALOG_NAME}.{SCHEMA_SILVER}.county_crosswalk_metrics")

In [0]:
excluded_cols = ["load_dt", "source_file", "_rescued_data"]

In [0]:
display(df.dtypes)

_1,_2
date,string
region_name,string
days_on_zillow_all_homes,string
inventory_seasonally_adjusted_all_homes,string
inventory_raw_all_homes,string
median_listing_price_per_sqft_1_bedroom,string
median_listing_price_per_sqft_2_bedroom,string
median_listing_price_per_sqft_3_bedroom,string
median_listing_price_per_sqft_4_bedroom,string
median_listing_price_per_sqft_5_bedroom_or_more,string


In [0]:
date_col = "date"
# All columns starting with 'median', 'zhvi', 'zri', 'sale', 'pct', or 'zhvi'
metric_cols = [c for c in df.columns if any(prefix in c for prefix in 
               ['median', 'zhvi', 'zri', 'sale', 'pct', 'price_to_rent', 'inventory'])]

In [0]:
df_standardized = df.withColumn(date_col, F.to_date(F.col(date_col))) # Date Standardization

for col_name in metric_cols:
    # Currency/Metric Normalization: Cast to double and round to 2 decimals
    df_standardized = df_standardized.withColumn(col_name, F.round(F.col(col_name).cast("double"), 2))

In [0]:
display(df_standardized.dtypes)

_1,_2
date,date
region_name,string
days_on_zillow_all_homes,string
inventory_seasonally_adjusted_all_homes,double
inventory_raw_all_homes,double
median_listing_price_per_sqft_1_bedroom,double
median_listing_price_per_sqft_2_bedroom,double
median_listing_price_per_sqft_3_bedroom,double
median_listing_price_per_sqft_4_bedroom,double
median_listing_price_per_sqft_5_bedroom_or_more,double


In [0]:
df_standardized.write.format("delta").mode("overwrite").option("overwriteSchema", "true").saveAsTable(f"{CATALOG_NAME}.{SCHEMA_SILVER}.county_crosswalk_metrics")

## Delta Lake Time Travel Evidence

In [0]:
display(spark.sql(f"DESCRIBE HISTORY {CATALOG_NAME}.{SCHEMA_SILVER}.county_crosswalk_metrics"))

version,timestamp,userId,userName,operation,operationParameters,job,notebook,clusterId,readVersion,isolationLevel,isBlindAppend,operationMetrics,userMetadata,engineInfo
3,2026-01-02T19:54:45.000Z,74222979012178,saswat.n.behera@v4c.ai,CREATE OR REPLACE TABLE AS SELECT,"Map(partitionBy -> [], clusterBy -> [], description -> null, isManaged -> true, properties -> {""delta.parquet.compression.codec"":""zstd"",""delta.enableDeletionVectors"":""true"",""delta.enableRowTracking"":""true"",""delta.checkpointPolicy"":""v2"",""delta.rowTracking.materializedRowCommitVersionColumnName"":""_row-commit-version-col-a74087f6-dfd8-4aed-acfb-bf711f1817bb"",""delta.rowTracking.materializedRowIdColumnName"":""_row-id-col-37eef9ac-3c28-453f-bb8e-9792893ce882""}, statsOnLoad -> true, clusteringOnWriteStatus -> Reason for skipping: Nonexistent clustering column, generally when replacing a Liquid table with a table without clustering columns)",,List(3477484132944233),0102-194029-zta1qb9o-v2n,2.0,WriteSerializable,False,"Map(numFiles -> 1, numRemovedFiles -> 1, numRemovedBytes -> 17551851, numDeletionVectorsRemoved -> 0, numOutputRows -> 155638, numOutputBytes -> 10361834)",,Databricks-Runtime/17.3.x-aarch64-photon-scala2.13
2,2026-01-02T06:02:18.000Z,74222979012178,saswat.n.behera@v4c.ai,CREATE OR REPLACE TABLE AS SELECT,"Map(partitionBy -> [], clusterBy -> [""region_name"",""date""], description -> null, isManaged -> true, properties -> {""delta.parquet.compression.codec"":""zstd"",""delta.enableDeletionVectors"":""true"",""delta.enableRowTracking"":""true"",""delta.checkpointPolicy"":""v2"",""delta.rowTracking.materializedRowCommitVersionColumnName"":""_row-commit-version-col-a74087f6-dfd8-4aed-acfb-bf711f1817bb"",""delta.rowTracking.materializedRowIdColumnName"":""_row-id-col-37eef9ac-3c28-453f-bb8e-9792893ce882""}, statsOnLoad -> true, clusteringOnWriteStatus -> late-stage clustering triggered)","List(1086061569120203, [dev saswat_n_behera] Zillow_Silver_Enriched, 937874594628388, 1031284339868486, 74222979012178, manual)",List(963590887321560),0102-055617-ws4qs1xd-v2n,1.0,WriteSerializable,False,"Map(numFiles -> 1, numRemovedFiles -> 1, numRemovedBytes -> 17551849, numDeletionVectorsRemoved -> 0, numOutputRows -> 155638, numOutputBytes -> 17551851)",,Databricks-Runtime/17.3.x-aarch64-photon-scala2.13
1,2026-01-02T04:12:32.000Z,74222979012178,saswat.n.behera@v4c.ai,CREATE OR REPLACE TABLE AS SELECT,"Map(partitionBy -> [], clusterBy -> [""region_name"",""date""], description -> null, isManaged -> true, properties -> {""delta.parquet.compression.codec"":""zstd"",""delta.enableDeletionVectors"":""true"",""delta.enableRowTracking"":""true"",""delta.checkpointPolicy"":""v2"",""delta.rowTracking.materializedRowCommitVersionColumnName"":""_row-commit-version-col-a74087f6-dfd8-4aed-acfb-bf711f1817bb"",""delta.rowTracking.materializedRowIdColumnName"":""_row-id-col-37eef9ac-3c28-453f-bb8e-9792893ce882""}, statsOnLoad -> true, clusteringOnWriteStatus -> late-stage clustering triggered)","List(580393071454010, Zillow_Silver_Enriched, 233580990431539, 909302618638789, 74222979012178, manual)",List(963590887321560),0102-034516-rko8b1uq-v2n,0.0,WriteSerializable,False,"Map(numFiles -> 1, numRemovedFiles -> 1, numRemovedBytes -> 17551794, numDeletionVectorsRemoved -> 0, numOutputRows -> 155638, numOutputBytes -> 17551849)",,Databricks-Runtime/17.3.x-aarch64-photon-scala2.13
0,2026-01-01T11:55:49.000Z,74222979012178,saswat.n.behera@v4c.ai,CREATE OR REPLACE TABLE AS SELECT,"Map(partitionBy -> [], clusterBy -> [""region_name"",""date""], description -> null, isManaged -> true, properties -> {""delta.parquet.compression.codec"":""zstd"",""delta.enableDeletionVectors"":""true"",""delta.enableRowTracking"":""true"",""delta.checkpointPolicy"":""v2"",""delta.rowTracking.materializedRowCommitVersionColumnName"":""_row-commit-version-col-a74087f6-dfd8-4aed-acfb-bf711f1817bb"",""delta.rowTracking.materializedRowIdColumnName"":""_row-id-col-37eef9ac-3c28-453f-bb8e-9792893ce882""}, statsOnLoad -> true, clusteringOnWriteStatus -> late-stage clustering triggered)",,List(963590887321560),0101-112534-5vyjoyku-v2n,,WriteSerializable,False,"Map(numFiles -> 1, numRemovedFiles -> 0, numRemovedBytes -> 0, numDeletionVectorsRemoved -> 0, numOutputRows -> 155638, numOutputBytes -> 17551794)",,Databricks-Runtime/17.3.x-aarch64-photon-scala2.13


In [0]:
df_v0 = spark.read.option("versionAsOf", 0).table(f"{CATALOG_NAME}.{SCHEMA_SILVER}.county_crosswalk_metrics")

df_latest = spark.table(f"{CATALOG_NAME}.{SCHEMA_SILVER}.county_crosswalk_metrics")

print("VERSION 0:")
df_v0.select("date", "median_listing_price_all_homes").printSchema()

print("LATEST VERSION")
df_latest.select("date", "median_listing_price_all_homes").printSchema()

VERSION 0:
root
 |-- date: string (nullable = true)
 |-- median_listing_price_all_homes: string (nullable = true)

LATEST VERSION
root
 |-- date: date (nullable = true)
 |-- median_listing_price_all_homes: double (nullable = true)



In [0]:
print("VERSION 0:")
df_v0.printSchema()

print("LATEST VERSION")
df_latest.printSchema()

VERSION 0:
root
 |-- date: string (nullable = true)
 |-- region_name: string (nullable = true)
 |-- days_on_zillow_all_homes: string (nullable = true)
 |-- inventory_seasonally_adjusted_all_homes: string (nullable = true)
 |-- inventory_raw_all_homes: string (nullable = true)
 |-- median_listing_price_per_sqft_1_bedroom: string (nullable = true)
 |-- median_listing_price_per_sqft_2_bedroom: string (nullable = true)
 |-- median_listing_price_per_sqft_3_bedroom: string (nullable = true)
 |-- median_listing_price_per_sqft_4_bedroom: string (nullable = true)
 |-- median_listing_price_per_sqft_5_bedroom_or_more: string (nullable = true)
 |-- median_listing_price_per_sqft_all_homes: string (nullable = true)
 |-- median_listing_price_per_sqft_condo_coop: string (nullable = true)
 |-- median_listing_price_per_sqft_duplex_triplex: string (nullable = true)
 |-- median_listing_price_per_sqft_single_family_residence: string (nullable = true)
 |-- median_listing_price_1_bedroom: string (nullable = 