In [0]:
from pyspark.sql import functions as F

df_silver = spark.table("zillow.silver.county_crosswalk_metrics")

# 1. Check for Nulls in Business Keys
key_cols = ["region_name", "date", "state_name", "county_name", "city"]
key_nulls = df_silver.select([F.count(F.when(F.col(c).isNull(), c)).alias(c) for c in key_cols])

print("Null counts in Primary Keys/Geographic identifiers:")
display(key_nulls)

# 2. Check for Nulls in Metric Columns (Sample of 80+ columns)
# This identifies columns where the string-to-double cast might have failed
metric_cols = ["zhvi_all_homes", "median_listing_price_all_homes", "sale_counts"]
metric_nulls = df_silver.select([F.count(F.when(F.col(c).isNull(), c)).alias(c) for c in metric_cols])

print("Null counts in key metrics:")
display(metric_nulls)

In [0]:
df_silver.columns

In [0]:
from pyspark.sql.functions import col, count, when

# Generate a count of nulls for every column in the list you provided
null_summary = df_silver.select([
    count(when(col(c).isNull(), c)).alias(c) 
    for c in df_silver.columns
])

# Display columns that actually contain nulls (to make it easier to read)
null_results = null_summary.collect()[0].asDict()
significant_nulls = {k: v for k, v in null_results.items() if v > 0}

print(f"Columns with Null Values: {significant_nulls}")

In [0]:
display(null_summary)