In [0]:
%sql
-- OPTIMIZE + ZORDER
OPTIMIZE churn_catalog.analytics.churn_features
ZORDER BY (customer_id);

-- VACUUM (keep last 7 days of history)
VACUUM churn_catalog.analytics.churn_features RETAIN 168 HOURS;

-- SHOW FILE SIZE AND HISTORY
DESCRIBE HISTORY churn_catalog.analytics.churn_features;

DESCRIBE DETAIL churn_catalog.analytics.churn_features;


In [0]:
%sql
describe history churn_catalog.analytics.churn_features;

# Total Rows

In [0]:
raw_count = spark.table("churn_catalog.raw.customer_data").count()
clean_count = spark.table("churn_catalog.processed.customer_profiles").count()

removed_rows = raw_count - clean_count

print(" ROW CLEANING SUMMARY ")
print(f"Total rows in RAW layer     : {raw_count}")
print(f"Total rows in SILVER layer  : {clean_count}")
print(f"Removed/cleaned rows        : {removed_rows}")

#  Removed Rows During Data Cleaning

In [0]:
# SHOW REMOVED / INVALID ROWS

df_raw = spark.table("churn_catalog.raw.customer_data")
df_silver = spark.table("churn_catalog.processed.customer_profiles")

removed_df = df_raw.exceptAll(df_silver)

print("REMOVED ROWS (Invalid / Duplicate / Corrupted)")
display(removed_df)


# DELTA OPTIMIZATION SUMMARY

In [0]:
print("DELTA OPTIMIZATION SUMMARY ")

# Load table details
detail = spark.sql("""
    DESCRIBE DETAIL churn_catalog.analytics.churn_features
""")

history = spark.sql("""
    DESCRIBE HISTORY churn_catalog.analytics.churn_features
""")

# Show important information
total_size = detail.select("sizeInBytes").collect()[0][0]
row_count = detail.select("numFiles").collect()[0][0]
partition_col = detail.select("partitionColumns").collect()[0][0]

print(f"Total Rows: {row_count}")
print(f"Table Size (MB): {total_size / (1024*1024):.2f} MB")
print(f"Partition Columns: {partition_col}")

print("\nRecent Operation History:")
display(history.limit(5))
