In [0]:
from pyspark.sql import functions as F

table_names = [
    "sales_customers",
    "sales_franchises",
    "sales_suppliers",
    "sales_transactions",
    "media_customer_reviews",
    "media_gold_reviews_chunked"
]


In [0]:
def perform_eda_on_table(db_name, table_name):
    """
    Perform EDA on a given table:
      1. Print the table's storage location (using DESCRIBE DETAIL)
      2. Load the table into a DataFrame
      3. Show the schema and first 5 rows
      4. Print out basic summary statistics
      5. Count nulls per column
      6. Check for duplicates (by comparing total vs distinct row count)
    """
    full_table_name = f"{db_name}.{table_name}"
    print(f"\n========== EDA for {full_table_name} ==========")
    
    # 1. 
    print("Table Detail (Storage Location):")
    try:
        detail_df = spark.sql(f"DESCRIBE DETAIL {full_table_name}")
        detail_df.select("location").show(truncate=False)
    except Exception as e:
        print(f"Error getting table details for {full_table_name}: {e}")
    
    # 2.
    try:
        df = spark.table(full_table_name)
    except Exception as e:
        print(f"Error loading table {full_table_name}: {e}")
        return
    
    # 3.
    print("Schema:")
    df.printSchema()
    
    print("Sample Rows:")
    df.show(5, truncate=False)
    
    # 4.
    print("Basic Summary Statistics:")
    try:
        df.describe().show()
    except Exception as e:
        print(f"Error running describe() on {full_table_name}: {e}")
    
    # 5.
    print("Null Count per Column:")
    null_counts = df.select([F.count(F.when(F.col(c).isNull(), c)).alias(c) for c in df.columns])
    null_counts.show()
    
    # 6.
    total_count = df.count()
    distinct_count = df.distinct().count()
    print(f"Total Count: {total_count}, Distinct Count: {distinct_count}")
    if total_count != distinct_count:
        print("WARNING: Duplicates found!")
    else:
        print("No duplicates detected.")

db_name = "samples.bakehouse"


In [0]:
for t in table_names:
    perform_eda_on_table(db_name, t)