In [None]:
# Databricks notebook or Python script
# ------------------------------------
# Title: Explore TPCH Sample Database (Auto Catalog Detection)
# Description:
#   This script explores the TPCH sample dataset in Databricks.
#   It detects the correct catalog (e.g. "samples.tpch"), lists tables,
#   prints their schemas, and previews example records.
# ------------------------------------

from pyspark.sql import SparkSession
import pandas as pd

# ---------------------------------------------------
# Step 0: Initialize Spark session
# ---------------------------------------------------
spark = SparkSession.builder.getOrCreate()

print("=== CHECKING AVAILABLE CATALOGS ===")
catalogs = [row.catalog for row in spark.sql("SHOW CATALOGS").collect()]
print("Catalogs found:", catalogs)
print()


In [None]:

# ---------------------------------------------------
# Step 1: Detect which catalog contains the TPCH schema
# ---------------------------------------------------
target_schema = None
target_catalog = None

for catalog in catalogs:
    schemas = [row.databaseName for row in spark.sql(f"SHOW SCHEMAS IN {catalog}").collect()]
    if "tpch" in schemas:
        target_catalog = catalog
        target_schema = "tpch"
        break

if not target_catalog:
    raise ValueError("Could not find schema 'tpch' in any available catalog.")

print(f"Using catalog: {target_catalog}, schema: {target_schema}")
print()

# Set active catalog and schema
spark.sql(f"USE CATALOG {target_catalog}")
spark.sql(f"USE {target_schema}")


In [None]:

# ---------------------------------------------------
# Step 2: List tables in the TPCH schema
# ---------------------------------------------------
print(f"=== TABLES IN {target_catalog}.{target_schema} ===")
tables = spark.catalog.listTables(f"{target_catalog}.{target_schema}")

tables_df = pd.DataFrame(
    [(t.name, t.tableType) for t in tables],
    columns=["Table", "Type"]
)
print(tables_df)
print()


In [None]:

# ---------------------------------------------------
# Step 3: Inspect the schema of a sample table
# ---------------------------------------------------
sample_table = "customer"

print(f"=== SCHEMA: {target_catalog}.{target_schema}.{sample_table} ===")
df = spark.table(f"{target_catalog}.{target_schema}.{sample_table}")
df.printSchema()
print()


In [None]:

# ---------------------------------------------------
# Step 4: Display a small data sample
# ---------------------------------------------------
print(f"=== SAMPLE DATA: {target_catalog}.{target_schema}.{sample_table} ===")
df.show(5)
print()


In [None]:

# ---------------------------------------------------
# Step 5: Inspect all table schemas (optional)
# ---------------------------------------------------
print(f"=== ALL TABLE SCHEMAS IN {target_catalog}.{target_schema} ===")
for t in tables:
    print(f"--- {t.name} ---")
    tdf = spark.table(f"{target_catalog}.{target_schema}.{t.name}")
    tdf.printSchema()
    print()
