# External Table Partitioned by Year

- Creates separate directories for each year value
- Query only reads data from the specified partition (year = 1995)
- Best for low-cardinality columns with predictable access patterns
- Can cause small files problem if partitions are too granular

In [0]:
Use catalog demonhid15;
USE SCHEMA customer_a;

In [0]:
-- Create external partitioned table
CREATE EXTERNAL TABLE IF NOT EXISTS demonhid15.customer_a.lineitem_partitioned_sql (
  l_orderkey BIGINT,
  l_partkey BIGINT,
  l_suppkey BIGINT,
  l_linenumber INT,
  l_quantity DECIMAL(15,2),
  l_extendedprice DECIMAL(15,2),
  l_discount DECIMAL(15,2),
  l_tax DECIMAL(15,2),
  l_returnflag STRING,
  l_linestatus STRING,
  l_shipdate DATE,
  l_commitdate DATE,
  l_receiptdate DATE,
  l_shipmode STRING,
  l_comment STRING
)
USING DELTA
PARTITIONED BY (year INT)
LOCATION 's3://demonhid15-rootbucket-sm/customer_a_ext/lineitem_partitioned_sql';

In [0]:
-- Insert data with partition column
INSERT INTO demonhid15.customer_a.lineitem_partitioned_sql
SELECT
  l_orderkey,
  l_partkey,
  l_suppkey,
  l_linenumber,
  l_quantity,
  l_extendedprice,
  l_discount,
  l_tax,
  l_returnflag,
  l_linestatus,
  l_shipdate,
  l_commitdate,
  l_receiptdate,
  l_shipmode,
  l_comment,
  YEAR(l_shipdate) as year
FROM samples.tpch.lineitem;

In [0]:
-- Query with partition pruning
SELECT COUNT(*)
FROM demonhid15.customer_a.lineitem_partitioned_sql
where year = 1998

In [0]:
-- Create external partitioned table
CREATE EXTERNAL TABLE IF NOT EXISTS demonhid15.customer_a.lineitem_partitioned_sql2 (
  l_orderkey BIGINT,
  l_partkey BIGINT,
  l_suppkey BIGINT,
  l_linenumber INT,
  l_quantity DECIMAL(15,2),
  l_extendedprice DECIMAL(15,2),
  l_discount DECIMAL(15,2),
  l_tax DECIMAL(15,2),
  l_returnflag STRING,
  l_linestatus STRING,
  l_shipdate DATE,
  l_commitdate DATE,
  l_receiptdate DATE,
  l_shipmode STRING,
  l_comment STRING
)
USING DELTA
PARTITIONED BY (year INT)
LOCATION 's3://demonhid15-rootbucket-sm/customer_a_ext/lineitem_partitioned_sql';

# External Table with Z-ORDER

- Physically sorts data by specified columns within files
- Requires full file rewrite every time OPTIMIZE runs (not incremental)
- Choose 1-4 high-cardinality columns frequently used in WHERE or JOIN clauses
- Schedule regular OPTIMIZE operations (daily/weekly) for maintenance
- Not idempotent - repeated runs may not show improvement without new data


In [0]:
-- Create external table without partitioning
CREATE EXTERNAL TABLE IF NOT EXISTS demonhid15.customer_a.lineitem_zorder_sql (
  l_orderkey BIGINT,
  l_partkey BIGINT,
  l_suppkey BIGINT,
  l_linenumber INT,
  l_quantity DECIMAL(15,2),
  l_extendedprice DECIMAL(15,2),
  l_discount DECIMAL(15,2),
  l_tax DECIMAL(15,2),
  l_returnflag STRING,
  l_linestatus STRING,
  l_shipdate DATE,
  l_commitdate DATE,
  l_receiptdate DATE,
  l_shipmode STRING,
  l_comment STRING
)
USING DELTA
LOCATION 's3://demonhid15-rootbucket-sm/customer_a_ext/lineitem_zorder_sql';

In [0]:
-- Insert data
INSERT INTO demonhid15.customer_a.lineitem_zorder_sql
SELECT
  l_orderkey,
  l_partkey,
  l_suppkey,
  l_linenumber,
  l_quantity,
  l_extendedprice,
  l_discount,
  l_tax,
  l_returnflag,
  l_linestatus,
  l_shipdate,
  l_commitdate,
  l_receiptdate,
  l_shipmode,
  l_comment
FROM samples.tpch.lineitem;

In [0]:
-- Apply Z-ORDER optimization on high-cardinality columns
OPTIMIZE demonhid15.customer_a.lineitem_zorder_sql
ZORDER BY (l_orderkey, l_partkey);

In [0]:
-- Query leveraging Z-ORDER data skipping
SELECT l_orderkey, l_extendedprice, l_quantity
FROM demonhid15.customer_a.lineitem_zorder_sql
WHERE l_orderkey = 12345 AND l_partkey = 67890;

In [0]:
ALTER TABLE demonhid15.customer_a.lineitem_zorder_sql
SET TBLPROPERTIES ('spark.databricks.delta.retentionDurationCheck.enabled'='true');

In [0]:
VACUUM demonhid15.customer_a.lineitem_zorder_sql RETAIN 0 HOURS

# External Table with Liquid Clustering

- Uses Hilbert curves for better data layout than Z-ORDER
- Incremental optimization - only rewrites files that need clustering
- 7x faster writes compared to partitioning + Z-ORDER
- 2-12x faster queries in customer deployments
- Supports flexible clustering key changes without table recreation
- Automatic clustering on write (when thresholds are met)
- Requires Databricks Runtime 13.3 LTS or above

In [0]:
-- Create external table with Liquid Clustering
CREATE EXTERNAL TABLE IF NOT EXISTS demonhid15.customer_a.lineitem_liquid_sql (
  l_orderkey BIGINT,
  l_partkey BIGINT,
  l_suppkey BIGINT,
  l_linenumber INT,
  l_quantity DECIMAL(15,2),
  l_extendedprice DECIMAL(15,2),
  l_discount DECIMAL(15,2),
  l_tax DECIMAL(15,2),
  l_returnflag STRING,
  l_linestatus STRING,
  l_shipdate DATE,
  l_commitdate DATE,
  l_receiptdate DATE,
  l_shipmode STRING,
  l_comment STRING
)
USING DELTA
CLUSTER BY (l_orderkey, l_partkey)
LOCATION 's3://demonhid15-rootbucket-sm/customer_a_ext/lineitem_liquid_sql';

In [0]:
-- Insert data (clustering happens automatically on write)
INSERT INTO demonhid15.customer_a.lineitem_liquid_sql
SELECT
  l_orderkey,
  l_partkey,
  l_suppkey,
  l_linenumber,
  l_quantity,
  l_extendedprice,
  l_discount,
  l_tax,
  l_returnflag,
  l_linestatus,
  l_shipdate,
  l_commitdate,
  l_receiptdate,
  l_shipmode,
  l_comment
FROM samples.tpch.lineitem;

In [0]:
-- View clustering information
DESCRIBE DETAIL demonhid15.customer_a.lineitem_liquid_sql;

In [0]:
-- Incremental optimization (only optimizes files that need it)
OPTIMIZE demonhid15.customer_a.lineitem_liquid_sql;

In [0]:
-- Full optimization when first enabling or changing clustering keys
OPTIMIZE demonhid15.customer_a.lineitem_liquid_sql FULL;

In [0]:
-- Query with automatic data skipping
SELECT l_orderkey, l_extendedprice, l_quantity
FROM demonhid15.customer_a.lineitem_liquid_sql
WHERE l_orderkey = 12345 AND l_partkey = 67890;

In [0]:
-- Change clustering keys dynamically (flexibility advantage)
ALTER TABLE demonhid15.customer_a.lineitem_liquid_sql 
CLUSTER BY (l_suppkey, l_shipdate);

In [0]:
-- View clustering information
DESCRIBE DETAIL demonhid15.customer_a.lineitem_liquid_sql;

In [0]:
-- Full optimization when first enabling or changing clustering keys
OPTIMIZE demonhid15.customer_a.lineitem_liquid_sql FULL;