In [0]:
%sql
USE CATALOG ecommerce;
USE SCHEMA silver;


# Analyze Query Execution Plans

In [0]:
spark.sql("""
SELECT *
FROM ecommerce.silver.events
WHERE event_type = 'purchase'
""").explain(True)


== Parsed Logical Plan ==
'Project [*]
+- 'Filter ('event_type = purchase)
   +- 'UnresolvedRelation [ecommerce, silver, events], [], false

== Analyzed Logical Plan ==
event_id: int, event_type: string, event_ts: timestamp
Project [event_id#13635, event_type#13636, event_ts#13637]
+- Filter (event_type#13636 = purchase)
   +- SubqueryAlias ecommerce.silver.events
      +- Relation ecommerce.silver.events[event_id#13635,event_type#13636,event_ts#13637] parquet

== Optimized Logical Plan ==
Filter (isnotnull(event_type#13636) AND (event_type#13636 = purchase))
+- Relation ecommerce.silver.events[event_id#13635,event_type#13636,event_ts#13637] parquet

== Physical Plan ==
*(1) ColumnarToRow
+- PhotonResultStage
   +- PhotonScan parquet ecommerce.silver.events[event_id#13635,event_type#13636,event_ts#13637] DataFilters: [isnotnull(event_type#13636), (event_type#13636 = purchase)], DictionaryFilters: [(event_type#13636 = purchase)], Format: parquet, Location: PreparedDeltaFileIndex(1 paths

In [0]:
spark.sql("""
SELECT event_id, event_ts
FROM ecommerce.silver.events
WHERE event_type = 'purchase'
""").explain(True)


== Parsed Logical Plan ==
'Project ['event_id, 'event_ts]
+- 'Filter ('event_type = purchase)
   +- 'UnresolvedRelation [ecommerce, silver, events], [], false

== Analyzed Logical Plan ==
event_id: int, event_ts: timestamp
Project [event_id#13659, event_ts#13661]
+- Filter (event_type#13660 = purchase)
   +- SubqueryAlias ecommerce.silver.events
      +- Relation ecommerce.silver.events[event_id#13659,event_type#13660,event_ts#13661] parquet

== Optimized Logical Plan ==
Project [event_id#13659, event_ts#13661]
+- Filter (isnotnull(event_type#13660) AND (event_type#13660 = purchase))
   +- Relation ecommerce.silver.events[event_id#13659,event_type#13660,event_ts#13661] parquet

== Physical Plan ==
*(1) ColumnarToRow
+- PhotonResultStage
   +- PhotonProject [event_id#13659, event_ts#13661]
      +- PhotonScan parquet ecommerce.silver.events[event_id#13659,event_type#13660,event_ts#13661] DataFilters: [isnotnull(event_type#13660), (event_type#13660 = purchase)], DictionaryFilters: [(even

In [0]:
%sql
CREATE TABLE IF NOT EXISTS ecommerce.silver.events_optimized
USING DELTA
PARTITIONED BY (event_date)
AS
SELECT
  event_id,
  event_type,
  event_ts,
  DATE(event_ts) AS event_date
FROM ecommerce.silver.events;


num_affected_rows,num_inserted_rows


# Partition Large Table

In [0]:
spark.sql("""
DESCRIBE DETAIL ecommerce.silver.events_optimized
""").show(truncate=False)



+------+------------------------------------+---------------------------------+-----------+--------+-----------------------+-------------------+----------------+-----------------+--------+-----------+------------------------------------------------------------------------------+----------------+----------------+-----------------------------------------+---------------------------------------------------------------+-------------+
|format|id                                  |name                             |description|location|createdAt              |lastModified       |partitionColumns|clusteringColumns|numFiles|sizeInBytes|properties                                                                    |minReaderVersion|minWriterVersion|tableFeatures                            |statistics                                                     |clusterByAuto|
+------+------------------------------------+---------------------------------+-----------+--------+-----------------------+--------

In [0]:
%sql
OPTIMIZE ecommerce.silver.events_optimized;


path,metrics
,"List(0, 0, List(null, null, 0.0, 0, 0), List(null, null, 0.0, 0, 0), 1, null, null, 0, 0, 1, 1, true, 0, 0, 1768806775975, 1768806776797, 8, 0, null, List(0, 0), null, 4, 4, 0, 0, null)"


In [0]:
%sql
OPTIMIZE ecommerce.silver.events_optimized
ZORDER BY (event_type);


path,metrics
,"List(0, 0, List(null, null, 0.0, 0, 0), List(null, null, 0.0, 0, 0), 1, List(minCubeSize(107374182400), List(0, 0), List(1, 1383), 0, List(0, 0), 0, null), null, 0, 0, 1, 1, false, 0, 0, 1768806797215, 1768806797782, 8, 0, null, List(0, 0), null, 4, 4, 0, 0, null)"


# Benchmark Performance Improvements

In [0]:
%time
spark.sql("""
SELECT COUNT(*)
FROM ecommerce.silver.events
WHERE event_type = 'purchase'
""").collect()


CPU times: user 3 μs, sys: 0 ns, total: 3 μs
Wall time: 6.68 μs


[Row(COUNT(*)=1)]

In [0]:
%time
spark.sql("""
SELECT COUNT(*)
FROM ecommerce.silver.events_optimized
WHERE event_type = 'purchase'
""").collect()


CPU times: user 0 ns, sys: 3 μs, total: 3 μs
Wall time: 6.2 μs


[Row(COUNT(*)=1)]

# Using OPTIMIZE + ZORDER (BEST replacement)

In [0]:
%sql
OPTIMIZE ecommerce.silver.events_optimized
ZORDER BY (event_type);


path,metrics
,"List(0, 0, List(null, null, 0.0, 0, 0), List(null, null, 0.0, 0, 0), 1, List(minCubeSize(107374182400), List(0, 0), List(1, 1383), 0, List(0, 0), 0, null), null, 0, 0, 1, 1, false, 0, 0, 1768806938757, 1768806939279, 8, 0, null, List(0, 0), null, 4, 4, 0, 0, null)"


# Validate Optimization via Query Plan

In [0]:
spark.sql("""
SELECT *
FROM ecommerce.silver.events_optimized
WHERE event_type = 'purchase'
""").explain(True)


== Parsed Logical Plan ==
'Project [*]
+- 'Filter ('event_type = purchase)
   +- 'UnresolvedRelation [ecommerce, silver, events_optimized], [], false

== Analyzed Logical Plan ==
event_id: int, event_type: string, event_ts: timestamp, event_date: date
Project [event_id#14939, event_type#14940, event_ts#14941, event_date#14942]
+- Filter (event_type#14940 = purchase)
   +- SubqueryAlias ecommerce.silver.events_optimized
      +- Relation ecommerce.silver.events_optimized[event_id#14939,event_type#14940,event_ts#14941,event_date#14942] parquet

== Optimized Logical Plan ==
Filter (isnotnull(event_type#14940) AND (event_type#14940 = purchase))
+- Relation ecommerce.silver.events_optimized[event_id#14939,event_type#14940,event_ts#14941,event_date#14942] parquet

== Physical Plan ==
*(1) ColumnarToRow
+- PhotonResultStage
   +- PhotonScan parquet ecommerce.silver.events_optimized[event_id#14939,event_type#14940,event_ts#14941,event_date#14942] DataFilters: [isnotnull(event_type#14940), (eve