In [0]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import to_timestamp, year, month, dayofmonth
from delta.tables import DeltaTable

# Initialize Spark Session
spark = SparkSession.builder.appName("DeltaLakeEndToEndProject").getOrCreate()

# Drop existing tables and DataFrames if they exist
spark.sql("DROP TABLE IF EXISTS transactions")
dbutils.fs.rm("/dbfs/delta/transactions", True)

# Step 1: Data Ingestion
transactions_data = [
    (1, 101, 50.0, "2024-01-01"),
    (2, 102, 150.0, "2024-01-02"),
    (3, 103, 200.0, "2024-01-03"),
    (4, 101, 75.0, "2024-01-04"),
    (5, 104, 125.0, "2024-01-05")
]
transactions_columns = ["transaction_id", "product_id", "amount", "transaction_date"]
transactions_df = spark.createDataFrame(transactions_data, transactions_columns)
transactions_df = transactions_df.withColumn("transaction_date", to_timestamp("transaction_date", "yyyy-MM-dd"))

delta_table_path = "/dbfs/delta/transactions"
transactions_df.write.format("delta").mode("overwrite").save(delta_table_path)

# Step 2: Data Transformation
transformed_df = transactions_df.withColumn("year", year("transaction_date")) \
                                .withColumn("month", month("transaction_date")) \
                                .withColumn("day", dayofmonth("transaction_date"))
transformed_df.show()

In [0]:
# Step 3: Incremental Data Processing
transactions_updates_data = [
    (2, 102, 175.0, "2024-01-02"),
    (6, 105, 300.0, "2024-01-06")
]
transactions_updates_columns = ["transaction_id", "product_id", "amount", "transaction_date"]
transactions_updates_df = spark.createDataFrame(transactions_updates_data, transactions_updates_columns)
transactions_updates_df = transactions_updates_df.withColumn("transaction_date", to_timestamp("transaction_date", "yyyy-MM-dd"))
display(transactions_updates_df)

In [0]:
delta_table = DeltaTable.forPath(spark, delta_table_path)
delta_table.alias("tgt").merge(
    transactions_updates_df.alias("src"),
    "tgt.transaction_id = src.transaction_id"
).whenMatchedUpdate(set={
    "amount": "src.amount",
    "transaction_date": "src.transaction_date"
}).whenNotMatchedInsert(values={
    "transaction_id": "src.transaction_id",
    "product_id": "src.product_id",
    "amount": "src.amount",
    "transaction_date": "src.transaction_date"
}).execute()

updated_df = spark.read.format("delta").load(delta_table_path)
updated_df.show()

In [0]:
# Step 4: Data Analysis and Querying
updated_df.createOrReplaceTempView("transactions_view")
total_sales_per_product = spark.sql("""
SELECT product_id, SUM(amount) as total_sales
FROM transactions_view
GROUP BY product_id
ORDER BY total_sales DESC
""")
total_sales_per_product.show()

In [0]:
version_df = spark.read.format("delta").option("versionAsOf", 1).load(delta_table_path)
version_df.show()

In [0]:
# Step 5: Handling ACID Transactions
history_df = delta_table.history()
history_df.show(truncate=False)

delta_table.restoreToVersion(1)
rolled_back_df = spark.read.format("delta").load(delta_table_path)
rolled_back_df.show()

In [0]:
# Step 6: Optimizing and Managing Data
spark.sql("OPTIMIZE delta.`{}`".format(delta_table_path))
delta_table.vacuum()
spark.sql("OPTIMIZE delta.`{}` ZORDER BY (product_id)".format(delta_table_path))
spark.sql("COMMENT ON TABLE delta.`{}` IS 'This is a transactions table with sales data.'".format(delta_table_path))

### Explanation for Step 6 and how it is useful
1. Optimizing the Delta Table
- Code: spark.sql("OPTIMIZE delta.`{}`".format(delta_table_path))
- Explanation: OPTIMIZE Command: This command compacts small files into larger ones. Small files can result from frequent updates, inserts, and deletes, which can degrade query performance.
- How It Works: By combining smaller files into larger ones, it reduces the number of files that need to be read during query execution, thus improving performance.

2. Vacuuming the Delta Table
- Vacuum the Delta table to remove old files
- Code: delta_table.vacuum()
- Explanation: VACUUM Command: This command removes old data files that are no longer needed for the latest state of the table.
- Why It's Needed: Delta Lake keeps old files to allow for time travel and versioning. However, over time, these files can accumulate and consume significant storage space.
- How It Works: Vacuuming cleans up these old files based on a retention threshold (default is 7 days).

3. Z-Ordering the Delta Table
- Z-Ordering for better query performance
- Code: spark.sql("OPTIMIZE delta.`{}` ZORDER BY (product_id)".format(delta_table_path))
- Explanation: ZORDER Command: This command orders data files by specified columns. Z-ordering is a technique to colocate related information in the same set of files.
- Why It's Needed: This improves the performance of read queries that filter on the specified columns (e.g., product_id). By organizing data in this manner, the system can skip large amounts of data that are not relevant to a query, thereby speeding up query execution.

4. Adding a Comment to the Table
- Add a comment to the table
- Code: spark.sql("COMMENT ON TABLE transactions IS 'This is a transactions table with sales data.'")
- Explanation: COMMENT Command: Adds a comment to the table metadata. This is useful for documentation and providing context about the table.
- Why It's Needed: Helps users understand the purpose of the table and any specific details or notes about the data it contains.

Additional Commands
Reviewing Table History
Display history of transactions
history_df = delta_table.history()
history_df.show(truncate=False)
Explanation:

history Method: This method provides the transaction history of the Delta table, showing all the changes made over time.
Why It's Useful: It helps in auditing changes, debugging issues, and understanding the evolution of the table's data.
Rolling Back to a Previous Version
python
Copy code
- Roll back to a previous version
delta_table.restoreToVersion(1)

- Show the rolled-back Delta table
rolled_back_df = spark.read.format("delta").load(delta_table_path)
rolled_back_df.show()
Explanation:

restoreToVersion Method: This method restores the Delta table to a specified previous version.
Why It's Needed: Useful for undoing changes that introduced errors or issues, effectively rolling back the state of the table to a known good state.

### Rolling back to the just previous version
- This could be used when a further process failed so you need to drop the whole batch job and roll back to the version collected on the top

In [0]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import to_timestamp
from delta.tables import DeltaTable

# Initialize Spark Session
spark = SparkSession.builder.appName("DeltaLakeVersioningExample").getOrCreate()

# Drop existing tables and DataFrames if they exist
spark.sql("DROP TABLE IF EXISTS transactions")
dbutils.fs.rm("/dbfs/delta/transactions", True)

# Step 1: Initial Data Ingestion
transactions_data = [
    (1, 101, 50.0, "2024-01-01"),
    (2, 102, 150.0, "2024-01-02"),
    (3, 103, 200.0, "2024-01-03"),
    (4, 101, 75.0, "2024-01-04"),
    (5, 104, 125.0, "2024-01-05")
]
transactions_columns = ["transaction_id", "product_id", "amount", "transaction_date"]
transactions_df = spark.createDataFrame(transactions_data, transactions_columns)
transactions_df = transactions_df.withColumn("transaction_date", to_timestamp("transaction_date", "yyyy-MM-dd"))

delta_table_path = "/dbfs/delta/transactions"
transactions_df.write.format("delta").mode("overwrite").save(delta_table_path)

# Step 2: Update the Delta Table
# First update
transactions_updates_data_1 = [
    (2, 102, 175.0, "2024-01-02"),
    (6, 105, 300.0, "2024-01-06")
]
transactions_updates_df_1 = spark.createDataFrame(transactions_updates_data_1, transactions_columns)
transactions_updates_df_1 = transactions_updates_df_1.withColumn("transaction_date", to_timestamp("transaction_date", "yyyy-MM-dd"))

delta_table = DeltaTable.forPath(spark, delta_table_path)
delta_table.alias("tgt").merge(
    transactions_updates_df_1.alias("src"),
    "tgt.transaction_id = src.transaction_id"
).whenMatchedUpdate(set={
    "tgt.amount": "src.amount",
    "tgt.transaction_date": "src.transaction_date"
}).whenNotMatchedInsert(values={
    "transaction_id": "src.transaction_id",
    "product_id": "src.product_id",
    "amount": "src.amount",
    "transaction_date": "src.transaction_date"
}).execute()

# Second update
transactions_updates_data_2 = [
    (1, 101, 60.0, "2024-01-01"),
    (7, 106, 350.0, "2024-01-07")
]
transactions_updates_df_2 = spark.createDataFrame(transactions_updates_data_2, transactions_columns)
transactions_updates_df_2 = transactions_updates_df_2.withColumn("transaction_date", to_timestamp("transaction_date", "yyyy-MM-dd"))

delta_table.alias("tgt").merge(
    transactions_updates_df_2.alias("src"),
    "tgt.transaction_id = src.transaction_id"
).whenMatchedUpdate(set={
    "tgt.amount": "src.amount",
    "tgt.transaction_date": "src.transaction_date"
}).whenNotMatchedInsert(values={
    "transaction_id": "src.transaction_id",
    "product_id": "src.product_id",
    "amount": "src.amount",
    "transaction_date": "src.transaction_date"
}).execute()

# Show the updated Delta table
updated_df = spark.read.format("delta").load(delta_table_path)
updated_df.show()

# Third update
transactions_updates_data_2 = [
    (1, 101, 60.0, "2024-02-02"),
    (8, 108, 450.0, "2024-01-05")
]
transactions_updates_df_2 = spark.createDataFrame(transactions_updates_data_2, transactions_columns)
transactions_updates_df_2 = transactions_updates_df_2.withColumn("transaction_date", to_timestamp("transaction_date", "yyyy-MM-dd"))

delta_table.alias("tgt").merge(
    transactions_updates_df_2.alias("src"),
    "tgt.transaction_id = src.transaction_id"
).whenMatchedUpdate(set={
    "tgt.amount": "src.amount",
    "tgt.transaction_date": "src.transaction_date"
}).whenNotMatchedInsert(values={
    "transaction_id": "src.transaction_id",
    "product_id": "src.product_id",
    "amount": "src.amount",
    "transaction_date": "src.transaction_date"
}).execute()

# Show the updated Delta table
updated_df = spark.read.format("delta").load(delta_table_path)
updated_df.show()

+--------------+----------+------+-------------------+
|transaction_id|product_id|amount|   transaction_date|
+--------------+----------+------+-------------------+
|             3|       103| 200.0|2024-01-03 00:00:00|
|             5|       104| 125.0|2024-01-05 00:00:00|
|             2|       102| 175.0|2024-01-02 00:00:00|
|             6|       105| 300.0|2024-01-06 00:00:00|
|             1|       101|  60.0|2024-01-01 00:00:00|
|             7|       106| 350.0|2024-01-07 00:00:00|
|             4|       101|  75.0|2024-01-04 00:00:00|
+--------------+----------+------+-------------------+

+--------------+----------+------+-------------------+
|transaction_id|product_id|amount|   transaction_date|
+--------------+----------+------+-------------------+
|             3|       103| 200.0|2024-01-03 00:00:00|
|             5|       104| 125.0|2024-01-05 00:00:00|
|             2|       102| 175.0|2024-01-02 00:00:00|
|             6|       105| 300.0|2024-01-06 00:00:00|
|        

In [0]:
# Step 3: Fetch the Current Version and Roll Back to Previous Version
# Get the latest version of the Delta table
latest_version = delta_table.history(1).select("version").collect()[0][0]
previous_version = latest_version - 1
latest_version, previous_version

Out[14]: (3, 2)

In [0]:
delta_table.history(1)

In [0]:
print(f"Rolling back from version {latest_version} to version {previous_version}")

# Restore to the previous version
delta_table.restoreToVersion(previous_version)

# Show the rolled-back Delta table
rolled_back_df = spark.read.format("delta").load(delta_table_path)
rolled_back_df.show()

Rolling back from version 3 to version 2
+--------------+----------+------+-------------------+
|transaction_id|product_id|amount|   transaction_date|
+--------------+----------+------+-------------------+
|             2|       102| 175.0|2024-01-02 00:00:00|
|             1|       101|  60.0|2024-01-01 00:00:00|
|             3|       103| 200.0|2024-01-03 00:00:00|
|             6|       105| 300.0|2024-01-06 00:00:00|
|             7|       106| 350.0|2024-01-07 00:00:00|
|             5|       104| 125.0|2024-01-05 00:00:00|
|             4|       101|  75.0|2024-01-04 00:00:00|
+--------------+----------+------+-------------------+



In [0]:
# Restore to the previous version
delta_table.restoreToVersion(1)

# Show the rolled-back Delta table
rolled_back_df = spark.read.format("delta").load(delta_table_path)
rolled_back_df.show()

+--------------+----------+------+-------------------+
|transaction_id|product_id|amount|   transaction_date|
+--------------+----------+------+-------------------+
|             1|       101|  50.0|2024-01-01 00:00:00|
|             2|       102| 175.0|2024-01-02 00:00:00|
|             3|       103| 200.0|2024-01-03 00:00:00|
|             6|       105| 300.0|2024-01-06 00:00:00|
|             5|       104| 125.0|2024-01-05 00:00:00|
|             4|       101|  75.0|2024-01-04 00:00:00|
+--------------+----------+------+-------------------+



In [0]:
# Can we restore the latest version again? Yes
delta_table.restoreToVersion(3)

# Show the rolled-back Delta table
rolled_back_df = spark.read.format("delta").load(delta_table_path)
rolled_back_df.show()

+--------------+----------+------+-------------------+
|transaction_id|product_id|amount|   transaction_date|
+--------------+----------+------+-------------------+
|             2|       102| 175.0|2024-01-02 00:00:00|
|             1|       101|  60.0|2024-02-02 00:00:00|
|             3|       103| 200.0|2024-01-03 00:00:00|
|             6|       105| 300.0|2024-01-06 00:00:00|
|             7|       106| 350.0|2024-01-07 00:00:00|
|             8|       108| 450.0|2024-01-05 00:00:00|
|             5|       104| 125.0|2024-01-05 00:00:00|
|             4|       101|  75.0|2024-01-04 00:00:00|
+--------------+----------+------+-------------------+

