- **Name:** 21_delta_advanced
- **Author:** Shamas Imran
- **Desciption:** Advanced operations with Delta Lake
- **Date:** 19-Aug-2025
<!--
REVISION HISTORY
Version          Date        Author           Desciption
01           19-Aug-2025   Shamas Imran       Implemented upserts (merge) in Delta  
                                              Used time travel queries  
                                              Performed vacuum and optimize  
-->

In [0]:
from pyspark.sql import SparkSession
from pyspark.sql.types import StructType, StructField, IntegerType, StringType
from delta.tables import DeltaTable

# ------------------------------------------------------------
# 1) Spark Session
# ------------------------------------------------------------
spark = (
    SparkSession.builder
        .appName("Delta_Advanced_Features")
        .config("spark.sql.extensions", "io.delta.sql.DeltaSparkSessionExtension")
        .config("spark.sql.catalog.spark_catalog", "org.apache.spark.sql.delta.catalog.DeltaCatalog")
        .getOrCreate()
)


In [0]:
# ------------------------------------------------------------
# 2) Sample Student DataFrame
# ------------------------------------------------------------
student_schema = StructType([
    StructField('StudentID', IntegerType(), False),
    StructField('StudentName', StringType(), True),
    StructField('StudentAge', IntegerType(), True)
])

student_data = [
    (1, "Alice", 34),
    (2, "Bob", 45),
    (3, "Charlie", 29),
    (4, "Shamas", 40)
]

df_student = spark.createDataFrame(student_data, student_schema)

# Delta table path
delta_path = "/Volumes/datapurcatalog/default/datapurvolume/delta/student_table_advanced"

# ------------------------------------------------------------
# 3) Create Delta Table
# ------------------------------------------------------------
df_student.write.format("delta").mode("overwrite").save(delta_path)
# Now df_student is saved as Delta table

In [0]:
                                                            # ------------------------------------------------------------
                                                            # 4) Time Travel Queries
                                                            # ------------------------------------------------------------

In [0]:
# Show latest version
delta_table = DeltaTable.forPath(spark, delta_path)
delta_table.toDF().show()

In [0]:
# Get full history
full_history = delta_table.history(1000)  # large number to cover all versions
full_history.select("version", "timestamp", "operation", "operationParameters").show(truncate=False)


In [0]:

# Example: Read data as it was at version 0
df_version0 = spark.read.format("delta").option("versionAsOf", 1).load(delta_path)
df_version0.show()
# Key Point: Time travel allows you to query historical snapshots of Delta table



In [0]:
# ------------------------------------------------------------
# 5) Upserts / MERGE INTO (handling CDC)
# ------------------------------------------------------------
# Example new data to upsert (some new, some existing StudentID)
student_updates = [
    (3, "Charlie", 30),  # existing StudentID, age updated
    (5, "Faizan", 25)    # new StudentID
]

df_updates = spark.createDataFrame(student_updates, student_schema)
delta_table = DeltaTable.forPath(spark, delta_path)

delta_table.alias("tgt").merge(
    df_updates.alias("src"),
    "tgt.StudentID = src.StudentID"
).whenMatchedUpdate(set = {
    "StudentName": "src.StudentName",
    "StudentAge": "src.StudentAge"
}).whenNotMatchedInsert(values = {
    "StudentID": "src.StudentID",
    "StudentName": "src.StudentName",
    "StudentAge": "src.StudentAge"
}).execute()

# Show updated table
delta_table.toDF().show()

In [0]:
# List all files in Delta folder
all_files = dbutils.fs.ls(delta_path)

# Filter only Parquet files
parquet_files = [f.path for f in all_files if f.path.endswith(".parquet")]

# Show all Parquet files
print("Parquet files in Delta folder:")
for file in parquet_files:
    print(file)

In [0]:
                                                    # ------------------------------------------------------------
                                                    # 6.1) OPTIMIZE
                                                    # ------------------------------------------------------------
# OPTIMIZE (Delta Lake only in Databricks) improves query performance by compacting small files
# delta_table.optimize().execute()   # Uncomment if using Databricks

In [0]:
                                                    # ------------------------------------------------------------
                                                    # 6.2) VACUUM
                                                    # ------------------------------------------------------------
# VACUUM removes old files to clean up storage
delta_table.vacuum(retentionHours=1)  # retentionHours can be set to retain files for safety


In [0]:
# ------------------------------------------------------------
# 7) Schema Evolution
# ------------------------------------------------------------
# Adding a new column (StudentGrade) and enabling schema evolution
new_student_data = [
    (6, "Adeel", 28, "A")
]

new_schema = StructType([
    StructField('StudentID', IntegerType(), False),
    StructField('StudentName', StringType(), True),
    StructField('StudentAge', IntegerType(), True),
    StructField('StudentGrade', StringType(), True)
])

df_new = spark.createDataFrame(new_student_data, new_schema)

df_new.write.format("delta").mode("append").option("mergeSchema", "true").save(delta_path)
# Key Point: mergeSchema=True allows adding new columns without breaking the table

spark.read.format("delta").load(delta_path).show()