- Name: 04.3_dataframe_delta_folder
- Author: Shamas Imran
- Desciption: Reading and writing Delta tables stored in folder format
- Date: 14-Oct-2025

In [1]:
from pyspark.sql.types import *
from datetime import date

# Enrollment schema
enrollment_schema = StructType([
    StructField("EnrollmentID", IntegerType(), False),
    StructField("StudentID_FK", IntegerType(), False),
    StructField("CourseID_FK", IntegerType(), False),
    StructField("EnrollmentDate", DateType(), True)
])

# Enrollment sample data
enrollment_data = [
    (1, 1, 1, date(2023, 9, 1)),   # Alice -> Physics
    (2, 2, 2, date(2023, 9, 2)),   # Bob -> Chemistry
    (3, 4, 4, date(2023, 9, 4)),   # Shamas -> Computer Science
    (4, 1, 2, date(2023, 9, 5)),   # Alice -> Chemistry
]

# Create DataFrame
df_enrollment = spark.createDataFrame(enrollment_data, enrollment_schema)

# Show DataFrame
df_enrollment.show()

StatementMeta(, 68671e46-a0a2-4b79-bed8-92c8d3bf2425, 3, Finished, Available, Finished)

+------------+------------+-----------+--------------+
|EnrollmentID|StudentID_FK|CourseID_FK|EnrollmentDate|
+------------+------------+-----------+--------------+
|           1|           1|          1|    2023-09-01|
|           2|           2|          2|    2023-09-02|
|           3|           4|          4|    2023-09-04|
|           4|           1|          2|    2023-09-05|
+------------+------------+-----------+--------------+



In [2]:
rootPath = "Files/client_output_data/delta/"
deltaPath = rootPath + "enrollment_delta"

df_enrollment.write.format("delta").mode("overwrite").save(deltaPath)
df_enrollment_uc = spark.read.format("delta").load(deltaPath)
df_enrollment_uc.show()

StatementMeta(, 68671e46-a0a2-4b79-bed8-92c8d3bf2425, 4, Finished, Available, Finished)

+------------+------------+-----------+--------------+
|EnrollmentID|StudentID_FK|CourseID_FK|EnrollmentDate|
+------------+------------+-----------+--------------+
|           4|           1|          2|    2023-09-05|
|           2|           2|          2|    2023-09-02|
|           1|           1|          1|    2023-09-01|
|           3|           4|          4|    2023-09-04|
+------------+------------+-----------+--------------+



In [3]:
from delta.tables import DeltaTable

rootPath = "Files/client_output_data/delta/"
deltaPath = rootPath + "enrollment_delta"
deltaTable = DeltaTable.forPath(spark, deltaPath)

# Update enrollment date for StudentID_FK = 1
deltaTable.update(
    condition = "StudentID_FK = 1 AND CourseID_FK = 1",
    set = { "EnrollmentDate": "'2025-01-01'" }
)

StatementMeta(, 68671e46-a0a2-4b79-bed8-92c8d3bf2425, 5, Finished, Available, Finished)

In [4]:
df_enrollment_uc = spark.read.format("delta").load(deltaPath)
df_enrollment_uc.show()

StatementMeta(, 68671e46-a0a2-4b79-bed8-92c8d3bf2425, 6, Finished, Available, Finished)

+------------+------------+-----------+--------------+
|EnrollmentID|StudentID_FK|CourseID_FK|EnrollmentDate|
+------------+------------+-----------+--------------+
|           4|           1|          2|    2023-09-05|
|           2|           2|          2|    2023-09-02|
|           1|           1|          1|    2025-01-01|
|           3|           4|          4|    2023-09-04|
+------------+------------+-----------+--------------+



In [5]:
deltaTableV2 = DeltaTable.forPath(spark, deltaPath)
deltaTableV2.history().show()

StatementMeta(, 68671e46-a0a2-4b79-bed8-92c8d3bf2425, 7, Finished, Available, Finished)

+-------+--------------------+------+--------+---------+--------------------+----+--------+---------+-----------+--------------+-------------+--------------------+------------+--------------------+
|version|           timestamp|userId|userName|operation| operationParameters| job|notebook|clusterId|readVersion|isolationLevel|isBlindAppend|    operationMetrics|userMetadata|          engineInfo|
+-------+--------------------+------+--------+---------+--------------------+----+--------+---------+-----------+--------------+-------------+--------------------+------------+--------------------+
|      1|2025-10-15 18:49:...|  NULL|    NULL|   UPDATE|{predicate -> ["(...|NULL|    NULL|     NULL|          0|  Serializable|        false|{numRemovedFiles ...|        NULL|Apache-Spark/3.5....|
|      0|2025-10-15 18:44:...|  NULL|    NULL|    WRITE|{mode -> Overwrit...|NULL|    NULL|     NULL|       NULL|  Serializable|        false|{numFiles -> 5, n...|        NULL|Apache-Spark/3.5....|
+-------+-

In [6]:
deltaTable.delete("EnrollmentID = 4")

deltaTableV2 = DeltaTable.forPath(spark, deltaPath)
deltaTableV2.toDF().show()
deltaTableV2.history().show()

StatementMeta(, 68671e46-a0a2-4b79-bed8-92c8d3bf2425, 8, Finished, Available, Finished)

+------------+------------+-----------+--------------+
|EnrollmentID|StudentID_FK|CourseID_FK|EnrollmentDate|
+------------+------------+-----------+--------------+
|           2|           2|          2|    2023-09-02|
|           1|           1|          1|    2025-01-01|
|           3|           4|          4|    2023-09-04|
+------------+------------+-----------+--------------+

+-------+--------------------+------+--------+---------+--------------------+----+--------+---------+-----------+--------------+-------------+--------------------+------------+--------------------+
|version|           timestamp|userId|userName|operation| operationParameters| job|notebook|clusterId|readVersion|isolationLevel|isBlindAppend|    operationMetrics|userMetadata|          engineInfo|
+-------+--------------------+------+--------+---------+--------------------+----+--------+---------+-----------+--------------+-------------+--------------------+------------+--------------------+
|      2|2025-10-15 

In [7]:

# View old version (e.g., version 0)
df_new = spark.read.format("delta").load(deltaPath)
df_new.show()

# View old version (e.g., version 0)
df_old = spark.read.format("delta").option("versionAsOf", 1).load(deltaPath)
df_old.show()

StatementMeta(, 68671e46-a0a2-4b79-bed8-92c8d3bf2425, 9, Finished, Available, Finished)

+------------+------------+-----------+--------------+
|EnrollmentID|StudentID_FK|CourseID_FK|EnrollmentDate|
+------------+------------+-----------+--------------+
|           2|           2|          2|    2023-09-02|
|           1|           1|          1|    2025-01-01|
|           3|           4|          4|    2023-09-04|
+------------+------------+-----------+--------------+

+------------+------------+-----------+--------------+
|EnrollmentID|StudentID_FK|CourseID_FK|EnrollmentDate|
+------------+------------+-----------+--------------+
|           4|           1|          2|    2023-09-05|
|           2|           2|          2|    2023-09-02|
|           1|           1|          1|    2025-01-01|
|           3|           4|          4|    2023-09-04|
+------------+------------+-----------+--------------+



In [8]:
mssparkutils.fs.rm(deltaPath, True)

StatementMeta(, 68671e46-a0a2-4b79-bed8-92c8d3bf2425, 10, Finished, Available, Finished)

True