- **Name:** 001_Introduction
- **Author:** Shamas Imran
- **Desciption:** Read/Write as Delta folder in Unity Catalog

In [0]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName("DatapurProgram").getOrCreate()

In [0]:
from pyspark.sql.types import *
from datetime import date

# Enrollment schema
enrollment_schema = StructType([
    StructField("EnrollmentID", IntegerType(), False),
    StructField("StudentID_FK", IntegerType(), False),
    StructField("CourseID_FK", IntegerType(), False),
    StructField("EnrollmentDate", DateType(), True)
])

# Enrollment sample data
enrollment_data = [
    (1, 1, 1, date(2023, 9, 1)),   # Alice -> Physics
    (2, 2, 2, date(2023, 9, 2)),   # Bob -> Chemistry
    (3, 4, 4, date(2023, 9, 4)),   # Shamas -> Computer Science
    (4, 1, 2, date(2023, 9, 5)),   # Alice -> Chemistry
]

# Create DataFrame
df_enrollment = spark.createDataFrame(enrollment_data, enrollment_schema)

# Show DataFrame
df_enrollment.show()

In [0]:
# Root path in Unity Catalog volume
rootPath = "/Volumes/datapurcatalog/default/datapurvolume/"
deltaPath = rootPath + "enrollment_delta"

In [0]:
df_enrollment.write.format("delta").mode("overwrite").save(deltaPath)

In [0]:
df_enrollment_uc = spark.read.format("delta").load(deltaPath).show()

In [0]:
df_enrollment_uc.printSchema()      # See schema
df_enrollment_uc.show()             # Show rows
df_enrollment_uc.count()            # Count total rows
df_enrollment_uc.columns            # List of columns
df_enrollment_uc.describe().show()  # Summary stats (min, max, avg, stddev)

In [0]:
from delta.tables import DeltaTable

# Load Delta table from Unity Catalog path
deltaPath = "/Volumes/datapurcatalog/default/datapurvolume/enrollment_delta"
deltaTable = DeltaTable.forPath(spark, deltaPath)

# Update enrollment date for StudentID_FK = 1
deltaTable.update(
    condition = "StudentID_FK = 1 AND CourseID_FK = 1",
    set = { "EnrollmentDate": "'2025-01-01'" }
)

In [0]:
deltaTable.delete("EnrollmentID = 4")

deltaTableV2 = DeltaTable.forPath(spark, deltaPath)
deltaTableV2.history().show()

In [0]:
# View old version (e.g., version 0)
df_old = spark.read.format("delta").option("versionAsOf", 0).load(deltaPath)
df_old.show()

In [0]:
# Compare with latest
df_latest = spark.read.format("delta").option("versionAsOf", 2).load(deltaPath)
df_latest.show()

In [0]:
# Use dbutils to remove folder & files
dbutils.fs.rm(deltaPath, recurse=True)