In [1]:
# Import necessary libraries
import pyspark
from pyspark.sql import SparkSession
from pyspark.sql.functions import col
from delta import *

In [2]:
# ==> THIS IS THE MOST IMPORTANT CONFIG <==
# Configure Spark Session to use Delta Lake
builder = pyspark.sql.SparkSession.builder.appName("AwesomeDeltaLake") \
    .config("spark.sql.extensions", "io.delta.sql.DeltaSparkSessionExtension") \
    .config("spark.sql.catalog.spark_catalog", "org.apache.spark.sql.delta.catalog.DeltaCatalog")

# Create the SparkSession
spark = configure_spark_with_delta_pip(builder).getOrCreate()

print("âœ¨ Spark and Delta Lake are ready to go! âœ¨")

:: loading settings :: url = jar:file:/Users/jesses_fables/Desktop/.venv/lib/python3.12/site-packages/pyspark/jars/ivy-2.5.3.jar!/org/apache/ivy/core/settings/ivysettings.xml
Ivy Default Cache set to: /Users/jesses_fables/.ivy2.5.2/cache
The jars for the packages stored in: /Users/jesses_fables/.ivy2.5.2/jars
io.delta#delta-spark_2.13 added as a dependency
:: resolving dependencies :: org.apache.spark#spark-submit-parent-365d4173-dcbe-47f6-8a33-99bd7dcd8012;1.0
	confs: [default]
	found io.delta#delta-spark_2.13;4.0.0 in central
	found io.delta#delta-storage;4.0.0 in central
	found org.antlr#antlr4-runtime;4.13.1 in central
downloading https://repo1.maven.org/maven2/io/delta/delta-spark_2.13/4.0.0/delta-spark_2.13-4.0.0.jar ...
	[SUCCESSFUL ] io.delta#delta-spark_2.13;4.0.0!delta-spark_2.13.jar (1203ms)
downloading https://repo1.maven.org/maven2/io/delta/delta-storage/4.0.0/delta-storage-4.0.0.jar ...
	[SUCCESSFUL ] io.delta#delta-storage;4.0.0!delta-storage.jar (70ms)
downloading https

âœ¨ Spark and Delta Lake are ready to go! âœ¨


In [3]:
# Define the path for our Delta table
delta_table_path = "/tmp/delta-table"

# Sample data for our table
data = spark.createDataFrame([
    (1, "Alice", 25),
    (2, "Bob", 30),
    (3, "Charlie", 35)
], ["id", "name", "age"])

# Write the DataFrame to a Delta table
print(f"Writing data to Delta table at: {delta_table_path}")
data.write.format("delta").mode("overwrite").save(delta_table_path)

print("âœ… Write complete!")

Writing data to Delta table at: /tmp/delta-table


25/09/07 11:35:59 WARN SparkStringUtils: Truncated the string representation of a plan since it was too large. This behavior can be adjusted by setting 'spark.sql.debug.maxToStringFields'.
                                                                                

âœ… Write complete!


In [4]:
# Let's see what we created!
# Read the data back and show it
df = spark.read.format("delta").load(delta_table_path)

print("Reading data from Delta table:")
df.show()

Reading data from Delta table:


                                                                                

+---+-------+---+
| id|   name|age|
+---+-------+---+
|  3|Charlie| 35|
|  1|  Alice| 25|
|  2|    Bob| 30|
+---+-------+---+



In [5]:
# Let's see what we created!
# Read the data back and show it
df = spark.read.format("delta").load(delta_table_path)

print("Reading data from Delta table:")
df.show()

Reading data from Delta table:
+---+-------+---+
| id|   name|age|
+---+-------+---+
|  3|Charlie| 35|
|  1|  Alice| 25|
|  2|    Bob| 30|
+---+-------+---+



In [6]:
# Create a new DataFrame with new and updated data
newData = spark.createDataFrame([
    (1, "Alicia", 26),        # Update Alice's name and age
    (4, "David", 40)         # New person
], ["id", "name", "age"])

print("New data to merge:")
newData.show()


New data to merge:
+---+------+---+
| id|  name|age|
+---+------+---+
|  1|Alicia| 26|
|  4| David| 40|
+---+------+---+



In [7]:
# Load the Delta table as a DeltaTable object
from delta.tables import *
deltaTable = DeltaTable.forPath(spark, delta_table_path)

# Perform the merge operation
print("Performing merge...")
deltaTable.alias("oldData") \
  .merge(
    newData.alias("newData"),
    "oldData.id = newData.id"
  ) \
  .whenMatchedUpdate(set = { "name": col("newData.name"), "age": col("newData.age") }) \
  .whenNotMatchedInsert(values = { "id": col("newData.id"), "name": col("newData.name"), "age": col("newData.age") }) \
  .execute()

print("âœ… Merge complete!")


Performing merge...


                                                                                

âœ… Merge complete!


25/09/07 11:37:21 WARN MapPartitionsRDD: RDD 57 was locally checkpointed, its lineage has been truncated and cannot be recomputed after unpersisting


In [8]:
# See the results! Bob and Charlie are untouched.
# Alice is updated to Alicia. David is added.
print("Table content after merge:")
spark.read.format("delta").load(delta_table_path).orderBy("id").show()

Table content after merge:


                                                                                

+---+-------+---+
| id|   name|age|
+---+-------+---+
|  1| Alicia| 26|
|  2|    Bob| 30|
|  3|Charlie| 35|
|  4|  David| 40|
+---+-------+---+



In [9]:
# Let's read the *first* version of our table (before the merge)
# Version 0 was the initial write.
print("Reading table at version 0 (before merge):")
df_v0 = spark.read.format("delta").option("versionAsOf", 0).load(delta_table_path)
df_v0.orderBy("id").show()

Reading table at version 0 (before merge):


                                                                                

+---+-------+---+
| id|   name|age|
+---+-------+---+
|  1|  Alice| 25|
|  2|    Bob| 30|
|  3|Charlie| 35|
+---+-------+---+



In [10]:
# Now let's read the *latest* version (after the merge)
# Version 1 was the merge operation.
print("Reading table at version 1 (after merge):")
df_v1 = spark.read.format("delta").option("versionAsOf", 1).load(delta_table_path)
df_v1.orderBy("id").show()

Reading table at version 1 (after merge):
+---+-------+---+
| id|   name|age|
+---+-------+---+
|  1| Alicia| 26|
|  2|    Bob| 30|
|  3|Charlie| 35|
|  4|  David| 40|
+---+-------+---+



In [11]:
# Create a DataFrame with a different schema (new column 'city')
badData = spark.createDataFrame([
    (5, "Eve", 28, "Miami")
], ["id", "name", "age", "city"])

# Let's try to append this...
print("Attempting to write data with a different schema...")
try:
    badData.write.format("delta").mode("append").save(delta_table_path)
except Exception as e:
    print("ðŸš¨ ERROR! As expected, the write failed.")
    print(e)

Attempting to write data with a different schema...
ðŸš¨ ERROR! As expected, the write failed.
[_LEGACY_ERROR_TEMP_DELTA_0007] A schema mismatch detected when writing to the Delta table (Table ID: e1e7b773-9c0c-40e1-8a4d-9ace3f45cb8e).
To enable schema migration using DataFrameWriter or DataStreamWriter, please set:
'.option("mergeSchema", "true")'.
For other operations, set the session configuration
spark.databricks.delta.schema.autoMerge.enabled to "true". See the documentation
specific to the operation for details.

Table schema:
root
-- id: long (nullable = true)
-- name: string (nullable = true)
-- age: long (nullable = true)


Data schema:
root
-- id: long (nullable = true)
-- name: string (nullable = true)
-- age: long (nullable = true)
-- city: string (nullable = true)

         


In [12]:
# Let's retry by enabling schema evolution
print("Writing again with schema evolution enabled...")

badData.write.format("delta") \
    .mode("append") \
    .option("mergeSchema", "true") \
    .save(delta_table_path)

print("âœ… Write successful!")

Writing again with schema evolution enabled...


                                                                                

âœ… Write successful!


In [13]:
# Check the new schema and data
# The 'city' column has been added for the new row,
# and is null for the old rows.
print("Table content after schema evolution:")
spark.read.format("delta").load(delta_table_path).orderBy("id").show()

print("New schema:")
spark.read.format("delta").load(delta_table_path).printSchema()

Table content after schema evolution:


                                                                                

+---+-------+---+-----+
| id|   name|age| city|
+---+-------+---+-----+
|  1| Alicia| 26| NULL|
|  2|    Bob| 30| NULL|
|  3|Charlie| 35| NULL|
|  4|  David| 40| NULL|
|  5|    Eve| 28|Miami|
+---+-------+---+-----+

New schema:
root
 |-- id: long (nullable = true)
 |-- name: string (nullable = true)
 |-- age: long (nullable = true)
 |-- city: string (nullable = true)



In [14]:
# It's good practice to clean up the files
import shutil

try:
    shutil.rmtree(delta_table_path)
    print(f"âœ… Successfully removed the Delta table at {delta_table_path}")
except OSError as e:
    print(f"Error: {e.strerror}")


âœ… Successfully removed the Delta table at /tmp/delta-table


In [15]:
# Stop the Spark Session
spark.stop()