In [0]:
import pyspark
from delta import *

In [0]:
builder = pyspark.sql.SparkSession.builder.appName("MyApp") \
    .config("spark.sql.extensions", "io.delta.sql.DeltaSparkSessionExtension") \
    .config("spark.sql.catalog.spark_catalog", "org.apache.spark.sql.delta.catalog.DeltaCatalog")

In [0]:
spark = configure_spark_with_delta_pip(builder).getOrCreate()

In [0]:
data = spark.range(0, 5)
data.write.format("delta").save("/tmp/delta-table")

In [0]:
df = spark.read.format("delta").load("/tmp/delta-table")
df.show()
display(df)

+---+
| id|
+---+
|  3|
|  4|
|  0|
|  1|
|  2|
+---+



id
3
4
0
1
2


In [0]:
data = spark.range(5, 10)
data.write.format("delta").mode("overwrite").save("/tmp/delta-table")
df.show()

+---+
| id|
+---+
|  8|
|  9|
|  5|
|  6|
|  7|
+---+



In [0]:
streamingDf = spark.readStream.format("rate").load()
stream = streamingDf.selectExpr("value as id").writeStream.format("delta").option("checkpointLocation", "/tmp/checkpoint").start("/tmp/delta-table")

In [0]:
from delta.tables import *
from pyspark.sql.functions import *

In [0]:
deltaTable = DeltaTable.forPath(spark, "/tmp/delta-table")

In [0]:
deltaTable.update(
  condition = expr("id % 2 == 0"),
  set = { "id": expr("id + 100") })


In [0]:
df = spark.read.format("delta").load("/tmp/delta-table")
df.show()
display(df)

+---+
| id|
+---+
|291|
|255|
|259|
|247|
|303|
|267|
|279|
|305|
|271|
|293|
|295|
|283|
|307|
|281|
|275|
|285|
|287|
|273|
|253|
|309|
+---+
only showing top 20 rows



id
5
106
108
100
102
104
106
108
110
112


In [0]:
deltaTable.delete(condition = expr("id % 2 == 0"))

In [0]:
df = spark.read.format("delta").load("/tmp/delta-table")
df.show()
display(df)

+---+
| id|
+---+
|  5|
|  7|
|  9|
|  1|
|  3|
|  5|
|  7|
|  9|
| 11|
| 13|
| 15|
| 17|
| 19|
| 21|
| 23|
| 25|
| 27|
| 29|
| 31|
| 33|
+---+
only showing top 20 rows



id
5
7
9
1
3
5
7
9
11
13


In [0]:
newData = spark.range(0, 20)

deltaTable.alias("oldData") \
  .merge(
    newData.alias("newData"),
    "oldData.id = newData.id") \
  .whenMatchedUpdate(set = { "id": col("newData.id") }) \
  .whenNotMatchedInsert(values = { "id": col("newData.id") }) \
  .execute()

deltaTable.toDF().show()

+---+
| id|
+---+
|  5|
|  5|
|  6|
|  7|
|  7|
|  8|
|  9|
|  9|
| 10|
| 11|
| 12|
| 13|
| 14|
| 15|
| 16|
| 17|
| 18|
| 19|
|  0|
|  1|
+---+
only showing top 20 rows

