In [None]:
from pyspark.sql.session import SparkSession

# .master("spark://spark-master:7077") \
spark = SparkSession.builder \
            .master("local[*]") \
            .appName("SparkByExamples.com") \
            .config("spark.jars.packages","io.delta:delta-core_2.12:1.1.0") \
            .config("spark.sql.extensions","io.delta.sql.DeltaSparkSessionExtension") \
            .config("spark.sql.catalog.spark_catalog","org.apache.spark.sql.delta.catalog.DeltaCatalog") \
            .config("spark.sql.shuffle.partitions","1") \
            .getOrCreate()  

In [None]:
!mkdir -p data
!rm -rf data/delta-table* data/checkpoint*
!ls data

In [None]:
data=spark.range(0, 5)
data.write.format("delta").save("data/delta-table")

In [None]:
df = spark.read.format("delta").load("data/delta-table")
df.sort("id").show()

In [None]:
data = spark.range(5, 10)
data.write.format("delta").mode("overwrite").save("data/delta-table")

In [None]:
df = spark.read.format("delta").load("data/delta-table")
df.sort("id").show()

In [None]:
from delta.tables import *

from pyspark.sql.functions import *

deltaTable = DeltaTable.forPath(spark, "data/delta-table")

# Update every even value by adding 100 to it
deltaTable.update(
  condition = expr("id % 2 == 0"),
  set = { "id": expr("id + 100") })

# Delete every even value
deltaTable.delete(condition = expr("id % 2 == 0"))

# Upsert (merge) new data
newData = spark.range(0, 20)

deltaTable.alias("oldData") \
  .merge(
    newData.alias("newData"),
    "oldData.id = newData.id") \
  .whenMatchedUpdate(set = { "id": col("newData.id") }) \
  .whenNotMatchedInsert(values = { "id": col("newData.id") }) \
  .execute()

deltaTable.toDF().show()

In [None]:
df = spark.read.format("delta").option("versionAsOf", 0).load("data/delta-table")
df.sort("id").show()

In [None]:
streamingDf = spark.readStream.format("rate").load()
stream = streamingDf.selectExpr("value as id") \
            .writeStream.format("delta") \
            .option("checkpointLocation", "data/checkpoint") \
            .start("data/delta-table")

In [None]:
stream2 = spark.readStream.format("delta").load("data/delta-table").writeStream.format("console").start()

In [None]:
stream.stop()

In [None]:
!rm -rf data
