## Quickstart

Code snippets are from [this guide](https://docs.delta.io/latest/quick-start.html).

In [1]:
import pyspark
from delta import *

In [2]:
builder = (
    pyspark.sql.SparkSession.builder.appName("MyApp")
    .config("spark.sql.extensions", "io.delta.sql.DeltaSparkSessionExtension")
    .config(
        "spark.sql.catalog.spark_catalog",
        "org.apache.spark.sql.delta.catalog.DeltaCatalog",
    )
)

In [3]:
spark = configure_spark_with_delta_pip(builder).getOrCreate()

23/12/10 20:08:19 WARN Utils: Your hostname, Pradeeps-MacBook-Pro.local resolves to a loopback address: 127.0.0.1; using 192.168.1.220 instead (on interface en0)
23/12/10 20:08:19 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
Ivy Default Cache set to: /Users/pradeep/.ivy2/cache
The jars for the packages stored in: /Users/pradeep/.ivy2/jars
io.delta#delta-core_2.12 added as a dependency
:: resolving dependencies :: org.apache.spark#spark-submit-parent-f22cec8f-d06c-458a-95bf-0da4f9388412;1.0
	confs: [default]


:: loading settings :: url = jar:file:/Users/pradeep/anaconda3/envs/pyspark-340-delta-240/lib/python3.9/site-packages/pyspark/jars/ivy-2.5.1.jar!/org/apache/ivy/core/settings/ivysettings.xml


	found io.delta#delta-core_2.12;2.4.0 in central
	found io.delta#delta-storage;2.4.0 in central
	found org.antlr#antlr4-runtime;4.9.3 in central
:: resolution report :: resolve 86ms :: artifacts dl 3ms
	:: modules in use:
	io.delta#delta-core_2.12;2.4.0 from central in [default]
	io.delta#delta-storage;2.4.0 from central in [default]
	org.antlr#antlr4-runtime;4.9.3 from central in [default]
	---------------------------------------------------------------------
	|                  |            modules            ||   artifacts   |
	|       conf       | number| search|dwnlded|evicted|| number|dwnlded|
	---------------------------------------------------------------------
	|      default     |   3   |   0   |   0   |   0   ||   3   |   0   |
	---------------------------------------------------------------------
:: retrieving :: org.apache.spark#spark-submit-parent-f22cec8f-d06c-458a-95bf-0da4f9388412
	confs: [default]
	0 artifacts copied, 3 already retrieved (0kB/3ms)
23/12/10 20:08:20 WA

## Create a table

In [4]:
data = spark.range(0, 5)
data.write.format("delta").save("tmp/delta-table")

23/12/10 20:08:49 WARN package: Truncated the string representation of a plan since it was too large. This behavior can be adjusted by setting 'spark.sql.debug.maxToStringFields'.
                                                                                

## Read a table

In [5]:
df = spark.read.format("delta").load("tmp/delta-table")

In [6]:
df.show()

+---+
| id|
+---+
|  3|
|  0|
|  1|
|  4|
|  2|
+---+



## Update table - overwrite

In [7]:
data = spark.range(5, 10)
data.write.format("delta").mode("overwrite").save("tmp/delta-table")

In [8]:
df = spark.read.format("delta").load("tmp/delta-table")
df.show()

+---+
| id|
+---+
|  9|
|  8|
|  6|
|  5|
|  7|
+---+



## Conditional update without overwrite

In [10]:
from delta.tables import *
from pyspark.sql.functions import *

In [11]:
deltaTable = DeltaTable.forPath(spark, "tmp/delta-table")

In [12]:
# Update every even value by adding 100 to it
deltaTable.update(condition=expr("id % 2 == 0"), set={"id": expr("id + 100")})

                                                                                

In [13]:
deltaTable.toDF().show()

+---+
| id|
+---+
|  7|
|  9|
|108|
|106|
|  5|
+---+



In [None]:
# Delete every even value
deltaTable.delete(condition=expr("id % 2 == 0"))

In [45]:
deltaTable.toDF().show()

+---+
| id|
+---+
|  5|
|  7|
|  9|
+---+



In [46]:
# Upsert (merge) new data
newData = spark.range(0, 20)

deltaTable.alias("oldData").merge(
    newData.alias("newData"), "oldData.id = newData.id"
).whenMatchedUpdate(set={"id": col("newData.id")}).whenNotMatchedInsert(
    values={"id": col("newData.id")}
).execute()

In [47]:
deltaTable.toDF().show()

+---+
| id|
+---+
|  0|
|  1|
|  2|
|  3|
|  4|
|  5|
|  6|
|  7|
|  8|
|  9|
| 10|
| 11|
| 12|
| 13|
| 14|
| 15|
| 16|
| 17|
| 18|
| 19|
+---+



## Read older versions of data using time travel

In [48]:
df = spark.read.format("delta").option("versionAsOf", 0).load("tmp/delta-table")
df.show()

+---+
| id|
+---+
|  4|
|  2|
|  0|
|  3|
|  1|
+---+

