# Delta Lake features

In [None]:
from pyspark.sql.types import *
import numpy as np
import pandas as pd
from delta.tables import DeltaTable

manualSchema = StructType([
  StructField("CustomerId", StringType(), True),
  StructField("ProductId", StringType(), True),
  StructField("Rating", LongType(), True),
  StructField("Cost", FloatType(), True),
  StructField("Size", FloatType(), True),
  StructField("Price", FloatType(), True),
  StructField("PrimaryBrandId", LongType(), True),
  StructField("GenderId", LongType(), True),
  StructField("MaritalStatus", LongType(), True),
  StructField("LowerIncomeBound", FloatType(), True),
  StructField("UpperIncomeBound", FloatType(), True)
])

url = "wasbs://files@synapsemlpublic.blob.core.windows.net/PersonalizedData.csv"
raw_data = spark.read.csv(url, header=True, schema=manualSchema)
print("Schema: ")
raw_data.printSchema()

df = raw_data.toPandas()
print("Shape: ", df.shape)

In [None]:
delta_table_path = 'abfss://delta@asadatalake20220206.dfs.core.windows.net/customer-rating'

raw_data.write.format('delta').save(delta_table_path)

mssparkutils.fs.ls(delta_table_path)

In [None]:
delta_log_path = mssparkutils.fs.ls(f'{delta_table_path}/_delta_log')[0].path
print(delta_log_path)
mssparkutils.fs.head(delta_log_path)

In [None]:
data = spark.read.format('delta').load(delta_table_path)
data.show()

In [None]:
# Get all versions
delta_table = DeltaTable.forPath(spark, delta_table_path)
display(delta_table.history())

In [None]:
# Declare the predicate by using a SQL-formatted string.
delta_table.update(
  condition = "Price < 1500",
  set = { "Price": "Price * 1.05" }
)


In [None]:
display(delta_table.history())

It's possible to query previous snapshots of your Delta Lake table by using a feature called Time Travel. If you want to access the data that you overwrote, you can query a snapshot of the table before you overwrote the first set of data using the versionAsOf option.

In [None]:
display(spark.read.format("delta").option("versionAsOf", "0").load(delta_table_path))

In [None]:
display(spark.read.format("delta").option("versionAsOf", "1").load(delta_table_path))


In [None]:
spark.sql("CREATE TABLE CustomerRating USING DELTA LOCATION '{0}'".format(delta_table_path))

In [None]:
spark.sql("SHOW TABLES").show()

In [None]:
spark.sql("DESCRIBE EXTENDED customerrating").show(truncate=False)

To query the delta table from the serverless SQL pool, navigate to the `Develop` hub in Synapse Studio and create a new SQL script. Make sure `Built-in` is selected for the `Connect to` option and `default` is selected for the `Use database` option.

Enter the query as shown in the picture below and make sure you replace the name of the Data Lake account with the one from your lab environment.

![Query Delta Lake with serverless SQL pool](https://solliancepublicdata.blob.core.windows.net/synapse-l400/notebook-images/query-delta-table.png)

This concludes the Delta Lake section of this notebook.

To learn more about Delta Lake support in Syanspe Spark, take a look at the [Work with Delta Lake](https://docs.microsoft.com/en-us/azure/synapse-analytics/spark/apache-spark-delta-lake-overview?pivots=programming-language-python) section in the Azure Synapse Analytics documentation.