# Using Linux foundation Delta Lake in Synapse Spark
In this notebook, how to read the delta table, how to write to delta table and timetravel is demonstrated

In [None]:
# Set the strorage path info
account_name = '<storage_account_name>' # fill in your primary storage account name
container_name = '<container_name>' # fill in your container name
relative_csv_path = '<csv_folder_name>' # fill in your relative CSV folder path
relative_delta_path='<Delta_Lake_folder_name>' # fill in your relative delta lake folder path

adls_path = 'abfss://%s@%s.dfs.core.windows.net/' % (container_name, account_name)
print('Primary storage account path: ' + adls_path)

#csv input file path
csvfilepath = adls_path + relative_csv_path + '/00 HRData.csv'
print ('CSV file path: '+ csvfilepath)

# Delta Lake relative path
deltatablepath = adls_path + relative_delta_path + '/'
print('Delta Lake path: ' + deltatablepath)

### Read data in csv format



In [5]:
csvhrdatadf = spark.read.option("header",True).format("csv").load(csvfilepath)
csvhrdatadf.show(10)

+--------------------+----------+---------+------+-------+----------+--------------------+-----+----------+---+----------+-------------+
|       Employee_Name|     EmpID|MarriedID|DeptID|PayRate|PositionID|            Position|State|       DOB|Sex|DateofHire|   Department|
+--------------------+----------+---------+------+-------+----------+--------------------+-----+----------+---+----------+-------------+
|          Brown, Mia|1103024456|        1|     1|   28.5|         1|        Accountant I|   MA|11/24/1987|  F|10/27/2008|Admin Offices|
|LaRotonda, William  |1106026572|        0|     1|     23|         1|        Accountant I|   MA| 4/26/1984| M |  1/6/2014|Admin Offices|
|    Steans, Tyrone  |1302053333|        0|     1|     29|         1|        Accountant I|   MA|  9/1/1986| M | 9/29/2014|Admin Offices|
|     Howard, Estelle|1211050782|        1|     1|   21.5|         2|Administrative As...|   MA| 9/16/1985|  F| 2/16/2015|Admin Offices|
|         Singh, Nan |1307059817|        

### Write data in delta format


In [22]:
csvhrdatadf.write.mode("overwrite").format("delta").partitionBy("Department").save(deltatablepath)

In [23]:
df_hr = spark.read.format("delta").load(deltatablepath)
df_hr.show(10)

+------------------+----------+---------+------+-------+----------+--------------------+-----+----------+---+----------+-----------------+
|     Employee_Name|     EmpID|MarriedID|DeptID|PayRate|PositionID|            Position|State|       DOB|Sex|DateofHire|       Department|
+------------------+----------+---------+------+-------+----------+--------------------+-----+----------+---+----------+-----------------+
|   Bramante, Elisa|1006020066|        0|     5|     60|        10|Director of Opera...|   MA| 3/19/1983|  F|  1/5/2009|Production       |
| Albert, Michael  |1501072311|        0|     5|   54.5|        17|  Production Manager|   MA|10/10/1968| M |  8/1/2011|Production       |
|    Bozzi, Charles|1303054580|        0|     5|   50.5|        18|  Production Manager|   MA| 3/10/1970| M | 9/30/2013|Production       |
|Butler, Webster  L|1110029990|        0|     5|     55|        18|  Production Manager|   MA|  8/9/1983| M | 1/28/2016|Production       |
|       Dunn, Amy  |1409070

### Add New Column YearsOfService



In [24]:
import pyspark.sql.functions as f

df_hr_service = df_hr.withColumn('YearsOfService',2020-f.year(f.to_timestamp('DateofHire', 'MM/dd/yyyy')))
df_hr_service.show(5)

+------------------+----------+---------+------+-------+----------+--------------------+-----+----------+---+----------+-----------------+--------------+
|     Employee_Name|     EmpID|MarriedID|DeptID|PayRate|PositionID|            Position|State|       DOB|Sex|DateofHire|       Department|YearsOfService|
+------------------+----------+---------+------+-------+----------+--------------------+-----+----------+---+----------+-----------------+--------------+
|   Bramante, Elisa|1006020066|        0|     5|     60|        10|Director of Opera...|   MA| 3/19/1983|  F|  1/5/2009|Production       |            11|
| Albert, Michael  |1501072311|        0|     5|   54.5|        17|  Production Manager|   MA|10/10/1968| M |  8/1/2011|Production       |             9|
|    Bozzi, Charles|1303054580|        0|     5|   50.5|        18|  Production Manager|   MA| 3/10/1970| M | 9/30/2013|Production       |             7|
|Butler, Webster  L|1110029990|        0|     5|     55|        18|  Product

### Overwrite the entire delta table


In [25]:
df_hr_service.write.mode("overwrite").format("delta").partitionBy("Department").option("mergeSchema", True).save(deltatablepath)

### Validate delta table is updated with new column


In [26]:
hrdataframe = spark.read.format("delta").load(deltatablepath)
hrdataframe.show(10)

+------------------+----------+---------+------+-------+----------+--------------------+-----+----------+---+----------+-----------------+--------------+
|     Employee_Name|     EmpID|MarriedID|DeptID|PayRate|PositionID|            Position|State|       DOB|Sex|DateofHire|       Department|YearsOfService|
+------------------+----------+---------+------+-------+----------+--------------------+-----+----------+---+----------+-----------------+--------------+
|   Bramante, Elisa|1006020066|        0|     5|     60|        10|Director of Opera...|   MA| 3/19/1983|  F|  1/5/2009|Production       |            11|
| Albert, Michael  |1501072311|        0|     5|   54.5|        17|  Production Manager|   MA|10/10/1968| M |  8/1/2011|Production       |             9|
|    Bozzi, Charles|1303054580|        0|     5|   50.5|        18|  Production Manager|   MA| 3/10/1970| M | 9/30/2013|Production       |             7|
|Butler, Webster  L|1110029990|        0|     5|     55|        18|  Product

### Check version with timetravel

we can see here yearsOfService column is not present in original delta table

In [27]:
hrdataoriginal = (spark
                    .read
                    .format("delta")
                    .option("versionAsOf",0)
                    .load(deltatablepath)
                    )
hrdataoriginal.show(10)

+------------------+----------+---------+------+-------+----------+--------------------+-----+----------+---+----------+-----------------+
|     Employee_Name|     EmpID|MarriedID|DeptID|PayRate|PositionID|            Position|State|       DOB|Sex|DateofHire|       Department|
+------------------+----------+---------+------+-------+----------+--------------------+-----+----------+---+----------+-----------------+
|   Bramante, Elisa|1006020066|        0|     5|     60|        10|Director of Opera...|   MA| 3/19/1983|  F|  1/5/2009|Production       |
| Albert, Michael  |1501072311|        0|     5|   54.5|        17|  Production Manager|   MA|10/10/1968| M |  8/1/2011|Production       |
|    Bozzi, Charles|1303054580|        0|     5|   50.5|        18|  Production Manager|   MA| 3/10/1970| M | 9/30/2013|Production       |
|Butler, Webster  L|1110029990|        0|     5|     55|        18|  Production Manager|   MA|  8/9/1983| M | 1/28/2016|Production       |
|       Dunn, Amy  |1409070

### Update records that match the given condition 
Lets update here PayRate for employees whose payroll is less than 20 to make it lowest payrate value above 20.


In [28]:
from pyspark.sql.functions import *
from delta.tables import *

deltaTable = DeltaTable.forPath(spark,deltatablepath)

minPayRateAbove20 = hrdataframe.filter("PayRate>20").select(min("PayRate"))
minPay=(minPayRateAbove20.__getitem__(0).cast("string"))
test=minPayRateAbove20.collect().__getitem__(0)


#Number of records that will be updated
deltaTable.toDF().filter("PayRate<=20").count()



89

In [40]:
# Update PayRate to 20.5 for employees whose PayRate is below 20 
deltaTable.update(
    condition = (col("PayRate")<=20),
    set = {"PayRate":"20.5"}
)
deltaTable.toDF().filter("PayRate<20.5").show()

+-------------+-----+---------+------+-------+----------+--------+-----+---+---+----------+----------+--------------+
|Employee_Name|EmpID|MarriedID|DeptID|PayRate|PositionID|Position|State|DOB|Sex|DateofHire|Department|YearsOfService|
+-------------+-----+---------+------+-------+----------+--------+-----+---+---+----------+----------+--------------+
+-------------+-----+---------+------+-------+----------+--------+-----+---+---+----------+----------+--------------+

### Validate changes by filtering records on condition
Validate no employees have PayRate less than or equal to 20


In [41]:
deltaTableAfterUpdate = DeltaTable.forPath(spark,deltatablepath)
deltaTableAfterUpdate.toDF().filter("PayRate<20.5").count()

0

## Audit data changes
or Check Version history


In [42]:
#get version history
deltaTable.history().show()

+-------+-------------------+------+--------+---------+--------------------+----+--------+---------+-----------+--------------+-------------+--------------------+
|version|          timestamp|userId|userName|operation| operationParameters| job|notebook|clusterId|readVersion|isolationLevel|isBlindAppend|    operationMetrics|
+-------+-------------------+------+--------+---------+--------------------+----+--------+---------+-----------+--------------+-------------+--------------------+
|      9|2020-08-11 02:41:00|  null|    null|   UPDATE|[predicate -> (ca...|null|    null|     null|          8|          null|        false|[numRemovedFiles ...|
|      8|2020-08-11 02:40:19|  null|    null|   UPDATE|[predicate -> (ca...|null|    null|     null|          7|          null|        false|[numRemovedFiles ...|
|      7|2020-08-11 02:39:33|  null|    null|   UPDATE|[predicate -> (ca...|null|    null|     null|          6|          null|        false|[numRemovedFiles ...|
|      6|2020-08-11 02

In [44]:
deltaTable.history(1).show()

+-------+-------------------+------+--------+---------+--------------------+----+--------+---------+-----------+--------------+-------------+--------------------+
|version|          timestamp|userId|userName|operation| operationParameters| job|notebook|clusterId|readVersion|isolationLevel|isBlindAppend|    operationMetrics|
+-------+-------------------+------+--------+---------+--------------------+----+--------+---------+-----------+--------------+-------------+--------------------+
|      9|2020-08-11 02:41:00|  null|    null|   UPDATE|[predicate -> (ca...|null|    null|     null|          8|          null|        false|[numRemovedFiles ...|
+-------+-------------------+------+--------+---------+--------------------+----+--------+---------+-----------+--------------+-------------+--------------------+

### Undo changes for DeltaTable by restoring previous version
Lets set the PayRate as it was in previous version 

In [46]:
#Get verison 0 details
hrdataversion0 = spark\
                        .read\
                        .format("delta")\
                        .option("versionAsOF",0)\
                        .load(deltatablepath)
print("HR Dataframe as of version 0: ")
hrdataversion0.show(10)

print("In version 0 count of employees who have PayRate less than or equal to 20 are:%d" % hrdataversion0.filter("PayRate<20.5").count())

# Revert changes
hrdataversion0.write.format("delta").mode("overwrite").partitionBy("Department").save(deltatablepath)

#read data and check count of employees again
finalversion = spark.read.format("delta").load(deltatablepath)
print("In latest version count of employees who have PayRate less than or equal to 20 are: %d" % finalversion.filter("PayRate<20.5").count())
finalversion.show(10)

HR Dataframe as of version 0: 
+------------------+----------+---------+------+-------+----------+--------------------+-----+----------+---+----------+-----------------+
|     Employee_Name|     EmpID|MarriedID|DeptID|PayRate|PositionID|            Position|State|       DOB|Sex|DateofHire|       Department|
+------------------+----------+---------+------+-------+----------+--------------------+-----+----------+---+----------+-----------------+
|   Bramante, Elisa|1006020066|        0|     5|     60|        10|Director of Opera...|   MA| 3/19/1983|  F|  1/5/2009|Production       |
| Albert, Michael  |1501072311|        0|     5|   54.5|        17|  Production Manager|   MA|10/10/1968| M |  8/1/2011|Production       |
|    Bozzi, Charles|1303054580|        0|     5|   50.5|        18|  Production Manager|   MA| 3/10/1970| M | 9/30/2013|Production       |
|Butler, Webster  L|1110029990|        0|     5|     55|        18|  Production Manager|   MA|  8/9/1983| M | 1/28/2016|Production     