In [0]:
dbutils.fs.cp("file:/Workspace/Shared/csv_orders.csv","dbfs:/FileStore/csv_orders.csv")

True

In [0]:
from pyspark.sql.functions import col
#  Assignment: Creating a Complete ETL Pipeline using Delta Live Tables

# Task 1
schema = "OrderID STRING, OrderDate STRING, CustomerID STRING, Product STRING, Quantity INT, Price INT"

df_orders = spark.read.format("csv").option("header","true").schema(schema).load("dbfs:/FileStore/csv_orders.csv")

# Transformations
df_orders = df_orders.withColumn("TotalAmount", col("Quantity") * col("Price"))

df_orders_filterd = df_orders.filter(col("Quantity") > 1)

# Write into delta
df_orders_filterd.write.format("delta").mode("overwrite").save("dbfs:/FileStore/delta/orders")

In [0]:
# Creating dlt
import dlt

@dlt.table
def orders_table1():
    return spark.read.format("delta").load("dbfs:/FileStore/delta/orders")

print("delta live table created")

delta live table created


Name,Type
OrderID,string
OrderDate,string
CustomerID,string
Product,string
Quantity,int
Price,int
TotalAmount,int


In [0]:
from pyspark.sql.functions import col, when
from pyspark.sql import Row

# Task 3 Operations on delta table

# Read
df_dltOrders = spark.read.format("delta").load("dbfs:/FileStore/delta/orders")
df_dltOrders.show()

# Update
df_dltOrders = df_dltOrders.withColumn("Price",
    when(col("Product") == "Laptop", col("Price") * 1.10).otherwise(col("Price")))

print("Updation")
df_dltOrders.show()

# Delete
print("Deleted")
df_deleted =  df_dltOrders.filter(col("Quantity") >= 2)
df_deleted.show()

# insert
spark.sql("""
INSERT INTO delta.`dbfs:/FileStore/delta/orders`
VALUES ('106', '2024-01-06', 'C006', 'Keyboard', 3, 50,150)
""")

updated_df = spark.read.format("delta").load("dbfs:/FileStore/delta/orders")
updated_df.show()



+-------+----------+----------+-------+--------+-----+-----------+
|OrderID| OrderDate|CustomerID|Product|Quantity|Price|TotalAmount|
+-------+----------+----------+-------+--------+-----+-----------+
|    101|2024-01-01|      C001| Laptop|       2| 1000|       2000|
|    103|2024-01-03|      C003| Tablet|       3|  300|        900|
|    105|2024-01-05|      C005|  Mouse|       5|   20|        100|
+-------+----------+----------+-------+--------+-----+-----------+

Updation
+-------+----------+----------+-------+--------+------+-----------+
|OrderID| OrderDate|CustomerID|Product|Quantity| Price|TotalAmount|
+-------+----------+----------+-------+--------+------+-----------+
|    101|2024-01-01|      C001| Laptop|       2|1100.0|       2000|
|    103|2024-01-03|      C003| Tablet|       3| 300.0|        900|
|    105|2024-01-05|      C005|  Mouse|       5|  20.0|        100|
+-------+----------+----------+-------+--------+------+-----------+

Deleted
+-------+----------+----------+-----

In [0]:
dbutils.fs.cp("file:/Workspace/Shared/csv_new_orders.csv", "dbfs:/FileStore/csv_new_orders.csv")

True

In [0]:
# Task 4 Merge
df_new_orders = spark.read.format("csv").option("header", "true").load("dbfs:/FileStore/csv_new_orders.csv")
df_new_orders = df_new_orders.withColumn("TotalAmount", col("Quantity") * col("Price"))

df_new_orders.createOrReplaceTempView("new_orders")

df_old_orders = spark.read.format("delta").load("dbfs:/FileStore/delta/orders")
df_old_orders.createOrReplaceTempView("orders")

spark.sql("""
          MERGE INTO orders as target 
          USING new_orders as source
          ON target.OrderID = source.OrderID
          WHEN MATCHED THEN
          UPDATE SET target.OrderDate = source.OrderDate,
                     target.CustomerID = source.CustomerID,
                     target.Product = source.Product,
                     target.Quantity = source.Quantity,
                     target.Price = source.Price,
                     target.TotalAmount = source.TotalAmount
          WHEN NOT MATCHED THEN
          INSERT (OrderID, OrderDate, CustomerID, Product, Quantity, Price, TotalAmount) 
          VALUES (source.OrderID, source.OrderDate, source.CustomerID, source.Product, source.Quantity, source.Price, source.TotalAmount)
""")

spark.sql("SELECT * FROM orders").show()

+-------+----------+----------+--------+--------+-----+-----------+
|OrderID| OrderDate|CustomerID| Product|Quantity|Price|TotalAmount|
+-------+----------+----------+--------+--------+-----+-----------+
|    103|2024-01-03|      C003|  Tablet|       3|  300|        900|
|    105|2024-01-05|      C005|   Mouse|       5|   20|        100|
|    101|2024-01-10|      C001|  Laptop|       2| 1200|       2400|
|    106|2024-01-12|      C006|Keyboard|       3|   50|        150|
+-------+----------+----------+--------+--------+-----+-----------+



In [0]:
# Task 5
spark.sql("CREATE TABLE IF NOT EXISTS orders_delta_table USING DELTA AS SELECT * FROM orders")

# History
spark.sql("DESCRIBE HISTORY orders_delta_table").show()

# Detail
spark.sql("DESCRIBE DETAIL orders_delta_table").show()

+-------+-------------------+----------------+--------------------+--------------------+--------------------+----+------------------+--------------------+-----------+-----------------+-------------+--------------------+------------+--------------------+
|version|          timestamp|          userId|            userName|           operation| operationParameters| job|          notebook|           clusterId|readVersion|   isolationLevel|isBlindAppend|    operationMetrics|userMetadata|          engineInfo|
+-------+-------------------+----------------+--------------------+--------------------+--------------------+----+------------------+--------------------+-----------+-----------------+-------------+--------------------+------------+--------------------+
|      0|2024-09-17 07:42:03|6822499424790006|azuser2121_mml.lo...|CREATE TABLE AS S...|{partitionBy -> [...|NULL|{4166212652284049}|0911-073432-r1idfcx3|       NULL|WriteSerializable|         true|{numFiles -> 1, n...|        NULL|Databr

In [0]:
# Task 6 Time Travel

spark.sql("SELECT * FROM orders_delta_table VERSION AS OF 0").show()

spark.sql("SELECT * FROM orders_delta_table TIMESTAMP AS OF '2024-09-17T07:42:03.000Z'").show()

+-------+----------+----------+--------+--------+-----+-----------+
|OrderID| OrderDate|CustomerID| Product|Quantity|Price|TotalAmount|
+-------+----------+----------+--------+--------+-----+-----------+
|    103|2024-01-03|      C003|  Tablet|       3|  300|        900|
|    105|2024-01-05|      C005|   Mouse|       5|   20|        100|
|    101|2024-01-10|      C001|  Laptop|       2| 1200|       2400|
|    106|2024-01-12|      C006|Keyboard|       3|   50|        150|
+-------+----------+----------+--------+--------+-----+-----------+

+-------+----------+----------+--------+--------+-----+-----------+
|OrderID| OrderDate|CustomerID| Product|Quantity|Price|TotalAmount|
+-------+----------+----------+--------+--------+-----+-----------+
|    103|2024-01-03|      C003|  Tablet|       3|  300|        900|
|    105|2024-01-05|      C005|   Mouse|       5|   20|        100|
|    101|2024-01-10|      C001|  Laptop|       2| 1200|       2400|
|    106|2024-01-12|      C006|Keyboard|       

In [0]:
# Task 7 Optimize and vacuum

spark.sql("OPTIMIZE orders_delta_table")

spark.sql("VACUUM orders_delta_table RETAIN 168 HOURS")

DataFrame[path: string]

In [0]:
# Task 8 convert parquet to delta

csv_par_df = spark.read.csv("file:/Workspace/Shared/csv_orders.csv",header=True,inferSchema=True)
csv_par_df.write.format("parquet").mode("overwrite").save("dbfs:/FileStore/parquet_orders1")

# Load the Parquet files into a DataFrame
parquet_df = spark.read.format("parquet").load("dbfs:/FileStore/parquet_orders1/")

parquet_df.write.format("delta").mode("overwrite").save("dbfs:/FileStore/delta/orders_delta_parquet1")

# Load the Delta table
delta_df = spark.read.format("delta").load("dbfs:/FileStore/delta/orders_delta_parquet1")

delta_df.show()



+-------+----------+----------+-------+--------+-----+
|OrderID| OrderDate|CustomerID|Product|Quantity|Price|
+-------+----------+----------+-------+--------+-----+
|    101|2024-01-01|      C001| Laptop|       2| 1000|
|    102|2024-01-02|      C002|  Phone|       1|  500|
|    103|2024-01-03|      C003| Tablet|       3|  300|
|    104|2024-01-04|      C004|Monitor|       1|  150|
|    105|2024-01-05|      C005|  Mouse|       5|   20|
+-------+----------+----------+-------+--------+-----+

