In [0]:
dbutils.fs.cp("file:/Workspace/Shared/csv_employees.csv", "dbfs:/FileStore/csv_employees.csv")

dbutils.fs.cp("file:/Workspace/Shared/json_products.json", "dbfs:/FileStore/json_products.json")


True

In [0]:
from pyspark.sql.types import StructType, StructField, StringType, IntegerType

# Load CSV and JSON datasets
employee_df = spark.read.csv("dbfs:/FileStore/csv_employees.csv", header=True, inferSchema=True)

schema = StructType([
    StructField("ProductID", StringType(), True),
    StructField("ProductName", StringType(), True),
    StructField("Category", StringType(), True),
    StructField("Price", IntegerType(), True)
])

product_df = spark.read.schema(schema).json("dbfs:/FileStore/json_products.json")

# Delta table from dataframe
employee_df.write.format("delta").mode("overwrite").save("/FileStore/delta/employees")

product_df.write.format("delta").mode("overwrite").save("/FileStore/delta/products")

# Delta table using SQL
employee_df.createOrReplaceTempView("employees_view")
product_df.createOrReplaceTempView("products_view")

spark.sql("CREATE TABLE delta_employees USING DELTA AS SELECT * FROM employees_view")

spark.sql("CREATE TABLE delta_products USING DELTA AS SELECT * FROM products_view")

# Convert csv and json directly to delta
employee_df.write.format("delta").mode("overwrite").saveAsTable("delta_employees_csv")

product_df.write.format("delta").mode("overwrite").saveAsTable("delta_products_json")

spark.sql("SELECT * FROM delta_employees").show()
spark.sql("SELECT * FROM delta_products").show()


+----------+------------+-----------+-----------+------+
|EmployeeID|EmployeeName| Department|JoiningDate|Salary|
+----------+------------+-----------+-----------+------+
|       101|        John|         HR| 2023-01-10| 50000|
|       102|       Alice|    Finance| 2023-02-15| 70000|
|       103|        Mark|Engineering| 2023-03-20| 85000|
|       104|        Emma|      Sales| 2023-04-01| 55000|
|       105|        Liam|  Marketing| 2023-05-12| 60000|
+----------+------------+-----------+-----------+------+

+---------+-----------+-----------+-----+
|ProductID|ProductName|   Category|Price|
+---------+-----------+-----------+-----+
|     P101|     Laptop|Electronics| 1200|
|     P102|      Phone|Electronics|  800|
|     P103|     Tablet|Electronics|  600|
|     P104|    Monitor|Electronics|  300|
|     P105|      Mouse|Accessories|   25|
+---------+-----------+-----------+-----+



In [0]:
dbutils.fs.cp("file:/Workspace/Shared/csv_new_employees.csv", "dbfs:/FileStore/csv_new_employees.csv")

True

In [0]:
# Task 2
# Merge and upsert 
new_employee_df = spark.read.csv("dbfs:/FileStore/csv_new_employees.csv", header=True, inferSchema=True)

employee_df = spark.read.format("delta").load("/FileStore/delta/employees")

new_employee_df.createOrReplaceTempView("new_employees_view")

spark.sql("""
           MERGE INTO delta_employees AS target
           USING new_employees_view AS source
           ON target.EmployeeID = source.EmployeeID
           WHEN MATCHED THEN
           UPDATE SET target.Salary = source.Salary,
                      target.EmployeeName = source.EmployeeName,
                      target.Department = source.Department,
                      target.JoiningDate = source.JoiningDate
           WHEN NOT MATCHED THEN
           INSERT (EmployeeID, Salary, EmployeeName, Department, JoiningDate)
           VALUES (source.EmployeeID, source.Salary, source.EmployeeName, source.Department, source.JoiningDate) 
 """)

spark.sql("SELECT * FROM employees_view").show()


+----------+------------+-----------+-----------+------+
|EmployeeID|EmployeeName| Department|JoiningDate|Salary|
+----------+------------+-----------+-----------+------+
|       101|        John|         HR| 2023-01-10| 50000|
|       103|        Mark|Engineering| 2023-03-20| 85000|
|       104|        Emma|      Sales| 2023-04-01| 55000|
|       105|        Liam|  Marketing| 2023-05-12| 60000|
|       102|       Alice|    Finance| 2023-02-15| 75000|
|       106|      Olivia|         HR| 2023-06-10| 65000|
+----------+------------+-----------+-----------+------+



In [0]:
# Task 3
# Internals
spark.sql("DESCRIBE DETAIL delta_employees")

# Transaction History
spark.sql("DESCRIBE HISTORY delta_employees").show()

# Time Travel
spark.sql("SELECT * FROM delta_employees VERSION AS OF 0").show()


+-------+-------------------+----------------+--------------------+--------------------+--------------------+----+------------------+--------------------+-----------+-----------------+-------------+--------------------+------------+--------------------+
|version|          timestamp|          userId|            userName|           operation| operationParameters| job|          notebook|           clusterId|readVersion|   isolationLevel|isBlindAppend|    operationMetrics|userMetadata|          engineInfo|
+-------+-------------------+----------------+--------------------+--------------------+--------------------+----+------------------+--------------------+-----------+-----------------+-------------+--------------------+------------+--------------------+
|      2|2024-09-17 04:26:08|6822499424790006|azuser2121_mml.lo...|            OPTIMIZE|{predicate -> [],...|NULL|{4166212652284004}|0911-073432-r1idfcx3|          1|SnapshotIsolation|        false|{numRemovedFiles ...|        NULL|Databr

In [0]:
# Task 4 Optimizing
spark.sql("OPTIMIZE delta_employees")

# Z ordering
spark.sql("OPTIMIZE delta_employees ZORDER BY (Department)")

DataFrame[path: string, metrics: struct<numFilesAdded:bigint,numFilesRemoved:bigint,filesAdded:struct<min:bigint,max:bigint,avg:double,totalFiles:bigint,totalSize:bigint>,filesRemoved:struct<min:bigint,max:bigint,avg:double,totalFiles:bigint,totalSize:bigint>,partitionsOptimized:bigint,zOrderStats:struct<strategyName:string,inputCubeFiles:struct<num:bigint,size:bigint>,inputOtherFiles:struct<num:bigint,size:bigint>,inputNumCubes:bigint,mergedFiles:struct<num:bigint,size:bigint>,numOutputCubes:bigint,mergedNumCubes:bigint>,clusteringStats:struct<inputZCubeFiles:struct<numFiles:bigint,size:bigint>,inputOtherFiles:struct<numFiles:bigint,size:bigint>,inputNumZCubes:bigint,mergedFiles:struct<numFiles:bigint,size:bigint>,numOutputZCubes:bigint>,numBins:bigint,numBatches:bigint,totalConsideredFiles:bigint,totalFilesSkipped:bigint,preserveInsertionOrder:boolean,numFilesSkippedToReduceWriteAmplification:bigint,numBytesSkippedToReduceWriteAmplification:bigint,startTimeMs:bigint,endTimeMs:bigint,

In [0]:
# Vacuuming
spark.sql("VACUUM delta_employees RETAIN 168 HOURS")

DataFrame[path: string]