In [0]:
# Move the file from workspace to DBFS
dbutils.fs.cp("file:/Workspace/Shared/employee1_data.csv", "dbfs:/employee1_data.csv")

True

In [0]:
# Load the csv data into a dataframe
df_employee = spark.read.format("csv").option("header","true").load("dbfs:/FileStore/employee1_data.csv")


# Wrtie the dataframe in delta format
df_employee.write.format("delta").mode("overwrite").save("/delta/employee1_data")

In [0]:
from pyspark.sql.types import StructType, StructField, StringType, IntegerType, DoubleType

# Define the schema
schema = StructType([
    StructField("ProductID", IntegerType(), True),
    StructField("ProductName", StringType(), True),
    StructField("Category", StringType(), True),
    StructField("Price", DoubleType(), True),
    StructField("Stock", IntegerType(), True),
])

# Move the file from workspace to DBFS
dbutils.fs.cp("file:/Workspace/Shared/product1_data.json", "dbfs:/FileStore/product1_data.json")

# Load JSON data with schema
df_product = spark.read.schema(schema).json("dbfs:/FileStore/product1_data.json")
df_product.show()

# Create a temp view for SQL operations
df_product.createOrReplaceTempView("product_view")

# Create Delta Table from the view
spark.sql("CREATE TABLE IF NOT EXISTS product1_data USING DELTA AS SELECT * FROM product_view")

+---------+-----------+-----------+------+-----+
|ProductID|ProductName|   Category| Price|Stock|
+---------+-----------+-----------+------+-----+
|      101|     Laptop|Electronics|1200.0|   35|
|      102| Smartphone|Electronics| 800.0|   80|
|      103| Desk Chair|  Furniture| 150.0|   60|
|      104|    Monitor|Electronics| 300.0|   45|
|      105|       Desk|  Furniture| 350.0|   25|
+---------+-----------+-----------+------+-----+



In [None]:
# Move the file from Workspace to DBFS
dbutils.fs.cp("file:/Workspace/Shared/employee_updated.csv", "dbfs:/FileStore/employee_updated.csv")


# Convert employee csv data to Delta Format
# Load the csv data into a dataframe
df_employee1 = spark.read.format("csv").option("header","true").load("dbfs:/FileStore/employee1_data.csv")


# Wrtie the dataframe in delta format
df_employee1.write.format("delta").mode("overwrite").save("/delta/employee1_data")


# Convert employee updates csv to delta format
df_employee_updates = spark.read.format("csv").option("header","true").load("/FileStore/employee_updated.csv")
df_employee_updates.write.format("delta").mode("overwrite").save("/delta/employee_updates")



In [None]:
# Load Delta tables
df_emp = spark.read.format("delta").load("/delta/employee_data")
df_emp_updates = spark.read.format("delta").load("/delta/employee_updates")


# Create a temporary view for SQL operations
df_emp.createOrReplaceTempView("delta_employee")
df_emp_updates.createOrReplaceTempView("employee_updates")

In [None]:
spark.sql("""
    MERGE INTO delta_employee AS target
    USING employee_updates AS source
    ON target.EmployeeID = source.EmployeeID
    WHEN MATCHED THEN UPDATE SET target.Salary = source.Salary, target.Department = source.Department
    WHEN NOT MATCHED THEN INSERT (EmployeeID, Name, Department, JoiningDate, Salary)
    VALUES (source.EmployeeID, source.Name, source.Department, source.JoiningDate, source.Salary)
""")

# Query the Delta table to check if the data was updated or inserted correctly
spark.sql("SELECT * FROM delta_employee").show()

In [None]:
# Write the employee dataframe to a delta table
df_emp.write.format("delta").mode("overwrite").save("/delta/employee_data")

In [None]:
# Register the delta table
spark.sql("Create table if not exists delta_employee_table using DELTA Location '/delta/employee_data'")

In [None]:
# Optimize the delta table
spark.sql("OPTIMIZE delta_employee_table")

In [None]:
# Describe the history of the Delta table
spark.sql("DESCRIBE HISTORY delta_employee_table")

In [None]:
# ZORDER the delta table
spark.sql("OPTIMIZE delta_employee_table ZORDER BY Department")

In [None]:
# DELETE records older than a week
spark.sql("VACUUM delta_employee_table RETAIN 168 HOURS")

In [None]:
--In data bricks notebook u can change cell to sql cell

%sql
CREATE TABLE managed_table {
    id INT,
    name String
};

In [None]:
%sql
CREATE EXTERNAL TABLE unmanaged_table {
    id INT,
    name STRING
}
LOCATION '/user/data/external_data/';