### Creating the database and tables for operations

In [0]:
%sql
create database my_db
location '/FileStore/db/';


In [0]:
%sql
CREATE TABLE my_db.customers (
    customer_id INT,
    name STRING,
    email STRING,
    created_date TIMESTAMP
) USING DELTA;

In [0]:
%sql
INSERT INTO my_db.customers (customer_id, name, email, created_date) VALUES
(1, 'Alice', 'alice@example.com', '2024-01-01 10:00:00'),
(2, 'Bob', 'bob@example.com', '2024-01-05 12:30:00'),
(3, 'Charlie', 'charlie@example.com', '2024-01-10 15:45:00'),
(4, 'David', 'david@example.com', '2024-01-15 08:20:00');


num_affected_rows,num_inserted_rows
4,4


In [0]:
%sql
select * from my_db.customers

-- output
-- customer_id	name	email	created_date
-- 1	Alice	alice@example.com	2024-01-01T10:00:00.000+00:00
-- 2	Bob	bob@example.com	2024-01-05T12:30:00.000+00:00
-- 3	Charlie	charlie@example.com	2024-01-10T15:45:00.000+00:00
-- 4	David	david@example.com	2024-01-15T08:20:00.000+00:00

customer_id,name,email,created_date
1,Alice,alice@example.com,2024-01-01T10:00:00.000+0000
2,Bob,bob@example.com,2024-01-05T12:30:00.000+0000
3,Charlie,charlie@example.com,2024-01-10T15:45:00.000+0000
4,David,david@example.com,2024-01-15T08:20:00.000+0000


### Delete operation on Delta table using SQL Approach

In [0]:
%sql
delete from my_db.customers
where customer_id = 4

num_affected_rows
1


In [0]:
%sql
select * from my_db.customers

-- ouput
-- customer_id	name	email	created_date
-- 1	Alice	alice@example.com	2024-01-01T10:00:00.000+00:00
-- 2	Bob	bob@example.com	2024-01-05T12:30:00.000+00:00
-- 3	Charlie	charlie@example.com	2024-01-10T15:45:00.000+00:00

customer_id,name,email,created_date
1,Alice,alice@example.com,2024-01-01T10:00:00.000+0000
2,Bob,bob@example.com,2024-01-05T12:30:00.000+0000
3,Charlie,charlie@example.com,2024-01-10T15:45:00.000+0000


### Delete the record using DeltaTable module

In [0]:
from delta.tables import DeltaTable

delta_table = DeltaTable.forName(spark, "my_db.customers")
delta_table.delete('customer_id = 3')

In [0]:
%sql
select * from my_db.customers

customer_id,name,email,created_date
1,Alice,alice@example.com,2024-01-01T10:00:00.000+0000
2,Bob,bob@example.com,2024-01-05T12:30:00.000+0000


### Updating the records using SQL Approach

In [0]:
%sql
update my_db.customers
set name = 'bobby'
where customer_id = 2

num_affected_rows
1


In [0]:
%sql
select * from my_db.customers

-- customer_id	name	email	created_date
-- 1	Alice	alice@example.com	2024-01-01T10:00:00.000+00:00
-- 2	bobby	bob@example.com	2024-01-05T12:30:00.000+00:00


customer_id,name,email,created_date
1,Alice,alice@example.com,2024-01-01T10:00:00.000+0000
2,bobby,bob@example.com,2024-01-05T12:30:00.000+0000


### Updating the records using Delta Table module approach

In [0]:
delta_table.update(
    condition='customer_id = 2',
    set =  {
        'email' : "'bobbynewemail@example.com'"
    }
)

In [0]:
%sql
select * from DELTA.`/FileStore/db/customers`
--output
-- customer_id	name	email	created_date
-- 1	Alice	alice@example.com	2024-01-01T10:00:00.000+00:00
-- 2	bobby	bobbynewemail@example.com	2024-01-05T12:30:00.000+00:00

customer_id,name,email,created_date
1,Alice,alice@example.com,2024-01-01T10:00:00.000+0000
2,bobby,bobbynewemail@example.com,2024-01-05T12:30:00.000+0000


### MERGE COMMAND USING SQL APPROACH

In [0]:
%sql
INSERT OVERWRITE TABLE my_db.customers (customer_id, name, email, created_date) VALUES
(1, 'Alice', 'alice@example.com', '2024-01-01 10:00:00'),
(2, 'Bob', 'bob@example.com', '2024-01-05 12:30:00'),
(3, 'Charlie', 'charlie@example.com', '2024-01-10 15:45:00'),
(4, 'David', 'david@example.com', '2024-01-15 08:20:00');

num_affected_rows,num_inserted_rows
4,4


In [0]:
%sql
select * from my_db.customers

customer_id,name,email,created_date
1,Alice,alice@example.com,2024-01-01T10:00:00.000+0000
2,Bob,bob@example.com,2024-01-05T12:30:00.000+0000
3,Charlie,charlie@example.com,2024-01-10T15:45:00.000+0000
4,David,david@example.com,2024-01-15T08:20:00.000+0000


In [0]:
def cleanup_tables():
    spark.sql("drop table my_db.customers ")
    spark.sql("""
	CREATE TABLE my_db.customers (
		customer_id INT,
		name STRING,
		email STRING,
		country STRING,
		created_date TIMESTAMP
	) USING DELTA
	""")
    spark.sql("""
              	INSERT INTO my_db.customers (customer_id, name, email, country, created_date) VALUES
	    (1, 'Alice', 'alice@example.com', 'USA', '2024-01-01 10:00:00'),
	    (2, 'Bob', 'bob@example.com', 'UK', '2024-01-05 12:30:00'),
	    (3, 'Charlie', 'charlie@example.com', 'USA', '2024-01-10 15:45:00'),
	    (4, 'David', 'david@example.com', 'India', '2024-01-15 08:20:00')
              """)
    spark.sql("""
              drop table my_db.customer_updates
              """)
    spark.sql("""
              	CREATE TABLE my_db.customer_updates (
		customer_id INT,
		name STRING,
		email STRING,
		country STRING,
		updated_date TIMESTAMP
	) USING DELTA
              """)
    spark.sql("""
              	INSERT INTO my_db.customer_updates (customer_id, name, email, country, updated_date) VALUES
	(2, 'Bob', 'bob.new@example.com', 'UK', '2024-01-20 11:00:00'),
	(3, 'Charlie', 'charlie@example.com', 'Canada', '2024-01-22 09:30:00'),
	(4, 'Dave', 'dave.new@example.com', 'India', '2024-01-25 10:00:00'),
	(5, 'Eve', 'eve@example.com', 'USA', '2024-01-28 14:15:00')

              """)

In [0]:
cleanup_tables()

In [0]:
spark.sql("select * from my_db.customers").show(truncate=0)

+-----------+-------+-------------------+-------+-------------------+
|customer_id|name   |email              |country|created_date       |
+-----------+-------+-------------------+-------+-------------------+
|1          |Alice  |alice@example.com  |USA    |2024-01-01 10:00:00|
|2          |Bob    |bob@example.com    |UK     |2024-01-05 12:30:00|
|3          |Charlie|charlie@example.com|USA    |2024-01-10 15:45:00|
|4          |David  |david@example.com  |India  |2024-01-15 08:20:00|
+-----------+-------+-------------------+-------+-------------------+



In [0]:
%sql
select * from my_db.customer_updates

customer_id,name,email,country,updated_date
2,Bob,bob.new@example.com,UK,2024-01-20T11:00:00.000+0000
3,Charlie,charlie@example.com,Canada,2024-01-22T09:30:00.000+0000
4,Dave,dave.new@example.com,India,2024-01-25T10:00:00.000+0000
5,Eve,eve@example.com,USA,2024-01-28T14:15:00.000+0000


#### Insert the new records and update the modified records in target

In [0]:
%sql
use my_db

In [0]:
%sql
MERGE INTO customers AS tgt
USING customer_updates AS src
ON tgt.customer_id = src.customer_id
WHEN MATCHED THEN
  UPDATE SET tgt.email = src.email, tgt.name = src.name
WHEN NOT MATCHED THEN 
  INSERT (tgt.customer_id,tgt.name,tgt.email,tgt.country,tgt.created_date)
  VALUES(src.customer_id,src.name,src.email,src.country,src.updated_date)


num_affected_rows,num_updated_rows,num_deleted_rows,num_inserted_rows
4,3,0,1


In [0]:
%sql
select * from customers

customer_id,name,email,country,created_date
2,Bob,bob.new@example.com,UK,2024-01-05T12:30:00.000+0000
3,Charlie,charlie@example.com,USA,2024-01-10T15:45:00.000+0000
4,Dave,dave.new@example.com,India,2024-01-15T08:20:00.000+0000
5,Eve,eve@example.com,USA,2024-01-28T14:15:00.000+0000
1,Alice,alice@example.com,USA,2024-01-01T10:00:00.000+0000


In [0]:
cleanup_tables()

#### DeltaTable Approach

In [0]:
from delta.tables import DeltaTable

# Load the Delta tables
customers_table = DeltaTable.forPath(spark, "/FileStore/db/customers")
customer_updates_df = spark.read.format("delta").load("/FileStore/db/customer_updates")

In [0]:
# Perform the merge
customers_table.alias("tgt").merge(
    customer_updates_df.alias("src"),
    "tgt.customer_id = src.customer_id"
).whenMatchedUpdate(set={
    "email": "src.email",
    "name": "src.name"
}).whenNotMatchedInsert(values={
    "customer_id": "src.customer_id",
    "name": "src.name",
    "email": "src.email",
    "country": "src.country",
    "created_date": "src.updated_date"
}).execute()

In [0]:
%sql
select * from customers

customer_id,name,email,country,created_date
2,Bob,bob.new@example.com,UK,2024-01-05T12:30:00.000+0000
3,Charlie,charlie@example.com,USA,2024-01-10T15:45:00.000+0000
4,Dave,dave.new@example.com,India,2024-01-15T08:20:00.000+0000
5,Eve,eve@example.com,USA,2024-01-28T14:15:00.000+0000
1,Alice,alice@example.com,USA,2024-01-01T10:00:00.000+0000


#### MATCHED WITH CONDITION

In [0]:
%sql
MERGE INTO customers AS tgt
USING customer_updates AS src
ON tgt.customer_id = src.customer_id
WHEN MATCHED AND src.country = 'USA' THEN
  UPDATE SET tgt.country = src.country, tgt.email = src.email


In [0]:
customers_table.alias("tgt").merge(
    customer_updates_df.alias("src"),
    "tgt.customer_id = src.customer_id"
).whenMatchedUpdate(condition="src.country = 'USA'", set={
    "country": "src.country",
    "email": "src.email"
}).execute()


#### UPDATE ALL

In [0]:
MERGE INTO customers AS tgt
USING customer_updates AS src
ON tgt.customer_id = src.customer_id
WHEN MATCHED THEN
  UPDATE SET *; -- Update all columns


In [0]:
customers_table.alias("tgt").merge(
    customer_updates_df.alias("src"),
    "tgt.customer_id = src.customer_id"
).whenMatchedUpdateAll().execute()


#### INSERT ALL

In [0]:
MERGE INTO customers AS tgt
USING customer_updates AS src
ON tgt.customer_id = src.customer_id
WHEN NOT MATCHED THEN
  INSERT *; -- Insert all columns


In [0]:
customers_table.alias("tgt").merge(
    customer_updates_df.alias("src"),
    "tgt.customer_id = src.customer_id"
).whenNotMatchedInsertAll().execute()
