**Merge(Upsert) using Pyspark and Spark sql -- SCD Type I**

Creating schema using library

In [0]:
from pyspark.sql.types import *
from pyspark.sql.functions import *
schema = StructType([StructField("emp_id", IntegerType(), True), 
                    StructField("name", StringType(), True),
                    StructField("city", StringType(), True),
                    StructField("country", StringType(), True),
                    StructField("contact_no", IntegerType(), True)])

In [0]:
data=[(1000,"Prakash","Richmond","USA",8322662)]
df=spark.createDataFrame(data=data, schema=schema)
display(df)

emp_id,name,city,country,contact_no
1000,Prakash,Richmond,USA,8322662


In [0]:
%sql
--Create delta table
CREATE OR REPLACE TABLE dim_employee (
emp_id int,
name string,
city string,
country string,
contact_no int
)
USING DELTA
LOCATION "/FileStore/tables/delta_merge"

In [0]:
%sql
select * from dim_employee

emp_id,name,city,country,contact_no


***Method 1 - Spark SQL to Merge Operation for SCD type I***

In [0]:
df.createOrReplaceTempView("source_view")

In [0]:
%sql
select * from source_view

emp_id,name,city,country,contact_no
1000,Prakash,Richmond,USA,8322662


In [0]:
%sql
select * from dim_employee

emp_id,name,city,country,contact_no


In [0]:
%sql
-- Merge Operation
MERGE INTO dim_employee as target
USING source_view as source
ON target.emp_id = source.emp_id
WHEN MATCHED THEN 
UPDATE SET
  target.name = source.name,
  target.city = source.city,
  target.country = source.country,
  target.contact_no = source.contact_no
WHEN NOT MATCHED THEN
INSERT (emp_id, name, city, country, contact_no) VALUES (emp_id, name, city, country, contact_no)

num_affected_rows,num_updated_rows,num_deleted_rows,num_inserted_rows
1,0,0,1


In [0]:
%sql
select * from dim_employee

emp_id,name,city,country,contact_no
1000,Prakash,Richmond,USA,8322662


***Insert/Update another record - scd type I***

In [0]:
data=[(1000,"Prakash","Florida","USA",8052664), (2000,"Roshan","Ontario","CANADA",9051444)]
df=spark.createDataFrame(data=data,schema=schema)
display(df)

emp_id,name,city,country,contact_no
1000,Prakash,Florida,USA,8052664
2000,Roshan,Ontario,CANADA,9051444


In [0]:
df.createOrReplaceTempView('source_view')

In [0]:
%sql
select * from source_view

emp_id,name,city,country,contact_no
1000,Prakash,Florida,USA,8052664
2000,Roshan,Ontario,CANADA,9051444


In [0]:
%sql
select * from dim_employee

emp_id,name,city,country,contact_no
1000,Prakash,Richmond,USA,8322662


In [0]:
%sql
-- Merge Operation
MERGE INTO dim_employee as target
USING source_view as source
ON target.emp_id = source.emp_id
WHEN MATCHED THEN 
UPDATE SET
  target.name = source.name,
  target.city = source.city,
  target.country = source.country,
  target.contact_no = source.contact_no
WHEN NOT MATCHED THEN
INSERT (emp_id, name, city, country, contact_no) VALUES (emp_id, name, city, country, contact_no)

num_affected_rows,num_updated_rows,num_deleted_rows,num_inserted_rows
2,1,0,1


In [0]:
%sql
select * from dim_employee

emp_id,name,city,country,contact_no
2000,Roshan,Ontario,CANADA,9051444
1000,Prakash,Florida,USA,8052664


***Method2 - Pyspark***

Data is kept as dataframe not in table

In [0]:
data=[(3000,"Ramesh","Kathmandu","NEPAL",335341), (2000,"Roshan","Calgary","CANADA",9051444)]
df=spark.createDataFrame(data=data,schema=schema)
display(df)

emp_id,name,city,country,contact_no
3000,Ramesh,Kathmandu,NEPAL,335341
2000,Roshan,Calgary,CANADA,9051444


In [0]:
#Create Delta table in pyspark
from delta.tables import *
delta_df = DeltaTable.forPath(spark,"/FileStore/tables/delta_merge")

In [0]:
delta_df.alias("target").merge(
source = df.alias("source"),
condition = "target.emp_id = source.emp_id"
).whenMatchedUpdate(set={
    "name":"source.name",
    "city":"source.city",
    "country":"source.country",
    "contact_no":"source.contact_no"
}).whenNotMatchedInsert(values ={
    "emp_id":"source.emp_id",
    "name":"source.name",
    "city":"source.city",
    "country":"source.country",
    "contact_no":"source.contact_no"  
}).execute()

In [0]:
%sql
select * from dim_employee order by emp_id

emp_id,name,city,country,contact_no
1000,Prakash,Florida,USA,8052664
2000,Roshan,Calgary,CANADA,9051444
3000,Ramesh,Kathmandu,NEPAL,335341
