In [0]:
%sql
CREATE DATABASE IF NOT EXISTS training;

In [0]:
%sql
CREATE OR REPLACE TABLE training.EMP_SCD2
(
  EMPNO INT, 
	ENAME STRING, 
	JOB STRING, 
	MGR INT, 
	HIREDATE DATE, 
	SAL INT, 
	COMM INT, 
	DEPTNO INT,
	sk_EMPNO INT,
	EFFECTIVE_DATE DATE,
	EXPIRATION_DATE DATE,
	CURRENT_FLAG STRING,
	ETL_CHECKSUM STRING
)
USING DELTA
LOCATION '/FileStore/tables/delta-table-merge/EMP_SCD2';

In [0]:
%sql
select * from training.EMP_SCD2

EMPNO,ENAME,JOB,MGR,HIREDATE,SAL,COMM,DEPTNO,sk_EMPNO,EFFECTIVE_DATE,EXPIRATION_DATE,CURRENT_FLAG,ETL_CHECKSUM


In [0]:
#SOURCE_PATH = "dbfs:/FileStore/tables/EMP.csv"
SOURCE_PATH = "dbfs:/FileStore/tables/EMP_SCD2SQL-1.csv"
df_source = spark.read.options(header=True, delimiter=',', inferSchema='True').csv(SOURCE_PATH)
#spark.read.options(header=True, delimiter=',',inferSchema='True')\
df_source.printSchema()
display(df_source)


root
 |-- EMPNO: integer (nullable = true)
 |-- ENAME: string (nullable = true)
 |-- JOB: string (nullable = true)
 |-- MGR: integer (nullable = true)
 |-- HIREDATE: string (nullable = true)
 |-- SAL: integer (nullable = true)
 |-- COMM: integer (nullable = true)
 |-- DEPTNO: integer (nullable = true)



EMPNO,ENAME,JOB,MGR,HIREDATE,SAL,COMM,DEPTNO
7839,KING,PRESIDENT,,17-Nov-81,5000,1,10
7698,BLAKE,MANAGER,7839.0,1-May-81,2850,5,30
7782,CLARK,MANAGER,7839.0,9-Jun-81,2450,4,10
7566,JONES,MANAGER,7839.0,2-Apr-81,2975,6,20
7788,SCOTT,ANALYST,7566.0,19-Apr-87,3000,7,20
7902,FORD,ANALYST,7566.0,3-Dec-81,3000,8,20
7369,SMITH,CLERK,7902.0,17-Dec-80,800,9,20
7499,ALLEN,SALESMAN,7698.0,20-Feb-81,1600,300,30
7521,WARD,SALESMAN,7698.0,22-Feb-81,1250,500,30
7654,MARTIN,SALESMAN,7698.0,28-Sep-81,1250,1400,30


In [0]:
df_source.createOrReplaceTempView('EMP_SOURCE')

In [0]:
%sql
select * from EMP_SOURCE

EMPNO,ENAME,JOB,MGR,HIREDATE,SAL,COMM,DEPTNO
7839,KING,PRESIDENT,,17-Nov-81,5000,1,10
7698,BLAKE,MANAGER,7839.0,1-May-81,2850,5,30
7782,CLARK,MANAGER,7839.0,9-Jun-81,2450,4,10
7566,JONES,MANAGER,7839.0,2-Apr-81,2975,6,20
7788,SCOTT,ANALYST,7566.0,19-Apr-87,3000,7,20
7902,FORD,ANALYST,7566.0,3-Dec-81,3000,8,20
7369,SMITH,CLERK,7902.0,17-Dec-80,800,9,20
7499,ALLEN,SALESMAN,7698.0,20-Feb-81,1600,300,30
7521,WARD,SALESMAN,7698.0,22-Feb-81,1250,500,30
7654,MARTIN,SALESMAN,7698.0,28-Sep-81,1250,1400,30


In [0]:
%sql
CREATE OR REPLACE TEMPORARY VIEW stg_EMP_vw
AS
SELECT DISTINCT
EMPNO,
ENAME,
 JOB,
 MGR,
 HIREDATE,
 SAL,
 COMM,
 DEPTNO
-- , row_number() OVER (ORDER BY EMPNO) AS SK_EMPNO
  ,md5(
    CONCAT_WS('|',
               JOB,
                MGR,
              HIREDATE,
             DEPTNO,
             SAL,
             COMM 
  
    )
   )AS ETL_CHECKSUM
  
FROM
  EMP_SOURCE


In [0]:
%sql
select * from stg_EMP_vw

EMPNO,ENAME,JOB,MGR,HIREDATE,SAL,COMM,DEPTNO,ETL_CHECKSUM
7902,FORD,ANALYST,7566.0,3-Dec-81,3000,8,20,13de86bab8df5bc781803112b898d595
7782,CLARK,MANAGER,7839.0,9-Jun-81,2450,4,10,f910c9735baa9e9bce84c695b023882f
7698,BLAKE,MANAGER,7839.0,1-May-81,2850,5,30,ee5493135780100e8c2365c1ef46c859
7654,MARTIN,SALESMAN,7698.0,28-Sep-81,1250,1400,30,85ff118e74b3f1f657bb207a2954679e
7521,WARD,SALESMAN,7698.0,22-Feb-81,1250,500,30,532164d9a9a24675575fa207fdbfd590
7876,ADAMS,CLERK,7788.0,23-May-87,1100,9,20,c59e22513124c442c90c0f3d21d18ef4
7369,SMITH,CLERK,7902.0,17-Dec-80,800,9,20,686dab84b1783ca9d3af3a684a3719a7
7900,JAMES,CLERK,7698.0,3-Dec-81,950,6,30,9c7db86ec7eadd29076cf2afb493cd2e
7566,JONES,MANAGER,7839.0,2-Apr-81,2975,6,20,b3d1b9898a4938fcc6598356dc56a6e9
7934,MILLER,ANALYST,7782.0,23-Jan-82,1300,9,10,3e5efa499cfbccefb656b33bb5a52f91


In [0]:
%sql

MERGE INTO training.EMP_SCD2 TARGET
USING 
(SELECT SRC.EMPNO as JOIN_KEY, SRC.* from stg_emp_vw SRC
UNION ALL
SELECT NULL as JOIN_KEY, SRC.* from stg_emp_vw SRC
INNER JOIN  training.EMP_SCD2 tgt ON tgt.EMPNO = SRC.EMPNO
WHERE tgt.ETL_CHECKSUM != src.ETL_CHECKSUM AND tgt.CURRENT_FLAG = 'Y'
) SOURCE
ON TARGET.EMPNO = SOURCE.JOIN_KEY
WHEN MATCHED AND TARGET.ETL_CHECKSUM != SOURCE.ETL_CHECKSUM 
THEN
  UPDATE SET
  TARGET.CURRENT_FLAG = 'N',
  TARGET.EXPIRATION_DATE = CURRENT_DATE()
WHEN NOT MATCHED THEN
  INSERT
  (
EMPNO,
ENAME,
JOB,
MGR,
HIREDATE,
SAL,
COMM,
DEPTNO,
SK_EMPNO,
EFFECTIVE_DATE,
EXPIRATION_DATE,
CURRENT_FLAG,
ETL_CHECKSUM
  )
  VALUES
  (
SOURCE.EMPNO,
SOURCE.ENAME,
SOURCE.JOB,
SOURCE.MGR,
SOURCE.HIREDATE,
SOURCE.SAL,
SOURCE.COMM,
SOURCE.DEPTNO,
" ",
--row_number() OVER (ORDER BY EMPNO ASC NULLS FIRST ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW),
--A query operator contains one or more unsupported expressions. Consider to rewrite it to avoid window functions, 
--Do not use the Window functions in the merge statement, instead use it in USING SOURCE SQL
CURRENT_DATE(),
"9999-12-31",
 "Y",
 SOURCE.ETL_CHECKSUM 
  )


num_affected_rows,num_updated_rows,num_deleted_rows,num_inserted_rows
30,14,0,16


In [0]:
%sql
select * from training.EMP_SCD2

EMPNO,ENAME,JOB,MGR,HIREDATE,SAL,COMM,DEPTNO,sk_EMPNO,EFFECTIVE_DATE,EXPIRATION_DATE,CURRENT_FLAG,ETL_CHECKSUM
7902,FORD,ANALYST,7566.0,,3000,8.0,20,17,2024-05-10,9999-12-31,Y,13de86bab8df5bc781803112b898d595
7782,CLARK,MANAGER,7839.0,,2450,4.0,10,18,2024-05-10,9999-12-31,Y,f910c9735baa9e9bce84c695b023882f
7698,BLAKE,MANAGER,7839.0,,2850,5.0,30,19,2024-05-10,9999-12-31,Y,ee5493135780100e8c2365c1ef46c859
7654,MARTIN,SALESMAN,7698.0,,1250,1400.0,30,20,2024-05-10,9999-12-31,Y,85ff118e74b3f1f657bb207a2954679e
7521,WARD,SALESMAN,7698.0,,1250,500.0,30,21,2024-05-10,9999-12-31,Y,532164d9a9a24675575fa207fdbfd590
7876,ADAMS,CLERK,7788.0,,1100,9.0,20,22,2024-05-10,9999-12-31,Y,c59e22513124c442c90c0f3d21d18ef4
7369,SMITH,CLERK,7902.0,,800,9.0,20,23,2024-05-10,9999-12-31,Y,686dab84b1783ca9d3af3a684a3719a7
7900,JAMES,CLERK,7698.0,,950,6.0,30,24,2024-05-10,9999-12-31,Y,9c7db86ec7eadd29076cf2afb493cd2e
7566,JONES,MANAGER,7839.0,,2975,6.0,20,25,2024-05-10,9999-12-31,Y,b3d1b9898a4938fcc6598356dc56a6e9
7934,MILLER,ANALYST,7782.0,,1300,9.0,10,26,2024-05-10,9999-12-31,Y,3e5efa499cfbccefb656b33bb5a52f91


In [0]:
%sql
WITH CTE AS(
SELECT ((ROW_NUMBER() OVER (PARTITION BY CAST(EMP_SCD2.SK_EMPNO AS INT) ORDER BY EMP_SCD2.SK_EMPNO)) + MAX_SK)  AS  SK_EMPNO1
       ,EMPNO
FROM training.EMP_SCD2
CROSS JOIN (
  SELECT IFNULL(max(SK_EMPNO),0) AS MAX_SK FROM training.EMP_SCD2 
) MaxSK_View
WHERE EMP_SCD2.SK_EMPNO IS NULL
ORDER BY 1
)
--select *from CTE

MERGE INTO training.EMP_SCD2
USING CTE
ON CTE.EMPNO = EMP_SCD2.EMPNO
WHEN MATCHED AND EMP_SCD2.SK_EMPNO IS NULL
  THEN UPDATE SET 
    EMP_SCD2.SK_EMPNO = CTE.SK_EMPNO1

num_affected_rows,num_updated_rows,num_deleted_rows,num_inserted_rows
16,16,0,0


In [0]:
%sql
truncate table training.EMP_SCD2