In [0]:
from pyspark.sql.functions import *

## Create Flag Param to identify Initial/INCR

In [0]:
dbutils.widgets.text('incr_flag','0')

In [0]:
incr_flag = dbutils.widgets.get('incr_flag')

### Getting Watermark values

In [0]:
load_start_dt = spark.table("sales_catalog.gold.gold_watermark") \
          .filter("table_name = 'dim_date'") \
          .select("last_watermark") \
          .collect()[0][0]
load_end_dt = spark.sql('''select max(Date_ID)
                        from parquet.`abfss://silver@dlsaleslakehouse.dfs.core.windows.net/sales`
                     ''')\
                    .collect()[0][0]
print(f"Loading data from {load_start_dt} to {load_end_dt}")


Loading data from DT00000 to DT01245


## Creating Dimension Dealer

### Creating dim_dealer
**Fetching relative columns from Silver layer**

In [0]:
df_src = spark.sql(f'''select distinct Date_ID
                        from parquet.`abfss://silver@dlsaleslakehouse.dfs.core.windows.net/sales`
                        WHERE Date_ID > '{load_start_dt}' AND Date_ID <= '{load_end_dt}'
                     ''')
display(df_src)


Date_ID
DT00528
DT00319
DT00599
DT00411
DT00102
DT00290
DT00478
DT00593
DT00269
DT00539


**Creating Schema for Initial run**

In [0]:
if spark.catalog.tableExists('sales_catalog.gold.dim_date'):
    df_sink = spark.sql('''
                                SELECT Dim_date_key, Date_ID 
                                FROM sales_catalog.gold.dim_date
                            ''')
else:
    df_sink = spark.sql('''
                          SELECT 1 AS Dim_date_key, Date_ID
                          FROM parquet.`abfss://silver@dlsaleslakehouse.dfs.core.windows.net/sales`
                          WHERE 1=0
                          ''')
display(df_sink)

Dim_date_key,Date_ID


**Filtering new records and updated records**

In [0]:
df_filter = df_src.join(df_sink, ['Date_ID'], 'left')\
                              .select(df_src['Date_ID'],df_sink['Dim_date_key'])
display(df_filter)

Date_ID,Dim_date_key
DT00528,
DT00319,
DT00599,
DT00411,
DT00102,
DT00290,
DT00478,
DT00593,
DT00269,
DT00539,


**new records**

In [0]:
df_new_rec = df_filter.filter(df_filter.Dim_date_key.isNull())\
                                .select(col('Date_ID'))
display(df_new_rec)

Date_ID
DT00528
DT00319
DT00599
DT00411
DT00102
DT00290
DT00478
DT00593
DT00269
DT00539


**updated records**

In [0]:
df_updated_rec = df_filter.filter(df_filter.Dim_date_key.isNotNull())
display(df_updated_rec)

Date_ID,Dim_date_key


**Create Surrogate Keys**

In [0]:
if incr_flag == '0':
  max_val=0
else:
  max_val_df = spark.sql('''
                        SELECT max(Dim_date_key) as max_val
                        FROM sales_catalog.gold.dim_date
                      ''')
  max_val = max_val_df.collect()[0]['max_val']


In [0]:
df_new_rec = df_new_rec.withColumn('Dim_date_key', max_val + monotonically_increasing_id() + 1)
display(df_new_rec)

Date_ID,Dim_date_key
DT00528,1
DT00319,2
DT00599,3
DT00411,4
DT00102,5
DT00290,6
DT00478,7
DT00593,8
DT00269,9
DT00539,10


**Combining new and updated records**

In [0]:
df_date = df_new_rec.unionByName(df_updated_rec)
display(df_date)

Date_ID,Dim_date_key
DT00528,1
DT00319,2
DT00599,3
DT00411,4
DT00102,5
DT00290,6
DT00478,7
DT00593,8
DT00269,9
DT00539,10


**SCD Type-1 (UPSERT)**

In [0]:
from delta.tables import DeltaTable

In [0]:
# INCR run
if spark.catalog.tableExists('sales_catalog.gold.dim_date'):
    delta_tbl = DeltaTable.forPath(spark, 'abfss://gold@dlsaleslakehouse.dfs.core.windows.net/dim_date')
    delta_tbl.alias('trg').merge(df_date.alias('src'), 'trg.Dim_date_key = src.Dim_date_key')\
        .whenMatchedUpdateAll()\
        .whenNotMatchedInsertAll()\
        .execute()

# initial run
else:
    df_date.write.format('delta')\
        .mode('overwrite')\
        .option('path','abfss://gold@dlsaleslakehouse.dfs.core.windows.net/dim_date')\
        .saveAsTable('sales_catalog.gold.dim_date')

In [0]:
%sql
select * from sales_catalog.gold.dim_date

Date_ID,Dim_date_key
DT00528,1
DT00319,2
DT00599,3
DT00411,4
DT00102,5
DT00290,6
DT00478,7
DT00593,8
DT00269,9
DT00539,10


### Updating watermark table values

In [0]:
spark.sql(f"""
    UPDATE sales_catalog.gold.gold_watermark
    SET last_watermark = '{load_end_dt}',
        updated_at = current_timestamp()
    WHERE table_name = 'dim_date'
""")


DataFrame[num_affected_rows: bigint]