In [0]:
from pyspark.sql.functions import *

### Getting Watermark values

In [0]:
load_start_dt = spark.table("sales_catalog.gold.gold_watermark") \
          .filter("table_name = 'dim_dealer'") \
          .select("last_watermark") \
          .collect()[0][0]
load_end_dt = spark.sql('''select max(Date_ID)
                        from parquet.`abfss://silver@dlsaleslakehouse.dfs.core.windows.net/sales`
                     ''')\
                    .collect()[0][0]
print(f"Loading data from {load_start_dt} to {load_end_dt}")


Loading data from DT00000 to DT01245


## Create Flag Param to identify Initial/INCR

In [0]:
dbutils.widgets.text('incr_flag','0')

In [0]:
incr_flag = dbutils.widgets.get('incr_flag')

## Creating Dimension Dealer

### Creating dim_dealer
**Fetching relative columns from Silver layer**

In [0]:
df_src = spark.sql(f'''select distinct Dealer_ID , DealerName
                        from parquet.`abfss://silver@dlsaleslakehouse.dfs.core.windows.net/sales`
                        WHERE Date_ID > '{load_start_dt}' AND Date_ID <= '{load_end_dt}'
                     ''')
display(df_src)


Dealer_ID,DealerName
DLR0095,Isuzu Motors
DLR0124,McLaren Motors
DLR0245,Jennings Ford Automobile Dealership
DLR0171,Samsung Motors
DLR0052,e.GO Mobile Motors
DLR0175,Saturn Motors
DLR0080,Hindustan Motors
DLR0263,Auto-Union Motors
DLR0066,Freightliner Motors
DLR0258,


**Creating Schema for Initial run**

In [0]:
if spark.catalog.tableExists('sales_catalog.gold.dim_dealer'):
    df_sink = spark.sql('''
                                SELECT Dim_dealer_key, Dealer_ID , DealerName
                                FROM sales_catalog.gold.dim_dealer
                            ''')
else:
    df_sink = spark.sql('''
                          SELECT 1 AS Dim_dealer_key, Dealer_ID , DealerName
                          FROM parquet.`abfss://silver@dlsaleslakehouse.dfs.core.windows.net/sales`
                          WHERE 1=0
                          ''')
display(df_sink)

Dim_dealer_key,Dealer_ID,DealerName


**Filtering new records and updated records**

In [0]:
df_filter = df_src.join(df_sink, ['Dealer_ID'], 'left')\
                              .select(df_src['Dealer_ID'],df_src['DealerName'],df_sink['Dim_dealer_key'])
display(df_filter)

Dealer_ID,DealerName,Dim_dealer_key
DLR0095,Isuzu Motors,
DLR0124,McLaren Motors,
DLR0245,Jennings Ford Automobile Dealership,
DLR0171,Samsung Motors,
DLR0052,e.GO Mobile Motors,
DLR0175,Saturn Motors,
DLR0080,Hindustan Motors,
DLR0263,Auto-Union Motors,
DLR0066,Freightliner Motors,
DLR0258,,


**new records**

In [0]:
df_new_rec = df_filter.filter(df_filter.Dim_dealer_key.isNull())\
                                .select(col('Dealer_ID'),col('DealerName'))
display(df_new_rec)

Dealer_ID,DealerName
DLR0095,Isuzu Motors
DLR0124,McLaren Motors
DLR0245,Jennings Ford Automobile Dealership
DLR0171,Samsung Motors
DLR0052,e.GO Mobile Motors
DLR0175,Saturn Motors
DLR0080,Hindustan Motors
DLR0263,Auto-Union Motors
DLR0066,Freightliner Motors
DLR0258,


**updated records**

In [0]:
df_updated_rec = df_filter.filter(df_filter.Dim_dealer_key.isNotNull())
display(df_updated_rec)

Dealer_ID,DealerName,Dim_dealer_key


**Create Surrogate Keys**

In [0]:
if incr_flag == '0':
  max_val=0
else:
  max_val_df = spark.sql('''
                        SELECT max(Dim_dealer_key) as max_val
                        FROM sales_catalog.gold.dim_dealer
                      ''')
  max_val = max_val_df.collect()[0]['max_val']


In [0]:
df_new_rec = df_new_rec.withColumn('Dim_dealer_key', max_val + monotonically_increasing_id() + 1)
display(df_new_rec)

Dealer_ID,DealerName,Dim_dealer_key
DLR0095,Isuzu Motors,1
DLR0124,McLaren Motors,2
DLR0245,Jennings Ford Automobile Dealership,3
DLR0171,Samsung Motors,4
DLR0052,e.GO Mobile Motors,5
DLR0175,Saturn Motors,6
DLR0080,Hindustan Motors,7
DLR0263,Auto-Union Motors,8
DLR0066,Freightliner Motors,9
DLR0258,,10


**Combining new and updated records**

In [0]:
df_dealer = df_new_rec.unionByName(df_updated_rec)
display(df_dealer)

Dealer_ID,DealerName,Dim_dealer_key
DLR0095,Isuzu Motors,1
DLR0124,McLaren Motors,2
DLR0245,Jennings Ford Automobile Dealership,3
DLR0171,Samsung Motors,4
DLR0052,e.GO Mobile Motors,5
DLR0175,Saturn Motors,6
DLR0080,Hindustan Motors,7
DLR0263,Auto-Union Motors,8
DLR0066,Freightliner Motors,9
DLR0258,,10


**SCD Type-1 (UPSERT)**

In [0]:
from delta.tables import DeltaTable

In [0]:
# INCR run
if spark.catalog.tableExists('sales_catalog.gold.dim_dealer'):
    delta_tbl = DeltaTable.forPath(spark, 'abfss://gold@dlsaleslakehouse.dfs.core.windows.net/dim_dealer')
    delta_tbl.alias('trg').merge(df_dealer.alias('src'), 'trg.Dim_dealer_key = src.Dim_dealer_key')\
        .whenMatchedUpdateAll()\
        .whenNotMatchedInsertAll()\
        .execute()

# initial run
else:
    df_dealer.write.format('delta')\
        .mode('overwrite')\
        .option('path','abfss://gold@dlsaleslakehouse.dfs.core.windows.net/dim_dealer')\
        .saveAsTable('sales_catalog.gold.dim_dealer')

In [0]:
%sql
select * from sales_catalog.gold.dim_dealer

Dealer_ID,DealerName,Dim_dealer_key
DLR0095,Isuzu Motors,1
DLR0124,McLaren Motors,2
DLR0245,Jennings Ford Automobile Dealership,3
DLR0171,Samsung Motors,4
DLR0052,e.GO Mobile Motors,5
DLR0175,Saturn Motors,6
DLR0080,Hindustan Motors,7
DLR0263,Auto-Union Motors,8
DLR0066,Freightliner Motors,9
DLR0258,,10


### Updating watermark table values

In [0]:
spark.sql(f"""
    UPDATE sales_catalog.gold.gold_watermark
    SET last_watermark = '{load_end_dt}',
        updated_at = current_timestamp()
    WHERE table_name = 'dim_dealer'
""")


DataFrame[num_affected_rows: bigint]