In [0]:
from pyspark.sql import functions as F
from delta.tables import DeltaTable

In [0]:
from databricks.labs.dqx.engine import DQEngine
from databricks.sdk import WorkspaceClient

# Create parameter

In [0]:
dbutils.widgets.text('storage_account', '0')
dbutils.widgets.text('year', '0')
dbutils.widgets.text('month', '0')
dbutils.widgets.text('day', '0')

In [0]:
storage_account = dbutils.widgets.get('storage_account')
year = dbutils.widgets.get('year')
month = dbutils.widgets.get('month')
day = dbutils.widgets.get('day')

# Read data from silver

In [0]:
df_silver = spark \
                .read \
                .format('parquet') \
                .option('inferSchema', True) \
                .load(f'abfss://silver@{storage_account}.dfs.core.windows.net/transformed_data/{year}/{month}/{day}/')

In [0]:
df_silver.display()

Sales_Person_ID,Sales_Person,Country,Product_ID,Product,Date,Revenue,Boxes_Shipped,First_Name,Last_Name,Revenue_Per_Box,Date_Key,Year,Quarter,Month,Day,Start_Of_Year,Start_Of_Quarter,Start_Of_Month
16,Karlen McCaffrey,United States,19,Raspberry Choco,2022-04-15,14749,354,Karlen,McCaffrey,41.66,20220415,2022,2,4,15,2022-01-01,2022-04-01,2022-04-01
24,Van Tuxwell,Australia,14,Milk Bars,2022-06-20,7910,87,Van,Tuxwell,90.92,20220620,2022,2,6,20,2022-01-01,2022-04-01,2022-06-01
6,Ches Bonnell,New Zealand,14,Milk Bars,2022-08-17,4389,126,Ches,Bonnell,34.83,20220817,2022,3,8,17,2022-01-01,2022-07-01,2022-08-01
25,Wilone O'Kielt,Australia,12,Fruit & Nut Bars,2022-06-15,392,102,Wilone,O'Kielt,3.84,20220615,2022,2,6,15,2022-01-01,2022-04-01,2022-06-01
23,Roddy Speechley,Canada,16,Orange Choco,2022-02-09,8148,85,Roddy,Speechley,95.86,20220209,2022,1,2,9,2022-01-01,2022-01-01,2022-02-01
18,Madelene Upcott,India,15,Mint Chip Choco,2022-08-24,3836,71,Madelene,Upcott,54.03,20220824,2022,3,8,24,2022-01-01,2022-07-01,2022-08-01
2,Barr Faughny,New Zealand,1,50% Dark Bites,2022-06-23,4557,308,Barr,Faughny,14.8,20220623,2022,2,6,23,2022-01-01,2022-04-01,2022-06-01
1,Andria Kimpton,United Kingdom,1,50% Dark Bites,2022-08-24,2653,314,Andria,Kimpton,8.45,20220824,2022,3,8,24,2022-01-01,2022-07-01,2022-08-01
24,Van Tuxwell,United States,4,99% Dark & Pure,2022-07-28,12586,6,Van,Tuxwell,2097.67,20220728,2022,3,7,28,2022-01-01,2022-07-01,2022-07-01
17,Kelci Walkden,India,5,After Nines,2022-01-07,1687,520,Kelci,Walkden,3.24,20220107,2022,1,1,7,2022-01-01,2022-01-01,2022-01-01


# Create dimension table

### Select necessary columns

In [0]:
df_total = df_silver.select(
    'Date_Key',
    'Date',
    'Year',
    'Quarter',
    'Month',
    'Day',
    'Start_Of_Year',
    'Start_Of_Quarter',
    'Start_Of_Month'
).distinct()

In [0]:
df_total.display()

Date_Key,Date,Year,Quarter,Month,Day,Start_Of_Year,Start_Of_Quarter,Start_Of_Month
20220509,2022-05-09,2022,2,5,9,2022-01-01,2022-04-01,2022-05-01
20220801,2022-08-01,2022,3,8,1,2022-01-01,2022-07-01,2022-08-01
20220221,2022-02-21,2022,1,2,21,2022-01-01,2022-01-01,2022-02-01
20220107,2022-01-07,2022,1,1,7,2022-01-01,2022-01-01,2022-01-01
20220628,2022-06-28,2022,2,6,28,2022-01-01,2022-04-01,2022-06-01
20220207,2022-02-07,2022,1,2,7,2022-01-01,2022-01-01,2022-02-01
20220831,2022-08-31,2022,3,8,31,2022-01-01,2022-07-01,2022-08-01
20220311,2022-03-11,2022,1,3,11,2022-01-01,2022-01-01,2022-03-01
20220110,2022-01-10,2022,1,1,10,2022-01-01,2022-01-01,2022-01-01
20220629,2022-06-29,2022,2,6,29,2022-01-01,2022-04-01,2022-06-01


### Data quality checks

In [0]:
dq_engine = DQEngine(WorkspaceClient())

checks = dq_engine.load_checks_from_workspace_file(workspace_path='/pipeline_project/check/checks_gold_dim_date.yml')

### Apply checks and check result

In [0]:
df_check = dq_engine.apply_checks_by_metadata(df_total, checks)

error_count = df_check.select('_errors').filter(F.col('_errors').isNotNull()).count()

assert error_count == 0, f'{error_count} errors found in the data'

In [0]:
df_check.display()

Date_Key,Date,Year,Quarter,Month,Day,Start_Of_Year,Start_Of_Quarter,Start_Of_Month,_errors,_warnings
20220103,2022-01-03,2022,1,1,3,2022-01-01,2022-01-01,2022-01-01,,
20220104,2022-01-04,2022,1,1,4,2022-01-01,2022-01-01,2022-01-01,,
20220105,2022-01-05,2022,1,1,5,2022-01-01,2022-01-01,2022-01-01,,
20220107,2022-01-07,2022,1,1,7,2022-01-01,2022-01-01,2022-01-01,,
20220110,2022-01-10,2022,1,1,10,2022-01-01,2022-01-01,2022-01-01,,
20220111,2022-01-11,2022,1,1,11,2022-01-01,2022-01-01,2022-01-01,,
20220112,2022-01-12,2022,1,1,12,2022-01-01,2022-01-01,2022-01-01,,
20220113,2022-01-13,2022,1,1,13,2022-01-01,2022-01-01,2022-01-01,,
20220114,2022-01-14,2022,1,1,14,2022-01-01,2022-01-01,2022-01-01,,
20220117,2022-01-17,2022,1,1,17,2022-01-01,2022-01-01,2022-01-01,,


### Upsert data into dimension table, create new table if not exists

In [0]:
if spark.catalog.tableExists('sales_catalog.gold.dim_date'):

    delta_table = DeltaTable.forName(spark, 'sales_catalog.gold.dim_date')

    delta_table.alias('trg').merge(
        source=df_total.alias('src'),
        condition='trg.Date_Key = src.Date_Key'
    ) \
    .whenMatchedUpdateAll() \
    .whenNotMatchedInsertAll() \
    .execute()

else:

    df_total \
        .write \
        .format('delta') \
        .mode('overwrite') \
        .option('path', f'abfss://gold@{storage_account}.dfs.core.windows.net/dim_date') \
        .saveAsTable('sales_catalog.gold.dim_date')