# CREATE FLAG PARAMETER

In [0]:
dbutils.widgets.text("incremental_flag", "0")

In [0]:
incremental_flag = dbutils.widgets.get("incremental_flag")
print(type(incremental_flag))

<class 'str'>


# CREATING DIMENSION MODEL

### Filtering new records and old records

###  Fetch Relative Column

In [0]:
df_src_date = spark.sql('''
    SELECT DISTINCT Date_ID
    FROM parquet.`abfss://silver@datalakecarsrj.dfs.core.windows.net/carsales`
''')
df_src_date.display()

Date_ID
DT00528
DT00319
DT00599
DT00411
DT00593
DT00102
DT00290
DT00478
DT00539
DT00269


### Create Surrogate Key 

In [0]:
if spark.catalog.tableExists("cars_catalog.gold.dim_date"):
    df_existing_date = spark.sql('''
        SELECT date_key, Date_ID
        FROM cars_catalog.gold.dim_date
    ''')
else:
    df_existing_date = spark.sql('''
        SELECT 1 as date_key, Date_ID
        FROM parquet.`abfss://silver@datalakecarsrj.dfs.core.windows.net/carsales`
        WHERE 1=0
    ''')

**Create surrogate key column and ADD the max surrogate key** 

In [0]:
df_joined_date = df_src_date.join(df_existing_date, "Date_ID", how="left") \
                            .select(df_src_date.Date_ID, df_existing_date.date_key)

df_existing_only = df_joined_date.filter(df_joined_date.date_key.isNotNull())
df_new_only = df_joined_date.filter(df_joined_date.date_key.isNull()) \
                            .select("Date_ID")


### dim_branch sink - initial and incremental 

**Fetch the max surrogate key from the existing table**

In [0]:
from pyspark.sql.functions import row_number
from pyspark.sql.window import Window

# Widget for incremental run
dbutils.widgets.text("incremental_flag", "0")
incremental_flag = int(dbutils.widgets.get("incremental_flag"))

if incremental_flag == 0 or not spark.catalog.tableExists("cars_catalog.gold.dim_date"):
    max_date_key = 0
else:
    max_df = spark.sql("SELECT MAX(date_key) as max_key FROM cars_catalog.gold.dim_date")
    max_date_key = max_df.collect()[0]["max_key"]

# Assign surrogate key to new Date_IDs
window_spec = Window.orderBy("Date_ID")
df_new_with_keys = df_new_only.withColumn(
    "date_key", row_number().over(window_spec) + max_date_key
).select("date_key", "Date_ID")



In [0]:
df_existing_clean = df_existing_only.select("date_key", "Date_ID")
df_final_date = df_existing_clean.union(df_new_with_keys)
df_final_date.display()



date_key,Date_ID
1,DT00001
2,DT00002
3,DT00003
4,DT00004
5,DT00005
6,DT00006
7,DT00007
8,DT00008
9,DT00009
10,DT00010


# SCD TYPE 1 - UPSERT 

In [0]:
from delta.tables import DeltaTable

gold_path_date = "abfss://gold@datalakecarsrj.dfs.core.windows.net/dim_date"

if spark.catalog.tableExists("cars_catalog.gold.dim_date"):
    delta_tbl = DeltaTable.forPath(spark, gold_path_date)
    delta_tbl.alias("trg").merge(
        df_final_date.alias("src"),
        "trg.date_key = src.date_key"
    ).whenMatchedUpdateAll() \
     .whenNotMatchedInsertAll() \
     .execute()
else:
    df_final_date.write.format("delta") \
        .mode("overwrite") \
        .option("path", gold_path_date) \
        .saveAsTable("cars_catalog.gold.dim_date")




In [0]:
%sql
select * from cars_catalog.gold.dim_date; 

date_key,Date_ID
1,DT00001
2,DT00002
3,DT00003
4,DT00004
5,DT00005
6,DT00006
7,DT00007
8,DT00008
9,DT00009
10,DT00010
