# CREATE FLAG PARAMETER

In [0]:
dbutils.widgets.text("incremental_flag", "0")

In [0]:
incremental_flag = dbutils.widgets.get("incremental_flag")
print(type(incremental_flag))

<class 'str'>


# CREATING DIMENSION MODEL

### Filtering new records and old records

###  Fetch Relative Column

In [0]:
df_src_branch = spark.sql('''
    SELECT DISTINCT Branch_ID, BranchName
    FROM parquet.`abfss://silver@datalakecarsrj.dfs.core.windows.net/carsales`
''')
df_src_branch.display()

Branch_ID,BranchName
BR1152,Marcos Motors
BR0819,Honda Motors
BR2324,Blankinship Motor Company Building
BR1833,Spyker Motors
BR0721,GMC Motors
BR1403,Noble Motors
BR2103,ZIL Motors
BR0005,AC Cars Motors
BR0783,Hillman Motors
BR0614,Ford Australia Motors


### Create Surrogate Key 

In [0]:
if spark.catalog.tableExists("cars_catalog.gold.dim_branch"):
    df_existing_branch = spark.sql('''
        SELECT branch_key, Branch_ID, BranchName
        FROM cars_catalog.gold.dim_branch
    ''')
else:
    df_existing_branch = spark.sql('''
        SELECT 1 as branch_key, Branch_ID, BranchName
        FROM parquet.`abfss://silver@datalakecarsrj.dfs.core.windows.net/carsales`
        WHERE 1=0
    ''')

**Create surrogate key column and ADD the max surrogate key** 

In [0]:
df_joined = df_src_branch.join(df_existing_branch, "Branch_ID", how="left") \
                         .select(df_src_branch.Branch_ID, df_src_branch.BranchName, df_existing_branch.branch_key)

df_existing_only = df_joined.filter(df_joined.branch_key.isNotNull())
df_new_only = df_joined.filter(df_joined.branch_key.isNull()) \
                       .select("Branch_ID", "BranchName")

### dim_branch sink - initial and incremental 

**Fetch the max surrogate key from the existing table**

In [0]:
from pyspark.sql.functions import row_number
from pyspark.sql.window import Window

# Optional incremental flag if needed
dbutils.widgets.text("incremental_flag", "0")
incremental_flag = int(dbutils.widgets.get("incremental_flag"))

if incremental_flag == 0 or not spark.catalog.tableExists("cars_catalog.gold.dim_branch"):
    max_branch_key = 0
else:
    max_df = spark.sql("SELECT MAX(branch_key) as max_key FROM cars_catalog.gold.dim_branch")
    max_branch_key = max_df.collect()[0]["max_key"]

# Generate new surrogate keys
window_spec = Window.orderBy("Branch_ID")
df_new_with_keys = df_new_only.withColumn(
    "branch_key", row_number().over(window_spec) + max_branch_key
).select("branch_key", "Branch_ID", "BranchName")




In [0]:
df_existing_only_clean = df_existing_only.select("branch_key", "Branch_ID", "BranchName")
df_final_branch = df_existing_only_clean.union(df_new_with_keys)
df_final_branch.display()



branch_key,Branch_ID,BranchName
1,BR0001,AC Cars Motors
2,BR0003,AC Cars Motors
3,BR0004,AC Cars Motors
4,BR0005,AC Cars Motors
5,BR0006,AC Cars Motors
6,BR0008,AC Cars Motors
7,BR0009,AC Cars Motors
8,BR0010,AC Cars Motors
9,BR0011,Acura Motors
10,BR0012,Acura Motors


# SCD TYPE 1 - UPSERT 

In [0]:
from delta.tables import DeltaTable

gold_path = "abfss://gold@datalakecarsrj.dfs.core.windows.net/dim_branch"

if spark.catalog.tableExists("cars_catalog.gold.dim_branch"):
    delta_tbl = DeltaTable.forPath(spark, gold_path)
    delta_tbl.alias("trg").merge(
        df_final_branch.alias("src"),
        "trg.branch_key = src.branch_key"
    ).whenMatchedUpdateAll() \
     .whenNotMatchedInsertAll() \
     .execute()
else:
    df_final_branch.write.format("delta") \
        .mode("overwrite") \
        .option("path", gold_path) \
        .saveAsTable("cars_catalog.gold.dim_branch")



In [0]:
%sql
select * from cars_catalog.gold.dim_branch; 

branch_key,Branch_ID,BranchName
1,BR0001,AC Cars Motors
2,BR0003,AC Cars Motors
3,BR0004,AC Cars Motors
4,BR0005,AC Cars Motors
5,BR0006,AC Cars Motors
6,BR0008,AC Cars Motors
7,BR0009,AC Cars Motors
8,BR0010,AC Cars Motors
9,BR0011,Acura Motors
10,BR0012,Acura Motors
