# CREATE FLAG PARAMETER

In [0]:
dbutils.widgets.text("incremental_flag", "0")

In [0]:
incremental_flag = dbutils.widgets.get("incremental_flag")
print(type(incremental_flag))

<class 'str'>


# CREATING DIMENSION MODEL

###  Fetch Relative Column

### dim_model sink - initial and incremental 

### Filtering new records and old records

### Create Surrogate Key 

**Fetch the max surrogate key from the existing table**

**Create surrogate key column and ADD the max surrogate key** 

In [0]:
df_src = spark.sql('''
    SELECT DISTINCT Model_ID, `Model_Category`
    FROM parquet.`abfss://silver@datalakecarsrj.dfs.core.windows.net/carsales`
''')

In [0]:
if spark.catalog.tableExists("cars_catalog.gold.dim_model"):
    df_sink = spark.sql("""
        SELECT dim_model_key, Model_ID, Model_Category
        FROM cars_catalog.gold.dim_model
    """)
else:
    # Schema-only empty DataFrame
    df_sink = spark.sql('''
        SELECT 1 as dim_model_key, Model_ID, `Model_Category`
        FROM parquet.`abfss://silver@datalakecarsrj.dfs.core.windows.net/carsales`
        WHERE 1=0
    ''')

In [0]:
df_filter = df_src.join(df_sink, "Model_ID", how='left') \
                  .select(df_src.Model_ID, df_src.Model_Category, df_sink.dim_model_key)

df_filter_old = df_filter.filter(df_filter.dim_model_key.isNotNull())
df_filter_new = df_filter.filter(df_filter.dim_model_key.isNull()) \
                         .select("Model_ID", "Model_Category")

In [0]:
dbutils.widgets.text("incremental_flag", "0")
incremental_flag = int(dbutils.widgets.get("incremental_flag"))

if incremental_flag == 0 or not spark.catalog.tableExists("cars_catalog.gold.dim_model"):
    max_value = 0
else:
    max_value_df = spark.sql("SELECT MAX(dim_model_key) as max_key FROM cars_catalog.gold.dim_model")
    max_value = max_value_df.collect()[0]["max_key"]

In [0]:
from pyspark.sql.functions import row_number
from pyspark.sql.window import Window

window_spec = Window.orderBy("Model_ID")

df_new_with_keys = df_filter_new.withColumn(
    "dim_model_key", row_number().over(window_spec) + max_value
).select("dim_model_key", "Model_ID", "Model_Category")




In [0]:
df_new_with_keys.display()



dim_model_key,Model_ID,Model_Category
1,Acu-M59,Acu
2,Acu-M60,Acu
3,Acu-M61,Acu
4,Acu-M62,Acu
5,Acu-M63,Acu
6,Agr-M111,Agr
7,Agr-M112,Agr
8,Agr-M113,Agr
9,Aud-M227,Aud
10,Aud-M228,Aud


In [0]:
from pyspark.sql import DataFrame

# Union old records (with existing keys) and new records (with generated keys)
df_final: DataFrame = df_filter_old.unionByName(df_new_with_keys)




In [0]:
df_final.display()



Model_ID,Model_Category,dim_model_key
Acu-M59,Acu,1
Acu-M60,Acu,2
Acu-M61,Acu,3
Acu-M62,Acu,4
Acu-M63,Acu,5
Agr-M111,Agr,6
Agr-M112,Agr,7
Agr-M113,Agr,8
Aud-M227,Aud,9
Aud-M228,Aud,10


# SCD TYPE - 1 (UPSERT)

In [0]:
from delta.tables import DeltaTable
path = "abfss://gold@datalakecarsrj.dfs.core.windows.net/dim_model"

In [0]:
# Incremental Run
if spark.catalog.tableExists("cars_catalog.gold.dim_model"):
    delta_tbl = DeltaTable.forPath(spark, path)

    delta_tbl.alias("trg").merge(
        df_final.alias("src"),
        "trg.dim_model_key = src.dim_model_key"
    ).whenMatchedUpdateAll() \
     .whenNotMatchedInsertAll() \
     .execute()
     
# Initial Run
else:
    df_final.write.format("delta") \
        .mode("overwrite") \
        .option("path", path) \
        .saveAsTable("cars_catalog.gold.dim_model")



In [0]:
%sql
select * from cars_catalog.gold.dim_model; 

Model_ID,Model_Category,dim_model_key
Acu-M59,Acu,1
Acu-M60,Acu,2
Acu-M61,Acu,3
Acu-M62,Acu,4
Acu-M63,Acu,5
Agr-M111,Agr,6
Agr-M112,Agr,7
Agr-M113,Agr,8
Aud-M227,Aud,9
Aud-M228,Aud,10
