In [0]:
%sql
-- CREATE DIMENSION TABLES

create table if not exists real_time_projects.ecommerce_historical.dim_customer
(
customer_id STRING,
customer_unique_id STRING,
customer_zip_code_prefix STRING,
customer_city STRING,
customer_state STRING,

effective_start_date DATE,
effective_end_date DATE,
is_Active STRING
)
USING DELTA;

create table if not exists real_time_projects.ecommerce_historical.dim_product
(
product_id STRING,
product_category_name STRING,
product_name_lenght STRING,
product_description_lenght STRING,
product_photos_qty STRING,
product_weight_g STRING,
product_length_cm STRING,
product_height_cm STRING,
product_width_cm STRING,

effective_start_date DATE,
effective_end_date DATE,
is_Active STRING
)
USING DELTA;

In [0]:
from pyspark.sql.functions import current_date, lit
from delta.tables import DeltaTable

In [0]:
from delta.tables import DeltaTable
from pyspark.sql.functions import current_date, lit

def scd_type2_merge(
    spark,
    source_df,
    target_table,
    business_key,
    tracked_columns,
    target_columns,
    start_date_col="effective_start_date",
    end_date_col="effective_end_date",
    active_col="is_active",
    open_end_date="9999-12-31"
):

    # ADD SCD-2 SOURCE DF
    scd_df = (
        source_df
        .withColumn(active_col, lit("Y"))
        .withColumn(start_date_col, current_date())
        .withColumn(end_date_col, lit(open_end_date))
    )

    # TARGET TABLE
    delta_target = DeltaTable.forName(spark, target_table)

    # CREATE MERGE CONDITION
    merge_condition = f"""
        t.{business_key} = s.{business_key}
        AND t.{active_col} = 'Y'
    """

    # CREATE CHANGE CONDITION
    change_condition = " OR ".join(
        [f"t.{col} <> s.{col}" for col in tracked_columns]
    )

    # UPDATE SET FOR NON-MATCHING ROWS
    update_set = {
        end_date_col: "current_date()",
        active_col: "'N'"
    }

    # ðŸ”¥ INSERT ONLY TARGET COLUMNS
    insert_set = {
        col: f"s.{col}" for col in target_columns
    }

    # SCD-2 MERGE STATEMENT
    (
        delta_target.alias("t")
        .merge(scd_df.alias("s"), merge_condition)
        .whenMatchedUpdate(
            condition=change_condition,
            set=update_set
        )
        .whenNotMatchedInsert(values=insert_set)
        .execute()
    )


In [0]:
# READ CUSTOMER TABLE
customer_df = spark.table("real_time_projects.ecommerce_historical.customers")

# MERGE INTO dim_customer
scd_type2_merge(
    spark=spark,
    source_df=customer_df,
    target_table="real_time_projects.ecommerce_historical.dim_customer",
    business_key="customer_id",
    tracked_columns=[
        "customer_unique_id",
        "customer_zip_code_prefix",
        "customer_city",
        "customer_state"
    ],
    target_columns=[
        "customer_id",
        "customer_unique_id",
        "customer_zip_code_prefix",
        "customer_city",
        "customer_state",
        "effective_start_date",
        "effective_end_date",
        "is_active"
    ]
)


In [0]:
# READ PRODUCT TABLE
product_df = spark.table("real_time_projects.ecommerce_historical.products")

# MERGE INTO dim_product
scd_type2_merge(
    spark=spark,
    source_df=product_df,
    target_table="real_time_projects.ecommerce_historical.dim_product",
    business_key="product_id",
    tracked_columns=[
        "product_category_name",
        "product_name_lenght",
        "product_description_lenght",
        "product_photos_qty",
        "product_weight_g",
        "product_length_cm",
        "product_height_cm",
        "product_width_cm"
    ],
    target_columns=[
        "product_id",
        "product_category_name",
        "product_name_lenght",
        "product_description_lenght",
        "product_photos_qty",
        "product_weight_g",
        "product_length_cm",
        "product_height_cm",
        "product_width_cm",
        "effective_start_date",
        "effective_end_date",
        "is_active"
    ]
)


In [0]:
%sql

-- CUSTOMER
-- ENABLE DELTA OPTIMIZATIONS
ALTER TABLE real_time_projects.ecommerce_historical.dim_customer
SET TBLPROPERTIES (
  delta.autoOptimize.optimizeWrite = true,
  delta.autoOptimize.autoCompact = true
);

-- Z-ORDERING FOR FASTER JOINS
OPTIMIZE real_time_projects.ecommerce_historical.dim_customer
ZORDER BY (customer_id);

-- PRODUCT
-- ENABLE DELTA OPTIMIZATIONS
ALTER TABLE real_time_projects.ecommerce_historical.dim_product
SET TBLPROPERTIES (
  delta.autoOptimize.optimizeWrite = true,
  delta.autoOptimize.autoCompact = true
);

-- Z-ORDERING FOR FASTER JOINS
OPTIMIZE real_time_projects.ecommerce_historical.dim_product
ZORDER BY (product_id);
