# **Step-by-step Instructional for SCD Type 1, 2, and 3**

**Step 1: Setup - Generate Sample Data Using Faker in Python**

Run this Python code in a notebook cell to generate fake customer data and save it as a Delta table.

In [0]:
%pip install faker

from faker import Faker
import pandas as pd

fake = Faker()

# Generate 100 fake customers with id, name, address, and phone
data = []
for i in range(1, 101):
    data.append({
        "customer_id": i,
        "name": fake.name(),
        "address": fake.address().replace('\n', ', '),
        "phone": fake.phone_number()
    })

df = pd.DataFrame(data)

# Save to Delta format as initial baseline
spark.createDataFrame(df).write.format("delta").mode("overwrite").saveAsTable("customer_dim_base")

# Display the table using SQL
display(
    spark.sql(
        "SELECT * FROM customer_dim_base"
    )
)

Step 2: Create initial dimension table with SCD Type 1 (Overwrite)

In [0]:
%sql
CREATE OR REPLACE TABLE customer_dim_scd1 AS
SELECT * FROM customer_dim_base;

SELECT * FROM customer_dim_scd1;

Step 3: Simulate an update source with changed addresses for some customers

Run Python to generate updated data with address changes

In [0]:
import random

updated_data = []
for i in range(1, 101):
    # 20% get updated address
    if random.random() < 0.2:
        new_addr = fake.address().replace('\n', ', ')
    else:
        new_addr = df.loc[i-1, 'address']
    updated_data.append({
        "customer_id": i,
        "name": df.loc[i-1, 'name'],
        "address": new_addr,
        "phone": df.loc[i-1, 'phone']
    })

updated_df = pd.DataFrame(updated_data)
spark.createDataFrame(updated_df).createOrReplaceTempView("customer_dim_updates")

# Show updates by joining to existing table
display(spark.sql("""
    SELECT 
        base.customer_id,
        base.name AS old_name,
        base.address AS old_address,
        base.phone AS old_phone,
        updates.name AS new_name,
        updates.address AS new_address,
        updates.phone AS new_phone
    FROM customer_dim_base base
    INNER JOIN customer_dim_updates updates
        ON base.customer_id = updates.customer_id
    WHERE base.address <> updates.address
"""))

Step 4: SCD Type 1 update logic: overwrite existing rows

In [0]:
%sql
MERGE INTO customer_dim_scd1 AS target
USING customer_dim_updates AS source
ON target.customer_id = source.customer_id
WHEN MATCHED AND (
  target.name <> source.name OR
  target.address <> source.address OR
  target.phone <> source.phone
) THEN
  UPDATE SET
    name = source.name,
    address = source.address,
    phone = source.phone;

Step 5: Create base SCD Type 2 table with versioning columns

In [0]:
%sql
CREATE OR REPLACE TABLE customer_dim_scd2 (
  customer_id INT,
  name STRING,
  address STRING,
  phone STRING,
  start_date DATE,
  end_date DATE,
  current_flag BOOLEAN
)
USING DELTA;

Step 6: Insert initial records for SCD Type 2

In [0]:
%sql
INSERT INTO customer_dim_scd2
SELECT customer_id, name, address, phone, current_date() AS start_date, DATE('9999-12-31') AS end_date, TRUE AS current_flag
FROM customer_dim_base;

Step 7: SCD Type 2 MERGE logic for changes detection and history preservation

In [0]:
%sql
MERGE INTO customer_dim_scd2 AS target
USING (
  SELECT 
    customer_id AS merge_key,
    name,
    address,
    phone,
    current_date() AS effective_date
  FROM customer_dim_updates

  UNION ALL

  SELECT 
    NULL AS merge_key,
    u.name,
    u.address,
    u.phone,
    current_date() AS effective_date
  FROM customer_dim_updates u
  INNER JOIN customer_dim_scd2 t
    ON u.customer_id = t.customer_id
  WHERE t.current_flag = TRUE
    AND (
      t.name <> u.name OR
      t.address <> u.address OR
      t.phone <> u.phone
    )
) AS staged_updates
ON target.customer_id = staged_updates.merge_key

WHEN MATCHED AND target.current_flag = TRUE AND (
  target.name <> staged_updates.name OR
  target.address <> staged_updates.address OR
  target.phone <> staged_updates.phone
) THEN
  UPDATE SET
    end_date = staged_updates.effective_date,
    current_flag = FALSE

WHEN NOT MATCHED THEN
  INSERT (
    customer_id, name, address, phone, start_date, end_date, current_flag
  )
  VALUES (
    staged_updates.merge_key, staged_updates.name, staged_updates.address, staged_updates.phone, staged_updates.effective_date, DATE('9999-12-31'), TRUE
  )
;

In [0]:
display(spark.sql("SELECT * FROM customer_dim_scd2"))

Step 8: SCD Type 3 example: Limited history with old value columns

Start with a base SCD3 table including old address and phone

In [0]:
%sql
CREATE OR REPLACE TABLE customer_dim_scd3 (
  customer_id INT,
  name STRING,
  address STRING,
  old_address STRING,
  phone STRING,
  old_phone STRING
)
USING DELTA;

Step 9: Insert initial data for SCD3

In [0]:
%sql
INSERT INTO customer_dim_scd3
SELECT customer_id, name, address, NULL AS old_address, phone, NULL AS old_phone
FROM customer_dim_base;

Step 10: SCD Type 3 Merge example: update current + store old values on change

In [0]:
%sql
MERGE INTO customer_dim_scd3 AS target
USING customer_dim_updates AS source
ON target.customer_id = source.customer_id
WHEN MATCHED AND (
    (target.address IS DISTINCT FROM source.address) OR
    (target.phone IS DISTINCT FROM source.phone)
) THEN
  UPDATE SET
    old_address = target.address,
    address = source.address,
    old_phone = target.phone,
    phone = source.phone
WHEN NOT MATCHED THEN
  INSERT (customer_id, name, address, old_address, phone, old_phone)
  VALUES (source.customer_id, source.name, source.address, NULL, source.phone, NULL);

In [0]:
display(spark.sql("SELECT * FROM customer_dim_scd3"))

Drop all tables and Clean up resources

In [0]:
tables_to_drop = [
    "customer_dim_base",
    "customer_dim_updates",
    "customer_dim_scd1"
    "customer_dim_scd2",
    "customer_dim_scd3"
]

for tbl in tables_to_drop:
    spark.sql(f"DROP TABLE IF EXISTS {tbl}")

# Confirm deletion
display(spark.sql("""
    SELECT *
    FROM information_schema.tables
    WHERE table_name IN ('customer_dim_base','customer_dim_updates', 'customer_dim_scd1' 'customer_dim_scd2', 'customer_dim_scd3')
"""))