In [0]:
from pyspark.sql import functions as F
from pyspark.sql.types import *
from pyspark.sql.functions import monotonically_increasing_id, col, count, row_number, lit, current_timestamp, coalesce, max
from pyspark.sql.window import Window
from delta.tables import DeltaTable

In [0]:
silver_path = "/mnt/mock_prajwal/Mock2/silver/"
gold_path = "/mnt/mock_prajwal/Mock2/gold/"

In [0]:
df = spark.read.format("delta").load(silver_path + "FF_Customer_Details_Day1")
df.printSchema()

In [0]:
df.count()

In [0]:
df_df = spark.read.format("delta").load(silver_path + "CustContact")
df_df.printSchema()

In [0]:
# Join df and df_df with left join
joined_df = df.join(df_df, on="customer_id", how="left")

display(joined_df)

In [0]:
from pyspark.sql.functions import col

df_selected = joined_df.select(
    col("customer_id"),
    col("customer_name"),
    col("customer_dob"),
    col("customer_marital_status"),
    col("gender"),
    col("email"),
    col("phone_number"),
    col("country"),
    col("region"),
    col("state"),
    col("city"),
    col("postal_code")
).dropDuplicates(['customer_id'])

df_selected = df_selected.withColumn("customer_key", row_number().over(Window.orderBy("customer_id")))

display(df_selected)

In [0]:
df_selected.printSchema()

In [0]:
from pyspark.sql.types import StructType, StructField, IntegerType, StringType, DateType

schema = StructType([
    StructField("customer_id", IntegerType(), True),
    StructField("customer_name", StringType(), True),
    StructField("customer_dob", DateType(), True),
    StructField("customer_marital_status", StringType(), True),
    StructField("gender", StringType(), True),
    StructField("email", StringType(), True),
    StructField("phone_number", StringType(), True),
    StructField("country", StringType(), True),
    StructField("region", StringType(), True),
    StructField("state", StringType(), True),
    StructField("city", StringType(), True),
    StructField("postal_code", StringType(), True),
    StructField("customer_key", IntegerType(), False),
    StructField("start_date", TimestampType(), False),
    StructField("end_date", TimestampType(), True),
    StructField("is_active", BooleanType(), False),
    StructField("last_modified", TimestampType(), False)
])

# Create the schema if it does not exist
spark.sql("CREATE SCHEMA IF NOT EXISTS Prajwal_Mock")

# Create the table with the specified schema
spark.sql("""
    CREATE TABLE IF NOT EXISTS Prajwal_Mock.Dim_Customer (
        customer_id INT,
        customer_name STRING,
        customer_dob DATE,
        customer_marital_status STRING,
        gender STRING,
        email STRING,
        phone_number STRING,
        country STRING,
        region STRING,
        state STRING,
        city STRING,
        postal_code STRING,
        customer_key INT NOT NULL,
        start_date TIMESTAMP NOT NULL,
        end_date TIMESTAMP,
        is_active BOOLEAN NOT NULL,
        last_modified TIMESTAMP NOT NULL
    )
        USING DELTA 
    LOCATION "/mnt/mock_prajwal/Mock2/gold/Dim_Customer_test"
""")

In [0]:
# Load target table
target_df = spark.read.table("Prajwal_Mock.Dim_Customer")

# Check if 'customer_key' column exists in the target table
if 'customer_key' in target_df.columns:
    # Finding max of previous data from table
    max_key = target_df.agg(coalesce(max("customer_key"), lit(0))).collect()[0][0]
else:
    max_key = 0

# Define window specification
window_spec = Window.orderBy("customer_id")

# Add row number and customer_key
source_df_keyed = df_selected \
    .withColumn("rn", row_number().over(window_spec)) \
    .withColumn("customer_key", col("rn") + lit(max_key)) \
    .drop("rn")

# Add audit columns to source
source_df_audit_col = source_df_keyed \
    .withColumn("start_date", current_timestamp()) \
    .withColumn("end_date", lit(None).cast("timestamp")) \
    .withColumn("is_active", lit(True)) \
    .withColumn("last_modified", current_timestamp())

# Convert target to DeltaTable
target_table = DeltaTable.forName(spark, "Prajwal_Mock.Dim_Customer")

# Merge condition (on business key and active rows)
merge_condition = "target.customer_id = source.customer_id AND target.is_active = true"

# Perform Expiration
target_table.alias("target") \
    .merge(source_df_audit_col.alias("source"), merge_condition) \
    .whenMatchedUpdate(
        condition="target.customer_name != source.customer_name OR "
                  "target.city != source.city OR "
                  "target.customer_marital_status != source.customer_marital_status OR "
                  "target.gender != source.gender OR "
                  "target.customer_dob != source.customer_dob",
        set={
            "end_date": current_timestamp(),
            "is_active": lit(False),
            "last_modified": current_timestamp()
        }
    ).execute()

# Insert records with new keys (new business keys that never existed)
updated_target_df = spark.read.table("Prajwal_Mock.Dim_Customer").filter("is_active=true").select("customer_id")
insert_df = source_df_audit_col.join(updated_target_df, on="customer_id", how="left_anti")

insert_df.write.format("delta").mode("append").save("/mnt/mock_prajwal/Mock2/gold/Dim_Customer_test")

# Ensure the schema matches the target table
# insert_df = insert_df.select(*[col for col in target_df.columns if col in insert_df.columns])

In [0]:
display(source_df_audit_col) # 3records

In [0]:
display(updated_target_df) #1130

In [0]:
display(insert_df)

In [0]:
display(insert_df)

In [0]:
display(spark.sql("select * from Prajwal_Mock.Dim_Customer"))

In [0]:
display(spark.read.format("delta").load("/mnt/mock_prajwal/Mock2/gold/Dim_Customer_test"))

In [0]:
display(spark.read.table("Prajwal_Mock.Dim_Customer"))

In [0]:
insert_df.write.format("delta").mode("overwrite").save("/mnt/mock_prajwal/Mock2/gold/Dim_Customer")
display(insert_df)

In [0]:
df = spark.read.format("delta").load("/mnt/mock_prajwal/Mock2/gold/Dim_Customer")
display(df)

In [0]:
# Check updated rows
updated_rows = target_table.toDF().filter("is_active = false AND end_date IS NOT NULL AND last_modified >= current_timestamp() - interval 1 day")

# Check inserted rows
inserted_rows = insert_df

# Count updated and inserted rows
updated_count = updated_rows.count()
inserted_count = inserted_rows.count()

# Display updated and inserted rows along with their counts
display(updated_rows)
display(inserted_rows)
display(spark.createDataFrame([(updated_count, inserted_count)], ["Updated Rows Count", "Inserted Rows Count"]))

In [0]:
# Drop the table if it exists
spark.sql("DROP TABLE IF EXISTS Prajwal_Mock.Dim_Customer")

In [0]:
# Count records in the gold layer
gold_count = spark.read.format("delta").load("/mnt/mock_prajwal/Mock2/gold/Dim_Customer").count()

# Assuming silver layer is stored in DataFrames named silver_df_day0 and silver_df_day1
silver_df_day0 = spark.read.format("delta").load(silver_path + "FF_Customer_Details_Day0")
silver_df_day1 = spark.read.format("delta").load(silver_path + "FF_Customer_Details_Day1")

silver_count_day0 = silver_df_day0.count()
silver_count_day1 = silver_df_day1.count()

# Display counts
display(spark.createDataFrame([(silver_count_day0, silver_count_day1, gold_count)], ["Silver Layer Day 0 Count", "Silver Layer Day 1 Count", "Gold Layer Count"]))

In [0]:
gold_df = spark.read.format("delta").load("/mnt/mock_prajwal/Mock2/gold/Dim_Customer")
display(gold_df)