In [0]:
 #Referential Integrity Check Before Merge
# Before inserting into dim_product, check for duplicate or inconsistent records in stg_dim_product to prevent conflicts
# If results appear, it means multiple records exist for the same product_id.
#  Deduplicate or validate before merging.

from pyspark.sql import functions as F

# Load the staging data (replace with your actual staging path if needed)
product_df = spark.read.format("delta").load("/mnt/Prajwal/Retail_sales_usecase/Silver/SilverPDetails")

# Check for duplicate product_id
duplicate_product_ids = product_df.groupBy("product_id") \
    .agg(F.count("*").alias("count")) \
    .filter(F.col("count") > 1)

# Display duplicates
duplicate_product_ids.show(truncate=False)

In [0]:
from pyspark.sql import functions as F
from pyspark.sql.types import *
from pyspark.sql.functions import monotonically_increasing_id
from delta.tables import DeltaTable

# Paths
silver_path = "/mnt/Prajwal/Retail_sales_usecase/Silver/SilverPDetails"
gold_path = "/mnt/Prajwal/Retail_sales_usecase/Gold"

# Define the schema for the Product Dimension (without 'is_high_value_flag' and 'ingestion_time_formatted')
product_schema = StructType([
    StructField("product_id", IntegerType()),       # Business Key (Natural Key)
    StructField("product_name", StringType()),
    StructField("category", StringType()),
    StructField("brand", StringType()),
    StructField("in_stock", IntegerType()),         # Assuming in_stock is integer
    StructField("ingestion_time", TimestampType())  # Include the ingestion_time field from Silver schema
])


# Apply transformations: Add necessary columns
product_df = product_df.withColumn("Effective_Start_Dt", F.lit("2025-04-12").cast("date")) \
    .withColumn("Effective_End_Dt", F.lit(None).cast("date")) \
    .withColumn("Is_Active", F.lit("Y")) \
    .withColumn("status", F.lit("NI - Newly Inserted"))  # NI = Newly Inserted

display(product_df)

In [0]:
# Add Surrogate Key (using monotonically_increasing_id)
product_df = product_df.withColumn("Product_Key", monotonically_increasing_id().cast(IntegerType()))

In [0]:
# Write the data to a Delta table in the Gold path
product_df.write.format("delta").mode("overwrite").save(gold_path + "ProductDimension")

In [0]:
product_gold_df = spark.read.format("delta").load(gold_path + "ProductDimension")
display(product_gold_df)

In [0]:
# Register the saved data as a table in the Delta catalog (using Gold path)
spark.sql(f"CREATE TABLE IF NOT EXISTS retail.productdimension USING DELTA LOCATION '{gold_path}ProductDimension'")

print("Product data saved to Gold layer and table registered.")

In [0]:
%sql
select count(*) from retail.productdimension