In [0]:
silver_path = "/mnt/Prajwal/Retail_sales_usecase/Silver/SilverCInsights"

source_df = spark.read.format("delta").load(silver_path)
display(source_df)

In [0]:
source_df = source_df.withColumn("customer_id", source_df["customer_id"].cast("integer"))


In [0]:
"""
/* Here’s the referential integrity check for dim_customer_insights to ensure all customer_id values exist in dim_customer before merging

This query identifies orphaned records in stg_dim_customer_insights where customer_id does not exist in dim_customer.

If any results appear, it means there are customer_id values in stg_dim_customer_insights that do not have a matching entry in dim_customer.

*/

--  Investigate the Missing Customer
/* Possible reasons:
The customer was never inserted into dim_customer.
The customer was deleted from dim_customer but still exists in insights.
The customer exists with is_current = FALSE (historical record only).
*/ """

In [0]:
silver_path = "/mnt/Prajwal/Retail_sales_usecase/Silver/SilverCInsights"
gold_path = "/mnt/Prajwal/Retail_sales_usecase/GoldCustomerDimension"


silver_customer_insights = source_df
gold_customer = spark.read.format("delta").load(gold_path)


# Perform left join to find customer_id in silver CustomerInsights data not present in gold Customer table
missing_customers = silver_customer_insights.join(gold_customer, on="customer_id", how="left").filter(gold_customer["customer_id"].isNull()).select(silver_customer_insights["customer_id"])


display(missing_customers)

In [0]:
# Ignore the Orphaned Record (Safe Option)

# Filter out records that have a matching customer_id in the Gold table using a left_semi join
valid_customers_df = silver_customer_insights.join(
    gold_customer,
    on="customer_id",
    how="left_semi",
)

# Print the count of valid records
print("Total valid records in Customer Insights which has matching entry in Customer Gold table:", valid_customers_df.count())


In [0]:
# Customer Insights Gold Table

from pyspark.sql import functions as F
from pyspark.sql.types import StructType, StructField, IntegerType, StringType, DoubleType, TimestampType, BooleanType
from delta.tables import DeltaTable
from pyspark.sql.functions import current_timestamp, lit

# Define the schema for Customer Insights Dimension (without ingestion_timestamp)
insights_schema = StructType([
    StructField("customer_id", IntegerType(), True),
    StructField("shopping_frequency", IntegerType(), True),
    StructField("total_spent", DoubleType(), True),
    StructField("lifetime_value", DoubleType(), True),
    StructField("page_views", IntegerType(), True),
    StructField("time_spent_online", DoubleType(), True),
    StructField("cart_dropout_rate", DoubleType(), True),
    StructField("avg_order_gap_days", DoubleType(), True),
    StructField("total_orders", IntegerType(), True),
    StructField("customer_segment", StringType(), True),
    StructField("start_date", TimestampType(), True),
    StructField("end_date", TimestampType(), True),
    StructField("is_active", BooleanType(), True)
])

In [0]:
# Transform the data to derive additional fields
# Convert order_frequency into numeric values (Seldom = 1, Monthly = 12, Weekly = 52)
silver_customer_insights = valid_customers_df.withColumn(
    "order_frequency_numeric", 
    F.when(F.col("order_frequency") == "Seldom", 1)
     .when(F.col("order_frequency") == "Monthly", 12)
     .when(F.col("order_frequency") == "Weekly", 52)
     .otherwise(0)
)

In [0]:
# Calculate avg_order_gap_days (365 days divided by the numeric order frequency)
silver_customer_insights = silver_customer_insights.withColumn(
    "avg_order_gap_days", 
    365 / F.col("order_frequency_numeric")
)

# Derive total_orders (using order_frequency_numeric, assuming 12 orders per year for each frequency)
silver_customer_insights = silver_customer_insights.withColumn(
    "total_orders", 
    F.col("order_frequency_numeric") * 12
)

In [0]:
# Assign customer segments based on behavior
source_df = silver_customer_insights.withColumn(
    "customer_segment", 
    F.when(F.col("order_frequency_numeric") > 10, F.lit("High Frequency"))
     .when(F.col("average_order_value").cast("double") > 1000, F.lit("High Value"))
     .otherwise(F.lit("Low Value"))
)

# Ensure total_spent and lifetime_value are cast to double for consistency in calculations
source_df = source_df.withColumn("total_spent", F.col("average_order_value").cast("double"))
source_df = source_df.withColumn("lifetime_value", F.col("customer_lifetime_value").cast("double"))

# Ensure total_spent and lifetime_value are cast to double for consistency in calculations
source_df = source_df.withColumn("total_spent", F.col("average_order_value").cast("double"))
source_df = source_df.withColumn("lifetime_value", F.col("customer_lifetime_value").cast("double"))

display(source_df)

In [0]:
%python
from pyspark.sql.functions import current_timestamp, lit
from pyspark.sql.types import TimestampType
from delta.tables import DeltaTable

# Load Gold dimension table
dim_table = DeltaTable.forPath(spark, gold_path)

# SCD Type 2 merge condition: match on customer_id and ensure the existing record is active
merge_condition = "dim.customer_id = src.customer_id AND dim.Is_Active = true"

# Define actions for update and insert during the merge process
update_action = {
    "Effective_End_Dt": current_timestamp(),
    "Is_Active": lit(False)
}

insert_action = {
    "customer_id": "src.customer_id",
    "first_name": "src.first_name",
    "last_name": "src.last_name",
    "email": "src.email",
    "gender": "src.gender",
    "address": "src.address",
    "city": "src.city",
    "state": "src.state",
    "country": "src.country",
    "zipcode": "src.zipcode",
    "contact_no": "src.contact_no",
    "registration_date": "src.registration_date",
    "membership_status": "src.membership_status",
    "ingest_time": "src.ingest_time",
    "Effective_Start_Dt": current_timestamp(),
    "Effective_End_Dt": lit(None).cast(TimestampType()),
    "Is_Active": lit(True),
    "status": "src.status",
    "surrogate_key": "src.surrogate_key"
}

# Perform the SCD Type 2 merge operation
dim_table.alias("dim").merge(
    source=gold_customer.alias("src"),
    condition=merge_condition
).whenMatchedUpdate(set=update_action) \
 .whenNotMatchedInsert(values=insert_action) \
 .execute()

# Show the result after merge schema true
display(spark.read.format("delta").option("mergeSchema", "true").load(gold_path))