In [0]:
# Create the schema 'retail' if it doesn't exist
spark.sql("CREATE SCHEMA IF NOT EXISTS retail")

In [0]:
# Import necessary functions and types
from pyspark.sql import functions as F
from pyspark.sql.types import *
from delta.tables import DeltaTable

# Define bronze and silver paths
silver_path = "/mnt/Prajwal/Retail_sales_usecase/Silver/SilverCDetails"
gold_path = "/mnt/Prajwal/Retail_sales_usecase/gold/"

In [0]:
# Read the data from the Silver (CDetails)
silver_df = spark.read.format("delta").load(silver_path)

# Convert customer_id to integer
silver_df = silver_df.withColumn("customer_id", silver_df["customer_id"].cast(IntegerType()))

# Drop ingestion_time column
silver_df = silver_df.drop("ingestion_time")

In [0]:
from pyspark.sql.types import StructType, StructField, StringType

# Define the schema for the Customer Dimension Initial Load (matching the source data with 14 columns)
schemaa = StructType([
    StructField("customer_id", IntegerType(), True),   #business key
    StructField("first_name", StringType(), True),
    StructField("last_name", StringType(), True),
    StructField("email", StringType(), True),
    StructField("gender", StringType(), True),
    StructField("address", StringType(), True),
    StructField("city", StringType(), True),
    StructField("state", StringType(), True),
    StructField("country", StringType(), True),
    StructField("zipcode", StringType(), True),
    StructField("contact_no", StringType(), True),
    StructField("registration_date", StringType(), True),
    StructField("membership_status", StringType(), True),
    StructField("ingest_time", StringType(), True)
])


In [0]:
# Create the DataFrame based on the schema
customer_df = spark.createDataFrame(silver_df.rdd, schemaa) \
    .withColumn("registration_date",F.to_date(F.col("registration_date"), "MM/dd/yyyy"))\
    .withColumn("Effective_Start_Dt", F.lit("2025-04-04").cast("date")) \
    .withColumn("Effective_End_Dt", F.lit(None).cast("date")) \
    .withColumn("Is_Active", F.lit("Y")) \
    .withColumn("status", F.lit("NI - Newly Inserted"))  # NI = Newly Inserted

In [0]:
# Add surrogate key for the table
customer_df = customer_df.withColumn("surrogate_key", F.monotonically_increasing_id())

In [0]:
# Write the initial data to a Delta table in the Gold path
customer_df.write.format("delta").mode("overwrite").save(gold_path + "CustomerDimension")

In [0]:
customer_gold_df = spark.read.format("delta").load(gold_path + "CustomerDimension")
display(customer_gold_df)

In [0]:
# Register the saved data as a table in the Delta catalog (using Gold path)
spark.sql(f"CREATE TABLE IF NOT EXISTS retail.CustomerDim USING DELTA LOCATION '{gold_path}CustomerDimension'")

print("Initial Customer data saved to Gold layer and table registered.")

In [0]:
customer_dimension_df = spark.table("retail.CustomerDim")
display(customer_dimension_df)

In [0]:
%sql

select count(*) from retail.CustomerDim;

In [0]:
customer_df.columns