In [0]:
# Import necessary functions and types
from pyspark.sql import functions as F
from pyspark.sql.types import *
from delta.tables import DeltaTable
from pyspark.sql.functions import monotonically_increasing_id
from pyspark.sql.window import Window

In [0]:
silver_path = "/mnt/Prajwal/Capstone_Project/silver/bankcustomer_source_clone1"
gold_path = "/mnt/Prajwal/Capstone_Project/Gold_clone/"


df_cust = spark.read.format("delta").load(silver_path)
display(df_cust)

In [0]:
# Adding surrogate keys for the customer dataframe starting with 1
customer_df = df_cust.withColumn("Customer_Key", (monotonically_increasing_id() + 1).cast(IntegerType()))
window_spec = Window.orderBy("Customer_Key")
customer_df = customer_df.withColumn("Customer_Key", F.row_number().over(window_spec))

display(customer_df)

In [0]:
# Deduplicate the source DataFrame based on customer_id
customer_df = customer_df.dropDuplicates(["customer_id"])

display(customer_df)

In [0]:
customer_df = customer_df.withColumn("Start_date", F.current_date().cast("date")) \
                .withColumn("End_date", F.lit(None).cast("date")) \
                .withColumn("Is_Active", F.lit("Y")) \
                .withColumn("status", F.lit("New"))

display(customer_df)

In [0]:
customer_df = customer_df.select('Customer_Key','customer_id', 'name', 'city', 'phone_no', 'area_code', 'maritial_status','gender','DOB','age','email','Start_date','End_date',  'Is_Active','status')



# Append the new records to the Delta table
customer_df.write.format("delta").mode("overwrite").save(gold_path + "dim_customer")


In [0]:
dim_customer_df = spark.read.format("delta").load(gold_path + "dim_customer")
dim_customer_df = dim_customer_df.dropDuplicates()
dim_customer_df.write.format("delta").mode("overwrite").save(gold_path + "dim_customer")
display(dim_customer_df)

In [0]:
spark.sql("DROP TABLE IF EXISTS Prajwal.customer_dim")

In [0]:
# Create the table if it does not exist
spark.sql(f"""
    CREATE TABLE IF NOT EXISTS Prajwal.customer_dim 
    USING DELTA 
    LOCATION '/mnt/Prajwal/Capstone_Project/Gold_clone/dim_customer'
""")

# Display the contents of the table
display(spark.sql("SELECT * FROM Prajwal.customer_dim"))

In [0]:
# Display the contents of the table
display(spark.sql("SELECT * FROM Prajwal.customer_dim order by Customer_Key"))