In [0]:
from pyspark.sql import functions as F
from pyspark.sql.types import *
from pyspark.sql.functions import monotonically_increasing_id, col, count, row_number, lit, current_timestamp, coalesce, max
from pyspark.sql.window import Window
from delta.tables import DeltaTable

In [0]:
silver_path = "/mnt/mock_prajwal/Mock2/silver/"
gold_path = "/mnt/mock_prajwal/Mock2/gold/"

In [0]:
df = spark.read.format("delta").load(silver_path + "FF_Customer_Details_Day0")
df.printSchema()

In [0]:
sales_rep_dim = df.select("sales_rep_id", "sales_rep_name").dropDuplicates(['sales_rep_id'])
sales_rep_dim = sales_rep_dim.withColumn("sales_rep_key", row_number().over(Window.orderBy(col("sales_rep_id"))))

display(sales_rep_dim)

In [0]:
# Define the sales schema
sales_schema = StructType([
    StructField("sales_rep_id", IntegerType(), True),
    StructField("sales_rep_name", StringType(), True),
    StructField("sales_rep_key", IntegerType(), False)
])

# Create the schema if it does not exist
spark.sql("CREATE SCHEMA IF NOT EXISTS Prajwal_Mock")

# Create the table if it does not exist
spark.sql("""
    CREATE TABLE IF NOT EXISTS Prajwal_Mock.Dim_Sales_Rep (
        sales_rep_id INTEGER,
        sales_rep_name STRING,
        sales_rep_key INTEGER NOT NULL
    )
    USING DELTA 
    LOCATION "/mnt/mock_prajwal/Mock2/gold/Dim_Sales_rep"
""")

In [0]:
# # Check schema for sales_rep_dim DataFrame
# sales_rep_dim_schema = sales_rep_dim.schema

# # # Check schema for silver_df DataFrame
# # silver_df = spark.read.format("delta").load(silver_path + "FF_Customer_Details_Day0")
# # silver_df_schema = silver_df.schema

# # Check schema for Prajwal_Mock.Dim_Sales_Rep table
# dim_sales_rep_schema = spark.sql("DESCRIBE Prajwal_Mock.Dim_Sales_Rep")

# # Display schemas
# display(sales_rep_dim_schema)
# display(silver_df_schema)
# display(dim_sales_rep_schema)

In [0]:
sales_rep_dim.write.option("mergeSchema", "true").mode("overwrite").format("delta").save(gold_path + "Dim_Sales_rep")

In [0]:
# Count records in the gold layer
gold_count = sales_rep_dim.count()

# Assuming silver layer is stored in a DataFrame named silver_df
silver_df = spark.read.format("delta").load(silver_path + "FF_Customer_Details_Day0")
silver_count = silver_df.count()

# Display counts
display(spark.createDataFrame([(silver_count, gold_count)], ["Silver Layer Count", "Gold Layer Count"]))

In [0]:
# spark.sql("DROP TABLE IF EXISTS Prajwal_Mock.Dim_Sales_Rep")