In [0]:
from pyspark.sql import functions as f
from delta.tables import DeltaTable


In [0]:
%run /Workspace/Databricks_FMCG/setup/utilities


In [0]:
dbutils.widgets.text("catalog","fmcg","Catalog")
dbutils.widgets.text("data_source","customers","Data_Source")


In [0]:
catalog=dbutils.widgets.get("catalog")
data_source=dbutils.widgets.get("data_source")
print("Catalog name:",catalog," Data Source name:",data_source)

In [0]:
base_path=f's3://fmcg-child-sports-data/{data_source}/*.csv'
print(base_path)

### # **Bronze table creation**

In [0]:
#table structure not defined well 
df=spark.read.format("csv").load(base_path)
display(df.limit(10))

In [0]:
#defined table structure
df=(
spark.read.format("csv")
.option("header",True)
.option("inferSchema",True)
.load(base_path)
.withColumn("read_timestamp",f.current_timestamp())
.select("*", "_metadata.file_name", "_metadata.file_size")
)
display(df.limit(10))

In [0]:
df.printSchema()

In [0]:
df.write\
.option("delta.enableChangeDataFeed", "true")\
.mode("overwrite")\
.saveAsTable(f"{catalog}.{bronze_schema}.{data_source}")


### **Silver Table Creation**

In [0]:
df_bronze= spark.sql(f"select * from {catalog}.{bronze_schema}.{data_source};")
df_bronze.show(10)

In [0]:
#Transformations Cleaning data
# drop duplicates
df_dups=df_bronze.groupBy("customer_id").count().filter(f.col("count")>1)
display(df_dups) 

In [0]:
print("before dropping the duplicates:",df_bronze.count())
df_silver=df_bronze.dropDuplicates(["customer_id"])
print("after dropping the duplicates:",df_silver.count())



In [0]:
# Trimming additional spaces
df_trim= df_silver.filter(f.col("customer_name")!= f.trim(f.col("customer_name")))
display(df_trim)

In [0]:
df_silver=df_silver.withColumn("customer_name",f.trim(f.col("customer_name")))
display(df_trim)

In [0]:
#Making sure there are no extra spaces values in customer_name 
display(df_silver.filter(f.col("customer_name")!= f.trim(f.col("customer_name"))))


In [0]:
#Correcting the typos
df_silver.select("city").distinct().show()

In [0]:
#defining city dictionary

#city_dict={
 #   'Bengaluru':["Bengaluru","Bengalore","Bengaluruu"],
  #  'Hyderabad':["Hyderabad","Hyderabadd","Hyderbad"],
   # 'New Delhi':["New Delhi","NewDelhi","NewDelhee","NewDheli"] }

city_mapping={
    'Bengaluruu': 'Bengaluru',
    'Bengalore': 'Bengaluru',

    'Hyderabadd': 'Hyderabad',
    'Hyderbad': 'Hyderabad',

    'NewDelhi': 'New Delhi',
    'NewDheli': 'New Delhi',
    'NewDelhee': 'New Delhi'
}
allowed = ["Bengaluru", "Hyderabad", "New Delhi"]
df_silver=(df_silver
           .replace(city_mapping, subset=["city"])
           .withColumn("city",
                        f.when(f.col("city").isNull(),None)
                        .when(f.col("city").isin(allowed),f.col("city")) 
                        .otherwise (None)) )

In [0]:
display(df_silver)

In [0]:
df_silver.select('city').distinct().show()


In [0]:
# Capitalizing customer names

df_silver=df_silver.withColumn("customer_name",f.when(f.col("customer_name").isNull(),None)
                               .otherwise(f.initcap("customer_name")))

display(df_silver.select("customer_name").show())

In [0]:
# Removing null values from city by giving correct city name based on business fix 

null_customer_names = ['Sprintx Nutrition', 'Zenathlete Foods', 'Primefuel Nutrition', 'Recovery Lane']
df_silver.filter(f.col("customer_name").isin(null_customer_names)).show(truncate=False)

In [0]:
# Business Confirmation Note: City corrections confirmed by business team
customer_city_fix = {
    # Sprintx Nutrition
    789403: "New Delhi",

    # Zenathlete Foods
    789420: "Bengaluru",

    # Primefuel Nutrition
    789521: "Hyderabad",

    # Recovery Lane
    789603: "Hyderabad"
}

df_fix=spark.createDataFrame(
    [(k,v) for k, v in customer_city_fix.items()],["customer_id", "fixed_city"]
)

display(df_fix)

In [0]:
df_silver = (
    df_silver.join(df_fix, "customer_id", "left")
    .withColumn("city", f.coalesce(f.col("city"), f.col("fixed_city")))
    .drop("fixed_city")
)


In [0]:
null_customer_names = ['Sprintx Nutrition', 'Zenathlete Foods', 'Primefuel Nutrition', 'Recovery Lane']
df_silver.filter(f.col("customer_name").isin(null_customer_names)).show(truncate=False)

In [0]:
# Convert the customer_id typr from integer to string
df_silver= df_silver.withColumn("customer_id",f.col("customer_id").cast("string"))
print(df_silver.printSchema())

In [0]:
# Standardize the customers details to match parent data 

df_silver = (
    df_silver
    # Build final customer column: "CustomerName-City" or "CustomerName-Unknown"
    .withColumn(
        "customer",
        f.concat_ws("-", "customer_name", f.coalesce(f.col("city"), f.lit("Unknown")))
    )
    
    # Static attributes aligned with parent data model
    .withColumn("market", f.lit("India"))
    .withColumn("platform", f.lit("Sports Bar"))
    .withColumn("channel", f.lit("Acquisition"))
)

In [0]:
display(df_silver.limit(5))

In [0]:
# Writing into silver table

df_silver.write\
    .format("delta")\
    .option("delta.enableChangeDataFeed","true")\
    .option("mergeSchema","true")\
    .mode("overwrite")\
    .saveAsTable(f"{catalog}.{silver_schema}.{data_source}")


### **Gold Table Creation**

In [0]:
df_gold=df_silver.select("customer_id","customer_name","city","customer","market","platform","channel")

In [0]:
df_gold=df_gold.write\
    .format("delta")\
    .option("enableChangeDataFeed","true")\
    .mode("overwrite") \
    .saveAsTable(f"{catalog}.{gold_schema}.sb_dim_{data_source}")

## Merging Data source with parent

In [0]:
delta_table=DeltaTable.forName(spark,"fmcg.gold.dim_customers")
df_child_cust=spark.table("fmcg.gold.sb_dim_customers").select(f.col("customer_id").alias("customer_code"),
    "customer",
    "market",
    "platform",
    "channel")

In [0]:
delta_table.alias("target").merge(
    source=df_child_cust.alias("source"),
    condition="target.customer_code = source.customer_code"
).whenMatchedUpdateAll().whenNotMatchedInsertAll().execute()