In [0]:
from pyspark.sql import functions as F
from delta.tables import DeltaTable

In [0]:
%run ../01_init/03_utilities

In [0]:
dbutils.widgets.text("catalog", "fmcg", "Catalog")
dbutils.widgets.text("data_source", "customers", "Data Source")

In [0]:
catalog = dbutils.widgets.get("catalog")
data_source = dbutils.widgets.get("data_source")

base_path = (f"s3://xxxxx/{data_source}/*.csv")

In [0]:
df = (
    spark.read.format("csv")
    .option("header", True)
    .option("inferSchema", True)
    .load(base_path) 
    .withColumn("read_timestamp", F.current_timestamp())
    .select("*", "_metadata.file_name", "_metadata.file_size")
)

In [0]:

df.write\
    .format("delta")\
    .option("delta.enableChangeDataFeed", "True") \
    .mode("overwrite")\
    .saveAsTable(f"{catalog}.{bronze_schema}.{data_source}")

In [0]:
df_bronze = spark.sql(f"SELECT * FROM {catalog}.{bronze_schema}.{data_source};")

In [0]:
df_silver = df_bronze.dropDuplicates(["customer_id"])

In [0]:
df_silver = df_silver.withColumn("customer_name", F.trim(F.col("customer_name")))

In [0]:
city_mapping = {
    'Bengaluruu': 'Bengaluru',
    'Bengalore': 'Bengaluru',
    
    'Hyderabadd': 'Hyderabad',
    'Hyderbad': 'Hyderabad',

    'NewDelhee':'New Delhi',
    'NewDelhi':'New Delhi',
    'NewDheli':'New Delhi',
}


allowed = ["Bengaluru", "Hyderabad", "New Delhi"]

df_silver = (
    df_silver
        .replace(city_mapping, subset=["city"])
        .withColumn
            (
            "city",
            F.when(F.col("city").isNull(), None)
             .when(F.col("city").isin(allowed), F.col("city"))
             .otherwise(None)
            )
)

In [0]:
df_silver = df_silver.withColumn(
    "customer_name",
    F.when(F.col("customer_name").isNull(), None)
        .otherwise(F.initcap("customer_name"))
)


In [0]:
null_customer_names = ['Sprintx Nutrition', 'Zenathlete Foods', 'Primefuel Nutrition', 'Recovery Lane']

In [0]:
customer_city_fix = {
    789403: "New Delhi",
    789420: "Bengaluru",
    789521: "Hyderabad",
    789603: "Hyderabad",
}

df_fix = spark.createDataFrame(
    [(k, v) for k, v in customer_city_fix.items()],
    ["customer_id", "fixed_city"]
)

In [0]:
df_silver = (
    df_silver
    .join(
        df_fix,
        on="customer_id",
        how="left"
    )
    .withColumn(
        "city",
        F.coalesce(
            F.col("city"),
            F.col("fixed_city")
        )
    )
    .drop("fixed_city")
)

In [0]:
df_silver = df_silver.withColumn(
  "customer_id",
  F.col("customer_id").cast("string")
)

In [0]:
df_silver = (
  df_silver
  .withColumn(
    "customer",
    F.concat_ws("-", "customer_name", F.coalesce(F.col("city"), F.lit("Unknow")))
  )
  .withColumn("market", F.lit("India"))
  .withColumn("platform", F.lit("Sports Bar"))
  .withColumn("channel", F.lit("Acquisition"))
)


In [0]:
df_silver.write \
    .format("delta") \
    .option("delta.enableChangeDataFeed", "true") \
    .option("mergeSchema", "true") \
    .mode("overwrite") \
    .saveAsTable(f"{catalog}.{silver_schema}.{data_source}")


In [0]:
df_silver_to_gold = spark.sql(f"SELECT * FROM {catalog}.{silver_schema}.{data_source}")
df_gold = df_silver_to_gold.select("customer_id","customer_name","city","customer","market","platform","channel")


In [0]:
df_gold.write \
    .format("delta") \
    .option("delta.enableChangeDataFeed", "true") \
    .mode("overwrite") \
    .saveAsTable(f"{catalog}.{gold_schema}.sb_dim_{data_source}")


In [0]:
delta_table = DeltaTable.forName(spark, "fmcg.gold.dim_customers")
df_child_customers = spark.table("fmcg.gold.sb_dim_customers").select(
    F.col("customer_id").alias("customer_code"),
    "customer",
    "market",
    "platform",
    "channel"
)

In [0]:
df_unknown = spark.createDataFrame(
    [("999999", "Unknown Customer", "Unknown", "Unknown", "Unknown")],
    ["customer_code", "customer", "market", "platform", "channel"]
)

df_unknown = df_unknown.withColumn(
    "customer_code",
    F.col("customer_code").cast(df_child_customers.schema["customer_code"].dataType)
)

df_child_customers = df_child_customers.unionByName(df_unknown)

In [0]:
delta_table.alias("target").merge(
    source=df_child_customers.alias("source"),
    condition="target.customer_code = source.customer_code"
).whenMatchedUpdateAll().whenNotMatchedInsertAll().execute()