# Read bronze tables


In [2]:
from pyspark.sql import functions as F
from pyspark.sql.window import Window

StatementMeta(, d5fa5699-1848-4187-ba00-8d0c43aed288, 4, Finished, Available, Finished)

In [3]:
bronze_customers = spark.table("bronze_customers")
bronze_agents    = spark.table("bronze_agents")
bronze_policies  = spark.table("bronze_policies")
bronze_premiums  = spark.table("bronze_premiums")
bronze_claims    = spark.table("bronze_claims")

display(bronze_customers)

StatementMeta(, d5fa5699-1848-4187-ba00-8d0c43aed288, 5, Finished, Available, Finished)

SynapseWidget(Synapse.DataFrame, 6f733324-eb93-43b3-9b12-6d392e064096)

# Transform Customers

In [4]:
silver_customers = (
    bronze_customers
    .filter(F.col("customer_id").isNotNull())
    .withColumn("join_date", F.to_date("join_date"))
    .dropDuplicates(["customer_id"])
    .select(
        "customer_id",
        "full_name",
        "gender",
        "age",
        "city",
        "state",
        "risk_segment",
        "preferred_agent_id",
        "join_date"
    )
)

display(silver_customers)
print("Silver Customers:", silver_customers.count())

StatementMeta(, d5fa5699-1848-4187-ba00-8d0c43aed288, 6, Finished, Available, Finished)

SynapseWidget(Synapse.DataFrame, 00323eef-860d-43bb-aea8-43e6571e3501)

Silver Customers: 50000


In [5]:
(silver_customers
 .write.format("delta")
 .mode("overwrite")
 .saveAsTable("silver_customers"))

StatementMeta(, d5fa5699-1848-4187-ba00-8d0c43aed288, 7, Finished, Available, Finished)

# Transform Agents

In [6]:
silver_agents = (
    bronze_agents
    .filter(F.col("agent_id").isNotNull())
    .withColumn("join_date", F.to_date("join_date"))
    .dropDuplicates(["agent_id"])
    .select(
        "agent_id",
        "agent_name",
        "region",
        "channel",
        "join_date"
    )
)

display(silver_agents)

StatementMeta(, d5fa5699-1848-4187-ba00-8d0c43aed288, 8, Finished, Available, Finished)

SynapseWidget(Synapse.DataFrame, e77d083f-5db9-4c51-b5d9-e783a853a357)

In [8]:
(silver_agents
 .write.format("delta")
 .mode("overwrite")
 .saveAsTable("silver_agents"))

StatementMeta(, d5fa5699-1848-4187-ba00-8d0c43aed288, 10, Finished, Available, Finished)

# Transform Policies

In [9]:
silver_policies = (
    bronze_policies
    .filter(F.col("policy_id").isNotNull())
    .withColumn("start_date", F.to_date("start_date"))
    .withColumn("end_date", F.to_date("end_date"))
    .dropDuplicates(["policy_id"])
    .select(
        "policy_id",
        "customer_id",
        "agent_id",
        "policy_type",
        "plan_name",
        "policy_status",
        "start_date",
        "end_date",
        "sum_insured",
        "deductible",
        "premium_frequency"
    )
)

display(silver_policies)

StatementMeta(, d5fa5699-1848-4187-ba00-8d0c43aed288, 11, Finished, Available, Finished)

SynapseWidget(Synapse.DataFrame, a149f4c9-c3b3-4f07-86bc-d6bd5458465f)

In [10]:
(silver_policies
 .write.format("delta")
 .mode("overwrite")
 .saveAsTable("silver_policies"))

StatementMeta(, d5fa5699-1848-4187-ba00-8d0c43aed288, 12, Finished, Available, Finished)

# Transform Premiums

In [11]:
silver_premiums = (
    bronze_premiums
    .filter(F.col("payment_id").isNotNull())
    .withColumn("payment_date", F.to_date("payment_date"))
    .withColumn("premium_amount", F.col("premium_amount").cast("double"))
    .dropDuplicates(["payment_id"])
    .select(
        "payment_id",
        "policy_id",
        "customer_id",
        "agent_id",
        "payment_date",
        "premium_amount",
        "payment_status",
        "payment_method",
        "currency"
    )
)

display(silver_premiums)

StatementMeta(, d5fa5699-1848-4187-ba00-8d0c43aed288, 13, Finished, Available, Finished)

SynapseWidget(Synapse.DataFrame, 7bb93316-a887-4273-b02a-3d2c5ee42602)

In [12]:
(silver_premiums
 .write.format("delta")
 .mode("overwrite")
 .saveAsTable("silver_premiums"))

StatementMeta(, d5fa5699-1848-4187-ba00-8d0c43aed288, 14, Finished, Available, Finished)

# Transform claims

In [13]:
silver_claims = (
    bronze_claims
    .filter(F.col("claim_id").isNotNull())
    .withColumn("claim_date", F.to_date("claim_date"))
    .withColumn("claim_amount", F.col("claim_amount").cast("double"))
    .withColumn("approved_amount", F.col("approved_amount").cast("double"))
    .dropDuplicates(["claim_id"])
    .select(
        "claim_id",
        "policy_id",
        "customer_id",
        "agent_id",
        "claim_date",
        "claim_status",
        "claim_reason",
        "claim_amount",
        "approved_amount",
        "days_to_settle"
    )
)

display(silver_claims)

StatementMeta(, d5fa5699-1848-4187-ba00-8d0c43aed288, 15, Finished, Available, Finished)

SynapseWidget(Synapse.DataFrame, ae6526a8-27ea-4684-84fc-b41d11d73967)

In [14]:
(silver_claims
 .write.format("delta")
 .mode("overwrite")
 .saveAsTable("silver_claims"))

StatementMeta(, d5fa5699-1848-4187-ba00-8d0c43aed288, 16, Finished, Available, Finished)

# Quality Checks

In [15]:
print("DQ CHECKS")

print("Customers null IDs:", silver_customers.filter("customer_id IS NULL").count())
print("Policies null IDs:", silver_policies.filter("policy_id IS NULL").count())
print("Premiums <= 0:", silver_premiums.filter("premium_amount <= 0").count())
print("Claims negative:", silver_claims.filter("claim_amount < 0").count())

StatementMeta(, d5fa5699-1848-4187-ba00-8d0c43aed288, 17, Finished, Available, Finished)

DQ CHECKS
Customers null IDs: 0
Policies null IDs: 0
Premiums <= 0: 0
Claims negative: 0
