In [0]:
# Databricks notebook source - changed
import dlt
from pyspark.sql.types import StructType, StructField, StringType, IntegerType


In [0]:
# Define schema
bank_schema = StructType([
    StructField("age", IntegerType(), True),
    StructField("job", StringType(), True),
    StructField("marital", StringType(), True),
    StructField("education", StringType(), True),
    StructField("default", StringType(), True),
    StructField("balance", IntegerType(), True),
    StructField("housing", StringType(), True),
    StructField("loan", StringType(), True),
    StructField("contact", StringType(), True),
    StructField("day", IntegerType(), True),
    StructField("month", StringType(), True),
    StructField("duration", IntegerType(), True),
    StructField("campaign", IntegerType(), True),
    StructField("pdays", IntegerType(), True),
    StructField("previous", IntegerType(), True),
    StructField("poutcome", StringType(), True),
    StructField("deposit", StringType(), True)
])



In [0]:
# COMMAND ----------

@dlt.table(
    name="bronze_bank_customers",
    comment="Raw CSV data ingested from S3 bucket",
)
def bronze_bank_customers():
    return (
        spark.readStream.format("cloudFiles")
        .option("cloudFiles.format", "csv")
        .option("header", "true")
        .schema(bank_schema)
        .load("s3://aws-databricks-data-bucket-atin/data/bank-customers-raw")
    )



In [0]:
# COMMAND ----------

@dlt.table(
    name="silver_bank_customers",
    comment="Cleaned and filtered customer data"
)
@dlt.expect_or_drop("valid_age", "age IS NOT NULL AND age > 50")
@dlt.expect("non_negative_balance", "balance >= 0")
@dlt.expect("valid_deposit_value", "has_deposit IN ('yes', 'no')")
def silver_bank_customers():
    df = dlt.read("bronze_bank_customers")
    
    # Example transformation: filter out rows with null age or negative balance
    return (
        df.filter("age IS NOT NULL AND balance >= 0")
           .withColumnRenamed("deposit", "has_deposit")
    )



In [0]:
# COMMAND ----------

@dlt.table(
    name="gold_deposit_summary",
    comment="Aggregated number of deposits by job"
)
@dlt.expect("valid_job_not_null", "job IS NOT NULL AND job != 'unknown'")
@dlt.expect("valid_deposit_count", "num_deposits >= 0")
def gold_deposit_summary():
    from pyspark.sql.functions import count, col
    
    df = dlt.read("silver_bank_customers")
    return (
        df.filter(col("has_deposit") == "yes")
          .groupBy("job")
          .agg(count("*").alias("num_deposits"))
          .orderBy("num_deposits", ascending=False)
    )
