In [0]:
# Databricks notebook: MAS610_Transform
# Author: Saritha 
# Purpose: Transform Accounts, Loans, Collateral into MAS610 Fact_Exposure (Bronze‚ÜíSilver‚ÜíGold)
# ------------------------------------------------------------------------------

from pyspark.sql import SparkSession, functions as F, types as T
from datetime import datetime

# -------------------------------------------------------
# üîπ 1. Notebook Parameters (ADF-friendly)
# -------------------------------------------------------
dbutils.widgets.text("input_dir", "/mnt/raw/mas610_ex2")
dbutils.widgets.text("output_dir", "/mnt/gold/mas610")
input_dir  = dbutils.widgets.get("input_dir")
output_dir = dbutils.widgets.get("output_dir")

spark = SparkSession.builder.appName("MAS610_Transform").getOrCreate()

print(f"üìÇ Input Path : {input_dir}")
print(f"üíæ Output Path: {output_dir}")

# -------------------------------------------------------
# üîπ 2. Read Bronze Layer Files
# -------------------------------------------------------
accounts_df   = spark.read.option("header", True).csv(f"{input_dir}/accounts.csv")
loans_df      = spark.read.option("header", True).csv(f"{input_dir}/loans.csv")
collateral_df = spark.read.option("header", True).csv(f"{input_dir}/collateral.csv")

print(f"‚úÖ Loaded {accounts_df.count()} Accounts, {loans_df.count()} Loans, {collateral_df.count()} Collateral records")

# -------------------------------------------------------
# üîπ 3. Silver Layer ‚Äì Clean & Standardize
# -------------------------------------------------------
loans_silver = (
    loans_df
    .withColumn("loan_id",      F.col("loan_id").cast("string"))
    .withColumn("customer_id",  F.col("customer_id").cast("string"))
    .withColumn("notional",     F.col("notional").cast("double"))
    .withColumn("currency",     F.upper(F.col("currency")))
    .fillna({"currency": "USD"})
)

accounts_silver = accounts_df.selectExpr(
    "customer_id",
    "account_type",
    "branch_code",
    "region"
)

collateral_silver = collateral_df.selectExpr(
    "loan_id",
    "collateral_type",
    "collateral_value"
).withColumn("collateral_value", F.col("collateral_value").cast("double"))

# -------------------------------------------------------
# üîπ 4. Gold Layer ‚Äì Join & Enrich ‚Üí Fact_Exposure
# -------------------------------------------------------
fact_exposure = (
    loans_silver
    .join(accounts_silver, "customer_id", "left")
    .join(collateral_silver, "loan_id", "left")
    .withColumn("collateral_value", F.col("collateral_value"))
    .withColumn(
        "risk_weight",
        F.when(F.col("collateral_value").isNull(), F.lit(100.0))  # unsecured loan
         .otherwise(F.lit(50.0))                                  # secured loan
    )
    .withColumn("EAD", F.col("notional"))
    .withColumn("RWA", F.round(F.col("EAD") * F.col("risk_weight") / 100, 2))
    .withColumn("secured_flag", F.when(F.col("collateral_value") > 0, 1).otherwise(0))
    .withColumn("reporting_date", F.lit(datetime.today().strftime("%Y-%m-%d")))
    .withColumn("source_system", F.lit("Databricks_Manual"))
)

display(fact_exposure.limit(5))

# -------------------------------------------------------
# üîπ 5. Data-Quality Validation (Quick checks)
# -------------------------------------------------------
total_count = fact_exposure.count()
null_customers = fact_exposure.filter(F.col("customer_id").isNull()).count()
bad_weights = fact_exposure.filter((F.col("risk_weight") < 0) | (F.col("risk_weight") > 150)).count()

print(f"üîç Total Records: {total_count}")
print(f"‚ùó Null Customer_IDs: {null_customers}")
print(f"‚ö†Ô∏è Out-of-Range Risk Weights: {bad_weights}")

if null_customers > 0 or bad_weights > 0:
    print("‚ùå DQ Validation Failed ‚Äî please review inputs.")
else:
    print("‚úÖ DQ Validation Passed.")

# -------------------------------------------------------
# üîπ 6. Persist Outputs (Delta + JSON + Parquet)
# -------------------------------------------------------
fact_exposure.write.mode("overwrite").format("delta").save(f"{output_dir}/fact_exposure.delta")
fact_exposure.write.mode("overwrite").json(f"{output_dir}/json")
fact_exposure.write.mode("overwrite").parquet(f"{output_dir}/parquet")

print("üíæ Output written successfully to Delta, JSON, and Parquet formats.")

# -------------------------------------------------------
# üîπ 7. Summary Query (MAS 610 Validation)
# -------------------------------------------------------
summary = (
    fact_exposure
    .groupBy("currency")
    .agg(
        F.sum("EAD").alias("total_ead"),
        F.sum("RWA").alias("total_rwa"),
        F.round(F.sum("RWA")/F.sum("EAD")*100,2).alias("avg_risk_pct")
    )
)
display(summary)

print("üèÅ MAS610 Transformation Completed Successfully.")
