# 02_Silver_Feature_Engineering

In [0]:
# ============================================
# 02_Silver_Feature_Engineering.ipynb
# --------------------------------------------
# Purpose:
#   Clean Bronze data and engineer
#   cost-aware ML features for risk prediction
#   and decision optimization.
#
# Silver Principles:
#   - Schema enforcement
#   - Feature creation
#   - Business-aware transformations
#
# Output:
#   - silver_cost_aware_features
# ============================================

### Configuration & Imports

In [0]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import (
    col, when, log1p
)

In [0]:
spark = SparkSession.getActiveSession()

CATALOG = "cost_aware_capstone"
SCHEMA = "risk_decisioning"

In [0]:
BRONZE_TABLE = (
    "cost_aware_capstone.risk_decisioning."
    "bronze_cost_aware_cases"
)

SILVER_TABLE = (
    "cost_aware_capstone.risk_decisioning."
    "silver_cost_aware_features"
)

### Read Bronze Data

In [0]:
bronze_df = spark.table(BRONZE_TABLE)

bronze_df.printSchema()

In [0]:
display(bronze_df)

### Data Cleaning

In [0]:
clean_df = (
    bronze_df
    .filter(col("transaction_amount") > 0)
    .filter(col("investigation_cost") > 0)
    .filter(col("account_age_days") > 0)
)

### Feature Engineering

#### Log-scaled monetary features (stabilizes ML) 

In [0]:
feature_df = (
    clean_df
    .withColumn(
        "log_transaction_amount",
        log1p(col("transaction_amount"))
    )
    .withColumn(
        "log_fraud_loss_if_missed",
        log1p(col("fraud_loss_if_missed"))
    )
)

#### Risk intensity feature

In [0]:
feature_df = feature_df.withColumn(
    "behavioral_risk_score",
    (
        col("tx_velocity_24h") * 0.4 +
        col("unusual_location_flag") * 2.5 +
        col("device_change_flag") * 2.0
    )
)

#### Cost-aware feature

In [0]:
feature_df = feature_df.withColumn(
    "expected_loss_proxy",
    col("transaction_amount") * 0.05 +
    col("fraud_loss_if_missed") * 0.95
)

#### Final Silver Feature Selection

In [0]:
silver_df = (
    feature_df
    .select(
        "case_id",
        # ML features
        "log_transaction_amount",
        "tx_velocity_24h",
        "unusual_location_flag",
        "device_change_flag",
        "account_age_days",
        "behavioral_risk_score",
        "expected_loss_proxy",
        # cost fields
        "investigation_cost",
        "fraud_loss_if_missed",
        # label
        col("label_fraud").alias("label")
    )
)

-----

#### Write Silver Delta Table

In [0]:
(
    silver_df
    .write
    .format("delta")
    .mode("overwrite")
    .saveAsTable(SILVER_TABLE)
)

#### Validation & Preview

In [0]:
display(spark.sql(f"""
    SELECT COUNT(*) AS silver_row_count
    FROM {SILVER_TABLE}
"""))

In [0]:
display(spark.sql(f"""
    SELECT *
    FROM {SILVER_TABLE}
    LIMIT 5
"""))

----