In [0]:
# ============================================================================
# Transaction Fraud Detection ETL Pipeline
# Production-Grade Banking Fraud Analytics
# ============================================================================

from pyspark.sql.functions import (
    col, when, lit, concat, abs as spark_abs, round as spark_round,
    current_timestamp, hour, dayofweek, stddev, avg, count
)
from pyspark.sql import DataFrame
import logging

logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

logger.info("Starting Fraud Detection ETL Pipeline")
print("\n=== FRAUD DETECTION ETL PIPELINE ===")

# ============================================================================
# STAGE 1: EXTRACT - Load transaction data
# ============================================================================
print("\nSTAGE 1: EXTRACT")

transaction_data = [
    (1001, 101, "2024-01-15 10:30:00", 5000, "Online", "US"),
    (1002, 102, "2024-01-15 14:45:00", 15000, "ATM", "IN"),
    (1003, 101, "2024-01-15 22:30:00", 25000, "Online", "UK"),
    (1004, 103, "2024-01-15 03:15:00", 50000, "Online", "CN"),
    (1005, 102, "2024-01-15 11:00:00", 3000, "POS", "IN"),
    (1006, 104, "2024-01-15 23:45:00", 40000, "Online", "NG"),
]

transactions_df = spark.createDataFrame(
    transaction_data,
    ["transaction_id", "customer_id", "timestamp", "amount", "channel", "country"]
)

logger.info(f"Extracted {transactions_df.count()} transactions")
print(f"Extracted {transactions_df.count()} transaction records")

# ============================================================================
# STAGE 2: TRANSFORM - Fraud risk scoring
# ============================================================================
print("\nSTAGE 2: TRANSFORM")

fraud_scored_df = transactions_df.withColumn(
    "transaction_datetime", col("timestamp").cast("timestamp")
).withColumn(
    "hour",
    hour(col("transaction_datetime"))
).withColumn(
    "is_unusual_hour",
    when((col("hour") < 6) | (col("hour") > 22), 1).otherwise(0)
).withColumn(
    "is_high_amount",
    when(col("amount") > 20000, 1).otherwise(0)
).withColumn(
    "is_high_risk_country",
    when(col("country").isin(["CN", "NG", "BR", "RU"]), 1).otherwise(0)
).withColumn(
    "is_online_high_value",
    when((col("channel") == "Online") & (col("amount") > 10000), 1).otherwise(0)
).withColumn(
    "fraud_score",
    (col("is_unusual_hour") * 25 +
     col("is_high_amount") * 30 +
     col("is_high_risk_country") * 25 +
     col("is_online_high_value") * 20)
).withColumn(
    "fraud_flag",
    when(col("fraud_score") >= 50, "HIGH")
    .when(col("fraud_score") >= 30, "MEDIUM")
    .otherwise("LOW")
).withColumn(
    "processing_timestamp",
    current_timestamp()
)

logger.info("Fraud scoring completed")
print(f"Fraud scoring applied to {fraud_scored_df.count()} transactions")

# ============================================================================
# STAGE 3: LOAD - Fraud analytics and reports
# ============================================================================
print("\nSTAGE 3: LOAD")

# Fraud summary by risk level
fraud_summary = fraud_scored_df.groupBy("fraud_flag").agg(
    count("*").alias("transaction_count"),
    avg("amount").alias("avg_transaction_amount"),
    avg("fraud_score").alias("avg_fraud_score")
).withColumn("processing_date", current_timestamp())

logger.info("Fraud summary report created")
print("\nFraud Detection Summary:")
fraud_summary.show()

# High-risk transactions
high_risk_txns = fraud_scored_df.filter(col("fraud_flag") == "HIGH").select(
    "transaction_id", "customer_id", "amount", "fraud_score", "country"
)

print(f"\nHigh-risk transactions detected: {high_risk_txns.count()}")
print("\nHigh-Risk Transactions:")
high_risk_txns.show()

print("\n=== PIPELINE STATUS: SUCCESS ===")
logger.info("Fraud Detection ETL Pipeline completed successfully")