# 02_silver_notebook_saiba

**Purpose**: Migrate Saiba Data container from Alteryx ETL Process V3 to Fabric PySpark.

**Source**: `APAC_CRM_Analytics_LH.src_Saiba_crb` (Bronze)

**Output**: `APAC_Reporting_LH.clean_saiba_chloe` (Silver)

**Reference Tables** (all in `APAC_CRM_Analytics_LH`):
- `ref_Chloe_saiba_product_mapping` — Product / LOB mapping
- `ref_Chloe_insurer_mapping` — Insurer mapping (shared)
- `ref_Chloe_asia_currency_mapping` — Currency exchange rates (INR → USD)

**Alteryx Tool Mapping**:
| Cell | Alteryx Tools |
| :--- | :--- |
| Cell 2 | Input (183), Select (214), Filter (185) |
| Cell 3 | Formula (184) — 15 expressions, Cleanse (281), Select (186) |
| Cell 4 | N/A — single stream, no union |
| Cell 5 | Product Join (187/188), Insurer Join (190), Currency Join (191), Currency Formula (192), Final Select (193) |
| Cell 6 | Output to Silver |

In [None]:
# =============================================================================
# Cell 1: Setup & Configuration
# =============================================================================
from pyspark.sql import functions as F
from pyspark.sql.types import StringType, DoubleType, DateType, IntegerType, LongType, FloatType, DecimalType
from pyspark.sql.utils import AnalysisException

# Lakehouse
BRONZE_LH = "APAC_CRM_Analytics_LH"
SILVER_LH = "APAC_Reporting_LH"

# Tables
SOURCE_TABLE = f"{BRONZE_LH}.src_Saiba_crb"
TARGET_TABLE = f"{SILVER_LH}.clean_saiba_chloe"

# Reference tables
REF_PRODUCT = f"{BRONZE_LH}.ref_Chloe_saiba_product_mapping"
REF_INSURER = f"{BRONZE_LH}.ref_Chloe_insurer_mapping"
REF_CURRENCY = f"{BRONZE_LH}.ref_Chloe_asia_currency_mapping"

In [None]:
# =============================================================================
# Cell 2: Load Bronze Data + Filter + Data Type Check
# =============================================================================
try:
    df = spark.sql(f"SELECT * FROM {SOURCE_TABLE}")
except AnalysisException:
    raise Exception(f"ERROR: Source table {SOURCE_TABLE} not found.")

# Filter: Department != "EB" (exclude Employee Benefit)
if "Department" in df.columns:
    df = df.filter(F.col("Department") != "EB")
    print("Applied Department != 'EB' filter")
else:
    print("WARNING: No 'Department' column found — check column names")

print("=== SOURCE SCHEMA ===")
df.printSchema()
print("\n=== SOURCE COLUMNS ===")
print(df.columns)
print("\n=== SAMPLE DATA (first 3 rows) ===")
display(df.limit(3))
print(f"\n=== ROW COUNT: {df.count()} ===")

In [None]:
# =============================================================================
# Cell 3: Transformation Logic
#   - Select 214: cast CustCode to string, drop Cust.Vertical
#   - Formula 184: 15 expressions
#   - Cleanse 281: uppercase Insurer Name (handled by Formula)
#   - Select 186: BizType → TRANSACTION TYPE
# =============================================================================

# --- Step 1: Select 214 — cast CustCode to string ---
df = df.withColumn("CustCode", F.col("CustCode").cast(StringType()))

# --- Step 2: Force correct data types BEFORE formulas ---
df = (df
    .withColumn("EntryDate", F.col("EntryDate").cast(DateType()))
    .withColumn("StartDate", F.col("StartDate").cast(DateType()))
    .withColumn("ExpiryDate", F.col("ExpiryDate").cast(DateType()))
    .withColumn("Brok Prem.", F.col("`Brok Prem.`").cast(DoubleType()))
    .withColumn("Brokerage", F.col("Brokerage").cast(DoubleType()))
)

# --- Step 3: Formula 184 — 15 expressions (order matters) ---
df = (df
    # 1. DATA SOURCE = "Saiba" (NEW)
    .withColumn("DATA SOURCE", F.lit("Saiba"))
    # 2. REVENUE COUNTRY = "India" (NEW)
    .withColumn("REVENUE COUNTRY", F.lit("India"))
    # 3. Policy Type = TRIM(UPPER(Policy Type)) (MODIFY)
    .withColumn("Policy Type", F.upper(F.trim(F.col("`Policy Type`"))))
    # 4. Insurer Name = UPPER(TRIM(Insurer)) (NEW — from Insurer column)
    .withColumn("Insurer Name", F.upper(F.trim(F.col("Insurer"))))
    # 5. DUNS NUMBER = "UNKNOWN_SAIBA" (NEW)
    .withColumn("DUNS NUMBER", F.lit("UNKNOWN_SAIBA"))
    # 6. SYSTEM ID = "Saiba-" + CustCode (NEW)
    .withColumn("SYSTEM ID", F.concat(F.lit("Saiba-"), F.col("CustCode")))
    # 7. PARTY ID (WTW) = SYSTEM ID (NEW)
    .withColumn("PARTY ID (WTW)", F.col("`SYSTEM ID`"))
    # 8. REINSURANCE DESCRIPTION = "Null" (NEW)
    .withColumn("REINSURANCE DESCRIPTION", F.lit("Null"))
    # 9. POLICY DESCRIPTION = "NuLL" (NEW)
    .withColumn("POLICY DESCRIPTION", F.lit("NuLL"))
    # 10. CLIENT ID (WTW) = SYSTEM ID (NEW)
    .withColumn("CLIENT ID (WTW)", F.col("`SYSTEM ID`"))
    # 11. FINAL YEAR = YEAR(EntryDate) (NEW — as string)
    .withColumn("FINAL YEAR", F.year(F.col("EntryDate")).cast(StringType()))
    # 12. FINAL DATE = EntryDate (NEW — copy as Date)
    .withColumn("FINAL DATE", F.col("EntryDate").cast(DateType()))
    # 13. BUSINESS TYPE = "Null" (NEW)
    .withColumn("BUSINESS TYPE", F.lit("Null"))
    # 14. INSURER COUNTRY = "India" (NEW)
    .withColumn("INSURER COUNTRY", F.lit("India"))
    # 15. CCYYEAR = "INR-" + FINAL YEAR (NEW)
    .withColumn("CCYYEAR", F.concat(F.lit("INR-"), F.col("`FINAL YEAR`")))
)

# --- Step 4: Select 186 — rename BizType → TRANSACTION TYPE ---
df = df.withColumnRenamed("BizType", "TRANSACTION TYPE")

print("=== SCHEMA AFTER FORMULAS ===")
df.printSchema()
print("\n=== SAMPLE (first 3 rows) ===")
display(df.limit(3))

In [None]:
# =============================================================================
# Cell 4: Union & Unification — SKIPPED (single stream)
# =============================================================================
# Saiba has only one data stream.
# No union or column renaming needed at this stage.
print("Cell 4: Skipped — single stream, no union required.")

In [None]:
# =============================================================================
# Cell 5: Reference Joins + Currency Conversion + Final Select
# =============================================================================

# --- Load reference tables ---
try:
    df_product = spark.sql(f"SELECT * FROM {REF_PRODUCT}")
    df_insurer = spark.sql(f"SELECT * FROM {REF_INSURER}")
    df_currency = spark.sql(f"SELECT * FROM {REF_CURRENCY}")
except AnalysisException as e:
    print(f"WARNING: Reference table not found — {e}")
    raise

# --- Join 1: Product Mapping (Tool 187) ---
# Key: Policy Type (already TRIM+UPPER'd in Cell 3)
# Brings in: Lvl2 Product Mapping, GLoBs, GLOBS SPLIT P&C
df_product_ref = df_product.select(
    F.upper(F.trim(F.col("`Policy Type`"))).alias("_product_join_key"),
    F.col("`Lvl2 Product Mapping`"),
    F.col("GLoBs"),
    F.col("`GLOBS SPLIT P&C`")
)

df = df.join(
    df_product_ref,
    F.trim(F.upper(df["`Policy Type`"])) == df_product_ref["_product_join_key"],
    "left"
).drop("_product_join_key")

# --- Join 2: Insurer Mapping (Tool 190) ---
# Key: Insurer (ORIGINAL source column, NOT Insurer Name)
# Brings in: MAPPED_INSURER, Lloyd's Asia or Lloyd's London
df_insurer_ref = df_insurer.select(
    F.upper(F.trim(F.col("Insurer"))).alias("_insurer_join_key"),
    F.col("MAPPED_INSURER"),
    F.col("`Lloyd's Asia or Lloyd's London`")
)

df = df.join(
    df_insurer_ref,
    F.trim(F.upper(df["Insurer"])) == df_insurer_ref["_insurer_join_key"],
    "left"
).drop("_insurer_join_key")

# --- Join 3: Currency Mapping (Tool 191) ---
# Key: CCYYEAR (= "INR-" + YEAR(EntryDate))
# Brings in: Value (exchange rate for USD conversion)
df_currency_ref = df_currency.select(
    F.upper(F.trim(F.col("CCYYEAR"))).alias("_currency_join_key"),
    F.col("Value")
)

df = df.join(
    df_currency_ref,
    F.trim(F.upper(df["CCYYEAR"])) == df_currency_ref["_currency_join_key"],
    "left"
).drop("_currency_join_key")

# --- Currency Formula (Tool 192) ---
# Convert INR to USD using the exchange rate Value
df = (df
    .withColumn("PREMIUM (USD)", (F.col("`Brok Prem.`") * F.col("Value")).cast(DoubleType()))
    .withColumn("BROKERAGE (USD)", (F.col("Brokerage") * F.col("Value")).cast(DoubleType()))
)

# --- Final Select: 28 columns with PascalCase aliases ---
df_final = df.select(
    F.col("`BROKERAGE (USD)`").cast(DoubleType()).alias("BrokerageUsd"),
    F.col("`BUSINESS TYPE`").cast(StringType()).alias("BusinessType"),
    F.col("`CLIENT ID (WTW)`").cast(StringType()).alias("ClientIdWtw"),
    F.col("CustName").cast(StringType()).alias("ClientName"),
    F.col("`DATA SOURCE`").cast(StringType()).alias("DataSource"),
    F.col("Department").cast(StringType()).alias("Department"),
    F.col("`DUNS NUMBER`").cast(StringType()).alias("DunsNumber"),
    F.col("EntryDate").cast(DateType()).alias("InvoiceDate"),
    F.col("ExpiryDate").cast(DateType()).alias("ExpiryDate"),
    F.col("`FINAL DATE`").cast(DateType()).alias("FinalDate"),
    F.col("GLoBs").cast(StringType()).alias("Globs"),
    F.col("`GLOBS SPLIT P&C`").cast(StringType()).alias("GlobsSplitPc"),
    F.col("`INSURER COUNTRY`").cast(StringType()).alias("InsurerCountry"),
    F.col("`Insurer Name`").cast(StringType()).alias("InsurerName"),
    F.col("`Lloyd's Asia or Lloyd's London`").cast(StringType()).alias("Lloyds"),
    F.col("`Lvl2 Product Mapping`").cast(StringType()).alias("SubProductClass"),
    F.col("MAPPED_INSURER").cast(StringType()).alias("InsurerMapping"),
    F.col("`PARTY ID (WTW)`").cast(StringType()).alias("PartyIdWtw"),
    F.col("`POLICY DESCRIPTION`").cast(StringType()).alias("PolicyDescription"),
    F.col("`Policy Type`").cast(StringType()).alias("SystemProductId"),
    F.col("PolicyNo").cast(StringType()).alias("InvoicePolicyNumber"),
    F.col("`PREMIUM (USD)`").cast(DoubleType()).alias("PremiumUsd"),
    F.col("`REINSURANCE DESCRIPTION`").cast(StringType()).alias("ReinsuranceDescription"),
    F.col("`REVENUE COUNTRY`").cast(StringType()).alias("RevenueCountry"),
    F.col("RMName").cast(StringType()).alias("AccountHandler"),
    F.col("StartDate").cast(DateType()).alias("InceptionDate"),
    F.col("`SYSTEM ID`").cast(StringType()).alias("SystemId"),
    F.col("`TRANSACTION TYPE`").cast(StringType()).alias("TransactionType")
)

print("=== FINAL SCHEMA ===")
df_final.printSchema()
print(f"\n=== FINAL ROW COUNT: {df_final.count()} ===")
print("\n=== FINAL SAMPLE (first 5 rows) ===")
display(df_final.limit(5))

In [None]:
# =============================================================================
# Cell 6: Write to Silver
# =============================================================================
print(f"Writing to {TARGET_TABLE}...")
df_final.write.format("delta").mode("overwrite").option("overwriteSchema", "true").saveAsTable(TARGET_TABLE)

print(f"Success. Rows written: {spark.table(TARGET_TABLE).count()}")
print(f"Columns: {len(spark.table(TARGET_TABLE).columns)}")
display(spark.table(TARGET_TABLE).limit(5))