# 02_silver_notebook_eclipse

**Purpose**: Migrate Eclipse Data from Alteryx ETL Process to Fabric PySpark.

**Sources**:
- `APAC_CRM_Analytics_LH.src_eclipse_crb` (Bronze — Asia Stream)
- `APAC_CRM_Analytics_LH.src_eclipse_london` (Bronze — London Stream)

**Output**: `APAC_Reporting_LH.clean_eclipse_chloe` (Silver)

**Reference Tables** (all in `APAC_CRM_Analytics_LH`):
- `ref_Chloe_insurer_mapping` — Insurer mapping (shared)
- `ref_Chloe_eclipse_product_mapping` — Product / LOB mapping
- `ref_Chloe_Transaction_type_mapping` — Transaction type mapping

**Alteryx Tool Mapping**:
| Cell | Alteryx Tools |
| :--- | :--- |
| Cell 2 | Input Asia (src_eclipse_crb), Input London (src_eclipse_london), stream filters |
| Cell 3 | Formula Asia (Tool 93), Formula London (Tool 134) |
| Cell 4 | Union, Select (Tool 131 — renames), Cleanse |
| Cell 5 | Insurer Join (Tools 101/102), Product Join (Tools 139/140), TransType Join, Final Select |
| Cell 6 | Output to Silver |

In [None]:
# =============================================================================
# Cell 1: Setup & Configuration
# =============================================================================
from pyspark.sql import functions as F
from pyspark.sql.types import StringType, DoubleType, DateType, IntegerType, LongType, FloatType, DecimalType
from pyspark.sql.utils import AnalysisException
import re

# Lakehouse
BRONZE_LH = "APAC_CRM_Analytics_LH"
SILVER_LH = "APAC_Reporting_LH"

# Tables
SOURCE_ASIA = f"{BRONZE_LH}.src_eclipse_crb"
SOURCE_LONDON = f"{BRONZE_LH}.src_eclipse_london"
TARGET_TABLE = f"{SILVER_LH}.clean_eclipse_chloe"

# Reference tables
REF_INSURER = f"{BRONZE_LH}.ref_Chloe_insurer_mapping"
REF_PRODUCT = f"{BRONZE_LH}.ref_Chloe_eclipse_product_mapping"
REF_TRANS = f"{BRONZE_LH}.ref_Chloe_Transaction_type_mapping"

In [None]:
# =============================================================================
# Cell 2: Load Bronze Data
# =============================================================================

# --- 1. Asia Stream (Eclipse Recurring Report) ---
try:
    df_asia = spark.sql(f"SELECT * FROM {SOURCE_ASIA}")
    df_asia = df_asia.withColumn("Origin_Stream", F.lit("Asia")) \
        .filter(~F.col("LegalEntity").isin("PT. Willis Reinsurance Brokers Indonesia", "Willis Towers Watson Taiwan Limited"))

    print("=== ASIA SOURCE SCHEMA ===")
    df_asia.printSchema()
    print(f"\n=== ASIA ROW COUNT: {df_asia.count()} ===")

except AnalysisException:
    raise Exception(f"ERROR: Source table {SOURCE_ASIA} not found.")

# --- 2. London Stream (Combined MIR files) ---
try:
    df_london = spark.sql(f"SELECT * FROM {SOURCE_LONDON}")
    df_london = df_london.withColumn("Origin_Stream", F.lit("London"))

    # London Filters (Source.Name based logic)
    exclude_12046 = ["China", "Hong Kong", "India", "Japan", "Malaysia", "New Zealand", "Philippines", "Republic of Korea", "Singapore", "Taiwan", "Thailand"]
    exclude_12047 = ["Bahrain", "Cyprus", "Georgia", "Jordan", "Kazakhstan", "Kuwait", "Oman", "Qatar", "Turkey", "United Arab Emirates"]

    # Condition: (Contains 12046 AND Country in Exclude List A) OR (Contains 12047 AND Country NOT in Exclude List B)
    cond_exclude = (
        (F.col("`Source.Name`").contains("12046") & F.col("UWCountry").isin(exclude_12046)) |
        (F.col("`Source.Name`").contains("12047") & ~F.col("UWCountry").isin(exclude_12047))
    )
    df_london = df_london.filter(~cond_exclude)

    print("=== LONDON SOURCE SCHEMA ===")
    df_london.printSchema()
    print(f"\n=== LONDON ROW COUNT: {df_london.count()} ===")

except AnalysisException:
    raise Exception(f"ERROR: Source table {SOURCE_LONDON} not found.")

In [None]:
# =============================================================================
# Cell 3: Transformation Logic (Replicating Alteryx Tools 93 & 134)
#   - Asia: REVENUE COUNTRY logic, BuTeam, FinalDate, PolicyDescription
#   - London: hardcoded REVENUE COUNTRY, FinalDate, PolicyDescription
# =============================================================================

# --- ASIA LOGIC (Alteryx Tool 93) ---
# REVENUE COUNTRY Logic:
cond_rev_country_asia = (
    F.when(F.col("BuTeam") == "Retail+Commercial", "Singapore")
    .when(F.col("BuTeam") == "Retail+Construction", "Singapore")
    .when(F.col("BuTeam").contains("Retail+Client Service Team"), "Singapore")
    .when(F.col("LegalEntity").contains("Hong Kong"), "Hong Kong")
    .when(F.col("LegalEntity").contains("Philippines"), "Philippines")
    .otherwise("Regional Specialism")
)

df_asia_trans = (df_asia
    .withColumn("DataSource", F.lit("Eclipse"))
    .withColumn("ClientId", F.coalesce(F.col("Willis Party ID"), F.col("InsuredID")))
    .withColumn("BuTeam", F.concat(F.col("BusinessUnit"), F.lit("+"), F.col("Team")))
    .withColumn("RevenueCountry", cond_rev_country_asia)
    .withColumn("ProductsToBeMapped", F.upper(F.concat(F.col("BusinessUnit"), F.lit("+"), F.col("Team"), F.lit("+"), F.col("ClassOfBusiness"))))
    .withColumn("UwClean", F.upper(F.trim(F.col("UW"))))
    .withColumn("FinalDate", F.when(F.col("TransRef") == "INVOICE DATE", F.col("CreatedDate")).otherwise(F.col("InceptionDate")))
    .withColumn("PolicyDescription", F.when(F.col("BusinessType") == "Reinsurance", F.lit("Reinsurance")).otherwise(F.lit("null")))
    .withColumn("ReinsuranceDescription", F.lit("null"))
)

# --- LONDON LOGIC (Alteryx Tool 134) ---
df_london_trans = (df_london
    .withColumn("DataSource", F.lit("Eclipse"))
    .withColumn("ClientId", F.coalesce(F.col("Willis Party ID"), F.col("InsuredID")))
    .withColumn("RevenueCountry", F.lit("London Placements"))
    .withColumn("ProductsToBeMapped", F.trim(F.upper(F.col("ClassOfBusiness"))))
    .withColumn("UwClean", F.upper(F.trim(F.col("UW"))))
    .withColumn("FinalDate", F.when(F.col("TransRef") == "INVOICE DATE", F.col("CreatedDate")).otherwise(F.col("InceptionDate")))
    .withColumn("PolicyDescription", F.when(F.col("BusinessType") == "Reinsurance", F.lit("Reinsurance")).otherwise(F.lit("null")))
    .withColumn("ReinsuranceDescription", F.lit("null"))
)

print("=== SCHEMA AFTER FORMULAS ===")
df_asia_trans.printSchema()

In [None]:
# =============================================================================
# Cell 4: Union & Unification (Replicating Alteryx Union + Select Tool 131)
# =============================================================================

df_unified = df_asia_trans.unionByName(df_london_trans, allowMissingColumns=True)

# Common Final Transform (Alteryx Select Tool 131 Rename)
# FIX: Drop existing 'Department' column if it exists to avoid ambiguity when renaming 'Team'
for c in [col_name for col_name in df_unified.columns if col_name.lower() == "department"]:
    df_unified = df_unified.drop(c)

df_renamed = (df_unified
    .withColumnRenamed("Team", "Department")
    .withColumnRenamed("PolicyRef", "InvoicePolicyNumber")
    .withColumnRenamed("Account Handler", "AccountHandler")
    .withColumnRenamed("CreatedDate", "InvoiceDate")
    .withColumnRenamed("Insured", "ClientName")
    .withColumnRenamed("UW", "InsurerName")
    .withColumnRenamed("UWCountry", "InsurerCountry")
    .withColumnRenamed("GrossBkgeUSDPlan", "BrokerageUsd")
    .withColumnRenamed("GrossPremNonTtyUSDPlan", "PremiumUsd")
    .withColumnRenamed("ClassOfBusiness", "SystemProductId")
    .withColumnRenamed("Willis Party ID", "PartyIdWtw")
    .withColumnRenamed("Dun and Bradstreet No", "DunsNumber")
    .withColumnRenamed("Revenue Type", "TransactionType")
)

print("=== SCHEMA AFTER UNION & RENAME ===")
df_renamed.printSchema()
print(f"\n=== UNIFIED ROW COUNT: {df_renamed.count()} ===")

In [None]:
# =============================================================================
# Cell 5: Reference Joins + Final Select
# =============================================================================

# --- Load reference tables ---
try:
    df_insurer = spark.sql(f"SELECT * FROM {REF_INSURER}")
    df_product = spark.sql(f"SELECT * FROM {REF_PRODUCT}")
    df_trans = spark.sql(f"SELECT * FROM {REF_TRANS}")
except AnalysisException as e:
    print(f"WARNING: Reference table not found — {e}")
    raise

# --- Join 1: Insurer Mapping (Tools 101/102) ---
# Key: InsurerName == Insurer
# Brings in: MAPPED_INSURER, Lloyd's Asia or Lloyd's London
df_insurer_ref = df_insurer.select(
    F.upper(F.trim(F.col("Insurer"))).alias("_insurer_join_key"),
    F.col("MAPPED_INSURER"),
    F.col("`Lloyd's Asia or Lloyd's London`")
)

df = df_renamed.join(
    df_insurer_ref,
    F.trim(F.upper(df_renamed["InsurerName"])) == df_insurer_ref["_insurer_join_key"],
    "left"
).drop("_insurer_join_key")

# --- Join 2: Product Mapping (Tools 139/140) ---
# Key: ProductsToBeMapped == Filter Fac Product
# Brings in: Level 2 Mapping, GLOBS, GLOBS SPLIT P&C
df_product_ref = df_product.select(
    F.upper(F.trim(F.col("`Filter Fac Product`"))).alias("_product_join_key"),
    F.col("`Level 2 Mapping`"),
    F.col("GLOBS"),
    F.col("`GLOBS SPLIT P&C`")
)

df = df.join(
    df_product_ref,
    F.trim(F.upper(df["ProductsToBeMapped"])) == df_product_ref["_product_join_key"],
    "left"
).drop("_product_join_key")

# --- Join 3: Transaction Type ---
# Key: TransactionType == TransType
df_trans_ref = df_trans.select(
    F.upper(F.trim(F.col("TransType"))).alias("_trans_join_key"),
    F.col("TransType")
)

df = df.join(
    df_trans_ref,
    F.trim(F.upper(df["TransactionType"])) == df_trans_ref["_trans_join_key"],
    "left"
).drop("_trans_join_key")

# --- Final Select: PascalCase aliases ---
df_final = df.select(
    F.col("BrokerageUsd").cast(DoubleType()).alias("BrokerageUsd"),
    F.col("BusinessType").cast(StringType()).alias("BusinessType"),
    F.col("ClientId").cast(StringType()).alias("ClientIdWtw"),
    F.col("ClientName").cast(StringType()).alias("ClientName"),
    F.col("DataSource").cast(StringType()).alias("DataSource"),
    F.col("Department").cast(StringType()).alias("Department"),
    F.col("DunsNumber").cast(StringType()).alias("DunsNumber"),
    F.col("InvoiceDate").cast(DateType()).alias("InvoiceDate"),
    F.col("ExpiryDate").cast(DateType()).alias("ExpiryDate"),
    F.col("FinalDate").cast(DateType()).alias("FinalDate"),
    F.col("GLOBS").cast(StringType()).alias("Globs"),
    F.col("`GLOBS SPLIT P&C`").cast(StringType()).alias("GlobsSplitPc"),
    F.col("InsurerCountry").cast(StringType()).alias("InsurerCountry"),
    F.col("InsurerName").cast(StringType()).alias("InsurerName"),
    F.col("`Lloyd's Asia or Lloyd's London`").cast(StringType()).alias("Lloyds"),
    F.col("`Level 2 Mapping`").cast(StringType()).alias("SubProductClass"),
    F.col("MAPPED_INSURER").cast(StringType()).alias("InsurerMapping"),
    F.col("PartyIdWtw").cast(StringType()).alias("PartyIdWtw"),
    F.col("PolicyDescription").cast(StringType()).alias("PolicyDescription"),
    F.col("SystemProductId").cast(StringType()).alias("SystemProductId"),
    F.col("InvoicePolicyNumber").cast(StringType()).alias("InvoicePolicyNumber"),
    F.col("PremiumUsd").cast(DoubleType()).alias("PremiumUsd"),
    F.col("ReinsuranceDescription").cast(StringType()).alias("ReinsuranceDescription"),
    F.col("RevenueCountry").cast(StringType()).alias("RevenueCountry"),
    F.col("AccountHandler").cast(StringType()).alias("AccountHandler"),
    F.col("InceptionDate").cast(DateType()).alias("InceptionDate"),
    F.col("InsuredID").cast(StringType()).alias("SystemId"),
    F.col("TransactionType").cast(StringType()).alias("TransactionType")
)

print("=== FINAL SCHEMA ===")
df_final.printSchema()
print(f"\n=== FINAL ROW COUNT: {df_final.count()} ===")
print("\n=== FINAL SAMPLE (first 5 rows) ===")
display(df_final.limit(5))

In [None]:
# =============================================================================
# Cell 6: Write to Silver
# =============================================================================

# --- Standardize column order across all silver notebooks ---
STANDARD_COLUMNS = [
    "AccountHandler", "BrokerageUsd", "BusinessType", "ClientIdWtw", "ClientName",
    "DataSource", "Department", "DunsNumber", "ExpiryDate", "FinalDate",
    "Globs", "GlobsSplitPc", "InceptionDate", "InsurerCountry", "InsurerMapping",
    "InsurerName", "InvoiceDate", "InvoicePolicyNumber", "Lloyds", "PartyIdWtw",
    "PolicyDescription", "PremiumUsd", "ReinsuranceDescription", "RevenueCountry",
    "SubProductClass", "SystemId", "SystemProductId", "TransactionType"
]
df_final = df_final.select(*STANDARD_COLUMNS)

print(f"Writing to {TARGET_TABLE}...")
df_final.write.format("delta").mode("overwrite").option("overwriteSchema", "true").saveAsTable(TARGET_TABLE)

print(f"Success. Rows written: {spark.table(TARGET_TABLE).count()}")
print(f"Columns: {len(spark.table(TARGET_TABLE).columns)}")
display(spark.table(TARGET_TABLE).limit(5))