# 02_silver_notebook_gswin

**Purpose**: Migrate GSwin Data container from Alteryx ETL Process V3 to Fabric PySpark.

**Source**: `APAC_CRM_Analytics_LH.src_gswin_crb` (Bronze)

**Output**: `APAC_Reporting_LH.clean_gswin_chloe` (Silver)

**Reference Tables** (all in `APAC_CRM_Analytics_LH`):
- `ref_Chloe_gswin_product_mapping` — Product / LOB mapping
- `ref_Chloe_insurer_mapping` — Insurer mapping (shared)
- `ref_Chloe_gswin_insurer_mapping` — Insurer country mapping

**Alteryx Tool Mapping**:
| Cell | Alteryx Tools |
| :--- | :--- |
| Cell 2 | Input (171), Select (173), Filter (174) |
| Cell 3 | Formula (172) — 13 expressions |
| Cell 4 | N/A — single stream, no union |
| Cell 5 | Product Join (175/176), Insurer Join (177), Insurer Country Join (178/179), Final Select (180) |
| Cell 6 | Output to Silver |

In [None]:
# =============================================================================
# Cell 1: Setup & Configuration
# =============================================================================
from pyspark.sql import functions as F
from pyspark.sql.types import StringType, DoubleType, DateType, IntegerType, LongType, FloatType, DecimalType
from pyspark.sql.utils import AnalysisException

# Lakehouse
BRONZE_LH = "APAC_CRM_Analytics_LH"
SILVER_LH = "APAC_Reporting_LH"

# Tables
SOURCE_TABLE = f"{BRONZE_LH}.src_gswin_crb"
TARGET_TABLE = f"{SILVER_LH}.clean_gswin_chloe"

# Reference tables
REF_PRODUCT = f"{BRONZE_LH}.ref_Chloe_gswin_product_mapping"
REF_INSURER = f"{BRONZE_LH}.ref_Chloe_insurer_mapping"
REF_INSURER_COUNTRY = f"{BRONZE_LH}.ref_Chloe_gswin_insurer_mapping"

In [None]:
# =============================================================================
# Cell 2: Load Bronze Data + Filter + Data Type Check
# =============================================================================
try:
    df = spark.sql(f"SELECT * FROM {SOURCE_TABLE}")
except AnalysisException:
    raise Exception(f"ERROR: Source table {SOURCE_TABLE} not found. Check Lakehouse name and table.")

# Filter: Willis Line = "CRB" (safety — Bronze name suggests pre-filtered)
if "Willis Line" in df.columns:
    df = df.filter(F.col("`Willis Line`") == "CRB")
    print("Applied Willis Line = CRB filter")
else:
    print("No 'Willis Line' column found — Bronze table likely pre-filtered to CRB")

print("=== SOURCE SCHEMA ===")
df.printSchema()
print("\n=== SOURCE COLUMNS ===")
print(df.columns)
print("\n=== SAMPLE DATA (first 3 rows) ===")
display(df.limit(3))
print(f"\n=== ROW COUNT: {df.count()} ===")

In [None]:
# =============================================================================
# Cell 3: Transformation Logic — Formula Tool 172 (13 expressions)
# =============================================================================

# --- Step 1: Force correct data types BEFORE transformations ---
# Bronze columns: Inception Date, Expiry Date, Issue Date (dates)
#                 Premium USD, Total Brokerage in USD (numerics)
df = (df
    .withColumn("Inception Date", F.col("`Inception Date`").cast(DateType()))
    .withColumn("Expiry Date", F.col("`Expiry Date`").cast(DateType()))
    .withColumn("Issue Date", F.col("`Issue Date`").cast(DateType()))
    .withColumn("Premium USD", F.coalesce(F.col("`Premium USD`"), F.lit(0.0)).cast(DoubleType()))
    .withColumn("Total Brokerage in USD", F.coalesce(F.col("`Total Brokerage in USD`"), F.lit(0.0)).cast(DoubleType()))
)

# --- Step 2: Formula expressions (order matters — sequential evaluation) ---
df = (df
    # 1. DATA SOURCE = "GSwin" (NEW)
    .withColumn("DATA SOURCE", F.lit("GSwin"))
    # 2. REVENUE COUNTRY = "Vietnam" (NEW)
    .withColumn("REVENUE COUNTRY", F.lit("Vietnam"))
    # 3. Policy Type Name = TRIM(UPPER(Policy Type Name)) (MODIFY)
    .withColumn("Policy Type Name", F.upper(F.trim(F.col("`Policy Type Name`"))))
    # 4. Insurer Name = TRIM(UPPER(Insurer Name)) (MODIFY)
    .withColumn("Insurer Name", F.upper(F.trim(F.col("`Insurer Name`"))))
    # 5. InsurerID = TRIM(UPPER(InsurerID)) (MODIFY)
    .withColumn("InsurerID", F.upper(F.trim(F.col("InsurerID"))))
    # 6. DUNS NUMBER = "UNKNOWN_GSWIN" (NEW)
    .withColumn("DUNS NUMBER", F.lit("UNKNOWN_GSWIN"))
    # 7. SYSTEM ID = "GSWin-" + ClientID (NEW)
    .withColumn("SYSTEM ID", F.concat(F.lit("GSWin-"), F.col("ClientID")))
    # 8. PARTY ID (WTW) = SYSTEM ID (NEW)
    .withColumn("PARTY ID (WTW)", F.col("`SYSTEM ID`"))
    # 9. REINSURANCE DESCRIPTION: IF Client Type = "Reinsurance" THEN "Reinsurance" ELSE "null" (NEW)
    .withColumn("REINSURANCE DESCRIPTION",
        F.when(F.col("`Client Type`") == "Reinsurance", "Reinsurance").otherwise("null"))
    # 10. POLICY DESCRIPTION = REINSURANCE DESCRIPTION (NEW)
    .withColumn("POLICY DESCRIPTION", F.col("`REINSURANCE DESCRIPTION`"))
    # 11. CLIENT ID (WTW): IF PARTY ID IS NULL THEN SYSTEM ID ELSE PARTY ID (NEW)
    .withColumn("CLIENT ID (WTW)",
        F.when(F.col("`PARTY ID (WTW)`").isNull(), F.col("`SYSTEM ID`"))
         .otherwise(F.col("`PARTY ID (WTW)`")))
    # 12. FINAL DATE = Inception Date (copy as Date) (NEW)
    .withColumn("FINAL DATE", F.col("`Inception Date`").cast(DateType()))
    # 13. BUSINESS TYPE = "UNKNOWN_GSWIN" (NEW)
    .withColumn("BUSINESS TYPE", F.lit("UNKNOWN_GSWIN"))
)

print("=== SCHEMA AFTER FORMULAS ===")
df.printSchema()
print("\n=== SAMPLE (first 3 rows) ===")
display(df.limit(3))

In [None]:
# =============================================================================
# Cell 4: Union & Unification — SKIPPED (single stream)
# =============================================================================
# GSwin has only one data stream (no Asia/London split like Eclipse).
# No union or column renaming needed at this stage.
# Proceeding directly to Cell 5 (Reference Joins).
print("Cell 4: Skipped — single stream, no union required.")

In [None]:
# =============================================================================
# Cell 5: Reference Joins + Final Select
# =============================================================================

# --- Load reference tables ---
try:
    df_product = spark.sql(f"SELECT * FROM {REF_PRODUCT}")
    df_insurer = spark.sql(f"SELECT * FROM {REF_INSURER}")
    df_insurer_country = spark.sql(f"SELECT * FROM {REF_INSURER_COUNTRY}")
except AnalysisException as e:
    print(f"WARNING: Reference table not found — {e}")
    raise

# --- Join 1: Product Mapping ---
# Key: Policy Type Name (already TRIM+UPPER'd in Cell 3)
# Brings in: Level 2 Mapping, GLOBs, GLOBS SPLIT P&C
df_product_ref = df_product.select(
    F.upper(F.trim(F.col("`Policy Type Name`"))).alias("_product_join_key"),
    F.col("`Level 2 Mapping`"),
    F.col("GLOBs"),
    F.col("`GLOBS SPLIT P&C`")
)

df = df.join(
    df_product_ref,
    F.trim(F.upper(df["`Policy Type Name`"])) == df_product_ref["_product_join_key"],
    "left"
).drop("_product_join_key")

# --- Join 2: Insurer Mapping ---
# Key: Insurer Name (TRIM+UPPER'd) → Insurer
# Brings in: MAPPED_INSURER, Lloyd's Asia or Lloyd's London
df_insurer_ref = df_insurer.select(
    F.upper(F.trim(F.col("Insurer"))).alias("_insurer_join_key"),
    F.col("MAPPED_INSURER"),
    F.col("`Lloyd's Asia or Lloyd's London`")
)

df = df.join(
    df_insurer_ref,
    F.trim(F.upper(df["`Insurer Name`"])) == df_insurer_ref["_insurer_join_key"],
    "left"
).drop("_insurer_join_key")

# --- Join 3: Insurer Country Mapping ---
# Key: InsurerID (TRIM+UPPER'd)
# Brings in: INSURER COUNTRY
df_country_ref = df_insurer_country.select(
    F.upper(F.trim(F.col("InsurerID"))).alias("_country_join_key"),
    F.col("`INSURER COUNTRY`")
)

df = df.join(
    df_country_ref,
    F.trim(F.upper(df["InsurerID"])) == df_country_ref["_country_join_key"],
    "left"
).drop("_country_join_key")

# --- Final Select: 28 columns with PascalCase aliases ---
# Column name mapping: Bronze actual → Alteryx output → PascalCase
# Numeric columns coalesced to 0 for null safety
df_final = df.select(
    F.col("`CLIENT ID (WTW)`").cast(StringType()).alias("ClientIdWtw"),
    F.col("`Client Name`").cast(StringType()).alias("ClientName"),
    F.col("`DATA SOURCE`").cast(StringType()).alias("DataSource"),
    F.col("`DUNS NUMBER`").cast(StringType()).alias("DunsNumber"),
    F.col("`Expiry Date`").cast(DateType()).alias("ExpiryDate"),
    F.col("`FINAL DATE`").cast(DateType()).alias("FinalDate"),
    F.col("GLOBs").cast(StringType()).alias("Globs"),
    F.col("`GLOBS SPLIT P&C`").cast(StringType()).alias("GlobsSplitPc"),
    F.col("`Inception Date`").cast(DateType()).alias("InceptionDate"),
    F.col("`INSURER COUNTRY`").cast(StringType()).alias("InsurerCountry"),
    F.col("`Insurer Name`").cast(StringType()).alias("InsurerName"),
    F.col("`Issue Date`").cast(DateType()).alias("InvoiceDate"),
    F.col("`Level 2 Mapping`").cast(StringType()).alias("SubProductClass"),
    F.col("`Lloyd's Asia or Lloyd's London`").cast(StringType()).alias("Lloyds"),
    F.col("ManagerID").cast(StringType()).alias("AccountHandler"),
    F.col("MAPPED_INSURER").cast(StringType()).alias("InsurerMapping"),
    F.col("`New/ _Renew`").cast(StringType()).alias("TransactionType"),
    F.col("`PARTY ID (WTW)`").cast(StringType()).alias("PartyIdWtw"),
    F.col("`POLICY DESCRIPTION`").cast(StringType()).alias("PolicyDescription"),
    F.col("`Policy Number`").cast(StringType()).alias("InvoicePolicyNumber"),
    F.col("`Policy Type`").cast(StringType()).alias("Department"),
    F.col("`Policy Type Name`").cast(StringType()).alias("SystemProductId"),
    F.coalesce(F.col("`Premium USD`"), F.lit(0.0)).cast(DoubleType()).alias("PremiumUsd"),
    F.col("`REINSURANCE DESCRIPTION`").cast(StringType()).alias("ReinsuranceDescription"),
    F.col("`REVENUE COUNTRY`").cast(StringType()).alias("RevenueCountry"),
    F.col("`SYSTEM ID`").cast(StringType()).alias("SystemId"),
    F.coalesce(F.col("`Total Brokerage in USD`"), F.lit(0.0)).cast(DoubleType()).alias("BrokerageUsd"),
    F.col("`BUSINESS TYPE`").cast(StringType()).alias("BusinessType")
)

print("=== FINAL SCHEMA ===")
df_final.printSchema()
print(f"\n=== FINAL ROW COUNT: {df_final.count()} ===")
print("\n=== FINAL SAMPLE (first 5 rows) ===")
display(df_final.limit(5))

In [None]:
# =============================================================================
# Cell 6: Write to Silver
# =============================================================================
print(f"Writing to {TARGET_TABLE}...")
df_final.write.format("delta").mode("overwrite").option("overwriteSchema", "true").saveAsTable(TARGET_TABLE)

print(f"Success. Rows written: {spark.table(TARGET_TABLE).count()}")
print(f"Columns: {len(spark.table(TARGET_TABLE).columns)}")
display(spark.table(TARGET_TABLE).limit(5))