# Arias Silver Notebook
Migration of Alteryx Arias workflow to Fabric PySpark.
- **Source**: `src_arias_crb` (Japanese columns)
- **Output**: `clean_arias` in `APAC_Reporting_LH`

In [None]:
# Cell 1: Setup & Configuration
# -----------------------------
from pyspark.sql.functions import col, when, trim, upper, lit, current_date, coalesce, isnan, count, concat, expr, size, collect_set, regexp_replace, to_date, year
import pyspark.sql.functions as F
from pyspark.sql.types import IntegerType, DoubleType, DateType, StringType, LongType, FloatType, DecimalType
from pyspark.sql.utils import AnalysisException

# Helper: Cleanse DataFrame
print("Applying cleansing transformations...")

def cleanse_dataframe(df):
    print("Applying cleansing transformations...")
    
    string_cols = [f.name for f in df.schema.fields if isinstance(f.dataType, StringType)]
    numeric_cols = [f.name for f in df.schema.fields if isinstance(f.dataType, (IntegerType, DoubleType, LongType, FloatType, DecimalType))]

    cleansed_df = df

    for col_name in numeric_cols:
        cleansed_df = cleansed_df.withColumn(
            col_name,
            F.coalesce(F.col(f"`{col_name}`"), F.lit(0))
        )

    for col_name in string_cols:
        cleansed_df = cleansed_df.withColumn(
            col_name,
            F.upper(
                F.regexp_replace(
                    F.regexp_replace(
                        F.trim(
                            F.coalesce(F.col(f"`{col_name}`"), F.lit(''))
                        ),
                        r'[\t\n\r]', ''
                    ),
                    r'\s+', ' '
                )
            )
        )

    print("Cleansing finished.")
    return cleansed_df

In [None]:
# Cell 2: Load Bronze Data & Rename Columns
# -----------------------------------------
try:
    # Load src_arias_crb
    df_arias = spark.sql("SELECT * FROM APAC_CRM_Analytics_LH.src_arias_crb")
    
    # DEBUG: Print source schema and sample data to understand types
    print("=== SOURCE SCHEMA ===")
    df_arias.printSchema()
    print("\n=== SOURCE COLUMNS ===")
    print(df_arias.columns)
    print("\n=== SAMPLE DATA (first 3 rows) ===")
    display(df_arias.limit(3))
    
    # Rename Japanese columns to English
    df_arias_renamed = df_arias \
        .withColumnRenamed("請求書番号", "Invoice No.") \
        .withColumnRenamed("保険始期", "From") \
        .withColumnRenamed("保険終期", "To") \
        .withColumnRenamed("契約者名", "Name of Client") \
        .withColumnRenamed("保険会社名", "Insurer") \
        .withColumnRenamed("保険種類", "Class of Insurance") \
        .withColumnRenamed("保険料", "Premium") \
        .withColumnRenamed("手数料（税抜_", "Full Commission") \
        .withColumnRenamed("収益認識日", "Premium Receipt/Paid Date") \
        .withColumnRenamed("Policy No", "Policy No.") \
        .withColumnRenamed("チーム名", "Team") \
        .withColumnRenamed("ＡＥ名", "A/E") \
        .withColumnRenamed("Recurring", "Recurring/Non-Recurring") \
        .withColumnRenamed("６分類", "6分類")
    
    # DEBUG: Print renamed schema to verify types
    print("\n=== RENAMED SCHEMA ===")
    df_arias_renamed.printSchema()

except AnalysisException:
    print("WARNING: src_arias_crb not found in APAC_CRM_Analytics_LH. Creating empty dummy DF.")
    df_arias_renamed = spark.createDataFrame([], schema="`Invoice No.` string, `From` string, `To` string, `Name of Client` string, `Insurer` string, `Class of Insurance` string, `Premium` double, `Full Commission` double, `Premium Receipt/Paid Date` string, `Policy No.` string, `Team` string, `A/E` string, `Recurring/Non-Recurring` string, `6分類` string")

In [None]:
# Cell 3: Transformation Logic (Alteryx Tool 147 - Arias Data Formula)
# --------------------------------------------------------------------

# 0. Force correct types BEFORE any transformation
# Premium and Full Commission may come in as strings from Excel source
df_typed = df_arias_renamed \
    .withColumn("Premium", col("Premium").cast(DoubleType())) \
    .withColumn("Full Commission", col("`Full Commission`").cast(DoubleType()))

# 1. Date Conversion
# Source dates may be: yyyyMMdd numeric, yyyy-MM-dd string, or already DateType
# Try multiple formats with coalesce for robustness
df_transformed = df_typed \
    .withColumn("InvoiceDate", coalesce(
        to_date(col("Premium Receipt/Paid Date").cast(StringType()), "yyyyMMdd"),
        to_date(col("Premium Receipt/Paid Date").cast(StringType()), "yyyy-MM-dd"),
        col("Premium Receipt/Paid Date").cast(DateType())
    )) \
    .withColumn("InceptionDate", coalesce(
        to_date(col("From").cast(StringType()), "yyyyMMdd"),
        to_date(col("From").cast(StringType()), "yyyy-MM-dd"),
        col("From").cast(DateType())
    )) \
    .withColumn("ExpiryDate", coalesce(
        to_date(col("To").cast(StringType()), "yyyyMMdd"),
        to_date(col("To").cast(StringType()), "yyyy-MM-dd"),
        col("To").cast(DateType())
    )) \
    .withColumn("FinalDate", col("InceptionDate"))

# DEBUG: Check date parsing results
print("=== DATE PARSING CHECK ===")
df_transformed.select(
    col("Premium Receipt/Paid Date"), col("InvoiceDate"),
    col("From"), col("InceptionDate"),
    col("To"), col("ExpiryDate"),
    col("Premium"), col("Full Commission")
).show(5, truncate=False)

# 2. Hardcoded Fields (Alteryx Formula Tool 147)
df_transformed = df_transformed \
    .withColumn("DataSource", lit("Arias")) \
    .withColumn("RevenueCountry", lit("Japan")) \
    .withColumn("Segment", lit("null")) \
    .withColumn("InsurerCountry", lit("JAPAN")) \
    .withColumn("DunsNumber", lit("UNKNOWN-ARIAS")) \
    .withColumn("BusinessType", lit("UNKNOWN")) \
    .withColumn("PartyIdWtw", lit("UNKNOWN-ARIAS")) \
    .withColumn("Ccy", lit("JPY")) \
    .withColumn("ReinsuranceDescription", lit("null")) \
    .withColumn("PolicyDescription", lit("null")) \
    .withColumn("Department", lit("null")) \
    .withColumn("TransactionType",
        when(col("`Recurring/Non-Recurring`") == "R", lit("RENEWAL"))
        .when(col("`Recurring/Non-Recurring`") == "N", lit("NEW"))
        .otherwise(col("`Recurring/Non-Recurring`"))
    )

# 3. Derived Fields (Alteryx Formula Tool 147)
# CLIENT ID = [Name of Client]
# SYSTEM ID = [CLIENT ID] = [Name of Client]
# INCEPTION YEAR = DateTimeYear([INCEPTION DATE])
# CCYYEAR = "JPY" + "-" + [INCEPTION YEAR]
df_transformed = df_transformed \
    .withColumn("ClientIdWtw", col("Name of Client")) \
    .withColumn("SystemId", col("Name of Client")) \
    .withColumn("InceptionYear", year(col("InceptionDate")).cast(StringType())) \
    .withColumn("Ccyyear", concat(lit("JPY-"), col("InceptionYear")))

# 4. Rename remaining columns
df_transformed = df_transformed \
    .withColumnRenamed("Name of Client", "ClientName") \
    .withColumnRenamed("Insurer", "InsurerName") \
    .withColumnRenamed("Class of Insurance", "SystemProductId") \
    .withColumnRenamed("A/E", "AccountHandler") \
    .withColumnRenamed("Policy No.", "InvoicePolicyNumber")

# 5. Cleansing (before joins to ensure matches)
df_cleansed = cleanse_dataframe(df_transformed)

In [None]:
# Cell 4: Reference Joins
# -----------------------

# 1. Load Reference Tables
try:
    ref_currency = spark.sql("SELECT * FROM APAC_CRM_Analytics_LH.ref_Chloe_asia_currency_mapping")
    ref_product = spark.sql("SELECT * FROM APAC_CRM_Analytics_LH.ref_Chloe_arias_product_mapping")
    ref_insurer = spark.sql("SELECT * FROM APAC_CRM_Analytics_LH.ref_Chloe_insurer_mapping")
except:
    print("Reference tables missing. Skipping joins (Mock Mode).")
    ref_currency = None
    ref_product = None
    ref_insurer = None

# 2. Join Currency (on Ccyyear)
if ref_currency:
    df_joined_1 = df_cleansed.join(
        ref_currency, 
        df_cleansed["Ccyyear"] == ref_currency["CCYYEAR"], 
        "left"
    ).select(df_cleansed["*"], ref_currency["Value"].alias("Ccyvalue"))
else:
    df_joined_1 = df_cleansed.withColumn("Ccyvalue", lit(1.0))

# 3. Join Product (on SystemProductId == Class of Insurance)
if ref_product:
    df_joined_2 = df_joined_1.join(
        ref_product, 
        F.trim(F.upper(df_joined_1["SystemProductId"])) == F.trim(F.upper(ref_product["Class of Insurance"])), 
        "left"
    ).select(
        df_joined_1["*"], 
        ref_product["Lvl 2 Mapping"].alias("SubProductClass"),
        ref_product["GLOBs"].alias("Globs"),
        ref_product["GLOBS SPLIT P&C"].alias("GlobsSplitPnc")
    )
else:
    df_joined_2 = df_joined_1 \
        .withColumn("SubProductClass", lit(None)) \
        .withColumn("Globs", lit(None)) \
        .withColumn("GlobsSplitPnc", lit(None))

# 4. Join Insurer (on InsurerName == Insurer)
if ref_insurer:
    df_joined_3 = df_joined_2.join(
        ref_insurer, 
        F.trim(F.upper(df_joined_2["InsurerName"])) == F.trim(F.upper(ref_insurer["Insurer"])), 
        "left"
    ).select(
        df_joined_2["*"], 
        ref_insurer["MAPPED_INSURER"].alias("InsurerMapping"),
        ref_insurer["LLOYDS"].alias("Lloyds"),
        ref_insurer["Lloyd's Asia or Lloyd's London"].alias("LloydsAsiaOrLondon") if "Lloyd's Asia or Lloyd's London" in ref_insurer.columns else lit(None).alias("LloydsAsiaOrLondon")
    )
else:
    df_joined_3 = df_joined_2 \
        .withColumn("InsurerMapping", lit(None)) \
        .withColumn("Lloyds", lit(None))

In [None]:
# Cell 5: Final Calculations & Selection
# --------------------------------------

df_calculated = df_joined_3 \
    .withColumn("PremiumUsd", col("Premium") * coalesce(col("Ccyvalue"), lit(0))) \
    .withColumn("BrokerageUsd", col("Full Commission") * coalesce(col("Ccyvalue"), lit(0)))

# Select Final Columns matching target schema (PascalCase, no spaces)
df_final = df_calculated.select(
    col("ClientName").cast(StringType()),
    col("InsurerName").cast(StringType()),
    col("SystemProductId").cast(StringType()),
    col("AccountHandler").cast(StringType()),
    col("InvoicePolicyNumber").cast(StringType()),
    col("TransactionType").cast(StringType()),
    col("InvoiceDate").cast(DateType()),
    col("InceptionDate").cast(DateType()),
    col("ExpiryDate").cast(DateType()),
    col("DataSource").cast(StringType()),
    col("RevenueCountry").cast(StringType()),
    col("Department").cast(StringType()),
    col("ClientIdWtw").cast(StringType()),
    col("InsurerCountry").cast(StringType()),
    col("DunsNumber").cast(StringType()),
    col("BusinessType").cast(StringType()),
    col("SystemId").cast(StringType()),
    col("PartyIdWtw").cast(StringType()),
    col("ReinsuranceDescription").cast(StringType()),
    col("PolicyDescription").cast(StringType()),
    col("SubProductClass").cast(StringType()),
    col("Globs").cast(StringType()),
    col("GlobsSplitPnc").cast(StringType()),
    col("InsurerMapping").cast(StringType()),
    col("Lloyds").cast(StringType()),
    col("PremiumUsd").cast(DoubleType()),
    col("BrokerageUsd").cast(DoubleType()),
    col("FinalDate").cast(DateType()),
    col("Segment").cast(StringType())
)

In [None]:
# Cell 6: Write to Silver
# -----------------------
target_table = "APAC_Reporting_LH.clean_arias"

print(f"Writing to {target_table}...")
df_final.write.format("delta").mode("overwrite").option("overwriteSchema", "true").saveAsTable(target_table) 

print(f"Success. Rows Processed: {df_final.count()}")
display(df_final.limit(10))