In [0]:
%sql
use catalog corerep;
create schema if not exists transform_layer;

In [0]:
df_transform_gl_account = spark.table("corerep.raw_data.raw_gl_balances")
df_transform_gl_account.display();

In [0]:
from pyspark.sql import DataFrame
from pyspark.sql import functions as F, Window

def calculate_general_ledger(df_transform_gl_account: DataFrame) -> DataFrame:
    # Silver transform: normalize types, standardize columns, remove bad rows

    # 1) Standardize column names (example)
    df = df_transform_gl_account.select(
        F.col("ParentSystemId").cast("bigInt").alias("ParentSystemId"),
        F.col("GLBalanceKey").cast("string").alias("GLBalanceKey"),
        F.col("LedgerId").alias("LedgerId"),
        F.col("TransactionCurrency").alias("TransactionCurrency"),
        F.col("LedgerCurrency").cast("string").alias("LedgerCurrency"),
        F.col("PeriodYear").alias("PeriodYear"),
        F.col("PeriodNum").alias("PeriodNum"),
        (F.col("PeriodYear")*100+F.col("PeriodNum")).alias("PeriodKey"),
        F.col("TransCurrPeriodNetDr"),
        F.col("TransCurrPeriodNetCr")
    )

    # 2) Basic data quality filters
    df = df.filter(
        F.col("ParentSystemId").isNotNull() &
        F.col("GLBalanceKey").isNotNull() 
    )

    # 3) Group and aggregate
    df = df.groupBy("ParentSystemId", "GLBalanceKey", "LedgerId", "TransactionCurrency", "LedgerCurrency", "PeriodYear", "PeriodNum") \
        .agg(
            F.sum("TransCurrPeriodNetDr").alias("TransCurrPeriodNetDr"),
            F.sum("TransCurrPeriodNetCr").alias("TransCurrPeriodNetCr"),
            (F.sum("TransCurrPeriodNetDr") - F.sum("TransCurrPeriodNetCr")).alias("Net_Movement")
        )
    df = df.withColumn("PeriodKey", F.col("PeriodYear")*100+F.col("PeriodNum"))

    # 4) De-duplication (keep latest record per GLBalanceKey)
    w = Window.partitionBy("GLBalanceKey").orderBy(F.col("ParentSystemId").desc())
    df = (
        df.withColumn("_rn", F.row_number().over(w))
          .filter(F.col("_rn") == 1)
          .drop("_rn")
    )

    # 5) Calculate the closing balance
    w2 = Window.partitionBy("ParentSystemId", "GLBalanceKey", "LedgerId", "TransactionCurrency", "LedgerCurrency", "PeriodYear").orderBy(F.col("PeriodKey").desc())
    df = df.withColumn(
        "Closing_Balance",
        F.sum("Net_Movement").over(w2)
    )

    return df


In [0]:
# Create gl account
df_transform_gl_account = spark.table("corerep.raw_data.raw_gl_balances")

# Apply the transformation function
df_general_ledger = calculate_general_ledger(df_transform_gl_account)

# Display the result for inspection
df_general_ledger.display()