In [1]:
from dotenv import load_dotenv
import os
from pathlib import Path

CURRENT_DIRECTORY_NOTEBOOK = None


def intitate_notebook():
    load_dotenv()
    global CURRENT_DIRECTORY_NOTEBOOK
    if CURRENT_DIRECTORY_NOTEBOOK is None:
        os.chdir(os.getenv("PROJECT_BASE_PATH"))
        CURRENT_DIRECTORY_NOTEBOOK = Path(os.getcwd())
        print("Current directory for notebook: ", CURRENT_DIRECTORY_NOTEBOOK)
    else:
        print(
            "Current directory for notebook is already set: ",
            CURRENT_DIRECTORY_NOTEBOOK,
        )


intitate_notebook()

Current directory for notebook:  /Users/shirshmall/Personal_Drive/Credit_Risk_MLOps_Project


In [None]:
from pyspark.sql import SparkSession
from pyspark.sql import functions as F
from pyspark.sql.window import Window
from pyspark.sql.functions import col
from pyspark.sql.types import DoubleType

In [3]:
spark = (
    SparkSession.builder.appName("PostgresETL")
    .master("local[1]")  # Limit to 1 core
    .config("spark.jars", "setup_files/postgresql-42.7.5.jar")
    .config("spark.driver.memory", "8g")
    .config("spark.executor.memory", "4g")
    .config("spark.sql.shuffle.partitions", "100")
    .config("spark.sql.debug.maxToStringFields", 50)
    .getOrCreate()
)


username = "data_source_user"
password = "data_source_user_password"
host = "0.0.0.0"  # "172.17.0.1"
port = "5435"
database = "data_source_db"


jdbc_url = f"jdbc:postgresql://{host}:{port}/{database}"
properties = {"user": username, "password": password, "driver": "org.postgresql.Driver"}

25/07/26 23:53:41 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
Using Spark's default log4j profile: org/apache/spark/log4j2-defaults.properties
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
25/07/26 23:53:42 WARN Utils: Service 'SparkUI' could not bind on port 4040. Attempting port 4041.


In [4]:
prev_df = spark.read.option("failFast", "true").jdbc(
    url=jdbc_url, table="previous_application", properties=properties
)
pos_df = spark.read.option("failFast", "true").jdbc(
    url=jdbc_url, table="pos_cash_balance", properties=properties
)
cc_df = spark.read.option("failFast", "true").jdbc(
    url=jdbc_url, table="credit_card_balance", properties=properties
)
inst_df = spark.read.option("failFast", "true").jdbc(
    url=jdbc_url, table="installments_payments", properties=properties
)

In [5]:
# pos_df.printSchema()
# pos_df.show(5, truncate=False)

In [6]:
# for df in [prev_df, pos_df, cc_df, inst_df]:
#     print(df.count())
# for column_name in df.columns:
#     null_count = df.filter(col(column_name).isNull()).count()
#     if null_count > 0:
#         print(f"Column Name - {column_name}  |  Null Values Count: {null_count}")
#     unique_values_count = df.select(countDistinct(col(column_name))).collect()[0][0]
#     if unique_values_count < 25:
#         print(
#             f"Column Name - {column_name}  |  Unique Values: {[item[0] for item in df.select(column_name).distinct().collect()]}"
#         )

In [7]:
# get_column_summary(df=pos_df, column_name="name_contract_status")

In [None]:
# ===========================
# PREVIOUS APPLICATION FEATURES
# ===========================


def create_prevapp_features(prev_df):
    """Create features from previous_application table"""

    # Basic counts and aggregations
    prevapp_basic = prev_df.groupBy("sk_id_curr").agg(
        # PrevApp_Count
        F.count("*").alias("prevapp_count"),
        # PrevApp_Approved_Count
        F.sum(
            F.when(F.col("name_contract_status") == "Approved", 1).otherwise(0)
        ).alias("prevapp_approved_count"),
        # PrevApp_Mean_Credit_Amount
        F.mean("amt_credit").alias("prevapp_mean_credit_amount"),
        # PrevApp_ApprovalRate
        F.mean(
            F.when(F.col("name_contract_status") == "Approved", 1.0).otherwise(0.0)
        ).alias("prevapp_approval_rate"),
        # PrevApp_LastAppRecency (max days_decision - closest to 0)
        F.max("days_decision").alias("prevapp_last_app_recency"),
        # For PrevApp_AvgAmtApprovedRatio - only for approved applications
        F.mean(
            F.when(
                F.col("name_contract_status") == "Approved",
                F.col("amt_credit")
                / F.when(
                    F.col("amt_application") != 0, F.col("amt_application")
                ).otherwise(F.lit(-1.0)),
            ).otherwise(-1.0)
        ).alias("prevapp_avg_amt_approved_ratio"),
        # PrevApp_WeightedLoanTerm
        F.sum(F.col("cnt_payment") / (1 + F.abs(F.col("days_decision")))).alias(
            "prevapp_weighted_loan_term_numerator"
        ),
        F.sum(1 / (1 + F.abs(F.col("days_decision")))).alias(
            "prevapp_weighted_loan_term_denominator"
        ),
    )

    # Calculate weighted loan term
    prevapp_basic = prevapp_basic.withColumn(
        "prevapp_weighted_loan_term",
        F.when(
            F.col("prevapp_weighted_loan_term_denominator") != 0,
            F.col("prevapp_weighted_loan_term_numerator")
            / F.col("prevapp_weighted_loan_term_denominator"),
        ).otherwise(-1.0),
    ).drop(
        "prevapp_weighted_loan_term_numerator", "prevapp_weighted_loan_term_denominator"
    )

    # PrevApp_MeanApplicationGap
    window_gap = Window.partitionBy("sk_id_curr").orderBy("days_decision")
    prev_with_lag = prev_df.withColumn(
        "prev_days_decision", F.lag("days_decision").over(window_gap)
    )

    app_gaps = (
        prev_with_lag.filter(F.col("prev_days_decision").isNotNull())
        .groupBy("sk_id_curr")
        .agg(
            F.mean(F.abs(F.col("days_decision") - F.col("prev_days_decision"))).alias(
                "prevapp_mean_application_gap"
            )
        )
    )

    # PrevApp_Streak_Refused
    window_streak = Window.partitionBy("sk_id_curr").orderBy(
        F.col("days_decision").desc()
    )

    prev_with_row = prev_df.withColumn("row_num", F.row_number().over(window_streak))

    refused_streak = (
        prev_with_row.withColumn(
            "is_refused",
            F.when(F.col("name_contract_status") == "Refused", 1).otherwise(0),
        )
        .withColumn(
            "cumsum_not_refused", F.sum(1 - F.col("is_refused")).over(window_streak)
        )
        .filter(F.col("cumsum_not_refused") == 0)
        .groupBy("sk_id_curr")
        .agg(F.count("*").alias("prevapp_streak_refused"))
    )

    # PrevApp_ProductSwitchRate
    window_switch = Window.partitionBy("sk_id_curr").orderBy("days_decision")
    prev_with_switch = prev_df.withColumn(
        "prev_product", F.lag("name_product_type").over(window_switch)
    )

    product_switches = (
        prev_with_switch.filter(F.col("prev_product").isNotNull())
        .groupBy("sk_id_curr")
        .agg(
            F.mean(
                F.when(
                    F.col("name_product_type") != F.col("prev_product"), 1.0
                ).otherwise(0.0)
            ).alias("prevapp_product_switch_rate")
        )
    )

    # Combine all features
    result = prevapp_basic
    result = result.join(app_gaps, "sk_id_curr", "left")
    result = result.join(refused_streak, "sk_id_curr", "left")
    result = result.join(product_switches, "sk_id_curr", "left")

    # Fill 0 for refused streak if no refused applications
    result = result.fillna(0, subset=["prevapp_streak_refused"])

    return result


# ===========================
# POS CASH BALANCE FEATURES
# ===========================


def create_pos_features(pos_df):
    """Create features from pos_cash_balance table"""

    # Basic aggregations
    pos_basic = pos_df.groupBy("sk_id_curr").agg(
        # POS_Active_Count - count unique SK_ID_PREV where last status is Active
        F.countDistinct("sk_id_prev").alias("pos_total_loans"),
        # POS_Mean_Months_Balance
        F.mean("months_balance").alias("pos_mean_months_balance"),
        # POS_Avg_DPD
        F.mean("sk_dpd").alias("pos_avg_dpd"),
        # POS_DelinqTrend (will calculate separately)
        F.collect_list(F.struct("months_balance", "sk_dpd")).alias("dpd_history"),
        # POS_PaymentRatio_Avg
        F.mean(
            F.when(
                (F.col("cnt_instalment_future") != 0)
                & F.col("cnt_instalment_future").isNotNull(),
                F.col("cnt_instalment") / F.col("cnt_instalment_future"),
            ).otherwise(-1.0)
        ).alias("pos_payment_ratio_avg"),
        # POS_LastDelinqRecency
        F.min(
            F.when(F.col("sk_dpd") > 0, F.col("months_balance")).otherwise(-1.0)
        ).alias("pos_last_delinq_recency"),
        # For volatility calculation
        F.stddev(
            F.when(
                (F.col("cnt_instalment") != 0) & F.col("cnt_instalment").isNotNull(),
                (F.col("cnt_instalment") - F.col("cnt_instalment_future"))
                / F.col("cnt_instalment"),
            ).otherwise(-1.0)
        ).alias("pos_utilization_volatility"),
    )

    # Calculate delinquency trend using UDF
    @F.udf(returnType=DoubleType())
    def calculate_dpd_trend(dpd_history):
        if not dpd_history or len(dpd_history) < 2:
            return None

        # Extract months and dpd values
        data = [
            (row["months_balance"], row["sk_dpd"])
            for row in dpd_history
            if row["sk_dpd"] is not None
        ]

        if len(data) < 2:
            return None

        # Simple linear regression
        n = len(data)
        sum_x = sum(d[0] for d in data)
        sum_y = sum(d[1] for d in data)
        sum_xy = sum(d[0] * d[1] for d in data)
        sum_x2 = sum(d[0] * d[0] for d in data)

        denominator = n * sum_x2 - sum_x * sum_x
        if denominator == 0:
            return None

        slope = (n * sum_xy - sum_x * sum_y) / denominator
        return float(slope)

    pos_basic = pos_basic.withColumn(
        "pos_delinq_trend", calculate_dpd_trend("dpd_history")
    ).drop("dpd_history")

    # POS_Active_Count - need to get last status per loan
    window_last = Window.partitionBy("sk_id_curr", "sk_id_prev").orderBy(
        F.col("months_balance").desc()
    )
    pos_last_status = pos_df.withColumn(
        "row_num", F.row_number().over(window_last)
    ).filter(F.col("row_num") == 1)

    active_counts = pos_last_status.groupBy("sk_id_curr").agg(
        F.sum(F.when(F.col("name_contract_status") == "Active", 1).otherwise(0)).alias(
            "pos_active_count"
        )
    )

    # POS_DelinqStreak_Max
    window_streak = Window.partitionBy("sk_id_curr", "sk_id_prev").orderBy(
        "months_balance"
    )

    pos_with_streak = pos_df.withColumn(
        "is_delinq", F.when(F.col("sk_dpd") > 0, 1).otherwise(0)
    ).withColumn(
        "grp",
        F.sum(F.when(F.col("is_delinq") == 0, 1).otherwise(0)).over(window_streak),
    )

    delinq_streaks = (
        pos_with_streak.filter(F.col("is_delinq") == 1)
        .groupBy("sk_id_curr", "sk_id_prev", "grp")
        .agg(F.count("*").alias("streak_length"))
        .groupBy("sk_id_curr")
        .agg(F.max("streak_length").alias("pos_delinq_streak_max"))
    )

    # POS_Seasonality_DelinqSpike
    pos_seasonal = pos_df.withColumn(
        "quarter",
        F.when(F.col("months_balance") % 12 >= -3, "Q4")
        .when(F.col("months_balance") % 12 >= -6, "Q3")
        .when(F.col("months_balance") % 12 >= -9, "Q2")
        .otherwise("Q1"),
    )

    seasonal_dpd = (
        pos_seasonal.groupBy("sk_id_curr", "quarter")
        .agg(F.mean("sk_dpd").alias("mean_dpd"))
        .groupBy("sk_id_curr")
        .pivot("quarter")
        .agg(F.first("mean_dpd"))
    )

    seasonal_dpd = seasonal_dpd.withColumn(
        "pos_seasonality_delinq_spike",
        F.coalesce(F.col("Q4"), F.lit(0)) - F.coalesce(F.col("Q1"), F.lit(0)),
    ).select("sk_id_curr", "pos_seasonality_delinq_spike")

    # Combine all features
    result = pos_basic
    result = result.join(active_counts, "sk_id_curr", "left")
    result = result.join(delinq_streaks, "sk_id_curr", "left")
    result = result.join(seasonal_dpd, "sk_id_curr", "left")

    return result


# ===========================
# CREDIT CARD BALANCE FEATURES
# ===========================


def create_cc_features(cc_df):
    """Create features from credit_card_balance table"""

    # Basic aggregations
    cc_basic = cc_df.groupBy("sk_id_curr").agg(
        # CC_Num_Cards
        F.countDistinct("sk_id_prev").alias("cc_num_cards"),
        # CC_Avg_Balance
        F.mean("amt_balance").alias("cc_avg_balance"),
        # CC_Utilization_Mean/Max
        F.mean(
            F.when(
                (F.col("amt_credit_limit_actual") > 0)
                & F.col("amt_credit_limit_actual").isNotNull(),
                F.col("amt_balance") / F.col("amt_credit_limit_actual"),
            ).otherwise(-1.0)
        ).alias("cc_utilization_mean"),
        F.max(
            F.when(
                (F.col("amt_credit_limit_actual") > 0)
                & F.col("amt_credit_limit_actual").isNotNull(),
                F.col("amt_balance") / F.col("amt_credit_limit_actual"),
            ).otherwise(-1.0)
        ).alias("cc_utilization_max"),
        # CC_PaymentCoverageRatio
        F.mean(
            F.when(
                (F.col("amt_inst_min_regularity") > 0)
                & F.col("amt_inst_min_regularity").isNotNull(),
                F.col("amt_payment_total_current") / F.col("amt_inst_min_regularity"),
            ).otherwise(-1.0)
        ).alias("cc_payment_coverage_ratio"),
        # CC_LatestCycleOverLimitFlag
        F.max(
            F.when(
                F.col("amt_balance") > F.col("amt_credit_limit_actual"), 1
            ).otherwise(0)
        ).alias("cc_latest_cycle_over_limit_flag"),
        # CC_Volatility_Payment
        F.stddev("amt_payment_total_current").alias("cc_volatility_payment"),
        # For trend calculation
        F.collect_list(F.struct("months_balance", "amt_balance")).alias(
            "balance_history"
        ),
    )

    # Calculate drawing trend
    @F.udf(returnType=DoubleType())
    def calculate_balance_trend(balance_history):
        if not balance_history or len(balance_history) < 2:
            return None

        # Extract months and balance values
        data = [
            (row["months_balance"], row["amt_balance"])
            for row in balance_history
            if row["amt_balance"] is not None and row["months_balance"] is not None
        ]

        if len(data) < 2:
            return None

        # Simple linear regression
        n = len(data)
        sum_x = sum(d[0] for d in data)
        sum_y = sum(d[1] for d in data)
        sum_xy = sum(d[0] * d[1] for d in data)
        sum_x2 = sum(d[0] * d[0] for d in data)

        denominator = n * sum_x2 - sum_x * sum_x
        if denominator == 0:
            return None

        slope = (n * sum_xy - sum_x * sum_y) / denominator
        return float(slope)

    cc_basic = cc_basic.withColumn(
        "cc_drawing_trend", calculate_balance_trend("balance_history")
    ).drop("balance_history")

    # CC_DPD_Counts - sum all CNT_DRAWINGS_ATM_* columns
    cc_dpd = cc_df.groupBy("sk_id_curr").agg(
        F.sum("cnt_drawings_atm_current").alias("cc_dpd_atm_sum"),
        F.sum("cnt_drawings_other_current").alias("cc_dpd_other_sum"),
        F.sum("cnt_drawings_pos_current").alias("cc_dpd_pos_sum"),
    )

    # CC_RecentHighUtilFlag - check last 3 months
    window_recent = Window.partitionBy("sk_id_curr", "sk_id_prev").orderBy(
        F.col("months_balance").desc()
    )

    cc_recent = cc_df.withColumn("row_num", F.row_number().over(window_recent)).filter(
        F.col("row_num") <= 3
    )

    recent_high_util = cc_recent.groupBy("sk_id_curr").agg(
        F.max(
            F.when(
                (F.col("amt_credit_limit_actual") > 0)
                & F.col("amt_credit_limit_actual").isNotNull(),
                F.when(
                    F.col("amt_balance") / F.col("amt_credit_limit_actual") > 0.8, 1
                ).otherwise(0),
            ).otherwise(0)
        ).alias("cc_recent_high_util_flag")
    )

    # Combine all features
    result = cc_basic
    result = result.join(cc_dpd, "sk_id_curr", "left")
    result = result.join(recent_high_util, "sk_id_curr", "left")

    return result


# ===========================
# INSTALLMENTS PAYMENTS FEATURES
# ===========================


def create_inst_features(inst_df):
    """Create features from installments_payments table"""

    # Calculate delay for each installment
    inst_with_delay = inst_df.withColumn(
        "delay",
        F.when(
            F.col("days_entry_payment").isNotNull()
            & F.col("days_instalment").isNotNull(),
            F.col("days_entry_payment") - F.col("days_instalment"),
        ).otherwise(-1.0),
    )

    # Basic aggregations
    inst_basic = inst_with_delay.groupBy("sk_id_curr").agg(
        # Inst_Mean_Delay
        F.mean("delay").alias("inst_mean_delay"),
        # Inst_Delay_Mean/Max
        F.mean("delay").alias("inst_delay_mean"),
        F.max("delay").alias("inst_delay_max"),
        # Inst_LatePayment_Count
        F.sum(F.when(F.col("delay") > 0, 1).otherwise(0)).alias(
            "inst_latepayment_count"
        ),
        # Inst_EarlyPay_Ratio
        F.mean(F.when(F.col("delay") < 0, 1.0).otherwise(0.0)).alias(
            "inst_earlypay_ratio"
        ),
        # Inst_Overpayment_Rate
        F.mean(
            F.when(
                (F.col("amt_payment") > F.col("amt_instalment"))
                & F.col("amt_payment").isNotNull()
                & F.col("amt_instalment").isNotNull(),
                1.0,
            ).otherwise(0.0)
        ).alias("inst_overpayment_rate"),
        # Inst_PaidCoverage
        F.mean(
            F.when(
                (F.col("amt_instalment") > 0) & F.col("amt_instalment").isNotNull(),
                F.col("amt_payment") / F.col("amt_instalment"),
            ).otherwise(-1.0)
        ).alias("inst_paid_coverage"),
        # For weighted delay calculation
        F.sum(
            F.when(F.col("delay") > 0, F.col("delay")).otherwise(0)
            / (1 + F.abs(F.col("days_instalment")))
        ).alias("inst_weighted_delay_numerator"),
        F.sum(1 / (1 + F.abs(F.col("days_instalment")))).alias(
            "inst_weighted_delay_denominator"
        ),
        # Total count for calculations
        F.count("*").alias("total_installments"),
    )

    # Calculate weighted delay
    inst_basic = inst_basic.withColumn(
        "inst_weighted_delay",
        F.when(
            F.col("inst_weighted_delay_denominator") > 0,
            F.col("inst_weighted_delay_numerator")
            / F.col("inst_weighted_delay_denominator"),
        ).otherwise(-1.0),
    ).drop("inst_weighted_delay_numerator", "inst_weighted_delay_denominator")

    # Inst_LateStreak_Max
    window_streak = Window.partitionBy("sk_id_curr", "sk_id_prev").orderBy(
        "days_instalment"
    )

    inst_with_streak = inst_with_delay.withColumn(
        "is_late", F.when(F.col("delay") > 0, 1).otherwise(0)
    ).withColumn(
        "grp", F.sum(F.when(F.col("is_late") == 0, 1).otherwise(0)).over(window_streak)
    )

    late_streaks = (
        inst_with_streak.filter(F.col("is_late") == 1)
        .groupBy("sk_id_curr", "sk_id_prev", "grp")
        .agg(F.count("*").alias("streak_length"))
        .groupBy("sk_id_curr")
        .agg(F.max("streak_length").alias("inst_late_streak_max"))
    )

    # Inst_Seasonal_Effect
    inst_seasonal = inst_with_delay.withColumn(
        "month", F.month(F.expr("date_add('2018-01-01', days_instalment)"))
    ).withColumn(
        "quarter",
        F.when(F.col("month").isin(10, 11, 12), "Q4")
        .when(F.col("month").isin(7, 8, 9), "Q3")
        .when(F.col("month").isin(4, 5, 6), "Q2")
        .otherwise("Q1"),
    )

    seasonal_delay = (
        inst_seasonal.groupBy("sk_id_curr", "quarter")
        .agg(F.mean("delay").alias("mean_delay"))
        .groupBy("sk_id_curr")
        .pivot("quarter")
        .agg(F.first("mean_delay"))
    )

    seasonal_delay = seasonal_delay.withColumn(
        "inst_seasonal_effect",
        F.coalesce(F.col("Q4"), F.lit(0)) - F.coalesce(F.col("Q2"), F.lit(0)),
    ).select("sk_id_curr", "inst_seasonal_effect")

    # Combine all features
    result = inst_basic
    result = result.join(late_streaks, "sk_id_curr", "left")
    result = result.join(seasonal_delay, "sk_id_curr", "left")

    return result


# ===========================
# CROSS-TABLE INTERACTION FEATURES
# ===========================


def create_cross_features(prev_df, pos_df, cc_df, inst_df):
    """Create cross-table interaction features"""

    # A. previous_application ↔ POS_CASH_balance

    # Get unique SK_ID_PREV from each table
    prev_ids = prev_df.select("sk_id_curr", "sk_id_prev").distinct()
    pos_ids = pos_df.select("sk_id_curr", "sk_id_prev").distinct()
    cc_ids = cc_df.select("sk_id_curr", "sk_id_prev").distinct()
    # inst_ids = inst_df.select("sk_id_curr", "sk_id_prev").distinct()

    # PrevApp_POS_ConversionRate
    pos_conversion = (
        prev_ids.join(pos_ids, ["sk_id_curr", "sk_id_prev"], "left")
        .groupBy("sk_id_curr")
        .agg(
            F.sum(F.when(pos_ids["sk_id_prev"].isNotNull(), 1).otherwise(0)).alias(
                "pos_converted"
            ),
            F.count("*").alias("total_prev_apps"),
        )
        .withColumn(
            "prevapp_pos_conversion_rate",
            F.col("pos_converted") / F.col("total_prev_apps"),
        )
        .select("sk_id_curr", "prevapp_pos_conversion_rate")
    )

    # PrevApp_CC_ConversionRate
    cc_conversion = (
        prev_ids.join(cc_ids, ["sk_id_curr", "sk_id_prev"], "left")
        .groupBy("sk_id_curr")
        .agg(
            F.sum(F.when(cc_ids["sk_id_prev"].isNotNull(), 1).otherwise(0)).alias(
                "cc_converted"
            ),
            F.count("*").alias("total_prev_apps"),
        )
        .withColumn(
            "prevapp_cc_conversion_rate",
            F.col("cc_converted") / F.col("total_prev_apps"),
        )
        .select("sk_id_curr", "prevapp_cc_conversion_rate")
    )

    # PrevApp_POS_DPD_Mean
    prev_pos_join = prev_df.select("sk_id_curr", "sk_id_prev").join(
        pos_df.select("sk_id_prev", "sk_dpd"), "sk_id_prev"
    )

    pos_dpd_mean = prev_pos_join.groupBy("sk_id_curr").agg(
        F.mean("sk_dpd").alias("prevapp_pos_dpd_mean")
    )

    # PrevApp_to_POS_LagDays_Mean
    prev_decision = prev_df.select("sk_id_curr", "sk_id_prev", "days_decision")
    pos_months = pos_df.groupBy("sk_id_curr", "sk_id_prev").agg(
        F.min("months_balance").alias("min_months_balance")
    )

    lag_days = (
        prev_decision.join(pos_months, ["sk_id_curr", "sk_id_prev"])
        .withColumn(
            "lag_days", -F.col("min_months_balance") * 30 - F.col("days_decision")
        )
        .groupBy("sk_id_curr")
        .agg(F.mean("lag_days").alias("prevapp_to_pos_lagdays_mean"))
    )

    # PrevApp_CC_Utilization_Mean
    prev_cc_join = prev_df.select("sk_id_curr", "sk_id_prev").join(
        cc_df.select("sk_id_prev", "amt_balance", "amt_credit_limit_actual"),
        "sk_id_prev",
    )

    cc_util_mean = prev_cc_join.groupBy("sk_id_curr").agg(
        F.mean(
            F.when(
                (F.col("amt_credit_limit_actual") > 0)
                & F.col("amt_credit_limit_actual").isNotNull(),
                F.col("amt_balance") / F.col("amt_credit_limit_actual"),
            ).otherwise(-1.0)
        ).alias("prevapp_cc_utilization_mean")
    )

    # PrevApp_CC_PaymentCoverage_Mean
    prev_cc_payment = prev_df.select("sk_id_curr", "sk_id_prev").join(
        cc_df.select(
            "sk_id_prev", "amt_payment_total_current", "amt_inst_min_regularity"
        ),
        "sk_id_prev",
    )

    cc_payment_mean = prev_cc_payment.groupBy("sk_id_curr").agg(
        F.mean(
            F.when(
                (F.col("amt_inst_min_regularity") > 0)
                & F.col("amt_inst_min_regularity").isNotNull(),
                F.col("amt_payment_total_current") / F.col("amt_inst_min_regularity"),
            ).otherwise(-1.0)
        ).alias("prevapp_cc_payment_coverage_mean")
    )

    # PrevApp_Inst features (for approved applications only)
    approved_prev = prev_df.filter(F.col("name_contract_status") == "Approved").select(
        "sk_id_curr", "sk_id_prev"
    )

    inst_with_delay = inst_df.withColumn(
        "delay",
        F.when(
            F.col("days_entry_payment").isNotNull()
            & F.col("days_instalment").isNotNull(),
            F.col("days_entry_payment") - F.col("days_instalment"),
        ).otherwise(-1.0),
    )

    prev_inst_join = approved_prev.join(
        inst_with_delay.select("sk_id_prev", "delay"), "sk_id_prev"
    )

    # PrevApp_Inst_Delay_Avg
    inst_delay_avg = prev_inst_join.groupBy("sk_id_curr").agg(
        F.mean("delay").alias("prevapp_inst_delay_avg")
    )

    # PrevApp_Inst_EarlyPay_Rate
    inst_earlypay = prev_inst_join.groupBy("sk_id_curr").agg(
        F.mean(F.when(F.col("delay") < 0, 1.0).otherwise(0.0)).alias(
            "prevapp_inst_earlypay_rate"
        )
    )

    # PrevApp_Inst_LateStreak_Max
    window_inst_streak = Window.partitionBy(
        approved_prev["sk_id_curr"], "sk_id_prev"
    ).orderBy("days_instalment")

    inst_streak_calc = (
        approved_prev.join(inst_df, "sk_id_prev")
        .withColumn(
            "delay",
            F.when(
                F.col("days_entry_payment").isNotNull()
                & F.col("days_instalment").isNotNull(),
                F.col("days_entry_payment") - F.col("days_instalment"),
            ).otherwise(-1.0),
        )
        .withColumn("is_late", F.when(F.col("delay") > 0, 1).otherwise(0))
        .withColumn(
            "grp",
            F.sum(F.when(F.col("is_late") == 0, 1).otherwise(0)).over(
                window_inst_streak
            ),
        )
    )

    inst_late_streak = (
        inst_streak_calc.filter(F.col("is_late") == 1)
        .groupBy(approved_prev["sk_id_curr"], "sk_id_prev", "grp")
        .agg(F.count("*").alias("streak_length"))
        .groupBy(approved_prev["sk_id_curr"])
        .agg(F.max("streak_length").alias("prevapp_inst_late_streak_max"))
    )

    # Combine all cross features
    result = pos_conversion
    result = result.join(cc_conversion, "sk_id_curr", "left")
    result = result.join(pos_dpd_mean, "sk_id_curr", "left")
    result = result.join(lag_days, "sk_id_curr", "left")
    result = result.join(cc_util_mean, "sk_id_curr", "left")
    result = result.join(cc_payment_mean, "sk_id_curr", "left")
    result = result.join(inst_delay_avg, "sk_id_curr", "left")
    result = result.join(inst_earlypay, "sk_id_curr", "left")
    result = result.join(inst_late_streak, "sk_id_curr", "left")

    return result


# ===========================
# MAIN FEATURE ENGINEERING FUNCTION
# ===========================


def create_all_features(prev_df, pos_df, cc_df, inst_df):
    """
    Main function to create all features from all tables.
    Returns a single DataFrame with sk_id_curr and all features.
    """

    # Create features from each table
    print("Creating previous application features...")
    prevapp_features = create_prevapp_features(prev_df)
    prevapp_features.show(5)

    print("Creating POS cash balance features...")
    pos_features = create_pos_features(pos_df)
    pos_features.show(5)

    print("Creating credit card balance features...")
    cc_features = create_cc_features(cc_df)
    cc_features.show(5)

    print("Creating installments payments features...")
    inst_features = create_inst_features(inst_df)
    inst_features.show(5)

    print("Creating cross-table interaction features...")
    cross_features = create_cross_features(prev_df, pos_df, cc_df, inst_df)
    cross_features.show(5)

    # Get all unique SK_ID_CURR values
    all_ids = prev_df.select("sk_id_curr").distinct()
    all_ids = all_ids.union(pos_df.select("sk_id_curr").distinct())
    all_ids = all_ids.union(cc_df.select("sk_id_curr").distinct())
    all_ids = all_ids.union(inst_df.select("sk_id_curr").distinct())
    all_ids = all_ids.distinct()

    # Join all features
    print("Joining all features...")
    result = all_ids
    result = result.join(prevapp_features, "sk_id_curr", "left")
    result = result.join(pos_features, "sk_id_curr", "left")
    result = result.join(cc_features, "sk_id_curr", "left")
    result = result.join(inst_features, "sk_id_curr", "left")
    result = result.join(cross_features, "sk_id_curr", "left")

    return result

In [9]:
# Create all features
all_features_df = create_all_features(prev_df, pos_df, cc_df, inst_df)

Creating previous application features...


                                                                                

+----------+-------------+----------------------+--------------------------+---------------------+------------------------+------------------------------+--------------------------+----------------------------+----------------------+---------------------------+
|sk_id_curr|prevapp_count|prevapp_approved_count|prevapp_mean_credit_amount|prevapp_approval_rate|prevapp_last_app_recency|prevapp_avg_amt_approved_ratio|prevapp_weighted_loan_term|prevapp_mean_application_gap|prevapp_streak_refused|prevapp_product_switch_rate|
+----------+-------------+----------------------+--------------------------+---------------------+------------------------+------------------------------+--------------------------+----------------------------+----------------------+---------------------------+
|    100360|            3|                     2|                  101100.0|   0.6666666666666666|                    -101|           0.34400000000000003|         11.92056074766355|                       565.5|    

                                                                                

+----------+---------------+-----------------------+-----------+---------------------+-----------------------+--------------------------+----------------+----------------+---------------------+----------------------------+
|sk_id_curr|pos_total_loans|pos_mean_months_balance|pos_avg_dpd|pos_payment_ratio_avg|pos_last_delinq_recency|pos_utilization_volatility|pos_delinq_trend|pos_active_count|pos_delinq_streak_max|pos_seasonality_delinq_spike|
+----------+---------------+-----------------------+-----------+---------------------+-----------------------+--------------------------+----------------+----------------+---------------------+----------------------------+
|    100002|              1|                  -10.0|        0.0|    1.885420856109693|                   -1.0|       0.23447143077964075|             0.0|               1|                 NULL|                         0.0|
|    100003|              3|    -43.785714285714285|        0.0|   2.0991805813234388|                   -1.

                                                                                

+----------+------------+------------------+-------------------+------------------+-------------------------+-------------------------------+---------------------+------------------+--------------+----------------+--------------+------------------------+
|sk_id_curr|cc_num_cards|    cc_avg_balance|cc_utilization_mean|cc_utilization_max|cc_payment_coverage_ratio|cc_latest_cycle_over_limit_flag|cc_volatility_payment|  cc_drawing_trend|cc_dpd_atm_sum|cc_dpd_other_sum|cc_dpd_pos_sum|cc_recent_high_util_flag|
+----------+------------+------------------+-------------------+------------------+-------------------------+-------------------------------+---------------------+------------------+--------------+----------------+--------------+------------------------+
|    100006|           1|               0.0|                0.0|               0.0|                     -1.0|                              0|                  0.0|               0.0|          NULL|            NULL|          NULL|      

                                                                                

+----------+-------------------+-------------------+--------------+----------------------+-------------------+---------------------+------------------+------------------+-------------------+--------------------+--------------------+
|sk_id_curr|    inst_mean_delay|    inst_delay_mean|inst_delay_max|inst_latepayment_count|inst_earlypay_ratio|inst_overpayment_rate|inst_paid_coverage|total_installments|inst_weighted_delay|inst_late_streak_max|inst_seasonal_effect|
+----------+-------------------+-------------------+--------------+----------------------+-------------------+---------------------+------------------+------------------+-------------------+--------------------+--------------------+
|    100360|-22.692307692307693|-22.692307692307693|         -11.0|                     0|                1.0|                  0.0|               1.0|                13|                0.0|                NULL|   4.333333333333332|
|    100565|              -15.8|              -15.8|         -12.0| 

                                                                                

+----------+---------------------------+--------------------------+--------------------+---------------------------+---------------------------+--------------------------------+----------------------+--------------------------+----------------------------+
|sk_id_curr|prevapp_pos_conversion_rate|prevapp_cc_conversion_rate|prevapp_pos_dpd_mean|prevapp_to_pos_lagdays_mean|prevapp_cc_utilization_mean|prevapp_cc_payment_coverage_mean|prevapp_inst_delay_avg|prevapp_inst_earlypay_rate|prevapp_inst_late_streak_max|
+----------+---------------------------+--------------------------+--------------------+---------------------------+---------------------------+--------------------------------+----------------------+--------------------------+----------------------------+
|    100360|         0.6666666666666666|                       0.0|                 0.0|                     1341.5|                       NULL|                            NULL|   -22.692307692307693|                       1.0|  

In [10]:
all_features_df.show(5)

25/07/26 23:56:14 WARN SparkStringUtils: Truncated the string representation of a plan since it was too large. This behavior can be adjusted by setting 'spark.sql.debug.maxToStringFields'.
25/07/26 23:58:57 WARN DAGScheduler: Broadcasting large task binary with size 1085.2 KiB
[Stage 285:>                                                        (0 + 1) / 1]

+----------+-------------+----------------------+--------------------------+---------------------+------------------------+------------------------------+--------------------------+----------------------------+----------------------+---------------------------+---------------+-----------------------+-----------+---------------------+-----------------------+--------------------------+----------------+----------------+---------------------+----------------------------+------------+------------------+-------------------+------------------+-------------------------+-------------------------------+---------------------+----------------+--------------+----------------+--------------+------------------------+-------------------+-------------------+--------------+----------------------+-------------------+---------------------+------------------+------------------+-------------------+--------------------+--------------------+---------------------------+--------------------------+--------------

                                                                                

In [11]:
row_count = all_features_df.count()
row_count

                                                                                

291057

In [12]:
for column_name in all_features_df.columns:
    null_count = all_features_df.filter(col(column_name).isNull()).count()
    if null_count > 0:
        print(
            f"Column Name - {column_name}  |  Null Values Count: {(null_count / row_count) * 100} %"
        )

                                                                                

Column Name - prevapp_weighted_loan_term  |  Null Values Count: 0.14258375507203055 %


                                                                                

Column Name - prevapp_mean_application_gap  |  Null Values Count: 18.04904194023851 %


                                                                                

Column Name - prevapp_product_switch_rate  |  Null Values Count: 18.04904194023851 %


                                                                                

Column Name - pos_total_loans  |  Null Values Count: 1.4052230319147108 %


                                                                                

Column Name - pos_mean_months_balance  |  Null Values Count: 1.4052230319147108 %


                                                                                

Column Name - pos_avg_dpd  |  Null Values Count: 1.4052230319147108 %


                                                                                

Column Name - pos_payment_ratio_avg  |  Null Values Count: 1.4052230319147108 %


                                                                                

Column Name - pos_last_delinq_recency  |  Null Values Count: 1.4052230319147108 %


                                                                                

Column Name - pos_utilization_volatility  |  Null Values Count: 1.4976447912264608 %


                                                                                

Column Name - pos_delinq_trend  |  Null Values Count: 1.4983319418533139 %


                                                                                

Column Name - pos_active_count  |  Null Values Count: 1.4052230319147108 %


                                                                                

Column Name - pos_delinq_streak_max  |  Null Values Count: 82.32614230202331 %


                                                                                

Column Name - pos_seasonality_delinq_spike  |  Null Values Count: 1.4052230319147108 %


                                                                                

Column Name - cc_num_cards  |  Null Values Count: 73.22380152341293 %


                                                                                

Column Name - cc_avg_balance  |  Null Values Count: 73.22380152341293 %


                                                                                

Column Name - cc_utilization_mean  |  Null Values Count: 73.22380152341293 %


                                                                                

Column Name - cc_utilization_max  |  Null Values Count: 73.22380152341293 %


                                                                                

Column Name - cc_payment_coverage_ratio  |  Null Values Count: 73.22380152341293 %


                                                                                

Column Name - cc_latest_cycle_over_limit_flag  |  Null Values Count: 73.22380152341293 %


                                                                                

Column Name - cc_volatility_payment  |  Null Values Count: 73.44128469681196 %


                                                                                

Column Name - cc_drawing_trend  |  Null Values Count: 73.44128469681196 %


                                                                                

Column Name - cc_dpd_atm_sum  |  Null Values Count: 81.81455866033114 %


                                                                                

Column Name - cc_dpd_other_sum  |  Null Values Count: 81.81455866033114 %


                                                                                

Column Name - cc_dpd_pos_sum  |  Null Values Count: 81.81455866033114 %


                                                                                

Column Name - cc_recent_high_util_flag  |  Null Values Count: 73.22380152341293 %


                                                                                

Column Name - inst_mean_delay  |  Null Values Count: 0.567242842467283 %


                                                                                

Column Name - inst_delay_mean  |  Null Values Count: 0.567242842467283 %


                                                                                

Column Name - inst_delay_max  |  Null Values Count: 0.567242842467283 %


                                                                                

Column Name - inst_latepayment_count  |  Null Values Count: 0.567242842467283 %


                                                                                

Column Name - inst_earlypay_ratio  |  Null Values Count: 0.567242842467283 %


                                                                                

Column Name - inst_overpayment_rate  |  Null Values Count: 0.567242842467283 %


                                                                                

Column Name - inst_paid_coverage  |  Null Values Count: 0.5699914449746957 %


                                                                                

Column Name - total_installments  |  Null Values Count: 0.567242842467283 %


                                                                                

Column Name - inst_weighted_delay  |  Null Values Count: 0.567242842467283 %


                                                                                

Column Name - inst_late_streak_max  |  Null Values Count: 48.405638758043956 %


                                                                                

Column Name - inst_seasonal_effect  |  Null Values Count: 0.567242842467283 %


                                                                                

Column Name - prevapp_pos_dpd_mean  |  Null Values Count: 1.4052230319147108 %


                                                                                

Column Name - prevapp_to_pos_lagdays_mean  |  Null Values Count: 1.4052230319147108 %


                                                                                

Column Name - prevapp_cc_utilization_mean  |  Null Values Count: 73.22380152341293 %


                                                                                

Column Name - prevapp_cc_payment_coverage_mean  |  Null Values Count: 73.22380152341293 %


                                                                                

Column Name - prevapp_inst_delay_avg  |  Null Values Count: 0.567242842467283 %


                                                                                

Column Name - prevapp_inst_earlypay_rate  |  Null Values Count: 0.567242842467283 %


                                                                                

Column Name - prevapp_inst_late_streak_max  |  Null Values Count: 48.405638758043956 %


***

In [13]:
# final_features_df.to_parquet(
#     path="notebooks/training_initial_model/initial_processed_data/previous_applications.parquet",
#     index=False,
# )

In [None]:
target_db = "processed_data_staging"
target_url = f"jdbc:postgresql://{host}:{port}/{target_db}"

target_properties = {
    "user": username,
    "password": password,
    "driver": "org.postgresql.Driver",
}

all_features_df.write.jdbc(
    url=target_url,
    table="previous_applications",
    mode="overwrite",
    properties=target_properties,
)

***