In [1]:
from dotenv import load_dotenv
import os
from pathlib import Path

CURRENT_DIRECTORY_NOTEBOOK = None


def intitate_notebook():
    load_dotenv()
    global CURRENT_DIRECTORY_NOTEBOOK
    if CURRENT_DIRECTORY_NOTEBOOK is None:
        os.chdir(os.getenv("PROJECT_BASE_PATH"))
        CURRENT_DIRECTORY_NOTEBOOK = Path(os.getcwd())
        print("Current directory for notebook: ", CURRENT_DIRECTORY_NOTEBOOK)
    else:
        print(
            "Current directory for notebook is already set: ",
            CURRENT_DIRECTORY_NOTEBOOK,
        )


intitate_notebook()

Current directory for notebook:  /Users/shirshmall/Personal_Drive/Credit_Risk_MLOps_Project


In [2]:
import numpy as np
from pyspark.sql import SparkSession
from pyspark.ml.feature import StringIndexer
from pyspark.sql.functions import expr, when, col
from pyspark.ml import Pipeline
from pyspark.sql.functions import lit
from pyspark.sql import DataFrame
from pyspark.sql import functions as F
from pyspark.ml.regression import LinearRegression
from pyspark.ml.feature import VectorAssembler
from functools import reduce

In [3]:
from src.data.cleaning import assign_category_labels, assign_grouped_category_labels

In [4]:
username = "data_source_user"
password = "data_source_user_password"
host = "0.0.0.0"  # "172.17.0.1"
port = "5435"
database = "data_source_db"

spark = (
    SparkSession.builder.appName("PostgresETL")
    .config("spark.jars", "setup_files/postgresql-42.7.5.jar")
    .config("spark.driver.memory", "8g")
    .config("spark.executor.memory", "4g")
    .config("spark.sql.shuffle.partitions", "100")
    .getOrCreate()
)

jdbc_url = f"jdbc:postgresql://{host}:{port}/{database}"
properties = {"user": username, "password": password, "driver": "org.postgresql.Driver"}

Using Spark's default log4j profile: org/apache/spark/log4j2-defaults.properties
25/07/27 00:51:16 WARN Utils: Your hostname, Shirshs-MacBook-Air-2.local, resolves to a loopback address: 127.0.0.1; using 192.168.0.106 instead (on interface en0)
25/07/27 00:51:16 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
25/07/27 00:51:17 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
Using Spark's default log4j profile: org/apache/spark/log4j2-defaults.properties
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).


In [5]:
df = spark.read.option("failFast", "true").jdbc(
    url=jdbc_url, table="application_train", properties=properties
)
df

DataFrame[main_split_id: bigint, sk_id_curr: bigint, target: smallint, name_contract_type: string, code_gender: string, flag_own_car: string, flag_own_realty: string, cnt_children: smallint, amt_income_total: double, amt_credit: float, amt_annuity: float, amt_goods_price: float, name_type_suite: string, name_income_type: string, name_education_type: string, name_family_status: string, name_housing_type: string, region_population_relative: float, days_birth: smallint, days_employed: int, days_registration: float, days_id_publish: smallint, own_car_age: smallint, flag_mobil: smallint, flag_emp_phone: smallint, flag_work_phone: smallint, flag_cont_mobile: smallint, flag_phone: smallint, flag_email: smallint, occupation_type: string, cnt_fam_members: smallint, region_rating_client: smallint, region_rating_client_w_city: smallint, weekday_appr_process_start: string, hour_appr_process_start: smallint, reg_region_not_live_region: smallint, reg_region_not_work_region: smallint, live_region_not

In [6]:
df.show(10)

25/07/27 00:51:19 WARN SparkStringUtils: Truncated the string representation of a plan since it was too large. This behavior can be adjusted by setting 'spark.sql.debug.maxToStringFields'.
[Stage 0:>                                                          (0 + 1) / 1]

+-------------+----------+------+------------------+-----------+------------+---------------+------------+----------------+----------+-----------+---------------+---------------+--------------------+--------------------+--------------------+-----------------+--------------------------+----------+-------------+-----------------+---------------+-----------+----------+--------------+---------------+----------------+----------+----------+---------------+---------------+--------------------+---------------------------+--------------------------+-----------------------+--------------------------+--------------------------+---------------------------+----------------------+----------------------+-----------------------+-----------------+------------+------------+------------+--------------+----------------+---------------------------+---------------+--------------+-------------+-------------+-------------+-------------+------------+--------------------+--------------+-----------------------+-

                                                                                

In [7]:
def add_credit_features(df: DataFrame) -> DataFrame:
    """
    Takes a PySpark DataFrame with one row per SK_ID_CURR and returns the same DataFrame
    with additional credit-risk features (from both previous feature lists). Handles nulls
    and edge cases (zero denominators) by returning NULL where division by zero or missing
    would occur.
    """
    # Basic ratios and normalized variables
    df = (
        df.withColumn(
            "DEBT_INCOME_RATIO",
            F.when(
                (F.col("AMT_INCOME_TOTAL").isNotNull())
                & (F.col("AMT_INCOME_TOTAL") > 0),
                F.col("AMT_CREDIT") / F.col("AMT_INCOME_TOTAL"),
            ).otherwise(F.lit(-1.0)),
        )
        .withColumn(
            "ANNUITY_INCOME_RATIO",
            F.when(
                (F.col("AMT_INCOME_TOTAL").isNotNull())
                & (F.col("AMT_INCOME_TOTAL") > 0),
                F.col("AMT_ANNUITY") / F.col("AMT_INCOME_TOTAL"),
            ).otherwise(F.lit(-1.0)),
        )
        .withColumn(
            "CREDIT_GOODS_RATIO",
            F.when(
                (F.col("AMT_GOODS_PRICE").isNotNull()) & (F.col("AMT_GOODS_PRICE") > 0),
                F.col("AMT_CREDIT") / F.col("AMT_GOODS_PRICE"),
            ).otherwise(F.lit(-1.0)),
        )
        .withColumn(
            "INCOME_PER_FAMILY_MEMBER",
            F.when(
                (F.col("CNT_FAM_MEMBERS").isNotNull()) & (F.col("CNT_FAM_MEMBERS") > 0),
                F.col("AMT_INCOME_TOTAL") / F.col("CNT_FAM_MEMBERS"),
            ).otherwise(F.lit(-1.0)),
        )
    )

    # Contactability & documentation
    df = (
        df.withColumn(
            "PHONE_AVAILABILITY",
            (
                F.coalesce(F.col("FLAG_MOBIL"), F.lit(0)).cast("int")
                + F.coalesce(F.col("FLAG_EMP_PHONE"), F.lit(0)).cast("int")
                + F.coalesce(F.col("FLAG_PHONE"), F.lit(0)).cast("int")
            ),
        )
        .withColumn(
            "MOBILE_REACHABLE",
            F.coalesce(F.col("FLAG_CONT_MOBILE"), F.lit(0)).cast("int"),
        )
        .withColumn(
            "DOCS_PROVIDED",
            reduce(
                lambda acc, c: acc + F.coalesce(F.col(c), F.lit(0)).cast("int"),
                [
                    "flag_document_2",
                    "flag_document_3",
                    "flag_document_4",
                    "flag_document_5",
                    "flag_document_6",
                    "flag_document_7",
                    "flag_document_8",
                    "flag_document_9",
                    "flag_document_10",
                    "flag_document_11",
                    "flag_document_12",
                    "flag_document_13",
                    "flag_document_14",
                    "flag_document_15",
                    "flag_document_16",
                    "flag_document_17",
                    "flag_document_18",
                    "flag_document_19",
                    "flag_document_20",
                    "flag_document_21",
                ],
                F.lit(0),
            ),
        )
    )

    # Address mismatch
    df = df.withColumn(
        "ADDRESS_MISMATCH_COUNT",
        (
            F.coalesce(F.col("REG_REGION_NOT_LIVE_REGION"), F.lit(0)).cast("int")
            + F.coalesce(F.col("REG_REGION_NOT_WORK_REGION"), F.lit(0)).cast("int")
            + F.coalesce(F.col("LIVE_REGION_NOT_WORK_REGION"), F.lit(0)).cast("int")
            + F.coalesce(F.col("REG_CITY_NOT_LIVE_CITY"), F.lit(0)).cast("int")
            + F.coalesce(F.col("REG_CITY_NOT_WORK_CITY"), F.lit(0)).cast("int")
            + F.coalesce(F.col("LIVE_CITY_NOT_WORK_CITY"), F.lit(0)).cast("int")
        ),
    )

    # Region rating difference
    df = df.withColumn(
        "REGION_RATING_DIFF",
        F.when(
            (F.col("REGION_RATING_CLIENT_W_CITY").isNotNull())
            & (F.col("REGION_RATING_CLIENT").isNotNull()),
            F.col("REGION_RATING_CLIENT_W_CITY") - F.col("REGION_RATING_CLIENT"),
        ).otherwise(F.lit(-1.0)),
    )

    # Ownership & external scores
    df = (
        df.withColumn(
            "FLAG_OWN_CAR",
            F.when(F.col("FLAG_OWN_CAR") == "Y", 1)
            .when(F.col("FLAG_OWN_CAR") == "N", 0)
            .otherwise(-1.0),
        )
        .withColumn(
            "FLAG_OWN_REALTY",
            F.when(F.col("FLAG_OWN_REALTY") == "Y", 1)
            .when(F.col("FLAG_OWN_REALTY") == "N", 0)
            .otherwise(-1.0),
        )
        .withColumn(
            "EXT_SOURCE_COUNT",
            F.when(
                F.col("EXT_SOURCE_1").isNotNull()
                | F.col("EXT_SOURCE_2").isNotNull()
                | F.col("EXT_SOURCE_3").isNotNull(),
                F.coalesce(F.col("EXT_SOURCE_1"), F.lit(0))
                + F.coalesce(F.col("EXT_SOURCE_2"), F.lit(0))
                + F.coalesce(F.col("EXT_SOURCE_3"), F.lit(0)),
            ).otherwise(F.lit(-1.0)),
        )
    )

    # Social default rates
    df = df.withColumn(
        "SOCIAL_DEFAULT_RATE_30",
        F.when(
            (F.col("OBS_30_CNT_SOCIAL_CIRCLE").isNotNull())
            & (F.col("OBS_30_CNT_SOCIAL_CIRCLE") > 0),
            F.col("DEF_30_CNT_SOCIAL_CIRCLE") / F.col("OBS_30_CNT_SOCIAL_CIRCLE"),
        ).otherwise(F.lit(-1.0)),
    ).withColumn(
        "SOCIAL_DEFAULT_RATE_60",
        F.when(
            (F.col("OBS_60_CNT_SOCIAL_CIRCLE").isNotNull())
            & (F.col("OBS_60_CNT_SOCIAL_CIRCLE") > 0),
            F.col("DEF_60_CNT_SOCIAL_CIRCLE") / F.col("OBS_60_CNT_SOCIAL_CIRCLE"),
        ).otherwise(F.lit(-1.0)),
    )

    # Employment-to-age and car-related ratios
    df = df.withColumn(
        "EMPLOYMENT_AGE_RATIO",
        F.when(
            (F.col("DAYS_EMPLOYED").isNotNull())
            & (F.col("DAYS_BIRTH").isNotNull())
            & (F.col("DAYS_BIRTH") < 0)
            & (F.col("DAYS_EMPLOYED") < 0),
            (-F.col("DAYS_EMPLOYED")) / (-F.col("DAYS_BIRTH")),
        ).otherwise(F.lit(-1.0)),
    )

    # Precompute common subexpressions: EXT_MEAN, DTI, AGE_YRS, EMP_YRS, BUREAU_TOTAL
    df = df.withColumn(
        "AGE_YRS_TMP",
        F.when(
            F.col("DAYS_BIRTH").isNotNull(), -F.col("DAYS_BIRTH") / F.lit(365.2425)
        ).otherwise(F.lit(-1.0)),
    ).withColumn(
        "EMP_YRS_TMP",
        F.when(
            F.col("DAYS_EMPLOYED").isNotNull() & (F.col("DAYS_EMPLOYED") < 0),
            -F.col("DAYS_EMPLOYED") / F.lit(365.2425),
        ).otherwise(F.lit(-1.0)),
    )

    # Income-Annuity Buffer Ratio
    df = df.withColumn(
        "INCOME_ANNUITY_BUFFER",
        F.when(
            (F.col("AMT_ANNUITY").isNotNull()) & (F.col("AMT_ANNUITY") > 0),
            (F.col("AMT_INCOME_TOTAL") - F.col("AMT_ANNUITY")) / F.col("AMT_ANNUITY"),
        ).otherwise(F.lit(-1.0)),
    )

    # Employment-Age Gap
    df = df.withColumn(
        "EMPLOYMENT_AGE_GAP",
        F.when(
            (F.col("EMP_YRS_TMP").isNotNull())
            & (F.col("AGE_YRS_TMP").isNotNull())
            & (F.col("AGE_YRS_TMP") > 0),
            F.lit(1.0) - (F.col("EMP_YRS_TMP") / F.col("AGE_YRS_TMP")),
        ).otherwise(F.lit(-1.0)),
    )

    # Clean up any intermediate TMP columns
    df = df.drop(
        "AGE_YRS_TMP",
        "EMP_YRS_TMP",
    )

    return df

In [8]:
df.show(5)

[Stage 1:>                                                          (0 + 1) / 1]

+-------------+----------+------+------------------+-----------+------------+---------------+------------+----------------+----------+-----------+---------------+---------------+----------------+--------------------+--------------------+-----------------+--------------------------+----------+-------------+-----------------+---------------+-----------+----------+--------------+---------------+----------------+----------+----------+---------------+---------------+--------------------+---------------------------+--------------------------+-----------------------+--------------------------+--------------------------+---------------------------+----------------------+----------------------+-----------------------+-----------------+------------+------------+------------+--------------+----------------+---------------------------+---------------+--------------+-------------+-------------+-------------+-------------+------------+--------------------+--------------+-----------------------+-----

                                                                                

In [9]:
def impute_with_linear_regression(
    df: DataFrame,
    feature_col: str,
    label_col: str,
    reg_param=0.3,
    elastic_net_param=0.8,
) -> DataFrame:
    """
    Imputes missing values in `label_col` using a linear regression model trained on `feature_col`.

    Parameters:
    - df: Input Spark DataFrame
    - feature_col: The column used as a predictor (e.g., "amt_credit")
    - label_col: The target column with missing values to impute (e.g., "amt_annuity")
    - reg_param: Regularization parameter (default 0.3)
    - elastic_net_param: Elastic Net mixing parameter (default 0.8)

    Returns:
    - DataFrame with imputed values in `label_col`
    """

    # Filter rows with non-null label values
    train_df = df.filter(col(label_col).isNotNull())

    if train_df.count() == 0:
        raise ValueError(f"No non-null values in '{label_col}' to train the model.")

    # Assemble feature vector
    assembler = VectorAssembler(inputCols=[feature_col], outputCol="features")

    # Set up linear regression model
    lr = LinearRegression(
        featuresCol="features",
        labelCol=label_col,
        regParam=reg_param,
        elasticNetParam=elastic_net_param,
    )

    # Build and train the pipeline
    pipeline = Pipeline(stages=[assembler, lr])
    model = pipeline.fit(train_df)

    # Extract trained linear regression model
    lr_model = model.stages[-1]
    coefficient = float(lr_model.coefficients[0])
    intercept = float(lr_model.intercept)

    # Impute missing values
    df = df.withColumn(
        label_col,
        when(
            col(label_col).isNull(),
            col(feature_col) * lit(coefficient) + lit(intercept),
        ).otherwise(col(label_col)),
    )
    return df

In [10]:
df = impute_with_linear_regression(
    df=df, feature_col="amt_credit", label_col="amt_annuity"
)
df = impute_with_linear_regression(
    df=df, feature_col="amt_credit", label_col="amt_goods_price"
)

25/07/27 00:51:26 WARN InstanceBuilder: Failed to load implementation from:dev.ludovic.netlib.blas.JNIBLAS
                                                                                

In [11]:
average_cnt_fam_members = int(
    np.floor(
        df.select(
            expr("avg(cnt_fam_members)").alias("average_cnt_fam_members")
        ).collect()[0]["average_cnt_fam_members"]
    )
)

df = df.fillna({"cnt_fam_members": average_cnt_fam_members})

In [12]:
# Create New Features
df = add_credit_features(df)
df = df.toDF(*[col.lower() for col in df.columns])

***

In [13]:
# for col in df.columns:
#     get_column_summary(df=df, column_name=col)
#     print("\n\n")

In [14]:
# from pyspark.ml.feature import StringIndexer

# indexer = StringIndexer(
#     inputCol="name_contract_type", outputCol="name_contract_type_string_indexed"
# )
# model = indexer.fit(df)
# df = model.transform(df

# from pyspark.sql import Row

# temp_df = spark.createDataFrame(
#     [Row(name_contract_type=item) for item in model.labels]
# )
# temp_df.show()
# yo_indexer = StringIndexer(
#     inputCol="name_contract_type", outputCol="name_contract_type_Index"
# )

# yo_model = yo_indexer.fit(temp_df)
# yo_model.labels == model.labels

In [15]:
for col_name in [
    "name_contract_type",
    "code_gender",
    "flag_own_car",
    "flag_own_realty",
]:
    indexer = StringIndexer(
        inputCol=col_name,
        outputCol=col_name + "_string_indexed",
        stringOrderType="alphabetAsc",
    )
    model = indexer.fit(df)
    df = model.transform(df)

    df = df.drop(col_name)

In [16]:
df = assign_category_labels(
    df=df,
    category_column="name_type_suite",
    categories=[
        "Unaccompanied",
        "Other_B",
        "Other_A",
        "Group of people",
        "Children",
        "Spouse, partner",
        "Family",
    ],
)
df = df.drop("name_type_suite")

In [17]:
categories_mapping_dict = {
    "Working": ["Working"],
    "Commercial associate": ["Commercial associate", "Businessman"],
    "Pensioner": ["Pensioner"],
    "State servant": ["State servant"],
    "Not Earning": ["Unemployed", "Student", "Maternity leave"],
}

df = assign_grouped_category_labels(
    df=df,
    category_column="name_income_type",  # df.columns[col_index],
    categories_mapping=categories_mapping_dict,
)

df = df.drop("name_income_type")

In [18]:
for col_name in ["name_education_type", "name_family_status", "name_housing_type"]:
    indexer = StringIndexer(
        inputCol=col_name,
        outputCol=col_name + "_string_indexed",
        stringOrderType="alphabetAsc",
    )
    model = indexer.fit(df)
    df = model.transform(df)

    df = df.drop(col_name)

In [19]:
df = df.fillna({"own_car_age": -1})

In [20]:
df = assign_category_labels(
    df=df,
    category_column="occupation_type",
    categories=[
        "Managers",
        "High skill tech staff",
        "HR staff",
        "Medicine staff",
        "Realty agents",
        "Sales staff",
        "IT staff",
        "Accountants",
        "Private service staff",
        "Core staff",
        "Drivers",
        "Cooking staff",
        "Security staff",
        "Waiters/barmen staff",
        "Cleaning staff",
        "Secretaries",
        "Laborers",
        "Low-skill Laborers",
    ],
)

df = df.drop("occupation_type")

In [21]:
df = assign_category_labels(
    df=df,
    category_column="weekday_appr_process_start",
    categories=[
        "MONDAY",
        "TUESDAY",
        "WEDNESDAY",
        "THURSDAY",
        "FRIDAY",
        "SATURDAY",
        "SUNDAY",
    ],
)

df = df.drop("weekday_appr_process_start")

In [22]:
df = assign_category_labels(
    df=df,
    category_column="organization_type",
    categories=[
        value["organization_type"]
        for value in (
            df.groupBy("organization_type")
            .agg(F.count("organization_type").alias("count"))
            .orderBy(F.col("count").desc())
        ).collect()
    ],
)

df = df.drop("organization_type")

In [23]:
for column_name in [
    "apartments_avg",
    "basementarea_avg",
    "years_beginexpluatation_avg",
    "years_build_avg",
    "commonarea_avg",
    "elevators_avg",
    "entrances_avg",
    "floorsmax_avg",
    "floorsmin_avg",
    "landarea_avg",
    "livingapartments_avg",
    "livingarea_avg",
    "nonlivingapartments_avg",
    "nonlivingarea_avg",
    "apartments_mode",
    "basementarea_mode",
    "years_beginexpluatation_mode",
    "years_build_mode",
    "commonarea_mode",
    "elevators_mode",
    "entrances_mode",
    "floorsmax_mode",
    "floorsmin_mode",
    "landarea_mode",
    "livingapartments_mode",
    "livingarea_mode",
    "nonlivingapartments_mode",
    "nonlivingarea_mode",
    "apartments_medi",
    "basementarea_medi",
    "years_beginexpluatation_medi",
    "years_build_medi",
    "commonarea_medi",
    "elevators_medi",
    "entrances_medi",
    "floorsmax_medi",
    "floorsmin_medi",
    "landarea_medi",
    "livingapartments_medi",
    "livingarea_medi",
    "nonlivingapartments_medi",
    "nonlivingarea_medi",
    "fondkapremont_mode",
    "housetype_mode",
    "totalarea_mode",
    "wallsmaterial_mode",
    "emergencystate_mode",
    "ext_source_1",
    "ext_source_2",
    "ext_source_3",
    "obs_30_cnt_social_circle",
    "def_30_cnt_social_circle",
    "obs_60_cnt_social_circle",
    "def_60_cnt_social_circle",
    "days_last_phone_change",
    "amt_req_credit_bureau_hour",
    "amt_req_credit_bureau_day",
    "amt_req_credit_bureau_week",
    "amt_req_credit_bureau_mon",
    "amt_req_credit_bureau_qrt",
    "amt_req_credit_bureau_year",
]:
    df = df.fillna({column_name: -1})

***

In [24]:
df.show(5)

[Stage 59:>                                                         (0 + 1) / 1]

+-------------+----------+------+------------+----------------+----------+-----------+---------------+--------------------------+----------+-------------+-----------------+---------------+-----------+----------+--------------+---------------+----------------+----------+----------+---------------+--------------------+---------------------------+-----------------------+--------------------------+--------------------------+---------------------------+----------------------+----------------------+-----------------------+------------+------------+------------+--------------+----------------+---------------------------+---------------+--------------+-------------+-------------+-------------+-------------+------------+--------------------+--------------+-----------------------+-----------------+---------------+-----------------+----------------------------+----------------+---------------+--------------+--------------+--------------+--------------+-------------+---------------------+---------

                                                                                

***

In [26]:
# df = df.toPandas()
# df.to_parquet(
#     path="notebooks/training_initial_model/initial_processed_data/application_train.parquet", index=False
# )

In [None]:
target_db = "processed_data_staging"
target_url = f"jdbc:postgresql://{host}:{port}/{target_db}"

target_properties = {
    "user": username,
    "password": password,
    "driver": "org.postgresql.Driver",
}

df.write.jdbc(
    url=target_url,
    table="application_train",
    mode="overwrite",
    properties=target_properties,
)

                                                                                

***