In [None]:
from dotenv import load_dotenv
import os
from pathlib import Path

CURRENT_DIRECTORY_NOTEBOOK = None


def intitate_notebook():
    load_dotenv()
    global CURRENT_DIRECTORY_NOTEBOOK
    if CURRENT_DIRECTORY_NOTEBOOK is None:
        os.chdir(os.getenv("PROJECT_BASE_PATH"))
        CURRENT_DIRECTORY_NOTEBOOK = Path(os.getcwd())
        print("Current directory for notebook: ", CURRENT_DIRECTORY_NOTEBOOK)
    else:
        print(
            "Current directory for notebook is already set: ",
            CURRENT_DIRECTORY_NOTEBOOK,
        )


intitate_notebook()

In [None]:
import numpy as np
from pyspark.sql import SparkSession
from pyspark.ml.feature import StringIndexer
from src.data.explore_column_info import get_column_summary
from pyspark.sql.functions import expr, when, col, sum
from pyspark.sql import functions as F
from pyspark.ml.regression import LinearRegression
from pyspark.ml.feature import VectorAssembler
from pyspark.sql import DataFrame

In [None]:
from src.data.cleaning import assign_category_labels, assign_grouped_category_labels

In [None]:
spark = (
    SparkSession.builder.appName("PostgresETL")
    .config("spark.jars", "setup_files/postgresql-42.7.5.jar")
    .getOrCreate()
)

In [None]:
username = "data_source_user"
password = "data_source_user_password"
host = "172.17.0.1"
port = "5435"
database = "data_source_db"


jdbc_url = f"jdbc:postgresql://{host}:{port}/{database}"
properties = {"user": username, "password": password, "driver": "org.postgresql.Driver"}

In [None]:
df = spark.read.option("failFast", "true").jdbc(
    url=jdbc_url, table="application_train", properties=properties
)
df

In [None]:
def count_nulls(df):
    null_counts = (
        df.select(
            [sum(when(col(c).isNull(), 1).otherwise(0)).alias(c) for c in df.columns]
        )
        .collect()[0]
        .asDict()
    )

    print("Null values count per column:")
    for col_name, null_count in null_counts.items():
        print(f" - {col_name}: {null_count}")

In [None]:
from functools import reduce


def add_credit_features(df: DataFrame) -> DataFrame:
    """
    Takes a PySpark DataFrame with one row per SK_ID_CURR and returns the same DataFrame
    with additional credit-risk features (from both previous feature lists). Handles nulls
    and edge cases (zero denominators) by returning NULL where division by zero or missing
    would occur.
    """
    # Basic ratios and normalized variables
    df = (
        df.withColumn(
            "DEBT_INCOME_RATIO",
            F.when(
                (F.col("AMT_INCOME_TOTAL").isNotNull())
                & (F.col("AMT_INCOME_TOTAL") > 0),
                F.col("AMT_CREDIT") / F.col("AMT_INCOME_TOTAL"),
            ).otherwise(F.lit(None)),
        )
        .withColumn(
            "ANNUITY_INCOME_RATIO",
            F.when(
                (F.col("AMT_INCOME_TOTAL").isNotNull())
                & (F.col("AMT_INCOME_TOTAL") > 0),
                F.col("AMT_ANNUITY") / F.col("AMT_INCOME_TOTAL"),
            ).otherwise(F.lit(None)),
        )
        .withColumn(
            "CREDIT_GOODS_RATIO",
            F.when(
                (F.col("AMT_GOODS_PRICE").isNotNull()) & (F.col("AMT_GOODS_PRICE") > 0),
                F.col("AMT_CREDIT") / F.col("AMT_GOODS_PRICE"),
            ).otherwise(F.lit(None)),
        )
        .withColumn(
            "INCOME_PER_FAMILY_MEMBER",
            F.when(
                (F.col("CNT_FAM_MEMBERS").isNotNull()) & (F.col("CNT_FAM_MEMBERS") > 0),
                F.col("AMT_INCOME_TOTAL") / F.col("CNT_FAM_MEMBERS"),
            ).otherwise(F.lit(None)),
        )
        .withColumn(
            "CHILDREN_RATIO",
            F.when(
                (F.col("CNT_FAM_MEMBERS").isNotNull()) & (F.col("CNT_FAM_MEMBERS") > 0),
                F.col("CNT_CHILDREN") / F.col("CNT_FAM_MEMBERS"),
            ).otherwise(F.lit(None)),
        )
    )

    # Contactability & documentation
    df = (
        df.withColumn(
            "PHONE_AVAILABILITY",
            (
                F.coalesce(F.col("FLAG_MOBIL"), F.lit(0)).cast("int")
                + F.coalesce(F.col("FLAG_EMP_PHONE"), F.lit(0)).cast("int")
                + F.coalesce(F.col("FLAG_PHONE"), F.lit(0)).cast("int")
            ),
        )
        .withColumn(
            "MOBILE_REACHABLE",
            F.coalesce(F.col("FLAG_CONT_MOBILE"), F.lit(0)).cast("int"),
        )
        .withColumn(
            "DOCS_PROVIDED",
            reduce(
                lambda acc, c: acc + F.coalesce(F.col(c), F.lit(0)).cast("int"),
                [
                    "flag_document_2",
                    "flag_document_3",
                    "flag_document_4",
                    "flag_document_5",
                    "flag_document_6",
                    "flag_document_7",
                    "flag_document_8",
                    "flag_document_9",
                    "flag_document_10",
                    "flag_document_11",
                    "flag_document_12",
                    "flag_document_13",
                    "flag_document_14",
                    "flag_document_15",
                    "flag_document_16",
                    "flag_document_17",
                    "flag_document_18",
                    "flag_document_19",
                    "flag_document_20",
                    "flag_document_21",
                ],
                F.lit(0),
            ),
        )
    )

    # Credit Bureau inquiries
    df = df.withColumn(
        "BUREAU_INQUIRIES_TOTAL",
        (
            F.coalesce(F.col("AMT_REQ_CREDIT_BUREAU_HOUR"), F.lit(0)).cast("int")
            + F.coalesce(F.col("AMT_REQ_CREDIT_BUREAU_DAY"), F.lit(0)).cast("int")
            + F.coalesce(F.col("AMT_REQ_CREDIT_BUREAU_WEEK"), F.lit(0)).cast("int")
            + F.coalesce(F.col("AMT_REQ_CREDIT_BUREAU_MON"), F.lit(0)).cast("int")
            + F.coalesce(F.col("AMT_REQ_CREDIT_BUREAU_QRT"), F.lit(0)).cast("int")
            + F.coalesce(F.col("AMT_REQ_CREDIT_BUREAU_YEAR"), F.lit(0)).cast("int")
        ),
    ).withColumn(
        "BUREAU_INQUIRIES_1M",
        F.coalesce(F.col("AMT_REQ_CREDIT_BUREAU_MON"), F.lit(0)).cast("int"),
    )

    # Address mismatch
    df = df.withColumn(
        "ADDRESS_MISMATCH_COUNT",
        (
            F.coalesce(F.col("REG_REGION_NOT_LIVE_REGION"), F.lit(0)).cast("int")
            + F.coalesce(F.col("REG_REGION_NOT_WORK_REGION"), F.lit(0)).cast("int")
            + F.coalesce(F.col("LIVE_REGION_NOT_WORK_REGION"), F.lit(0)).cast("int")
            + F.coalesce(F.col("REG_CITY_NOT_LIVE_CITY"), F.lit(0)).cast("int")
            + F.coalesce(F.col("REG_CITY_NOT_WORK_CITY"), F.lit(0)).cast("int")
            + F.coalesce(F.col("LIVE_CITY_NOT_WORK_CITY"), F.lit(0)).cast("int")
        ),
    )

    # Region rating difference
    df = df.withColumn(
        "REGION_RATING_DIFF",
        F.when(
            (F.col("REGION_RATING_CLIENT_W_CITY").isNotNull())
            & (F.col("REGION_RATING_CLIENT").isNotNull()),
            F.col("REGION_RATING_CLIENT_W_CITY") - F.col("REGION_RATING_CLIENT"),
        ).otherwise(F.lit(None)),
    )

    # Ownership & external scores
    df = (
        df.withColumn(
            "OWNERSHIP_SCORE",
            (
                F.coalesce(F.col("FLAG_OWN_CAR"), F.lit(0)).cast("int")
                + F.coalesce(F.col("FLAG_OWN_REALTY"), F.lit(0)).cast("int")
            ),
        )
        .withColumn(
            "EXT_SOURCE_MEAN",
            F.when(
                (F.col("EXT_SOURCE_1").isNotNull())
                & (F.col("EXT_SOURCE_2").isNotNull())
                & (F.col("EXT_SOURCE_3").isNotNull()),
                (F.col("EXT_SOURCE_1") + F.col("EXT_SOURCE_2") + F.col("EXT_SOURCE_3"))
                / F.lit(3.0),
            ).otherwise(F.lit(None)),
        )
        .withColumn(
            "EXT_SOURCE_MIN",
            F.least(
                F.col("EXT_SOURCE_1"), F.col("EXT_SOURCE_2"), F.col("EXT_SOURCE_3")
            ),
        )
    )

    # Social default rates
    df = df.withColumn(
        "SOCIAL_DEFAULT_RATE_30",
        F.when(
            (F.col("OBS_30_CNT_SOCIAL_CIRCLE").isNotNull())
            & (F.col("OBS_30_CNT_SOCIAL_CIRCLE") > 0),
            F.col("DEF_30_CNT_SOCIAL_CIRCLE") / F.col("OBS_30_CNT_SOCIAL_CIRCLE"),
        ).otherwise(F.lit(0.0)),
    ).withColumn(
        "SOCIAL_DEFAULT_RATE_60",
        F.when(
            (F.col("OBS_60_CNT_SOCIAL_CIRCLE").isNotNull())
            & (F.col("OBS_60_CNT_SOCIAL_CIRCLE") > 0),
            F.col("DEF_60_CNT_SOCIAL_CIRCLE") / F.col("OBS_60_CNT_SOCIAL_CIRCLE"),
        ).otherwise(F.lit(0.0)),
    )

    # Employment-to-age and car-related ratios
    df = df.withColumn(
        "EMPLOYMENT_AGE_RATIO",
        F.when(
            (F.col("DAYS_EMPLOYED").isNotNull())
            & (F.col("DAYS_BIRTH").isNotNull())
            & (F.col("DAYS_BIRTH") < 0)
            & (F.col("DAYS_EMPLOYED") < 0),
            (-F.col("DAYS_EMPLOYED")) / (-F.col("DAYS_BIRTH")),
        ).otherwise(F.lit(None)),
    ).withColumn(
        "CAR_AGE_EFFECTIVE",
        F.when(F.col("FLAG_OWN_CAR") == 1, F.col("OWN_CAR_AGE")).otherwise(F.lit(0)),
    )

    # Precompute common subexpressions: EXT_MEAN, DTI, AGE_YRS, EMP_YRS, BUREAU_TOTAL
    df = (
        df.withColumn(
            "EXT_MEAN_TMP",
            F.when(
                (F.col("EXT_SOURCE_1").isNotNull())
                & (F.col("EXT_SOURCE_2").isNotNull())
                & (F.col("EXT_SOURCE_3").isNotNull()),
                (F.col("EXT_SOURCE_1") + F.col("EXT_SOURCE_2") + F.col("EXT_SOURCE_3"))
                / F.lit(3.0),
            ).otherwise(F.lit(None)),
        )
        .withColumn(
            "DTI_TMP",
            F.when(
                (F.col("AMT_INCOME_TOTAL").isNotNull())
                & (F.col("AMT_INCOME_TOTAL") > 0),
                F.col("AMT_CREDIT") / F.col("AMT_INCOME_TOTAL"),
            ).otherwise(F.lit(None)),
        )
        .withColumn(
            "AGE_YRS_TMP",
            F.when(
                F.col("DAYS_BIRTH").isNotNull(), -F.col("DAYS_BIRTH") / F.lit(365.2425)
            ).otherwise(F.lit(None)),
        )
        .withColumn(
            "EMP_YRS_TMP",
            F.when(
                F.col("DAYS_EMPLOYED").isNotNull() & (F.col("DAYS_EMPLOYED") < 0),
                -F.col("DAYS_EMPLOYED") / F.lit(365.2425),
            ).otherwise(F.lit(None)),
        )
    )

    # Income-Annuity Buffer Ratio
    df = df.withColumn(
        "INCOME_ANNUITY_BUFFER",
        F.when(
            (F.col("AMT_ANNUITY").isNotNull()) & (F.col("AMT_ANNUITY") > 0),
            (F.col("AMT_INCOME_TOTAL") - F.col("AMT_ANNUITY")) / F.col("AMT_ANNUITY"),
        ).otherwise(F.lit(None)),
    )

    # Risk‐Adjusted External Score
    df = df.withColumn(
        "RISK_ADJ_EXT",
        F.when(
            (F.col("EXT_MEAN_TMP").isNotNull())
            & (F.col("DTI_TMP").isNotNull())
            & (F.col("DTI_TMP") > 0),
            F.col("EXT_MEAN_TMP") / F.col("DTI_TMP"),
        ).otherwise(F.lit(None)),
    )

    # External Score Variability (standard deviation)
    df = df.withColumn(
        "EXT_STD",
        F.when(
            (F.col("EXT_SOURCE_1").isNotNull())
            & (F.col("EXT_SOURCE_2").isNotNull())
            & (F.col("EXT_SOURCE_3").isNotNull()),
            F.sqrt(
                (
                    (F.col("EXT_SOURCE_1") - F.col("EXT_MEAN_TMP")) ** 2
                    + (F.col("EXT_SOURCE_2") - F.col("EXT_MEAN_TMP")) ** 2
                    + (F.col("EXT_SOURCE_3") - F.col("EXT_MEAN_TMP")) ** 2
                )
                / F.lit(3.0)
            ),
        ).otherwise(F.lit(None)),
    )

    # Housing Mobility Score
    df = df.withColumn(
        "HOUSING_MOBILITY",
        2
        * (
            F.coalesce(F.col("REG_REGION_NOT_LIVE_REGION"), F.lit(0)).cast("int")
            + F.coalesce(F.col("REG_REGION_NOT_WORK_REGION"), F.lit(0)).cast("int")
            + F.coalesce(F.col("LIVE_REGION_NOT_WORK_REGION"), F.lit(0)).cast("int")
        )
        + 1
        * (
            F.coalesce(F.col("REG_CITY_NOT_LIVE_CITY"), F.lit(0)).cast("int")
            + F.coalesce(F.col("REG_CITY_NOT_WORK_CITY"), F.lit(0)).cast("int")
            + F.coalesce(F.col("LIVE_CITY_NOT_WORK_CITY"), F.lit(0)).cast("int")
        ),
    )

    # Employment-Age Gap
    df = df.withColumn(
        "EMPLOYMENT_AGE_GAP",
        F.when(
            (F.col("EMP_YRS_TMP").isNotNull())
            & (F.col("AGE_YRS_TMP").isNotNull())
            & (F.col("AGE_YRS_TMP") > 0),
            F.lit(1.0) - (F.col("EMP_YRS_TMP") / F.col("AGE_YRS_TMP")),
        ).otherwise(F.lit(None)),
    )

    # Clean up any intermediate TMP columns
    df = df.drop(
        "EXT_MEAN_TMP",
        "DTI_TMP",
        "AGE_YRS_TMP",
        "EMP_YRS_TMP",
    )

    return df

In [None]:
df = add_credit_features(df)

In [None]:
df = df.toDF(*[col.lower() for col in df.columns])

In [None]:
df.show(5)

***

In [None]:
# for col in df.columns:
#     get_column_summary(df=df, column_name=col)
#     print("\n\n")

In [None]:
col_index = 0
get_column_summary(df=df, column_name=df.columns[col_index])

In [None]:
col_index = 1
get_column_summary(df=df, column_name=df.columns[col_index])

In [None]:
col_index = 2
get_column_summary(df=df, column_name=df.columns[col_index])

In [None]:
col_index = 3
get_column_summary(df=df, column_name=df.columns[col_index])

In [None]:
# from pyspark.ml.feature import StringIndexer

# indexer = StringIndexer(
#     inputCol="name_contract_type", outputCol="name_contract_type_string_indexed"
# )
# model = indexer.fit(df)
# df = model.transform(df)

# from pyspark.sql import Row

# temp_df = spark.createDataFrame(
#     [Row(name_contract_type=item) for item in model.labels]
# )
# temp_df.show()
# yo_indexer = StringIndexer(
#     inputCol="name_contract_type", outputCol="name_contract_type_Index"
# )

# yo_model = yo_indexer.fit(temp_df)
# yo_model.labels == model.labels

In [None]:
col_index = 4
get_column_summary(df=df, column_name=df.columns[col_index])

In [None]:
col_index = 5
get_column_summary(df=df, column_name=df.columns[col_index])

In [None]:
col_index = 6
get_column_summary(df=df, column_name=df.columns[col_index])

In [None]:
for col_name in [
    "name_contract_type",
    "code_gender",
    "flag_own_car",
    "flag_own_realty",
]:
    indexer = StringIndexer(
        inputCol=col_name,
        outputCol=col_name + "_string_indexed",
        stringOrderType="alphabetAsc",
    )
    model = indexer.fit(df)
    df = model.transform(df)

In [None]:
col_index = 7
get_column_summary(df=df, column_name=df.columns[col_index])

In [None]:
col_index = 8
get_column_summary(df=df, column_name=df.columns[col_index])

In [None]:
col_index = 9
get_column_summary(df=df, column_name=df.columns[col_index])

In [None]:
col_index = 10
get_column_summary(df=df, column_name=df.columns[col_index])

In [None]:
assembler = VectorAssembler(
    inputCols=["amt_credit"], outputCol="amt_annuity_lr_features"
)

df_assembled = assembler.transform(df.filter(col("amt_annuity").isNotNull()))

lr = LinearRegression(
    featuresCol="amt_annuity_lr_features",
    labelCol="amt_annuity",
    regParam=0.3,
    elasticNetParam=0.8,
)

lrModel = lr.fit(df_assembled)

print("Coefficients: %s" % str(lrModel.coefficients))
print("Intercept: %s" % str(lrModel.intercept))
print("RMSE: %f" % lrModel.summary.rootMeanSquaredError)
print("R2 Score: %f" % lrModel.summary.r2)

In [None]:
coefficient = lrModel.coefficients[0]
intercept = lrModel.intercept
expr_string = f"({coefficient} * amt_credit) + {intercept}"

df = df.withColumn(
    "amt_annuity",
    when(col("amt_annuity").isNull(), expr(expr_string)).otherwise(col("amt_annuity")),
)

In [None]:
col_index = 11
get_column_summary(df=df, column_name=df.columns[col_index])

In [None]:
assembler = VectorAssembler(
    inputCols=["amt_credit"], outputCol="amt_goods_price_lr_features"
)

df_assembled = assembler.transform(df.filter(col("amt_goods_price").isNotNull()))

lr = LinearRegression(
    featuresCol="amt_goods_price_lr_features",
    labelCol="amt_goods_price",
    regParam=0.3,
    elasticNetParam=0.8,
)

lrModel = lr.fit(df_assembled)

print("Coefficients: %s" % str(lrModel.coefficients))
print("Intercept: %s" % str(lrModel.intercept))
print("RMSE: %f" % lrModel.summary.rootMeanSquaredError)
print("R2 Score: %f" % lrModel.summary.r2)

In [None]:
coefficient = lrModel.coefficients[0]
intercept = lrModel.intercept
expr_string = f"({coefficient} * amt_credit) + {intercept}"

df = df.withColumn(
    "amt_goods_price",
    when(col("amt_goods_price").isNull(), expr(expr_string)).otherwise(
        col("amt_goods_price")
    ),
)

In [None]:
col_index = 12
get_column_summary(df=df, column_name=df.columns[col_index])

In [None]:
df = assign_category_labels(
    df=df,
    category_column="name_type_suite",
    categories=[
        "Unaccompanied",
        "Other_B",
        "Other_A",
        "Group of people",
        "Children",
        "Spouse, partner",
        "Family",
    ],
)

In [None]:
col_index = 13
get_column_summary(df=df, column_name=df.columns[col_index])

In [None]:
categories_mapping_dict = {
    "Working": ["Working"],
    "Commercial associate": ["Commercial associate", "Businessman"],
    "Pensioner": ["Pensioner"],
    "State servant": ["State servant"],
    "Not Earning": ["Unemployed", "Student", "Maternity leave"],
}

df = assign_grouped_category_labels(
    df=df,
    category_column=df.columns[col_index],
    categories_mapping=categories_mapping_dict,
)

In [None]:
col_index = 14
get_column_summary(df=df, column_name=df.columns[col_index])

In [None]:
col_index = 15
get_column_summary(df=df, column_name=df.columns[col_index])

In [None]:
col_index = 16
get_column_summary(df=df, column_name=df.columns[col_index])

In [None]:
for col_name in ["name_education_type", "name_family_status", "name_housing_type"]:
    indexer = StringIndexer(
        inputCol=col_name,
        outputCol=col_name + "_string_indexed",
        stringOrderType="alphabetAsc",
    )
    model = indexer.fit(df)
    df = model.transform(df)

In [None]:
col_index = 17
get_column_summary(df=df, column_name=df.columns[col_index])

In [None]:
col_index = 18
get_column_summary(df=df, column_name=df.columns[col_index])

In [None]:
col_index = 19
get_column_summary(df=df, column_name=df.columns[col_index])

In [None]:
col_index = 20
get_column_summary(df=df, column_name=df.columns[col_index])

In [None]:
col_index = 21
get_column_summary(df=df, column_name=df.columns[col_index])

In [None]:
col_index = 21
get_column_summary(df=df, column_name=df.columns[col_index])

In [None]:
col_index = 22
get_column_summary(df=df, column_name=df.columns[col_index])

In [None]:
df = df.fillna({"own_car_age": -1})

In [None]:
col_index = 23
get_column_summary(df=df, column_name=df.columns[col_index])

In [None]:
col_index = 24
get_column_summary(df=df, column_name=df.columns[col_index])

In [None]:
col_index = 25
get_column_summary(df=df, column_name=df.columns[col_index])

In [None]:
col_index = 26
get_column_summary(df=df, column_name=df.columns[col_index])

In [None]:
col_index = 27
get_column_summary(df=df, column_name=df.columns[col_index])

In [None]:
col_index = 28
get_column_summary(df=df, column_name=df.columns[col_index])

In [None]:
col_index = 29
get_column_summary(df=df, column_name=df.columns[col_index])

In [None]:
df = assign_category_labels(
    df=df,
    category_column="occupation_type",
    categories=[
        "Managers",
        "High skill tech staff",
        "HR staff",
        "Medicine staff",
        "Realty agents",
        "Sales staff",
        "IT staff",
        "Accountants",
        "Private service staff",
        "Core staff",
        "Drivers",
        "Cooking staff",
        "Security staff",
        "Waiters/barmen staff",
        "Cleaning staff",
        "Secretaries",
        "Laborers",
        "Low-skill Laborers",
    ],
)

In [None]:
col_index = 30
get_column_summary(df=df, column_name=df.columns[col_index])

In [None]:
average_cnt_fam_members = int(
    np.floor(
        df.select(
            expr("avg(cnt_fam_members)").alias("average_cnt_fam_members")
        ).collect()[0]["average_cnt_fam_members"]
    )
)

df = df.fillna({"cnt_fam_members": average_cnt_fam_members})

In [None]:
col_index = 31
get_column_summary(df=df, column_name=df.columns[col_index])

In [None]:
col_index = 32
get_column_summary(df=df, column_name=df.columns[col_index])

In [None]:
col_index = 33
get_column_summary(df=df, column_name=df.columns[col_index])

In [None]:
df = assign_category_labels(
    df=df,
    category_column="weekday_appr_process_start",
    categories=[
        "MONDAY",
        "TUESDAY",
        "WEDNESDAY",
        "THURSDAY",
        "FRIDAY",
        "SATURDAY",
        "SUNDAY",
    ],
)

In [None]:
col_index = 34
get_column_summary(df=df, column_name=df.columns[col_index])

In [None]:
col_index = 35
get_column_summary(df=df, column_name=df.columns[col_index])

In [None]:
col_index = 36
get_column_summary(df=df, column_name=df.columns[col_index])

In [None]:
col_index = 37
get_column_summary(df=df, column_name=df.columns[col_index])

In [None]:
col_index = 38
get_column_summary(df=df, column_name=df.columns[col_index])

In [None]:
col_index = 39
get_column_summary(df=df, column_name=df.columns[col_index])

In [None]:
col_index = 40
get_column_summary(df=df, column_name=df.columns[col_index])

In [None]:
col_index = 41
get_column_summary(df=df, column_name=df.columns[col_index])

In [None]:
column_name = df.columns[col_index]
unique_values_count = (
    df.groupBy(column_name)
    .agg(F.count(column_name).alias("count"))
    .orderBy(F.col("count").desc())
)

df = assign_category_labels(
    df=df,
    category_column="organization_type",
    categories=[value[column_name] for value in unique_values_count.collect()],
)

In [None]:
col_index = 42
get_column_summary(df=df, column_name=df.columns[col_index])

In [None]:
col_index = 43
get_column_summary(df=df, column_name=df.columns[col_index])

In [None]:
col_index = 44
get_column_summary(df=df, column_name=df.columns[col_index])

In [None]:
for column_name in ["ext_source_1", "ext_source_2", "ext_source_3"]:
    df = df.fillna({column_name: -1})

In [None]:
col_index = 45
get_column_summary(df=df, column_name=df.columns[col_index])

In [None]:
for column_name in [
    "apartments_avg",
    "basementarea_avg",
    "years_beginexpluatation_avg",
    "years_build_avg",
    "commonarea_avg",
    "elevators_avg",
    "entrances_avg",
    "floorsmax_avg",
    "floorsmin_avg",
    "landarea_avg",
    "livingapartments_avg",
    "livingarea_avg",
    "nonlivingapartments_avg",
    "nonlivingarea_avg",
    "apartments_mode",
    "basementarea_mode",
    "years_beginexpluatation_mode",
    "years_build_mode",
    "commonarea_mode",
    "elevators_mode",
    "entrances_mode",
    "floorsmax_mode",
    "floorsmin_mode",
    "landarea_mode",
    "livingapartments_mode",
    "livingarea_mode",
    "nonlivingapartments_mode",
    "nonlivingarea_mode",
    "apartments_medi",
    "basementarea_medi",
    "years_beginexpluatation_medi",
    "years_build_medi",
    "commonarea_medi",
    "elevators_medi",
    "entrances_medi",
    "floorsmax_medi",
    "floorsmin_medi",
    "landarea_medi",
    "livingapartments_medi",
    "livingarea_medi",
    "nonlivingapartments_medi",
    "nonlivingarea_medi",
    "fondkapremont_mode",
    "housetype_mode",
    "totalarea_mode",
    "wallsmaterial_mode",
    "emergencystate_mode",
]:
    df = df.fillna({column_name: -1})

In [None]:
col_index = 93
get_column_summary(df=df, column_name=df.columns[col_index])

In [None]:
col_index = 96
get_column_summary(df=df, column_name=df.columns[col_index])

In [None]:
for column_name in [
    "obs_30_cnt_social_circle",
    "def_30_cnt_social_circle",
    "obs_60_cnt_social_circle",
    "def_60_cnt_social_circle",
    "days_last_phone_change",
]:
    df = df.fillna({column_name: -1})

In [None]:
col_index = 97
get_column_summary(df=df, column_name=df.columns[col_index])

In [None]:
# for col_name in df.columns[97:117]:
#     get_column_summary(df=df, column_name=col_name)

# No processing required

In [None]:
# for col_name in df.columns[117:123]:
#     get_column_summary(df=df, column_name=col_name)

In [None]:
for col_name in [
    "amt_req_credit_bureau_hour",
    "amt_req_credit_bureau_day",
    "amt_req_credit_bureau_week",
    "amt_req_credit_bureau_mon",
    "amt_req_credit_bureau_qrt",
    "amt_req_credit_bureau_year",
]:
    df = df.fillna({column_name: -1})

***

In [None]:
df.show(5)

In [None]:
count_nulls(df)

***

In [None]:
# column_name = df.columns[col_index]
# unique_values_count = (
#     df.groupBy(column_name)
#     .agg(F.count(column_name).alias("count"))
#     .orderBy(F.col("count").desc())
# )

# {value[column_name]: value["count"] for value in unique_values_count.collect()}