In [1]:
from dotenv import load_dotenv
import os
from pathlib import Path

CURRENT_DIRECTORY_NOTEBOOK = None


def intitate_notebook():
    load_dotenv()
    global CURRENT_DIRECTORY_NOTEBOOK
    if CURRENT_DIRECTORY_NOTEBOOK is None:
        os.chdir(os.getenv("PROJECT_BASE_PATH"))
        CURRENT_DIRECTORY_NOTEBOOK = Path(os.getcwd())
        print("Current directory for notebook: ", CURRENT_DIRECTORY_NOTEBOOK)
    else:
        print(
            "Current directory for notebook is already set: ",
            CURRENT_DIRECTORY_NOTEBOOK,
        )


intitate_notebook()

Current directory for notebook:  /workspace


In [2]:
# import shutil
# import os

# def extract_zip(zip_file_path, extract_to_folder):
#     # Check if the provided folder exists, if not, create it
#     if not os.path.exists(extract_to_folder):
#         os.makedirs(extract_to_folder)

#     # Extract the .zip file to the target folder
#     shutil.unpack_archive(zip_file_path, extract_to_folder)
#     print(f"Files extracted to {extract_to_folder}")

# extract_zip(zip_file_path="raw_dataset/home-credit-default-risk.zip", extract_to_folder="raw_dataset/raw_tables")

In [3]:
import pandas as pd
from sqlalchemy import create_engine

# Postgres DB Credentials
username = "data_source_user"
password = "data_source_user_password"
host = "172.17.0.1"
port = "5435"
database = "data_source_db"

connection_uri = f"postgresql://{username}:{password}@{host}:{port}/{database}"
print("Connection URI: ", connection_uri)

# MySQL - SQLAlchemy engine
engine = create_engine(connection_uri)
engine.connect()

Connection URI:  postgresql://data_source_user:data_source_user_password@172.17.0.1:5435/data_source_db


<sqlalchemy.engine.base.Connection at 0xffff72aed400>

In [4]:
import re


def to_snake_case(text):
    text = re.sub(r"[^a-zA-Z0-9]", " ", text)
    return "_".join(text.lower().split())

In [5]:
import numpy as np

pd.set_option("display.max_columns", 500)


def add_partition_id_column(
    df: pd.DataFrame, n_splits: int = 12, diff_count: int = 5000
):
    df = df.sort_values(by="SK_ID_CURR", ascending=True)
    unique_ids_count = df["SK_ID_CURR"].nunique()
    print("rows_count: ", df.shape[0])
    print("unique_ids_count: ", unique_ids_count)
    if unique_ids_count != df.shape[0]:
        raise ValueError("ID column is not unique")

    split_sizes = []
    for split_size in np.linspace(0, unique_ids_count, n_splits + 1, dtype=int):
        if split_size in (0, unique_ids_count):
            split_sizes.append(int(split_size))
        else:
            split_sizes.append(
                np.random.randint(
                    split_size - int(diff_count / 2), split_size + int(diff_count / 2)
                )
            )

    split_sizes_dict = {}
    for i in range(n_splits):
        split_sizes_dict[i + 1] = (
            split_sizes[i],
            split_sizes[i + 1],
        )

    df["MAIN_SPLIT_ID"] = None

    for partition_id, (start, end) in split_sizes_dict.items():
        df.loc[start:end, "MAIN_SPLIT_ID"] = partition_id

    column_names = df.columns
    df = df[["MAIN_SPLIT_ID"] + column_names.drop(labels=["MAIN_SPLIT_ID"]).to_list()]

    return df

In [6]:
def get_smallest_int_type(col_min, col_max, has_nulls):
    """Determine the smallest integer type that can accommodate the value range"""
    if col_min >= 0:  # Unsigned integers
        if has_nulls:
            if col_max <= 255:
                return "UInt8"
            if col_max <= 65535:
                return "UInt16"
            if col_max <= 4294967295:
                return "UInt32"
            return "UInt64"
        else:
            if col_max <= 255:
                return "uint8"
            if col_max <= 65535:
                return "uint16"
            if col_max <= 4294967295:
                return "uint32"
            return "uint64"
    else:  # Signed integers
        if has_nulls:
            if col_min >= -128 and col_max <= 127:
                return "Int8"
            if col_min >= -32768 and col_max <= 32767:
                return "Int16"
            if col_min >= -2147483648 and col_max <= 2147483647:
                return "Int32"
            return "Int64"
        else:
            if col_min >= -128 and col_max <= 127:
                return "int8"
            if col_min >= -32768 and col_max <= 32767:
                return "int16"
            if col_min >= -2147483648 and col_max <= 2147483647:
                return "int32"
            return "int64"


def optimize_dataframe(df):
    """Optimize DataFrame by downcasting numeric columns efficiently"""
    # Step 1: Convert float columns with integer values to int types
    float_cols = df.select_dtypes(include=["float"]).columns
    converted_to_int = []

    for col in float_cols:
        # Check for non-integer values (including inf and large numbers)
        non_null = df[col].dropna()
        if non_null.empty:  # All-NaN column
            new_dtype = "Int8"
            df[col] = df[col].astype(new_dtype)
            print(f"Column '{col}' (all NaN) converted to {new_dtype}")
            converted_to_int.append(col)
            continue

        # Check for integers using safe method (handles inf/nan)
        is_integer = non_null.apply(
            lambda x: x.is_integer() if not np.isinf(x) else False
        )

        if is_integer.all():
            has_nulls = df[col].isna().any()
            col_min = non_null.min()
            col_max = non_null.max()

            # Handle extreme values
            if col_min < -9223372036854775808 or col_max > 9223372036854775807:
                print(
                    f"Column '{col}' has out-of-int64-range values, skipping conversion"
                )
                continue

            new_dtype = get_smallest_int_type(col_min, col_max, has_nulls)
            df[col] = df[col].astype(new_dtype)
            print(f"Column '{col}' converted from float to {new_dtype}")
            converted_to_int.append(col)

    # Step 2: Downcast integer columns
    int_cols = df.select_dtypes(include=["integer"]).columns
    for col in int_cols:
        has_nulls = df[col].isna().any()
        non_null = df[col].dropna()

        if non_null.empty:  # All-NaN column
            new_dtype = "Int8" if has_nulls else "int8"
            if df[col].dtype.name != new_dtype:
                df[col] = df[col].astype(new_dtype)
                print(f"Column '{col}' (all NaN) downcasted to {new_dtype}")
            continue

        col_min = non_null.min()
        col_max = non_null.max()
        current_dtype = df[col].dtype.name

        # Skip if already optimal
        if has_nulls:
            if current_dtype.startswith("Int") and col_min >= 0:
                if current_dtype == "UInt8" and col_max <= 255:
                    continue
                if current_dtype == "UInt16" and col_max <= 65535:
                    continue
                if current_dtype == "UInt32" and col_max <= 4294967295:
                    continue
            elif current_dtype.startswith("Int"):
                if current_dtype == "Int8" and (col_min >= -128 and col_max <= 127):
                    continue
                if current_dtype == "Int16" and (
                    col_min >= -32768 and col_max <= 32767
                ):
                    continue
                if current_dtype == "Int32" and (
                    col_min >= -2147483648 and col_max <= 2147483647
                ):
                    continue
        else:
            if current_dtype.startswith("uint") and col_min >= 0:
                if current_dtype == "uint8" and col_max <= 255:
                    continue
                if current_dtype == "uint16" and col_max <= 65535:
                    continue
                if current_dtype == "uint32" and col_max <= 4294967295:
                    continue
            elif current_dtype.startswith("int"):
                if current_dtype == "int8" and (col_min >= -128 and col_max <= 127):
                    continue
                if current_dtype == "int16" and (
                    col_min >= -32768 and col_max <= 32767
                ):
                    continue
                if current_dtype == "int32" and (
                    col_min >= -2147483648 and col_max <= 2147483647
                ):
                    continue

        new_dtype = get_smallest_int_type(col_min, col_max, has_nulls)

        if new_dtype != current_dtype:
            df[col] = df[col].astype(new_dtype)
            print(f"Column '{col}' downcasted from {current_dtype} to {new_dtype}")

    # Step 3: Downcast float columns
    float_cols = df.select_dtypes(include=["float"]).columns
    for col in float_cols:
        if col in converted_to_int:  # Skip converted columns
            continue

        original_dtype = df[col].dtype
        non_null = df[col].dropna()

        if non_null.empty:  # All-NaN column
            new_dtype = "float32"
            df[col] = df[col].astype(new_dtype)
            print(f"Column '{col}' (all NaN) downcasted to {new_dtype}")
            continue

        # Check for extreme values
        if (non_null.abs() > 3.4e38).any():
            print(f"Column '{col}' has values >3.4e38, skipping downcast")
            continue

        # Downcast with precision preservation
        downcasted = pd.to_numeric(df[col], downcast="float")
        new_dtype = downcasted.dtype

        if new_dtype != original_dtype:
            df[col] = downcasted
            print(f"Column '{col}' downcasted from {original_dtype} to {new_dtype}")

    return df

### Table - application_train

In [13]:
application_train_df = pd.read_csv("raw_dataset/raw_tables/application_train.csv")

prev_memory_usage = application_train_df.memory_usage(deep=True).sum() / (1024**2)
application_train_df = optimize_dataframe(application_train_df)
memory_usage = application_train_df.memory_usage(deep=True).sum() / (1024**2)
print(" - " * 25, "\n")
print("Previous Memory Usage: ", prev_memory_usage)
print("Memory Usage: ", memory_usage)
print(" - " * 25, "\n")

application_train_df

Column 'OWN_CAR_AGE' converted from float to UInt8
Column 'CNT_FAM_MEMBERS' converted from float to UInt8
Column 'OBS_30_CNT_SOCIAL_CIRCLE' converted from float to UInt16
Column 'DEF_30_CNT_SOCIAL_CIRCLE' converted from float to UInt8
Column 'OBS_60_CNT_SOCIAL_CIRCLE' converted from float to UInt16
Column 'DEF_60_CNT_SOCIAL_CIRCLE' converted from float to UInt8
Column 'DAYS_LAST_PHONE_CHANGE' converted from float to Int16
Column 'AMT_REQ_CREDIT_BUREAU_HOUR' converted from float to UInt8
Column 'AMT_REQ_CREDIT_BUREAU_DAY' converted from float to UInt8
Column 'AMT_REQ_CREDIT_BUREAU_WEEK' converted from float to UInt8
Column 'AMT_REQ_CREDIT_BUREAU_MON' converted from float to UInt8
Column 'AMT_REQ_CREDIT_BUREAU_QRT' converted from float to UInt16
Column 'AMT_REQ_CREDIT_BUREAU_YEAR' converted from float to UInt8
Column 'SK_ID_CURR' downcasted from int64 to uint32
Column 'TARGET' downcasted from int64 to uint8
Column 'CNT_CHILDREN' downcasted from int64 to uint8
Column 'DAYS_BIRTH' downcast

Unnamed: 0,SK_ID_CURR,TARGET,NAME_CONTRACT_TYPE,CODE_GENDER,FLAG_OWN_CAR,FLAG_OWN_REALTY,CNT_CHILDREN,AMT_INCOME_TOTAL,AMT_CREDIT,AMT_ANNUITY,AMT_GOODS_PRICE,NAME_TYPE_SUITE,NAME_INCOME_TYPE,NAME_EDUCATION_TYPE,NAME_FAMILY_STATUS,NAME_HOUSING_TYPE,REGION_POPULATION_RELATIVE,DAYS_BIRTH,DAYS_EMPLOYED,DAYS_REGISTRATION,DAYS_ID_PUBLISH,OWN_CAR_AGE,FLAG_MOBIL,FLAG_EMP_PHONE,FLAG_WORK_PHONE,FLAG_CONT_MOBILE,FLAG_PHONE,FLAG_EMAIL,OCCUPATION_TYPE,CNT_FAM_MEMBERS,REGION_RATING_CLIENT,REGION_RATING_CLIENT_W_CITY,WEEKDAY_APPR_PROCESS_START,HOUR_APPR_PROCESS_START,REG_REGION_NOT_LIVE_REGION,REG_REGION_NOT_WORK_REGION,LIVE_REGION_NOT_WORK_REGION,REG_CITY_NOT_LIVE_CITY,REG_CITY_NOT_WORK_CITY,LIVE_CITY_NOT_WORK_CITY,ORGANIZATION_TYPE,EXT_SOURCE_1,EXT_SOURCE_2,EXT_SOURCE_3,APARTMENTS_AVG,BASEMENTAREA_AVG,YEARS_BEGINEXPLUATATION_AVG,YEARS_BUILD_AVG,COMMONAREA_AVG,ELEVATORS_AVG,ENTRANCES_AVG,FLOORSMAX_AVG,FLOORSMIN_AVG,LANDAREA_AVG,LIVINGAPARTMENTS_AVG,LIVINGAREA_AVG,NONLIVINGAPARTMENTS_AVG,NONLIVINGAREA_AVG,APARTMENTS_MODE,BASEMENTAREA_MODE,YEARS_BEGINEXPLUATATION_MODE,YEARS_BUILD_MODE,COMMONAREA_MODE,ELEVATORS_MODE,ENTRANCES_MODE,FLOORSMAX_MODE,FLOORSMIN_MODE,LANDAREA_MODE,LIVINGAPARTMENTS_MODE,LIVINGAREA_MODE,NONLIVINGAPARTMENTS_MODE,NONLIVINGAREA_MODE,APARTMENTS_MEDI,BASEMENTAREA_MEDI,YEARS_BEGINEXPLUATATION_MEDI,YEARS_BUILD_MEDI,COMMONAREA_MEDI,ELEVATORS_MEDI,ENTRANCES_MEDI,FLOORSMAX_MEDI,FLOORSMIN_MEDI,LANDAREA_MEDI,LIVINGAPARTMENTS_MEDI,LIVINGAREA_MEDI,NONLIVINGAPARTMENTS_MEDI,NONLIVINGAREA_MEDI,FONDKAPREMONT_MODE,HOUSETYPE_MODE,TOTALAREA_MODE,WALLSMATERIAL_MODE,EMERGENCYSTATE_MODE,OBS_30_CNT_SOCIAL_CIRCLE,DEF_30_CNT_SOCIAL_CIRCLE,OBS_60_CNT_SOCIAL_CIRCLE,DEF_60_CNT_SOCIAL_CIRCLE,DAYS_LAST_PHONE_CHANGE,FLAG_DOCUMENT_2,FLAG_DOCUMENT_3,FLAG_DOCUMENT_4,FLAG_DOCUMENT_5,FLAG_DOCUMENT_6,FLAG_DOCUMENT_7,FLAG_DOCUMENT_8,FLAG_DOCUMENT_9,FLAG_DOCUMENT_10,FLAG_DOCUMENT_11,FLAG_DOCUMENT_12,FLAG_DOCUMENT_13,FLAG_DOCUMENT_14,FLAG_DOCUMENT_15,FLAG_DOCUMENT_16,FLAG_DOCUMENT_17,FLAG_DOCUMENT_18,FLAG_DOCUMENT_19,FLAG_DOCUMENT_20,FLAG_DOCUMENT_21,AMT_REQ_CREDIT_BUREAU_HOUR,AMT_REQ_CREDIT_BUREAU_DAY,AMT_REQ_CREDIT_BUREAU_WEEK,AMT_REQ_CREDIT_BUREAU_MON,AMT_REQ_CREDIT_BUREAU_QRT,AMT_REQ_CREDIT_BUREAU_YEAR
0,100002,1,Cash loans,M,N,Y,0,202500.0,406597.5,24700.5,351000.0,Unaccompanied,Working,Secondary / secondary special,Single / not married,House / apartment,0.018801,-9461,-637,-3648.0,-2120,,1,1,0,1,1,0,Laborers,1,2,2,WEDNESDAY,10,0,0,0,0,0,0,Business Entity Type 3,0.083037,0.262949,0.139376,0.0247,0.0369,0.9722,0.6192,0.0143,0.00,0.0690,0.0833,0.1250,0.0369,0.0202,0.0190,0.0000,0.0000,0.0252,0.0383,0.9722,0.6341,0.0144,0.0000,0.0690,0.0833,0.1250,0.0377,0.0220,0.0198,0.0,0.0000,0.0250,0.0369,0.9722,0.6243,0.0144,0.00,0.0690,0.0833,0.1250,0.0375,0.0205,0.0193,0.0000,0.0000,reg oper account,block of flats,0.0149,"Stone, brick",No,2,2,2,2,-1134,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1
1,100003,0,Cash loans,F,N,N,0,270000.0,1293502.5,35698.5,1129500.0,Family,State servant,Higher education,Married,House / apartment,0.003541,-16765,-1188,-1186.0,-291,,1,1,0,1,1,0,Core staff,2,1,1,MONDAY,11,0,0,0,0,0,0,School,0.311267,0.622246,,0.0959,0.0529,0.9851,0.7960,0.0605,0.08,0.0345,0.2917,0.3333,0.0130,0.0773,0.0549,0.0039,0.0098,0.0924,0.0538,0.9851,0.8040,0.0497,0.0806,0.0345,0.2917,0.3333,0.0128,0.0790,0.0554,0.0,0.0000,0.0968,0.0529,0.9851,0.7987,0.0608,0.08,0.0345,0.2917,0.3333,0.0132,0.0787,0.0558,0.0039,0.0100,reg oper account,block of flats,0.0714,Block,No,1,0,1,0,-828,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
2,100004,0,Revolving loans,M,Y,Y,0,67500.0,135000.0,6750.0,135000.0,Unaccompanied,Working,Secondary / secondary special,Single / not married,House / apartment,0.010032,-19046,-225,-4260.0,-2531,26,1,1,1,1,1,0,Laborers,1,2,2,MONDAY,9,0,0,0,0,0,0,Government,,0.555912,0.729567,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,0,0,0,0,-815,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
3,100006,0,Cash loans,F,N,Y,0,135000.0,312682.5,29686.5,297000.0,Unaccompanied,Working,Secondary / secondary special,Civil marriage,House / apartment,0.008019,-19005,-3039,-9833.0,-2437,,1,1,0,1,0,0,Laborers,2,2,2,WEDNESDAY,17,0,0,0,0,0,0,Business Entity Type 3,,0.650442,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,2,0,2,0,-617,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,,,,,,
4,100007,0,Cash loans,M,N,Y,0,121500.0,513000.0,21865.5,513000.0,Unaccompanied,Working,Secondary / secondary special,Single / not married,House / apartment,0.028663,-19932,-3038,-4311.0,-3458,,1,1,0,1,0,0,Core staff,1,2,2,THURSDAY,11,0,0,0,0,1,1,Religion,,0.322738,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,0,0,0,0,-1106,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
307506,456251,0,Cash loans,M,N,N,0,157500.0,254700.0,27558.0,225000.0,Unaccompanied,Working,Secondary / secondary special,Separated,With parents,0.032561,-9327,-236,-8456.0,-1982,,1,1,0,1,0,0,Sales staff,1,1,1,THURSDAY,15,0,0,0,0,0,0,Services,0.145570,0.681632,,0.2021,0.0887,0.9876,0.8300,0.0202,0.22,0.1034,0.6042,0.2708,0.0594,0.1484,0.1965,0.0753,0.1095,0.1008,0.0172,0.9782,0.7125,0.0172,0.0806,0.0345,0.4583,0.0417,0.0094,0.0882,0.0853,0.0,0.0125,0.2040,0.0887,0.9876,0.8323,0.0203,0.22,0.1034,0.6042,0.2708,0.0605,0.1509,0.2001,0.0757,0.1118,reg oper account,block of flats,0.2898,"Stone, brick",No,0,0,0,0,-273,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,,,,,,
307507,456252,0,Cash loans,F,N,Y,0,72000.0,269550.0,12001.5,225000.0,Unaccompanied,Pensioner,Secondary / secondary special,Widow,House / apartment,0.025164,-20775,365243,-4388.0,-4090,,1,0,0,1,1,0,,1,2,2,MONDAY,8,0,0,0,0,0,0,XNA,,0.115992,,0.0247,0.0435,0.9727,0.6260,0.0022,0.00,0.1034,0.0833,0.1250,0.0579,0.0202,0.0257,0.0000,0.0000,0.0252,0.0451,0.9727,0.6406,0.0022,0.0000,0.1034,0.0833,0.1250,0.0592,0.0220,0.0267,0.0,0.0000,0.0250,0.0435,0.9727,0.6310,0.0022,0.00,0.1034,0.0833,0.1250,0.0589,0.0205,0.0261,0.0000,0.0000,reg oper account,block of flats,0.0214,"Stone, brick",No,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,,,,,,
307508,456253,0,Cash loans,F,N,Y,0,153000.0,677664.0,29979.0,585000.0,Unaccompanied,Working,Higher education,Separated,House / apartment,0.005002,-14966,-7921,-6737.0,-5150,,1,1,0,1,0,1,Managers,1,3,3,THURSDAY,9,0,0,0,0,1,1,School,0.744026,0.535722,0.218859,0.1031,0.0862,0.9816,0.7484,0.0123,0.00,0.2069,0.1667,0.2083,,0.0841,0.9279,0.0000,0.0000,0.1050,0.0894,0.9816,0.7583,0.0124,0.0000,0.2069,0.1667,0.2083,,0.0918,0.9667,0.0,0.0000,0.1041,0.0862,0.9816,0.7518,0.0124,0.00,0.2069,0.1667,0.2083,,0.0855,0.9445,0.0000,0.0000,reg oper account,block of flats,0.7970,Panel,No,6,0,6,0,-1909,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,1,0,1
307509,456254,1,Cash loans,F,N,Y,0,171000.0,370107.0,20205.0,319500.0,Unaccompanied,Commercial associate,Secondary / secondary special,Married,House / apartment,0.005313,-11961,-4786,-2562.0,-931,,1,1,0,1,0,0,Laborers,2,2,2,WEDNESDAY,9,0,0,0,1,1,0,Business Entity Type 1,,0.514163,0.661024,0.0124,,0.9771,,,,0.0690,0.0417,,,,0.0061,,,0.0126,,0.9772,,,,0.0690,0.0417,,,,0.0063,,,0.0125,,0.9771,,,,0.0690,0.0417,,,,0.0062,,,,block of flats,0.0086,"Stone, brick",No,0,0,0,0,-322,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


In [14]:
application_train_df = add_partition_id_column(application_train_df)
application_train_df.head()

rows_count:  307511
unique_ids_count:  307511


Unnamed: 0,MAIN_SPLIT_ID,SK_ID_CURR,TARGET,NAME_CONTRACT_TYPE,CODE_GENDER,FLAG_OWN_CAR,FLAG_OWN_REALTY,CNT_CHILDREN,AMT_INCOME_TOTAL,AMT_CREDIT,AMT_ANNUITY,AMT_GOODS_PRICE,NAME_TYPE_SUITE,NAME_INCOME_TYPE,NAME_EDUCATION_TYPE,NAME_FAMILY_STATUS,NAME_HOUSING_TYPE,REGION_POPULATION_RELATIVE,DAYS_BIRTH,DAYS_EMPLOYED,DAYS_REGISTRATION,DAYS_ID_PUBLISH,OWN_CAR_AGE,FLAG_MOBIL,FLAG_EMP_PHONE,FLAG_WORK_PHONE,FLAG_CONT_MOBILE,FLAG_PHONE,FLAG_EMAIL,OCCUPATION_TYPE,CNT_FAM_MEMBERS,REGION_RATING_CLIENT,REGION_RATING_CLIENT_W_CITY,WEEKDAY_APPR_PROCESS_START,HOUR_APPR_PROCESS_START,REG_REGION_NOT_LIVE_REGION,REG_REGION_NOT_WORK_REGION,LIVE_REGION_NOT_WORK_REGION,REG_CITY_NOT_LIVE_CITY,REG_CITY_NOT_WORK_CITY,LIVE_CITY_NOT_WORK_CITY,ORGANIZATION_TYPE,EXT_SOURCE_1,EXT_SOURCE_2,EXT_SOURCE_3,APARTMENTS_AVG,BASEMENTAREA_AVG,YEARS_BEGINEXPLUATATION_AVG,YEARS_BUILD_AVG,COMMONAREA_AVG,ELEVATORS_AVG,ENTRANCES_AVG,FLOORSMAX_AVG,FLOORSMIN_AVG,LANDAREA_AVG,LIVINGAPARTMENTS_AVG,LIVINGAREA_AVG,NONLIVINGAPARTMENTS_AVG,NONLIVINGAREA_AVG,APARTMENTS_MODE,BASEMENTAREA_MODE,YEARS_BEGINEXPLUATATION_MODE,YEARS_BUILD_MODE,COMMONAREA_MODE,ELEVATORS_MODE,ENTRANCES_MODE,FLOORSMAX_MODE,FLOORSMIN_MODE,LANDAREA_MODE,LIVINGAPARTMENTS_MODE,LIVINGAREA_MODE,NONLIVINGAPARTMENTS_MODE,NONLIVINGAREA_MODE,APARTMENTS_MEDI,BASEMENTAREA_MEDI,YEARS_BEGINEXPLUATATION_MEDI,YEARS_BUILD_MEDI,COMMONAREA_MEDI,ELEVATORS_MEDI,ENTRANCES_MEDI,FLOORSMAX_MEDI,FLOORSMIN_MEDI,LANDAREA_MEDI,LIVINGAPARTMENTS_MEDI,LIVINGAREA_MEDI,NONLIVINGAPARTMENTS_MEDI,NONLIVINGAREA_MEDI,FONDKAPREMONT_MODE,HOUSETYPE_MODE,TOTALAREA_MODE,WALLSMATERIAL_MODE,EMERGENCYSTATE_MODE,OBS_30_CNT_SOCIAL_CIRCLE,DEF_30_CNT_SOCIAL_CIRCLE,OBS_60_CNT_SOCIAL_CIRCLE,DEF_60_CNT_SOCIAL_CIRCLE,DAYS_LAST_PHONE_CHANGE,FLAG_DOCUMENT_2,FLAG_DOCUMENT_3,FLAG_DOCUMENT_4,FLAG_DOCUMENT_5,FLAG_DOCUMENT_6,FLAG_DOCUMENT_7,FLAG_DOCUMENT_8,FLAG_DOCUMENT_9,FLAG_DOCUMENT_10,FLAG_DOCUMENT_11,FLAG_DOCUMENT_12,FLAG_DOCUMENT_13,FLAG_DOCUMENT_14,FLAG_DOCUMENT_15,FLAG_DOCUMENT_16,FLAG_DOCUMENT_17,FLAG_DOCUMENT_18,FLAG_DOCUMENT_19,FLAG_DOCUMENT_20,FLAG_DOCUMENT_21,AMT_REQ_CREDIT_BUREAU_HOUR,AMT_REQ_CREDIT_BUREAU_DAY,AMT_REQ_CREDIT_BUREAU_WEEK,AMT_REQ_CREDIT_BUREAU_MON,AMT_REQ_CREDIT_BUREAU_QRT,AMT_REQ_CREDIT_BUREAU_YEAR
0,1,100002,1,Cash loans,M,N,Y,0,202500.0,406597.5,24700.5,351000.0,Unaccompanied,Working,Secondary / secondary special,Single / not married,House / apartment,0.018801,-9461,-637,-3648.0,-2120,,1,1,0,1,1,0,Laborers,1,2,2,WEDNESDAY,10,0,0,0,0,0,0,Business Entity Type 3,0.083037,0.262949,0.139376,0.0247,0.0369,0.9722,0.6192,0.0143,0.0,0.069,0.0833,0.125,0.0369,0.0202,0.019,0.0,0.0,0.0252,0.0383,0.9722,0.6341,0.0144,0.0,0.069,0.0833,0.125,0.0377,0.022,0.0198,0.0,0.0,0.025,0.0369,0.9722,0.6243,0.0144,0.0,0.069,0.0833,0.125,0.0375,0.0205,0.0193,0.0,0.0,reg oper account,block of flats,0.0149,"Stone, brick",No,2,2,2,2,-1134,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0.0,0.0,1.0
1,1,100003,0,Cash loans,F,N,N,0,270000.0,1293502.5,35698.5,1129500.0,Family,State servant,Higher education,Married,House / apartment,0.003541,-16765,-1188,-1186.0,-291,,1,1,0,1,1,0,Core staff,2,1,1,MONDAY,11,0,0,0,0,0,0,School,0.311267,0.622246,,0.0959,0.0529,0.9851,0.796,0.0605,0.08,0.0345,0.2917,0.3333,0.013,0.0773,0.0549,0.0039,0.0098,0.0924,0.0538,0.9851,0.804,0.0497,0.0806,0.0345,0.2917,0.3333,0.0128,0.079,0.0554,0.0,0.0,0.0968,0.0529,0.9851,0.7987,0.0608,0.08,0.0345,0.2917,0.3333,0.0132,0.0787,0.0558,0.0039,0.01,reg oper account,block of flats,0.0714,Block,No,1,0,1,0,-828,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0
2,1,100004,0,Revolving loans,M,Y,Y,0,67500.0,135000.0,6750.0,135000.0,Unaccompanied,Working,Secondary / secondary special,Single / not married,House / apartment,0.010032,-19046,-225,-4260.0,-2531,26.0,1,1,1,1,1,0,Laborers,1,2,2,MONDAY,9,0,0,0,0,0,0,Government,,0.555912,0.729567,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,0,0,0,0,-815,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0
3,1,100006,0,Cash loans,F,N,Y,0,135000.0,312682.5,29686.5,297000.0,Unaccompanied,Working,Secondary / secondary special,Civil marriage,House / apartment,0.008019,-19005,-3039,-9833.0,-2437,,1,1,0,1,0,0,Laborers,2,2,2,WEDNESDAY,17,0,0,0,0,0,0,Business Entity Type 3,,0.650442,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,2,0,2,0,-617,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,,,,,,
4,1,100007,0,Cash loans,M,N,Y,0,121500.0,513000.0,21865.5,513000.0,Unaccompanied,Working,Secondary / secondary special,Single / not married,House / apartment,0.028663,-19932,-3038,-4311.0,-3458,,1,1,0,1,0,0,Core staff,1,2,2,THURSDAY,11,0,0,0,0,1,1,Religion,,0.322738,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,0,0,0,0,-1106,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0


In [15]:
application_train_df["MAIN_SPLIT_ID"].value_counts()

MAIN_SPLIT_ID
6     28318
1     27805
12    27497
10    27328
8     26255
4     26093
5     25178
7     24758
3     24488
9     24480
11    22933
2     22378
Name: count, dtype: int64

In [16]:
application_train_df_column_rename_dict = {}
for item in application_train_df.columns:
    application_train_df_column_rename_dict[item] = to_snake_case(item)

application_train_df.rename(
    columns=application_train_df_column_rename_dict, inplace=True
)

In [17]:
application_train_df.to_sql(
    name="application_train",
    con=engine,
    index=False,
    if_exists="replace",
    # chunksize=1024 * 8,
    # method="multi",
)

111

In [18]:
application_train_df.to_parquet(
    path="raw_dataset/merged_main_splited_data/application_train.parquet",
    index=False,
)

In [7]:
application_train_df = pd.read_parquet(
    "raw_dataset/merged_main_splited_data/application_train.parquet",
    columns=["main_split_id", "sk_id_curr"],
)
application_train_df.head()

Unnamed: 0,main_split_id,sk_id_curr
0,1,100002
1,1,100003
2,1,100004
3,1,100006
4,1,100007


### Table - bureau

In [8]:
bureau_df = pd.read_csv("raw_dataset/raw_tables/bureau.csv")

prev_memory_usage = bureau_df.memory_usage(deep=True).sum() / (1024**2)
bureau_df = optimize_dataframe(bureau_df)
memory_usage = bureau_df.memory_usage(deep=True).sum() / (1024**2)
print(" - " * 25, "\n")
print("Previous Memory Usage: ", prev_memory_usage)
print("Memory Usage: ", memory_usage)
print(" - " * 25, "\n")

bureau_df

Column 'DAYS_CREDIT_ENDDATE' converted from float to Int32
Column 'DAYS_ENDDATE_FACT' converted from float to Int32
Column 'SK_ID_CURR' downcasted from int64 to uint32
Column 'SK_ID_BUREAU' downcasted from int64 to uint32
Column 'DAYS_CREDIT' downcasted from int64 to int16
Column 'CREDIT_DAY_OVERDUE' downcasted from int64 to uint16
Column 'CNT_CREDIT_PROLONG' downcasted from int64 to uint8
Column 'DAYS_CREDIT_UPDATE' downcasted from int64 to int32
 -  -  -  -  -  -  -  -  -  -  -  -  -  -  -  -  -  -  -  -  -  -  -  -  -  

Previous Memory Usage:  472.82484245300293
Memory Usage:  412.2590503692627
 -  -  -  -  -  -  -  -  -  -  -  -  -  -  -  -  -  -  -  -  -  -  -  -  -  



Unnamed: 0,SK_ID_CURR,SK_ID_BUREAU,CREDIT_ACTIVE,CREDIT_CURRENCY,DAYS_CREDIT,CREDIT_DAY_OVERDUE,DAYS_CREDIT_ENDDATE,DAYS_ENDDATE_FACT,AMT_CREDIT_MAX_OVERDUE,CNT_CREDIT_PROLONG,AMT_CREDIT_SUM,AMT_CREDIT_SUM_DEBT,AMT_CREDIT_SUM_LIMIT,AMT_CREDIT_SUM_OVERDUE,CREDIT_TYPE,DAYS_CREDIT_UPDATE,AMT_ANNUITY
0,215354,5714462,Closed,currency 1,-497,0,-153,-153,,0,91323.00,0.0,,0.0,Consumer credit,-131,
1,215354,5714463,Active,currency 1,-208,0,1075,,,0,225000.00,171342.0,,0.0,Credit card,-20,
2,215354,5714464,Active,currency 1,-203,0,528,,,0,464323.50,,,0.0,Consumer credit,-16,
3,215354,5714465,Active,currency 1,-203,0,,,,0,90000.00,,,0.0,Credit card,-16,
4,215354,5714466,Active,currency 1,-629,0,1197,,77674.5,0,2700000.00,,,0.0,Consumer credit,-21,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1716423,259355,5057750,Active,currency 1,-44,0,-30,,0.0,0,11250.00,11250.0,0.0,0.0,Microloan,-19,
1716424,100044,5057754,Closed,currency 1,-2648,0,-2433,-2493,5476.5,0,38130.84,0.0,0.0,0.0,Consumer credit,-2493,
1716425,100044,5057762,Closed,currency 1,-1809,0,-1628,-970,,0,15570.00,,,0.0,Consumer credit,-967,
1716426,246829,5057770,Closed,currency 1,-1878,0,-1513,-1513,,0,36000.00,0.0,0.0,0.0,Consumer credit,-1508,


In [9]:
bureau_df_column_rename_dict = {}
for item in bureau_df.columns:
    bureau_df_column_rename_dict[item] = to_snake_case(item)

bureau_df.rename(columns=bureau_df_column_rename_dict, inplace=True)

In [10]:
bureau_df = pd.merge(
    application_train_df,
    bureau_df,
    how="inner",
    on="sk_id_curr",
)

bureau_df.sk_id_bureau.isna().sum(), bureau_df.shape

(np.int64(0), (1465325, 18))

In [11]:
bureau_df.to_sql(
    name="bureau",
    con=engine,
    index=False,
    if_exists="replace",
    # chunksize=1024 * 32,
    # method="multi",
)

325

In [12]:
bureau_df.to_parquet(
    path="raw_dataset/merged_main_splited_data/bureau.parquet",
    index=False,
)

In [7]:
bureau_df = pd.read_parquet(
    "raw_dataset/merged_main_splited_data/bureau.parquet",
    columns=["main_split_id", "sk_id_bureau"],
)

#### Bureau Balance

In [8]:
bureau_balance_df = pd.read_csv("raw_dataset/raw_tables/bureau_balance.csv")

prev_memory_usage = bureau_balance_df.memory_usage(deep=True).sum() / (1024**2)
bureau_balance_df = optimize_dataframe(bureau_balance_df)
memory_usage = bureau_balance_df.memory_usage(deep=True).sum() / (1024**2)
print(" - " * 25, "\n")
print("Previous Memory Usage: ", prev_memory_usage)
print("Memory Usage: ", memory_usage)
print(" - " * 25, "\n")

bureau_balance_df_column_rename_dict = {}
for item in bureau_balance_df.columns:
    bureau_balance_df_column_rename_dict[item] = to_snake_case(item)

bureau_balance_df.rename(columns=bureau_balance_df_column_rename_dict, inplace=True)

bureau_balance_df.head()

Column 'SK_ID_BUREAU' downcasted from int64 to uint32
Column 'MONTHS_BALANCE' downcasted from int64 to int8
 -  -  -  -  -  -  -  -  -  -  -  -  -  -  -  -  -  -  -  -  -  -  -  -  -  

Previous Memory Usage:  1718.3257884979248
Memory Usage:  1431.938178062439
 -  -  -  -  -  -  -  -  -  -  -  -  -  -  -  -  -  -  -  -  -  -  -  -  -  



Unnamed: 0,sk_id_bureau,months_balance,status
0,5715448,0,C
1,5715448,-1,C
2,5715448,-2,C
3,5715448,-3,C
4,5715448,-4,C


In [9]:
bureau_balance_df = pd.merge(
    bureau_df,
    bureau_balance_df,
    how="inner",
    on="sk_id_bureau",
)

In [10]:
bureau_balance_df.isna().sum(), bureau_balance_df.shape

(main_split_id     0
 sk_id_bureau      0
 months_balance    0
 status            0
 dtype: int64,
 (14701612, 4))

In [11]:
bureau_balance_df

Unnamed: 0,main_split_id,sk_id_bureau,months_balance,status
0,1,6158904,-15,C
1,1,6158904,-16,C
2,1,6158904,-17,0
3,1,6158904,-18,0
4,1,6158904,-19,0
...,...,...,...,...
14701607,12,5126337,-72,C
14701608,12,5126337,-73,C
14701609,12,5126337,-74,C
14701610,12,5126337,-75,0


In [12]:
bureau_balance_df.to_sql(
    name="bureau_balance",
    con=engine,
    index=False,
    if_exists="replace",
    chunksize=1024 * 4,
    # method="multi",
)

344612

In [13]:
bureau_balance_df.to_parquet(
    path="raw_dataset/merged_main_splited_data/bureau_balance.parquet",
    index=False,
)

### Table - previous_application

In [8]:
previous_application_df = pd.read_csv("raw_dataset/raw_tables/previous_application.csv")

prev_memory_usage = previous_application_df.memory_usage(deep=True).sum() / (1024**2)
previous_application_df = optimize_dataframe(previous_application_df)
memory_usage = previous_application_df.memory_usage(deep=True).sum() / (1024**2)
print(" - " * 25, "\n")
print("Previous Memory Usage: ", prev_memory_usage)
print("Memory Usage: ", memory_usage)
print(" - " * 25, "\n")

previous_application_df

Column 'CNT_PAYMENT' converted from float to UInt8
Column 'DAYS_FIRST_DRAWING' converted from float to Int32
Column 'DAYS_FIRST_DUE' converted from float to Int32
Column 'DAYS_LAST_DUE_1ST_VERSION' converted from float to Int32
Column 'DAYS_LAST_DUE' converted from float to Int32
Column 'DAYS_TERMINATION' converted from float to Int32
Column 'NFLAG_INSURED_ON_APPROVAL' converted from float to UInt8
Column 'SK_ID_PREV' downcasted from int64 to uint32
Column 'SK_ID_CURR' downcasted from int64 to uint32
Column 'HOUR_APPR_PROCESS_START' downcasted from int64 to uint8
Column 'NFLAG_LAST_APPL_IN_DAY' downcasted from int64 to uint8
Column 'DAYS_DECISION' downcasted from int64 to int16
Column 'SELLERPLACE_AREA' downcasted from int64 to int32
Column 'RATE_DOWN_PAYMENT' downcasted from float64 to float32
Column 'RATE_INTEREST_PRIMARY' downcasted from float64 to float32
Column 'RATE_INTEREST_PRIVILEGED' downcasted from float64 to float32
 -  -  -  -  -  -  -  -  -  -  -  -  -  -  -  -  -  -  -  -

Unnamed: 0,SK_ID_PREV,SK_ID_CURR,NAME_CONTRACT_TYPE,AMT_ANNUITY,AMT_APPLICATION,AMT_CREDIT,AMT_DOWN_PAYMENT,AMT_GOODS_PRICE,WEEKDAY_APPR_PROCESS_START,HOUR_APPR_PROCESS_START,FLAG_LAST_APPL_PER_CONTRACT,NFLAG_LAST_APPL_IN_DAY,RATE_DOWN_PAYMENT,RATE_INTEREST_PRIMARY,RATE_INTEREST_PRIVILEGED,NAME_CASH_LOAN_PURPOSE,NAME_CONTRACT_STATUS,DAYS_DECISION,NAME_PAYMENT_TYPE,CODE_REJECT_REASON,NAME_TYPE_SUITE,NAME_CLIENT_TYPE,NAME_GOODS_CATEGORY,NAME_PORTFOLIO,NAME_PRODUCT_TYPE,CHANNEL_TYPE,SELLERPLACE_AREA,NAME_SELLER_INDUSTRY,CNT_PAYMENT,NAME_YIELD_GROUP,PRODUCT_COMBINATION,DAYS_FIRST_DRAWING,DAYS_FIRST_DUE,DAYS_LAST_DUE_1ST_VERSION,DAYS_LAST_DUE,DAYS_TERMINATION,NFLAG_INSURED_ON_APPROVAL
0,2030495,271877,Consumer loans,1730.430,17145.0,17145.0,0.0,17145.0,SATURDAY,15,Y,1,0.000000,0.182832,0.867336,XAP,Approved,-73,Cash through the bank,XAP,,Repeater,Mobile,POS,XNA,Country-wide,35,Connectivity,12,middle,POS mobile with interest,365243,-42,300,-42,-37,0
1,2802425,108129,Cash loans,25188.615,607500.0,679671.0,,607500.0,THURSDAY,11,Y,1,,,,XNA,Approved,-164,XNA,XAP,Unaccompanied,Repeater,XNA,Cash,x-sell,Contact center,-1,XNA,36,low_action,Cash X-Sell: low,365243,-134,916,365243,365243,1
2,2523466,122040,Cash loans,15060.735,112500.0,136444.5,,112500.0,TUESDAY,11,Y,1,,,,XNA,Approved,-301,Cash through the bank,XAP,"Spouse, partner",Repeater,XNA,Cash,x-sell,Credit and cash offices,-1,XNA,12,high,Cash X-Sell: high,365243,-271,59,365243,365243,1
3,2819243,176158,Cash loans,47041.335,450000.0,470790.0,,450000.0,MONDAY,7,Y,1,,,,XNA,Approved,-512,Cash through the bank,XAP,,Repeater,XNA,Cash,x-sell,Credit and cash offices,-1,XNA,12,middle,Cash X-Sell: middle,365243,-482,-152,-182,-177,1
4,1784265,202054,Cash loans,31924.395,337500.0,404055.0,,337500.0,THURSDAY,9,Y,1,,,,Repairs,Refused,-781,Cash through the bank,HC,,Repeater,XNA,Cash,walk-in,Credit and cash offices,-1,XNA,24,high,Cash Street: high,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1670209,2300464,352015,Consumer loans,14704.290,267295.5,311400.0,0.0,267295.5,WEDNESDAY,12,Y,1,0.000000,,,XAP,Approved,-544,Cash through the bank,XAP,,Refreshed,Furniture,POS,XNA,Stone,43,Furniture,30,low_normal,POS industry with interest,365243,-508,362,-358,-351,0
1670210,2357031,334635,Consumer loans,6622.020,87750.0,64291.5,29250.0,87750.0,TUESDAY,15,Y,1,0.340554,,,XAP,Approved,-1694,Cash through the bank,XAP,Unaccompanied,New,Furniture,POS,XNA,Stone,43,Furniture,12,middle,POS industry with interest,365243,-1604,-1274,-1304,-1297,0
1670211,2659632,249544,Consumer loans,11520.855,105237.0,102523.5,10525.5,105237.0,MONDAY,12,Y,1,0.101401,,,XAP,Approved,-1488,Cash through the bank,XAP,"Spouse, partner",Repeater,Consumer Electronics,POS,XNA,Country-wide,1370,Consumer electronics,10,low_normal,POS household with interest,365243,-1457,-1187,-1187,-1181,0
1670212,2785582,400317,Cash loans,18821.520,180000.0,191880.0,,180000.0,WEDNESDAY,9,Y,1,,,,XNA,Approved,-1185,Cash through the bank,XAP,Family,Repeater,XNA,Cash,x-sell,AP+ (Cash loan),-1,XNA,12,low_normal,Cash X-Sell: low,365243,-1155,-825,-825,-817,1


In [9]:
previous_application_df_column_rename_dict = {}
for item in previous_application_df.columns:
    previous_application_df_column_rename_dict[item] = to_snake_case(item)

previous_application_df.rename(
    columns=previous_application_df_column_rename_dict, inplace=True
)

In [10]:
previous_application_df = pd.merge(
    application_train_df,
    previous_application_df,
    how="inner",
    on="sk_id_curr",
)

previous_application_df.sk_id_prev.isna().sum(), previous_application_df.shape

(np.int64(0), (1413701, 38))

In [11]:
previous_application_df.head()

Unnamed: 0,main_split_id,sk_id_curr,sk_id_prev,name_contract_type,amt_annuity,amt_application,amt_credit,amt_down_payment,amt_goods_price,weekday_appr_process_start,hour_appr_process_start,flag_last_appl_per_contract,nflag_last_appl_in_day,rate_down_payment,rate_interest_primary,rate_interest_privileged,name_cash_loan_purpose,name_contract_status,days_decision,name_payment_type,code_reject_reason,name_type_suite,name_client_type,name_goods_category,name_portfolio,name_product_type,channel_type,sellerplace_area,name_seller_industry,cnt_payment,name_yield_group,product_combination,days_first_drawing,days_first_due,days_last_due_1st_version,days_last_due,days_termination,nflag_insured_on_approval
0,1,100002,1038818,Consumer loans,9251.775,179055.0,179055.0,0.0,179055.0,SATURDAY,9,Y,1,0.0,,,XAP,Approved,-606,XNA,XAP,,New,Vehicles,POS,XNA,Stone,500,Auto technology,24,low_normal,POS other with interest,365243,-565,125,-25,-17,0
1,1,100003,1810518,Cash loans,98356.995,900000.0,1035882.0,,900000.0,FRIDAY,12,Y,1,,,,XNA,Approved,-746,XNA,XAP,Unaccompanied,Repeater,XNA,Cash,x-sell,Credit and cash offices,-1,XNA,12,low_normal,Cash X-Sell: low,365243,-716,-386,-536,-527,1
2,1,100003,2636178,Consumer loans,64567.665,337500.0,348637.5,0.0,337500.0,SUNDAY,17,Y,1,0.0,,,XAP,Approved,-828,Cash through the bank,XAP,Family,Refreshed,Furniture,POS,XNA,Stone,1400,Furniture,6,middle,POS industry with interest,365243,-797,-647,-647,-639,0
3,1,100003,2396755,Consumer loans,6737.31,68809.5,68053.5,6885.0,68809.5,SATURDAY,15,Y,1,0.100061,,,XAP,Approved,-2341,Cash through the bank,XAP,Family,Refreshed,Consumer Electronics,POS,XNA,Country-wide,200,Consumer electronics,12,middle,POS household with interest,365243,-2310,-1980,-1980,-1976,1
4,1,100004,1564014,Consumer loans,5357.25,24282.0,20106.0,4860.0,24282.0,FRIDAY,5,Y,1,0.212008,,,XAP,Approved,-815,Cash through the bank,XAP,Unaccompanied,New,Mobile,POS,XNA,Regional / Local,30,Connectivity,4,middle,POS mobile without interest,365243,-784,-694,-724,-714,0


In [12]:
previous_application_df.to_sql(
    name="previous_application",
    con=engine,
    index=False,
    if_exists="replace",
    chunksize=1024 * 8,
    # method="multi",
)

78121

In [13]:
previous_application_df.to_parquet(
    path="raw_dataset/merged_main_splited_data/previous_application.parquet",
    index=False,
)

In [7]:
previous_application_df = pd.read_parquet(
    "raw_dataset/merged_main_splited_data/previous_application.parquet",
    columns=["main_split_id", "sk_id_curr", "sk_id_prev"],
)
previous_application_df = previous_application_df[
    ["main_split_id", "sk_id_curr", "sk_id_prev"]
]

#### POC Cash Balance

In [8]:
POS_CASH_balance_df = pd.read_csv("raw_dataset/raw_tables/POS_CASH_balance.csv")

prev_memory_usage = POS_CASH_balance_df.memory_usage(deep=True).sum() / (1024**2)
POS_CASH_balance_df = optimize_dataframe(POS_CASH_balance_df)
memory_usage = POS_CASH_balance_df.memory_usage(deep=True).sum() / (1024**2)
print(" - " * 25, "\n")
print("Previous Memory Usage: ", prev_memory_usage)
print("Memory Usage: ", memory_usage)
print(" - " * 25, "\n")

POS_CASH_balance_df_column_rename_dict = {}
for item in POS_CASH_balance_df.columns:
    POS_CASH_balance_df_column_rename_dict[item] = to_snake_case(item)

POS_CASH_balance_df.rename(columns=POS_CASH_balance_df_column_rename_dict, inplace=True)

POS_CASH_balance_df.head()

Column 'CNT_INSTALMENT' converted from float to UInt8
Column 'CNT_INSTALMENT_FUTURE' converted from float to UInt8
Column 'SK_ID_PREV' downcasted from int64 to uint32
Column 'SK_ID_CURR' downcasted from int64 to uint32
Column 'MONTHS_BALANCE' downcasted from int64 to int8
Column 'SK_DPD' downcasted from int64 to uint16
Column 'SK_DPD_DEF' downcasted from int64 to uint16
 -  -  -  -  -  -  -  -  -  -  -  -  -  -  -  -  -  -  -  -  -  -  -  -  -  

Previous Memory Usage:  1060.945873260498
Memory Usage:  688.962381362915
 -  -  -  -  -  -  -  -  -  -  -  -  -  -  -  -  -  -  -  -  -  -  -  -  -  



Unnamed: 0,sk_id_prev,sk_id_curr,months_balance,cnt_instalment,cnt_instalment_future,name_contract_status,sk_dpd,sk_dpd_def
0,1803195,182943,-31,48,45,Active,0,0
1,1715348,367990,-33,36,35,Active,0,0
2,1784872,397406,-32,12,9,Active,0,0
3,1903291,269225,-35,48,42,Active,0,0
4,2341044,334279,-35,36,35,Active,0,0


In [9]:
POS_CASH_balance_df = pd.merge(
    previous_application_df[["main_split_id", "sk_id_prev"]],
    POS_CASH_balance_df,
    how="inner",
    on="sk_id_prev",
)

POS_CASH_balance_df.isna().sum(), POS_CASH_balance_df.shape

(main_split_id                0
 sk_id_prev                   0
 sk_id_curr                   0
 months_balance               0
 cnt_instalment           21834
 cnt_instalment_future    21849
 name_contract_status         0
 sk_dpd                       0
 sk_dpd_def                   0
 dtype: int64,
 (8251754, 9))

In [10]:
POS_CASH_balance_df.to_sql(
    name="pos_cash_balance",
    con=engine,
    index=False,
    if_exists="replace",
    chunksize=1024 * 8,
    # method="multi",
)

POS_CASH_balance_df.to_parquet(
    path="raw_dataset/merged_main_splited_data/pos_cash_balance.parquet",
    index=False,
)

#### Instalments Payment

In [8]:
installments_payments_df = pd.read_csv(
    "raw_dataset/raw_tables/installments_payments.csv"
)

prev_memory_usage = installments_payments_df.memory_usage(deep=True).sum() / (1024**2)
installments_payments_df = optimize_dataframe(installments_payments_df)
memory_usage = installments_payments_df.memory_usage(deep=True).sum() / (1024**2)
print(" - " * 25, "\n")
print("Previous Memory Usage: ", prev_memory_usage)
print("Memory Usage: ", memory_usage)
print(" - " * 25, "\n")

installments_payments_df_column_rename_dict = {}
for item in installments_payments_df.columns:
    installments_payments_df_column_rename_dict[item] = to_snake_case(item)

installments_payments_df.rename(
    columns=installments_payments_df_column_rename_dict, inplace=True
)

installments_payments_df.head()

Column 'NUM_INSTALMENT_VERSION' converted from float to uint8
Column 'DAYS_INSTALMENT' converted from float to int16
Column 'DAYS_ENTRY_PAYMENT' converted from float to Int16
Column 'SK_ID_PREV' downcasted from int64 to uint32
Column 'SK_ID_CURR' downcasted from int64 to uint32
Column 'NUM_INSTALMENT_NUMBER' downcasted from int64 to uint16
 -  -  -  -  -  -  -  -  -  -  -  -  -  -  -  -  -  -  -  -  -  -  -  -  -  

Previous Memory Usage:  830.407901763916
Memory Usage:  415.2040138244629
 -  -  -  -  -  -  -  -  -  -  -  -  -  -  -  -  -  -  -  -  -  -  -  -  -  



Unnamed: 0,sk_id_prev,sk_id_curr,num_instalment_version,num_instalment_number,days_instalment,days_entry_payment,amt_instalment,amt_payment
0,1054186,161674,1,6,-1180,-1187,6948.36,6948.36
1,1330831,151639,0,34,-2156,-2156,1716.525,1716.525
2,2085231,193053,2,1,-63,-63,25425.0,25425.0
3,2452527,199697,1,3,-2418,-2426,24350.13,24350.13
4,2714724,167756,1,2,-1383,-1366,2165.04,2160.585


In [9]:
installments_payments_df = pd.merge(
    previous_application_df[["main_split_id", "sk_id_prev"]],
    installments_payments_df,
    how="inner",
    on="sk_id_prev",
)

installments_payments_df.isna().sum(), installments_payments_df.shape

(main_split_id                0
 sk_id_prev                   0
 sk_id_curr                   0
 num_instalment_version       0
 num_instalment_number        0
 days_instalment              0
 days_entry_payment        2212
 amt_instalment               0
 amt_payment               2212
 dtype: int64,
 (10572221, 9))

In [10]:
installments_payments_df.to_sql(
    name="installments_payments",
    con=engine,
    index=False,
    if_exists="replace",
    chunksize=1024 * 8,
    # method="multi",
)

installments_payments_df.to_parquet(
    path="raw_dataset/merged_main_splited_data/installments_payments.parquet",
    index=False,
)

#### Credit Card Balance

In [8]:
credit_card_balance_df = pd.read_csv("raw_dataset/raw_tables/credit_card_balance.csv")

prev_memory_usage = credit_card_balance_df.memory_usage(deep=True).sum() / (1024**2)
credit_card_balance_df = optimize_dataframe(credit_card_balance_df)
memory_usage = credit_card_balance_df.memory_usage(deep=True).sum() / (1024**2)
print(" - " * 25, "\n")
print("Previous Memory Usage: ", prev_memory_usage)
print("Memory Usage: ", memory_usage)
print(" - " * 25, "\n")

credit_card_balance_df_column_rename_dict = {}
for item in credit_card_balance_df.columns:
    credit_card_balance_df_column_rename_dict[item] = to_snake_case(item)

credit_card_balance_df.rename(
    columns=credit_card_balance_df_column_rename_dict, inplace=True
)

credit_card_balance_df.head()

Column 'CNT_DRAWINGS_ATM_CURRENT' converted from float to UInt8
Column 'CNT_DRAWINGS_OTHER_CURRENT' converted from float to UInt8
Column 'CNT_DRAWINGS_POS_CURRENT' converted from float to UInt8
Column 'CNT_INSTALMENT_MATURE_CUM' converted from float to UInt8
Column 'SK_ID_PREV' downcasted from int64 to uint32
Column 'SK_ID_CURR' downcasted from int64 to uint32
Column 'MONTHS_BALANCE' downcasted from int64 to int8
Column 'AMT_CREDIT_LIMIT_ACTUAL' downcasted from int64 to uint32
Column 'CNT_DRAWINGS_CURRENT' downcasted from int64 to uint8
Column 'SK_DPD' downcasted from int64 to uint16
Column 'SK_DPD_DEF' downcasted from int64 to uint16
 -  -  -  -  -  -  -  -  -  -  -  -  -  -  -  -  -  -  -  -  -  -  -  -  -  

Previous Memory Usage:  846.3884124755859
Memory Usage:  619.3191833496094
 -  -  -  -  -  -  -  -  -  -  -  -  -  -  -  -  -  -  -  -  -  -  -  -  -  



Unnamed: 0,sk_id_prev,sk_id_curr,months_balance,amt_balance,amt_credit_limit_actual,amt_drawings_atm_current,amt_drawings_current,amt_drawings_other_current,amt_drawings_pos_current,amt_inst_min_regularity,amt_payment_current,amt_payment_total_current,amt_receivable_principal,amt_recivable,amt_total_receivable,cnt_drawings_atm_current,cnt_drawings_current,cnt_drawings_other_current,cnt_drawings_pos_current,cnt_instalment_mature_cum,name_contract_status,sk_dpd,sk_dpd_def
0,2562384,378907,-6,56.97,135000,0.0,877.5,0.0,877.5,1700.325,1800.0,1800.0,0.0,0.0,0.0,0,1,0,1,35,Active,0,0
1,2582071,363914,-1,63975.555,45000,2250.0,2250.0,0.0,0.0,2250.0,2250.0,2250.0,60175.08,64875.555,64875.555,1,1,0,0,69,Active,0,0
2,1740877,371185,-7,31815.225,450000,0.0,0.0,0.0,0.0,2250.0,2250.0,2250.0,26926.425,31460.085,31460.085,0,0,0,0,30,Active,0,0
3,1389973,337855,-4,236572.11,225000,2250.0,2250.0,0.0,0.0,11795.76,11925.0,11925.0,224949.285,233048.97,233048.97,1,1,0,0,10,Active,0,0
4,1891521,126868,-1,453919.455,450000,0.0,11547.0,0.0,11547.0,22924.89,27000.0,27000.0,443044.395,453919.455,453919.455,0,1,0,1,101,Active,0,0


In [9]:
credit_card_balance_df = pd.merge(
    previous_application_df[["main_split_id", "sk_id_prev"]],
    credit_card_balance_df,
    how="inner",
    on="sk_id_prev",
)

credit_card_balance_df.isna().sum(), credit_card_balance_df.shape

(main_split_id                      0
 sk_id_prev                         0
 sk_id_curr                         0
 months_balance                     0
 amt_balance                        0
 amt_credit_limit_actual            0
 amt_drawings_atm_current      521094
 amt_drawings_current               0
 amt_drawings_other_current    521094
 amt_drawings_pos_current      521094
 amt_inst_min_regularity       198155
 amt_payment_current           539103
 amt_payment_total_current          0
 amt_receivable_principal           0
 amt_recivable                      0
 amt_total_receivable               0
 cnt_drawings_atm_current      521094
 cnt_drawings_current               0
 cnt_drawings_other_current    521094
 cnt_drawings_pos_current      521094
 cnt_instalment_mature_cum     198155
 name_contract_status               0
 sk_dpd                             0
 sk_dpd_def                         0
 dtype: int64,
 (2354993, 24))

In [10]:
credit_card_balance_df.to_sql(
    name="credit_card_balance",
    con=engine,
    index=False,
    if_exists="replace",
    chunksize=1024 * 8,
    # method="multi",
)

credit_card_balance_df.to_parquet(
    path="raw_dataset/merged_main_splited_data/credit_card_balance.parquet",
    index=False,
)

### Creating Indexes


In [11]:
from sqlalchemy import text


def run_sql_qeury(engine, sql_query):
    with engine.connect() as connection:
        connection.execute(text(sql_query))

In [None]:
run_sql_qeury(
    engine=engine,
    sql_query="""
CREATE INDEX idx_application_train_main_split_id
ON application_train (MAIN_SPLIT_ID);
""",
)

run_sql_qeury(
    engine=engine,
    sql_query="""
CREATE INDEX idx_application_train_sk_id_curr
ON application_train (SK_ID_CURR);
""",
)

In [14]:
run_sql_qeury(
    engine=engine,
    sql_query="""
CREATE INDEX idx_previous_application_main_split_id
ON previous_application (MAIN_SPLIT_ID);
""",
)

run_sql_qeury(
    engine=engine,
    sql_query="""
CREATE INDEX idx_previous_application_sk_id_curr
ON previous_application (SK_ID_CURR);
""",
)

run_sql_qeury(
    engine=engine,
    sql_query="""
CREATE INDEX idx_previous_application_sk_id_prev
ON previous_application (SK_ID_PREV);
""",
)

In [15]:
run_sql_qeury(
    engine=engine,
    sql_query="""
CREATE INDEX idx_bureau_main_split_id
ON bureau (MAIN_SPLIT_ID);
""",
)

run_sql_qeury(
    engine=engine,
    sql_query="""
CREATE INDEX idx_bureau_sk_id_curr
ON bureau (SK_ID_CURR);
""",
)

run_sql_qeury(
    engine=engine,
    sql_query="""
CREATE INDEX idx_bureau_sk_id_bureau
ON bureau (sk_id_bureau);
""",
)

In [16]:
run_sql_qeury(
    engine=engine,
    sql_query="""
CREATE INDEX idx_bureau_balance_main_split_id
ON bureau (main_split_id);
""",
)

run_sql_qeury(
    engine=engine,
    sql_query="""
CREATE INDEX idx_bureau_balance_sk_id_bureau
ON bureau (sk_id_bureau);
""",
)

In [17]:
for cat_col_name in ["CREDIT_ACTIVE", "CREDIT_CURRENCY", "CREDIT_TYPE"]:
    run_sql_qeury(
        engine=engine,
        sql_query=f"""
    CREATE INDEX idx_bureau_{cat_col_name}
    ON bureau ({cat_col_name})
    """,
    )

for cat_col_name in ["OCCUPATION_TYPE", "ORGANIZATION_TYPE"]:
    run_sql_qeury(
        engine=engine,
        sql_query=f"""
    CREATE INDEX idx_application_train_{cat_col_name}
    ON application_train ({cat_col_name})
    """,
    )

for cat_col_name in [
    "NAME_CASH_LOAN_PURPOSE",
    "NAME_GOODS_CATEGORY",
    "PRODUCT_COMBINATION",
]:
    run_sql_qeury(
        engine=engine,
        sql_query=f"""
    CREATE INDEX idx_previous_application_{cat_col_name}
    ON previous_application ({cat_col_name})
    """,
    )

In [18]:
for table_name in ["pos_cash_balance", "installments_payments", "credit_card_balance"]:
    for col_name in ["main_split_id", "sk_id_prev", "sk_id_curr"]:
        run_sql_qeury(
            engine=engine,
            sql_query=f"""
CREATE INDEX idx_{table_name}_main_split_id
ON bureau (MAIN_SPLIT_ID);
""",
        )
        print(table_name, " - ", col_name, " - Done")

pos_cash_balance  -  main_split_id  - Done
pos_cash_balance  -  sk_id_prev  - Done
pos_cash_balance  -  sk_id_curr  - Done
installments_payments  -  main_split_id  - Done
installments_payments  -  sk_id_prev  - Done
installments_payments  -  sk_id_curr  - Done
credit_card_balance  -  main_split_id  - Done
credit_card_balance  -  sk_id_prev  - Done
credit_card_balance  -  sk_id_curr  - Done


### Read via PySpark

In [None]:
from pyspark.sql import SparkSession

# Initialize Spark session
spark = (
    SparkSession.builder.appName("PostgresETL")
    .config("spark.jars", "setup_files/postgresql-42.7.5.jar")
    .getOrCreate()
)


# spark = SparkSession.builder \
#     .appName("PostgresETL") \
#     .master("local[*]") \
#     .config("spark.jars", "setup_files/postgresql-42.7.5.jar") \
#     .getOrCreate()

In [None]:
# https://chatgpt.com/c/67d1c41a-d580-800c-9df4-60e5b5205bf4

In [None]:
# Database connection properties

username = "data_source_user"
password = "data_source_user_password"
host = "172.17.0.1"
port = "5435"
database = "data_source_db"


jdbc_url = f"jdbc:postgresql://{host}:{port}/{database}"
properties = {"user": username, "password": password, "driver": "org.postgresql.Driver"}

In [None]:
# Step 1: Extract Data
df = spark.read.jdbc(url=jdbc_url, table="application_train", properties=properties)
df

In [None]:
df.head()