In [1]:
from dotenv import load_dotenv
import os
from pathlib import Path

CURRENT_DIRECTORY_NOTEBOOK = None


def intitate_notebook():
    load_dotenv()
    global CURRENT_DIRECTORY_NOTEBOOK
    if CURRENT_DIRECTORY_NOTEBOOK is None:
        os.chdir(os.getenv("PROJECT_BASE_PATH"))
        CURRENT_DIRECTORY_NOTEBOOK = Path(os.getcwd())
        print("Current directory for notebook: ", CURRENT_DIRECTORY_NOTEBOOK)
    else:
        print(
            "Current directory for notebook is already set: ",
            CURRENT_DIRECTORY_NOTEBOOK,
        )


intitate_notebook()

Current directory for notebook:  /workspace


In [2]:
# import shutil
# import os

# def extract_zip(zip_file_path, extract_to_folder):
#     # Check if the provided folder exists, if not, create it
#     if not os.path.exists(extract_to_folder):
#         os.makedirs(extract_to_folder)

#     # Extract the .zip file to the target folder
#     shutil.unpack_archive(zip_file_path, extract_to_folder)
#     print(f"Files extracted to {extract_to_folder}")

# extract_zip(zip_file_path="raw_dataset/home-credit-default-risk.zip", extract_to_folder="raw_dataset/raw_tables")

In [3]:
import pandas as pd
from sqlalchemy import create_engine

In [4]:
import re


def to_snake_case(text):
    text = re.sub(r"[^a-zA-Z0-9]", " ", text)
    return "_".join(text.lower().split())

### Table - application_train

In [5]:
application_train_df = pd.read_csv("raw_dataset/raw_tables/application_train.csv")
application_train_df

Unnamed: 0,SK_ID_CURR,TARGET,NAME_CONTRACT_TYPE,CODE_GENDER,FLAG_OWN_CAR,FLAG_OWN_REALTY,CNT_CHILDREN,AMT_INCOME_TOTAL,AMT_CREDIT,AMT_ANNUITY,...,FLAG_DOCUMENT_18,FLAG_DOCUMENT_19,FLAG_DOCUMENT_20,FLAG_DOCUMENT_21,AMT_REQ_CREDIT_BUREAU_HOUR,AMT_REQ_CREDIT_BUREAU_DAY,AMT_REQ_CREDIT_BUREAU_WEEK,AMT_REQ_CREDIT_BUREAU_MON,AMT_REQ_CREDIT_BUREAU_QRT,AMT_REQ_CREDIT_BUREAU_YEAR
0,100002,1,Cash loans,M,N,Y,0,202500.0,406597.5,24700.5,...,0,0,0,0,0.0,0.0,0.0,0.0,0.0,1.0
1,100003,0,Cash loans,F,N,N,0,270000.0,1293502.5,35698.5,...,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0
2,100004,0,Revolving loans,M,Y,Y,0,67500.0,135000.0,6750.0,...,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0
3,100006,0,Cash loans,F,N,Y,0,135000.0,312682.5,29686.5,...,0,0,0,0,,,,,,
4,100007,0,Cash loans,M,N,Y,0,121500.0,513000.0,21865.5,...,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
307506,456251,0,Cash loans,M,N,N,0,157500.0,254700.0,27558.0,...,0,0,0,0,,,,,,
307507,456252,0,Cash loans,F,N,Y,0,72000.0,269550.0,12001.5,...,0,0,0,0,,,,,,
307508,456253,0,Cash loans,F,N,Y,0,153000.0,677664.0,29979.0,...,0,0,0,0,1.0,0.0,0.0,1.0,0.0,1.0
307509,456254,1,Cash loans,F,N,Y,0,171000.0,370107.0,20205.0,...,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0


In [6]:
import pandas as pd
import numpy as np


def add_partition_id_column(
    df: pd.DataFrame, n_splits: int = 12, diff_count: int = 5000
):
    df = df.sort_values(by="SK_ID_CURR", ascending=True)
    unique_ids_count = df["SK_ID_CURR"].nunique()
    print("rows_count: ", df.shape[0])
    print("unique_ids_count: ", unique_ids_count)
    if unique_ids_count != df.shape[0]:
        raise ValueError("ID column is not unique")

    split_sizes = []
    for split_size in np.linspace(0, unique_ids_count, n_splits + 1, dtype=int):
        if split_size in (0, unique_ids_count):
            split_sizes.append(int(split_size))
        else:
            split_sizes.append(
                np.random.randint(
                    split_size - int(diff_count / 2), split_size + int(diff_count / 2)
                )
            )

    split_sizes_dict = {}
    for i in range(n_splits):
        split_sizes_dict[i + 1] = (
            split_sizes[i],
            split_sizes[i + 1],
        )

    df["MAIN_SPLIT_ID"] = None

    for partition_id, (start, end) in split_sizes_dict.items():
        df.loc[start:end, "MAIN_SPLIT_ID"] = partition_id

    column_names = df.columns
    df = df[["MAIN_SPLIT_ID"] + column_names.drop(labels=["MAIN_SPLIT_ID"]).to_list()]

    return df

In [7]:
application_train_df = add_partition_id_column(application_train_df)
application_train_df.head()

rows_count:  307511
unique_ids_count:  307511


Unnamed: 0,MAIN_SPLIT_ID,SK_ID_CURR,TARGET,NAME_CONTRACT_TYPE,CODE_GENDER,FLAG_OWN_CAR,FLAG_OWN_REALTY,CNT_CHILDREN,AMT_INCOME_TOTAL,AMT_CREDIT,...,FLAG_DOCUMENT_18,FLAG_DOCUMENT_19,FLAG_DOCUMENT_20,FLAG_DOCUMENT_21,AMT_REQ_CREDIT_BUREAU_HOUR,AMT_REQ_CREDIT_BUREAU_DAY,AMT_REQ_CREDIT_BUREAU_WEEK,AMT_REQ_CREDIT_BUREAU_MON,AMT_REQ_CREDIT_BUREAU_QRT,AMT_REQ_CREDIT_BUREAU_YEAR
0,1,100002,1,Cash loans,M,N,Y,0,202500.0,406597.5,...,0,0,0,0,0.0,0.0,0.0,0.0,0.0,1.0
1,1,100003,0,Cash loans,F,N,N,0,270000.0,1293502.5,...,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0
2,1,100004,0,Revolving loans,M,Y,Y,0,67500.0,135000.0,...,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0
3,1,100006,0,Cash loans,F,N,Y,0,135000.0,312682.5,...,0,0,0,0,,,,,,
4,1,100007,0,Cash loans,M,N,Y,0,121500.0,513000.0,...,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0


In [8]:
application_train_df["MAIN_SPLIT_ID"].value_counts()

MAIN_SPLIT_ID
7     29657
10    27910
12    26517
8     26049
1     26044
4     25841
6     25651
3     25595
2     25086
11    23972
5     23335
9     21854
Name: count, dtype: int64

In [9]:
# Postgres DB Credentials
username = "data_source_user"
password = "data_source_user_password"
host = "172.17.0.1"
port = "5435"
database = "data_source_db"

connection_uri = f"postgresql://{username}:{password}@{host}:{port}/{database}"
print("Connection URI: ", connection_uri)

# MySQL - SQLAlchemy engine
engine = create_engine(connection_uri)
engine.connect()

Connection URI:  postgresql://data_source_user:data_source_user_password@172.17.0.1:5435/data_source_db


<sqlalchemy.engine.base.Connection at 0xffff55a54d70>

In [10]:
application_train_df_column_rename_dict = {}
for item in application_train_df.columns:
    application_train_df_column_rename_dict[item] = to_snake_case(item)

application_train_df.rename(
    columns=application_train_df_column_rename_dict, inplace=True
)

In [11]:
application_train_df.to_sql(
    name="application_train",
    con=engine,
    index=False,
    if_exists="replace",
    chunksize=1024 * 8,
    # method="multi",
)

9121

In [12]:
application_train_df.to_parquet(
    path="raw_dataset/merged_main_splited_data/application_train.parquet",
    index=False,
)

### Table - bureau

In [13]:
bureau_df = pd.read_csv("raw_dataset/raw_tables/bureau.csv")
bureau_df.head()

Unnamed: 0,SK_ID_CURR,SK_ID_BUREAU,CREDIT_ACTIVE,CREDIT_CURRENCY,DAYS_CREDIT,CREDIT_DAY_OVERDUE,DAYS_CREDIT_ENDDATE,DAYS_ENDDATE_FACT,AMT_CREDIT_MAX_OVERDUE,CNT_CREDIT_PROLONG,AMT_CREDIT_SUM,AMT_CREDIT_SUM_DEBT,AMT_CREDIT_SUM_LIMIT,AMT_CREDIT_SUM_OVERDUE,CREDIT_TYPE,DAYS_CREDIT_UPDATE,AMT_ANNUITY
0,215354,5714462,Closed,currency 1,-497,0,-153.0,-153.0,,0,91323.0,0.0,,0.0,Consumer credit,-131,
1,215354,5714463,Active,currency 1,-208,0,1075.0,,,0,225000.0,171342.0,,0.0,Credit card,-20,
2,215354,5714464,Active,currency 1,-203,0,528.0,,,0,464323.5,,,0.0,Consumer credit,-16,
3,215354,5714465,Active,currency 1,-203,0,,,,0,90000.0,,,0.0,Credit card,-16,
4,215354,5714466,Active,currency 1,-629,0,1197.0,,77674.5,0,2700000.0,,,0.0,Consumer credit,-21,


In [14]:
bureau_df_column_rename_dict = {}
for item in bureau_df.columns:
    bureau_df_column_rename_dict[item] = to_snake_case(item)

bureau_df.rename(columns=bureau_df_column_rename_dict, inplace=True)

In [15]:
bureau_df = pd.merge(
    application_train_df[["main_split_id", "sk_id_curr"]],
    bureau_df,
    how="inner",
    on="sk_id_curr",
)

bureau_df.sk_id_bureau.isna().sum(), bureau_df.shape

(np.int64(0), (1465325, 18))

In [16]:
bureau_df.head()

Unnamed: 0,main_split_id,sk_id_curr,sk_id_bureau,credit_active,credit_currency,days_credit,credit_day_overdue,days_credit_enddate,days_enddate_fact,amt_credit_max_overdue,cnt_credit_prolong,amt_credit_sum,amt_credit_sum_debt,amt_credit_sum_limit,amt_credit_sum_overdue,credit_type,days_credit_update,amt_annuity
0,1,100002,6158904,Closed,currency 1,-1125,0,-1038.0,-1038.0,,0,40761.0,,,0.0,Credit card,-1038,0.0
1,1,100002,6158905,Closed,currency 1,-476,0,,-48.0,,0,0.0,0.0,,0.0,Credit card,-47,
2,1,100002,6158906,Closed,currency 1,-1437,0,-1072.0,-1185.0,0.0,0,135000.0,0.0,0.0,0.0,Consumer credit,-1185,0.0
3,1,100002,6158907,Closed,currency 1,-1121,0,-911.0,-911.0,3321.0,0,19071.0,,,0.0,Consumer credit,-906,0.0
4,1,100002,6158908,Closed,currency 1,-645,0,85.0,-36.0,5043.645,0,120735.0,0.0,0.0,0.0,Consumer credit,-34,0.0


In [17]:
bureau_df.to_sql(
    name="bureau",
    con=engine,
    index=False,
    if_exists="replace",
    chunksize=1024 * 32,
    # method="multi",
)

34325

In [18]:
bureau_df.to_parquet(
    path="raw_dataset/merged_main_splited_data/bureau.parquet",
    index=False,
)

### Table - previous_application

In [19]:
previous_application_df = pd.read_csv("raw_dataset/raw_tables/previous_application.csv")
previous_application_df

Unnamed: 0,SK_ID_PREV,SK_ID_CURR,NAME_CONTRACT_TYPE,AMT_ANNUITY,AMT_APPLICATION,AMT_CREDIT,AMT_DOWN_PAYMENT,AMT_GOODS_PRICE,WEEKDAY_APPR_PROCESS_START,HOUR_APPR_PROCESS_START,...,NAME_SELLER_INDUSTRY,CNT_PAYMENT,NAME_YIELD_GROUP,PRODUCT_COMBINATION,DAYS_FIRST_DRAWING,DAYS_FIRST_DUE,DAYS_LAST_DUE_1ST_VERSION,DAYS_LAST_DUE,DAYS_TERMINATION,NFLAG_INSURED_ON_APPROVAL
0,2030495,271877,Consumer loans,1730.430,17145.0,17145.0,0.0,17145.0,SATURDAY,15,...,Connectivity,12.0,middle,POS mobile with interest,365243.0,-42.0,300.0,-42.0,-37.0,0.0
1,2802425,108129,Cash loans,25188.615,607500.0,679671.0,,607500.0,THURSDAY,11,...,XNA,36.0,low_action,Cash X-Sell: low,365243.0,-134.0,916.0,365243.0,365243.0,1.0
2,2523466,122040,Cash loans,15060.735,112500.0,136444.5,,112500.0,TUESDAY,11,...,XNA,12.0,high,Cash X-Sell: high,365243.0,-271.0,59.0,365243.0,365243.0,1.0
3,2819243,176158,Cash loans,47041.335,450000.0,470790.0,,450000.0,MONDAY,7,...,XNA,12.0,middle,Cash X-Sell: middle,365243.0,-482.0,-152.0,-182.0,-177.0,1.0
4,1784265,202054,Cash loans,31924.395,337500.0,404055.0,,337500.0,THURSDAY,9,...,XNA,24.0,high,Cash Street: high,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1670209,2300464,352015,Consumer loans,14704.290,267295.5,311400.0,0.0,267295.5,WEDNESDAY,12,...,Furniture,30.0,low_normal,POS industry with interest,365243.0,-508.0,362.0,-358.0,-351.0,0.0
1670210,2357031,334635,Consumer loans,6622.020,87750.0,64291.5,29250.0,87750.0,TUESDAY,15,...,Furniture,12.0,middle,POS industry with interest,365243.0,-1604.0,-1274.0,-1304.0,-1297.0,0.0
1670211,2659632,249544,Consumer loans,11520.855,105237.0,102523.5,10525.5,105237.0,MONDAY,12,...,Consumer electronics,10.0,low_normal,POS household with interest,365243.0,-1457.0,-1187.0,-1187.0,-1181.0,0.0
1670212,2785582,400317,Cash loans,18821.520,180000.0,191880.0,,180000.0,WEDNESDAY,9,...,XNA,12.0,low_normal,Cash X-Sell: low,365243.0,-1155.0,-825.0,-825.0,-817.0,1.0


In [20]:
previous_application_df_column_rename_dict = {}
for item in previous_application_df.columns:
    previous_application_df_column_rename_dict[item] = to_snake_case(item)

previous_application_df.rename(
    columns=previous_application_df_column_rename_dict, inplace=True
)

In [21]:
previous_application_df = pd.merge(
    application_train_df[["main_split_id", "sk_id_curr"]],
    previous_application_df,
    how="inner",
    on="sk_id_curr",
)

previous_application_df.sk_id_prev.isna().sum(), previous_application_df.shape

(np.int64(0), (1413701, 38))

In [22]:
previous_application_df.head()

Unnamed: 0,main_split_id,sk_id_curr,sk_id_prev,name_contract_type,amt_annuity,amt_application,amt_credit,amt_down_payment,amt_goods_price,weekday_appr_process_start,...,name_seller_industry,cnt_payment,name_yield_group,product_combination,days_first_drawing,days_first_due,days_last_due_1st_version,days_last_due,days_termination,nflag_insured_on_approval
0,1,100002,1038818,Consumer loans,9251.775,179055.0,179055.0,0.0,179055.0,SATURDAY,...,Auto technology,24.0,low_normal,POS other with interest,365243.0,-565.0,125.0,-25.0,-17.0,0.0
1,1,100003,1810518,Cash loans,98356.995,900000.0,1035882.0,,900000.0,FRIDAY,...,XNA,12.0,low_normal,Cash X-Sell: low,365243.0,-716.0,-386.0,-536.0,-527.0,1.0
2,1,100003,2636178,Consumer loans,64567.665,337500.0,348637.5,0.0,337500.0,SUNDAY,...,Furniture,6.0,middle,POS industry with interest,365243.0,-797.0,-647.0,-647.0,-639.0,0.0
3,1,100003,2396755,Consumer loans,6737.31,68809.5,68053.5,6885.0,68809.5,SATURDAY,...,Consumer electronics,12.0,middle,POS household with interest,365243.0,-2310.0,-1980.0,-1980.0,-1976.0,1.0
4,1,100004,1564014,Consumer loans,5357.25,24282.0,20106.0,4860.0,24282.0,FRIDAY,...,Connectivity,4.0,middle,POS mobile without interest,365243.0,-784.0,-694.0,-724.0,-714.0,0.0


In [23]:
previous_application_df.to_sql(
    name="previous_application",
    con=engine,
    index=False,
    if_exists="replace",
    chunksize=1024 * 32,
    # method="multi",
)

4161

In [24]:
previous_application_df.to_parquet(
    path="raw_dataset/merged_main_splited_data/previous_application.parquet",
    index=False,
)

### Creating Indexes


In [25]:
from sqlalchemy import text


def run_sql_qeury(engine, sql_query):
    with engine.connect() as connection:
        connection.execute(text(sql_query))

In [26]:
run_sql_qeury(
    engine=engine,
    sql_query="""
CREATE INDEX idx_application_train_main_split_id
ON application_train (MAIN_SPLIT_ID);
""",
)

run_sql_qeury(
    engine=engine,
    sql_query="""
CREATE INDEX idx_application_train_sk_id_curr
ON application_train (SK_ID_CURR);
""",
)

In [27]:
run_sql_qeury(
    engine=engine,
    sql_query="""
CREATE INDEX idx_previous_application_main_split_id
ON previous_application (MAIN_SPLIT_ID);
""",
)

run_sql_qeury(
    engine=engine,
    sql_query="""
CREATE INDEX idx_previous_application_sk_id_curr
ON previous_application (SK_ID_CURR);
""",
)

run_sql_qeury(
    engine=engine,
    sql_query="""
CREATE INDEX idx_previous_application_sk_id_prev
ON previous_application (SK_ID_PREV);
""",
)

In [None]:
run_sql_qeury(
    engine=engine,
    sql_query="""
CREATE INDEX idx_bureau_main_split_id
ON bureau (MAIN_SPLIT_ID);
""",
)

run_sql_qeury(
    engine=engine,
    sql_query="""
CREATE INDEX idx_bureau_sk_id_curr
ON bureau (SK_ID_CURR);
""",
)

run_sql_qeury(
    engine=engine,
    sql_query="""
CREATE INDEX idx_bureau_sk_id_bureau
ON bureau (sk_id_bureau);
""",
)

In [31]:
for cat_col_name in ["CREDIT_ACTIVE", "CREDIT_CURRENCY", "CREDIT_TYPE"]:
    run_sql_qeury(
        engine=engine,
        sql_query=f"""
    CREATE INDEX idx_bureau_{cat_col_name}
    ON bureau ({cat_col_name})
    """,
    )

for cat_col_name in ["OCCUPATION_TYPE", "ORGANIZATION_TYPE"]:
    run_sql_qeury(
        engine=engine,
        sql_query=f"""
    CREATE INDEX idx_application_train_{cat_col_name}
    ON application_train ({cat_col_name})
    """,
    )

for cat_col_name in [
    "NAME_CASH_LOAN_PURPOSE",
    "NAME_GOODS_CATEGORY",
    "PRODUCT_COMBINATION",
]:
    run_sql_qeury(
        engine=engine,
        sql_query=f"""
    CREATE INDEX idx_previous_application_{cat_col_name}
    ON previous_application ({cat_col_name})
    """,
    )

### Read via PySpark

In [32]:
from pyspark.sql import SparkSession

# Initialize Spark session
spark = (
    SparkSession.builder.appName("PostgresETL")
    .config("spark.jars", "setup_files/postgresql-42.7.5.jar")
    .getOrCreate()
)


# spark = SparkSession.builder \
#     .appName("PostgresETL") \
#     .master("local[*]") \
#     .config("spark.jars", "setup_files/postgresql-42.7.5.jar") \
#     .getOrCreate()

25/05/03 05:46:21 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).


In [33]:
# https://chatgpt.com/c/67d1c41a-d580-800c-9df4-60e5b5205bf4

In [34]:
# Database connection properties

username = "data_source_user"
password = "data_source_user_password"
host = "172.17.0.1"
port = "5435"
database = "data_source_db"


jdbc_url = f"jdbc:postgresql://{host}:{port}/{database}"
properties = {"user": username, "password": password, "driver": "org.postgresql.Driver"}

In [35]:
# Step 1: Extract Data
df = spark.read.jdbc(url=jdbc_url, table="application_train", properties=properties)
df

DataFrame[main_split_id: bigint, sk_id_curr: bigint, target: bigint, name_contract_type: string, code_gender: string, flag_own_car: string, flag_own_realty: string, cnt_children: bigint, amt_income_total: double, amt_credit: double, amt_annuity: double, amt_goods_price: double, name_type_suite: string, name_income_type: string, name_education_type: string, name_family_status: string, name_housing_type: string, region_population_relative: double, days_birth: bigint, days_employed: bigint, days_registration: double, days_id_publish: bigint, own_car_age: double, flag_mobil: bigint, flag_emp_phone: bigint, flag_work_phone: bigint, flag_cont_mobile: bigint, flag_phone: bigint, flag_email: bigint, occupation_type: string, cnt_fam_members: double, region_rating_client: bigint, region_rating_client_w_city: bigint, weekday_appr_process_start: string, hour_appr_process_start: bigint, reg_region_not_live_region: bigint, reg_region_not_work_region: bigint, live_region_not_work_region: bigint, reg_

In [36]:
df.head()

25/05/03 05:46:28 WARN SparkStringUtils: Truncated the string representation of a plan since it was too large. This behavior can be adjusted by setting 'spark.sql.debug.maxToStringFields'.
                                                                                

Row(main_split_id=1, sk_id_curr=100002, target=1, name_contract_type='Cash loans', code_gender='M', flag_own_car='N', flag_own_realty='Y', cnt_children=0, amt_income_total=202500.0, amt_credit=406597.5, amt_annuity=24700.5, amt_goods_price=351000.0, name_type_suite='Unaccompanied', name_income_type='Working', name_education_type='Secondary / secondary special', name_family_status='Single / not married', name_housing_type='House / apartment', region_population_relative=0.018801, days_birth=-9461, days_employed=-637, days_registration=-3648.0, days_id_publish=-2120, own_car_age=None, flag_mobil=1, flag_emp_phone=1, flag_work_phone=0, flag_cont_mobile=1, flag_phone=1, flag_email=0, occupation_type='Laborers', cnt_fam_members=1.0, region_rating_client=2, region_rating_client_w_city=2, weekday_appr_process_start='WEDNESDAY', hour_appr_process_start=10, reg_region_not_live_region=0, reg_region_not_work_region=0, live_region_not_work_region=0, reg_city_not_live_city=0, reg_city_not_work_city=