# Install packages

In [1]:
!pip install /kaggle/input/kaggle-home-credit-risk-model-stability-lib/kaggle_home_credit_risk_model_stability-0.3-py3-none-any.whl --force-reinstall

Processing /kaggle/input/kaggle-home-credit-risk-model-stability-lib/kaggle_home_credit_risk_model_stability-0.3-py3-none-any.whl
Installing collected packages: kaggle-home-credit-risk-model-stability
Successfully installed kaggle-home-credit-risk-model-stability-0.3


In [2]:
!pip install /kaggle/input/polars/polars-0.20.15-cp38-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl

Processing /kaggle/input/polars/polars-0.20.15-cp38-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl
Installing collected packages: polars
  Attempting uninstall: polars
    Found existing installation: polars 0.20.3
    Uninstalling polars-0.20.3:
      Successfully uninstalled polars-0.20.3
Successfully installed polars-0.20.15


# Import packages

In [3]:
import polars as pl
import gc
import time
import numpy as np
import pandas as pd
import lightgbm as lgb
import matplotlib.pyplot as plt
import copy
import lightgbm as lgb
import importlib
import pickle
import argparse

# %load_ext autoreload
# %autoreload 2

import kaggle_home_credit_risk_model_stability.libs as hcr
from kaggle_home_credit_risk_model_stability.libs.env import Env
from kaggle_home_credit_risk_model_stability.libs.input.dataset import Dataset
from kaggle_home_credit_risk_model_stability.libs.input.data_loader import DataLoader
from kaggle_home_credit_risk_model_stability.libs.preprocessor.preprocessor import Preprocessor
from kaggle_home_credit_risk_model_stability.libs.preprocessor.steps import *
from kaggle_home_credit_risk_model_stability.libs.preprocessor.columns_info import ColumnsInfo
from kaggle_home_credit_risk_model_stability.libs.feature_description.feature_description import FreatureDescriptionGetter

from collections import defaultdict
from glob import glob
from pathlib import Path
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import StratifiedGroupKFold
from sklearn.base import BaseEstimator, RegressorMixin

In [4]:
env = Env(
    "/kaggle/input/",
    "/kaggle/working/"
)

In [5]:
data_loader = DataLoader(env, tables = [
    "base", "static_cb_0", "static_0", "person_1", "tax_registry_a_1", "tax_registry_b_1", "tax_registry_c_1", 
    "credit_bureau_a_2", 
    "credit_bureau_a_1", 
    "applprev_1",
    #"debitcard_1", "other_1", "deposit_1"
    #"credit_bureau_b_1", "credit_bureau_b_2", <- low amount of data
    #"applprev_2", "person_2"
])

preprocessor = Preprocessor({
    #"sort_raw_tables": SortRawTablesStep(),
    "set_column_info_step": SetColumnsInfoStep(),
    "set_types": SetTypesStep(),
    "drop_composite_features": DropCompositeFeaturesStep(),
    "create_day_feature": CreateDayFeatureStep(),
    "process_categorical": ProcessCategoricalStep(),
    #"drop_raw_single_value_columns": DropRawSingleValueColumnsStep(),
    "process_person_table": ProcessPersonTableStep(),
#    "process_applprev_table": ProcessApplprevTableStep(),
    "process_static_0_table": ProcessStatic0TableStep(),
    "process_tax_regestry_a1_table": ProcessTaxRegestryA1TableStep(),
    "process_tax_regestry_b1_table": ProcessTaxRegestryB1TableStep(),
    "process_tax_regestry_c1_table": ProcessTaxRegestryC1TableStep(),
    # "process_credit_burea_1_table": ProcessCreditBureaua1TableStep(
    #     config = {
    #         "active": {
    #             "mask_column": "dateofcredstart_739D",
    #             "financialinstitution_column": "financialinstitution_591M",
    #             "columns": [
    #                 #'annualeffectiverate_63L', 'dpdmax_139P', 'nominalrate_281L', 'numberofoutstandinstls_59L', 'numberofoverdueinstlmaxdat_641D', 'numberofoverdueinstls_725L', 'prolongationcount_599L', 
    #                 'contractsum_5085717L', 'credlmt_935A', 'dateofcredend_289D', 'dateofcredstart_739D', 'dpdmaxdatemonth_89T', 'instlamount_768A', 'lastupdate_1112D', 'monthlyinstlamount_332A', 'numberofinstls_320L', 'numberofoverdueinstlmax_1039L', 'outstandingamount_362A', 'overdueamount_659A', 'overdueamountmax2_14A', 'overdueamountmax2date_1142D', 'overdueamountmax_155A', 'overdueamountmaxdatemonth_365T', 'periodicityofpmts_837L', 'purposeofcred_426M', 'residualamount_856A', 'subjectrole_182M', 'totalamount_996A'
    #             ]
    #         },
    #         "close": {
    #             "mask_column": "dateofcredstart_181D",
    #             "financialinstitution_column": "financialinstitution_382M",
    #             "columns": [
    #                 # 'interestrate_508L', 'annualeffectiverate_199L', 'dateofrealrepmt_138D', 'lastupdate_388D', 'numberofoverdueinstlmaxdat_148D', 'prolongationcount_1120L',
    #                 'credlmt_230A', 'dateofcredend_353D', 'dateofcredstart_181D', 'dpdmax_757P', 'dpdmaxdatemonth_442T', 'instlamount_852A', 'monthlyinstlamount_674A', 'nominalrate_498L', 'numberofinstls_229L', 'numberofoutstandinstls_520L', 'numberofoverdueinstlmax_1151L', 'numberofoverdueinstls_834L', 'outstandingamount_354A', 'overdueamount_31A', 'overdueamountmax2_398A', 'overdueamountmax2date_1002D', 'overdueamountmax_35A', 'overdueamountmaxdatemonth_284T', 'periodicityofpmts_1102L', 'purposeofcred_874M', 'residualamount_488A', 'totalamount_6A', 'subjectrole_93M'
    #             ]
    #         }
    #     },
    #     finantial_institutions = {
    #         "active": ['Home Credit', 'P150_136_157'],
    #         "close": ['P150_136_157', 'P133_127_114']
    #     }
    # ),
    "split_active_close_credit_burea_1_table": SplitActiveCloseCreditBureaua1TableStep(),
    "split_active_credit_bureau_a_1_by_credit_duration": SplitTableByCreditDurationStep(
        table_name = "active_credit_bureau_a_1",
        intervals = {
            "short": [0, 135],
            "medium": [135, 270],
            "medium_long": [270, 450],
            "long": [450, 100000],
        }
    ),
    "split_close_credit_bureau_a_1_by_credit_duration": SplitTableByCreditDurationStep(
        table_name = "close_credit_bureau_a_1",
        intervals = {
            "short": [0, 135],
            "medium": [135, 270],
            "medium_long": [270, 450],
            "long": [450, 100000],
        }
    ),
    **{
        f"split_{period}_active_credit_bureau_a_1_step": SplitTableByCategoricalFeatureStep(f"{period}_active_credit_bureau_a_1", "financialinstitution_591M", [["Home Credit", "P150_136_157"]])
        for period in ["short", "medium", "medium_long", "long"]
    },
    **{
        f"split_{period}_close_credit_bureau_a_1_step": SplitTableByCategoricalFeatureStep(f"{period}_close_credit_bureau_a_1", "financialinstitution_382M", [["P150_136_157", "P133_127_114"]])
        for period in ["short", "medium", "medium_long", "long"]
    },
    "process_credit_burea_2_table": ProcessCreditBureaua2TableStep(),
    #"pairwise_diff_raw_dates": PairwiseDateDiffStep(),
    "aggregate_depth_table": AggregateDepthTableStep(),
    "join_table": JoinTablesStep(),
    "merge_chunked_table": MergeChunkedTablesStep(),
    "drop_almost_null_features": DropAlmostNullFeaturesStep(0.99),
    #"generate_age_feature": GenerateAgeFeatureStep(),
    "generate_base_date_diff": GenerateBaseDateDiffStep(base_column="date_decision"),
    "fill_nulls_in_categorical_features": FillNullsInCategoricalFeaturesStep(),
    "reduce_dimention_for_categorical_features": ReduceDimentionForCategoricalFeaturesStep(),
    "reduce_memory_usage": ReduceMemoryUsageStep(),
    **{
        f"create_money_feature_fraction_{base_column}": CreateMoneyFeatureFractionStep(base_column)
        for base_column in ["credamount_770A", "mainoccupationinc_384A"] # "maininc_215A", "annuity_780A"] # mean_amount_416A
    },
    "drop_single_value_features": DropSingleValueFeaturesStep(),
    "drop_variable_enum_features": DropVariableEnumFeaturesStep(),
    #"generate_mismatch_features": GenerateMismatchFeaturesStep(),
    # **{
    #     f"generate_anomaly_feature_{use_w}_{quantile}_{threashold}": GenerateAnomalyFeatureStep(quantile=quantile, threashold=threashold)
    #     for quantile in [0.99, 0.97, 0.95, 0.9, 0.8, 0.7]
    #     for threashold in [3, 2, 1.7, 1.5, 1.3]
    #     for use_w in [True, False]
    # },
    "reduce_memory_usage": ReduceMemoryUsageStep()
})

In [6]:
train_dataset_generator = data_loader.load_train_dataset(chunk_size=100000)
gc.collect()
train_df, columns_info = preprocessor.process_train_dataset(train_dataset_generator)
del train_dataset_generator
gc.collect()

Step: set_column_info_step
Step: set_types
Step: drop_composite_features
Step: create_day_feature
Step: process_categorical
Step: process_person_table
Step: process_static_0_table
Step: process_tax_regestry_a1_table
Step: process_tax_regestry_b1_table
Step: process_tax_regestry_c1_table
Step: split_active_close_credit_burea_1_table
Step: split_active_credit_bureau_a_1_by_credit_duration
Step: split_close_credit_bureau_a_1_by_credit_duration
Step: split_short_active_credit_bureau_a_1_step
Step: split_medium_active_credit_bureau_a_1_step
Step: split_medium_long_active_credit_bureau_a_1_step
Step: split_long_active_credit_bureau_a_1_step
Step: split_short_close_credit_bureau_a_1_step
Step: split_medium_close_credit_bureau_a_1_step
Step: split_medium_long_close_credit_bureau_a_1_step
Step: split_long_close_credit_bureau_a_1_step
Step: process_credit_burea_2_table
Step: aggregate_depth_table
Step: join_table
Step: merge_chunked_table
Step: drop_almost_null_features
Step: generate_base_date_

0

In [7]:
train_df.write_parquet(env.output_directory / "train_df.parquet")
pickle.dump(columns_info, open(env.output_directory / "columns_info.pkl", "wb"))
print(train_df.estimated_size() / 1024 / 1024)
print(train_df)

del train_df
gc.collect()

9885.453375816345
shape: (1_526_659, 1_773)
┌────────┬──────────┬────────────┬────────────┬───┬────────────┬───────────┬───────────┬───────────┐
│ MONTH  ┆ WEEK_NUM ┆ actualdpdt ┆ amtinstpai ┆ … ┆ sumoutstan ┆ totaldebt ┆ totalsett ┆ totinstal │
│ ---    ┆ ---      ┆ olerance_3 ┆ dbefduel24 ┆   ┆ dtotalest_ ┆ _9A/maino ┆ led_863A/ ┆ last1m_45 │
│ i32    ┆ i16      ┆ 44P        ┆ m_4187115A ┆   ┆ 4493215A/m ┆ ccupation ┆ mainoccup ┆ 25188A/ma │
│        ┆          ┆ ---        ┆ ---        ┆   ┆ ai…        ┆ inc_3…    ┆ ation…    ┆ inocc…    │
│        ┆          ┆ f32        ┆ f32        ┆   ┆ ---        ┆ ---       ┆ ---       ┆ ---       │
│        ┆          ┆            ┆            ┆   ┆ f32        ┆ f32       ┆ f32       ┆ f32       │
╞════════╪══════════╪════════════╪════════════╪═══╪════════════╪═══════════╪═══════════╪═══════════╡
│ 201901 ┆ 0        ┆ null       ┆ null       ┆ … ┆ null       ┆ 0.0       ┆ 0.0       ┆ null      │
│ 201901 ┆ 0        ┆ null       ┆ null       ┆

0

In [8]:
test_dataset_generator = data_loader.load_test_dataset(chunk_size=100000)
gc.collect()
test_df, columns_info = preprocessor.process_test_dataset(test_dataset_generator)
del test_dataset_generator
gc.collect()

Step: set_column_info_step
Step: set_types
Step: drop_composite_features
Step: create_day_feature
Step: process_categorical
Step: process_person_table
Step: process_static_0_table
Step: process_tax_regestry_a1_table
Step: process_tax_regestry_b1_table
Step: process_tax_regestry_c1_table
Step: split_active_close_credit_burea_1_table
Step: split_active_credit_bureau_a_1_by_credit_duration
Step: split_close_credit_bureau_a_1_by_credit_duration
Step: split_short_active_credit_bureau_a_1_step
Step: split_medium_active_credit_bureau_a_1_step
Step: split_medium_long_active_credit_bureau_a_1_step
Step: split_long_active_credit_bureau_a_1_step
Step: split_short_close_credit_bureau_a_1_step
Step: split_medium_close_credit_bureau_a_1_step
Step: split_medium_long_close_credit_bureau_a_1_step
Step: split_long_close_credit_bureau_a_1_step
Step: process_credit_burea_2_table
Step: aggregate_depth_table
Step: join_table
Step: merge_chunked_table
Step: drop_almost_null_features
Step: generate_base_date_

0

In [9]:
test_df.write_parquet(env.output_directory / "test_df.parquet")
print(test_df.estimated_size() / 1024 / 1024)
print(test_df)

del test_df
gc.collect()

0.08484935760498047
shape: (10, 1_772)
┌────────┬──────────┬────────────┬────────────┬───┬────────────┬───────────┬───────────┬───────────┐
│ MONTH  ┆ WEEK_NUM ┆ actualdpdt ┆ amtinstpai ┆ … ┆ sumoutstan ┆ totaldebt ┆ totalsett ┆ totinstal │
│ ---    ┆ ---      ┆ olerance_3 ┆ dbefduel24 ┆   ┆ dtotalest_ ┆ _9A/maino ┆ led_863A/ ┆ last1m_45 │
│ i32    ┆ i16      ┆ 44P        ┆ m_4187115A ┆   ┆ 4493215A/m ┆ ccupation ┆ mainoccup ┆ 25188A/ma │
│        ┆          ┆ ---        ┆ ---        ┆   ┆ ai…        ┆ inc_3…    ┆ ation…    ┆ inocc…    │
│        ┆          ┆ f32        ┆ f32        ┆   ┆ ---        ┆ ---       ┆ ---       ┆ ---       │
│        ┆          ┆            ┆            ┆   ┆ f32        ┆ f32       ┆ f32       ┆ f32       │
╞════════╪══════════╪════════════╪════════════╪═══╪════════════╪═══════════╪═══════════╪═══════════╡
│ 202201 ┆ 100      ┆ 0.0        ┆ 191767.359 ┆ … ┆ 0.357482   ┆ 0.357482  ┆ 13.41268  ┆ 0.525282  │
│        ┆          ┆            ┆ 375        ┆   ┆ 

0