# Install packages

In [1]:
!pip install /kaggle/input/kaggle-home-credit-risk-model-stability-lib/kaggle_home_credit_risk_model_stability-0.3-py3-none-any.whl --force-reinstall

Processing /kaggle/input/kaggle-home-credit-risk-model-stability-lib/kaggle_home_credit_risk_model_stability-0.3-py3-none-any.whl
Installing collected packages: kaggle-home-credit-risk-model-stability
Successfully installed kaggle-home-credit-risk-model-stability-0.3


In [2]:
!pip install /kaggle/input/polars/polars-0.20.15-cp38-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl

Processing /kaggle/input/polars/polars-0.20.15-cp38-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl
Installing collected packages: polars
  Attempting uninstall: polars
    Found existing installation: polars 0.20.3
    Uninstalling polars-0.20.3:
      Successfully uninstalled polars-0.20.3
Successfully installed polars-0.20.15


# Import packages

In [3]:
import polars as pl
import gc
import time
import numpy as np
import pandas as pd
import lightgbm as lgb
import matplotlib.pyplot as plt
import copy
import lightgbm as lgb
import importlib
import pickle
import argparse

# %load_ext autoreload
# %autoreload 2

import kaggle_home_credit_risk_model_stability.libs as hcr
from kaggle_home_credit_risk_model_stability.libs.env import Env
from kaggle_home_credit_risk_model_stability.libs.input.dataset import Dataset
from kaggle_home_credit_risk_model_stability.libs.input.data_loader import DataLoader
from kaggle_home_credit_risk_model_stability.libs.preprocessor.preprocessor import Preprocessor
from kaggle_home_credit_risk_model_stability.libs.preprocessor.steps import *
from kaggle_home_credit_risk_model_stability.libs.preprocessor.columns_info import ColumnsInfo
from kaggle_home_credit_risk_model_stability.libs.feature_description.feature_description import FreatureDescriptionGetter

from collections import defaultdict
from glob import glob
from pathlib import Path
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import StratifiedGroupKFold
from sklearn.base import BaseEstimator, RegressorMixin

In [4]:
env = Env(
    "/kaggle/input/",
    "/kaggle/working/"
)

In [5]:
data_loader = DataLoader(env, tables = [
    "base", "static_cb_0", "static_0", "person_1", "tax_registry_a_1", "tax_registry_b_1", "tax_registry_c_1", 
    "credit_bureau_a_2", 
    "credit_bureau_a_1", 
    "applprev_1",
    #"credit_bureau_b_1", "credit_bureau_b_2", "other_1", "deposit_1", "debitcard_1" <- low amount of data
])

preprocessor = Preprocessor({
    #"sort_raw_tables": SortRawTablesStep(),
    "set_column_info_step": SetColumnsInfoStep(),
    "set_types": SetTypesStep(),
    "drop_composite_features": DropCompositeFeaturesStep(),
    "create_day_feature": CreateDayFeatureStep(),
    # #"drop_raw_null_columns": DropRawNullColumns(feature_threashold=0.2, week_threashold=0.9),
    "process_categorical": ProcessCategoricalStep(),
    "process_person_table": ProcessPersonTableStep(),
    "process_applprev_table": ProcessApplprevTableStep(),
    "process_static_0_table": ProcessStatic0TableStep(),
    "process_tax_regestry_a1_table": ProcessTaxRegestryA1TableStep(),
    "process_tax_regestry_b1_table": ProcessTaxRegestryB1TableStep(),
    "process_tax_regestry_c1_table": ProcessTaxRegestryC1TableStep(),
#     "process_credit_burea_1_table": ProcessCreditBureaua1TableStep(
#         config = {
#             "active": {
#                 "mask_column": "dateofcredstart_739D",
#                 "financialinstitution_column": "financialinstitution_591M",
#                 "columns": [
#                     #'annualeffectiverate_63L', 'dpdmax_139P', 'nominalrate_281L', 'numberofoutstandinstls_59L', 'numberofoverdueinstlmaxdat_641D', 'numberofoverdueinstls_725L', 'prolongationcount_599L', 
#                     'contractsum_5085717L', 'credlmt_935A', 'dateofcredend_289D', 'dateofcredstart_739D', 'dpdmaxdatemonth_89T', 'instlamount_768A', 'lastupdate_1112D', 'monthlyinstlamount_332A', 'numberofinstls_320L', 'numberofoverdueinstlmax_1039L', 'outstandingamount_362A', 'overdueamount_659A', 'overdueamountmax2_14A', 'overdueamountmax2date_1142D', 'overdueamountmax_155A', 'overdueamountmaxdatemonth_365T', 'periodicityofpmts_837L', 'purposeofcred_426M', 'residualamount_856A', 'subjectrole_182M', 'totalamount_996A'
#                 ]
#             },
#             "close": {
#                 "mask_column": "dateofcredstart_181D",
#                 "financialinstitution_column": "financialinstitution_382M",
#                 "columns": [
#                     # 'interestrate_508L', 'annualeffectiverate_199L', 'dateofrealrepmt_138D', 'lastupdate_388D', 'numberofoverdueinstlmaxdat_148D', 'prolongationcount_1120L',
#                     'credlmt_230A', 'dateofcredend_353D', 'dateofcredstart_181D', 'dpdmax_757P', 'dpdmaxdatemonth_442T', 'instlamount_852A', 'monthlyinstlamount_674A', 'nominalrate_498L', 'numberofinstls_229L', 'numberofoutstandinstls_520L', 'numberofoverdueinstlmax_1151L', 'numberofoverdueinstls_834L', 'outstandingamount_354A', 'overdueamount_31A', 'overdueamountmax2_398A', 'overdueamountmax2date_1002D', 'overdueamountmax_35A', 'overdueamountmaxdatemonth_284T', 'periodicityofpmts_1102L', 'purposeofcred_874M', 'residualamount_488A', 'totalamount_6A', 'subjectrole_93M'
#                 ]
#             }
#         },
#         finantial_institutions = {
#             "active": ['Home Credit', 'P204_66_73'],
#             "close": ['P133_127_114', 'Home Credit']
#         }
#     ),
    "process_credit_burea_2_table": ProcessCreditBureaua2TableStep(),
    #"split_composite_features": SplitCompositeFeaturesStep(exclude_list="previouscontdistrict_112M", drop_original=True),
    #"reduce_dimention_for_categorical_features": ReduceDimentionForCategoricalFeaturesStep(),
    #"one_hot_encoding": OneHotEncodingStep(),
    #"pairwise_diff_raw_dates": PairwiseDateDiffStep(),
    "aggregate_depth_table": AggregateDepthTableStep(),
    #"drop_equal_columns": DropEqualColumnsStep(),
    #"drop_almost_null_features_respect_to_target": DropAlmostNullFeaturesWithRespectToTargetStep(),
    "join_table": JoinTablesStep(),
    "merge_chunked_table": MergeChunkedTablesStep(),
    "drop_almost_null_features": DropAlmostNullFeaturesStep(),
    #"generate_age_feature": GenerateAgeFeatureStep(),
    "generate_base_date_diff": GenerateBaseDateDiffStep(base_column="date_decision"),
    "fill_nulls_in_categorical_features": FillNullsInCategoricalFeaturesStep(),
    "reduce_memory_usage": ReduceMemoryUsageStep(),
    **{
        f"create_money_feature_fraction_{base_column}": CreateMoneyFeatureFractionStep(base_column)
        for base_column in ["credamount_770A", "mainoccupationinc_384A"] # "maininc_215A", "annuity_780A"] # mean_amount_416A
    },
    #"generate_mismatch_features": GenerateMismatchFeaturesStep(),
    #"generate_target_distribution_based_on_categorical_features_step": GenerateTargetDistributionBasedOnCategoricalStep(),
    "drop_single_value_features": DropSingleValueFeaturesStep(),
    "drop_variable_enum_features": DropVariableEnumFeaturesStep(),
    "reduce_memory_usage": ReduceMemoryUsageStep()
})

In [6]:
train_dataset_generator = data_loader.load_train_dataset(chunk_size=100000)
gc.collect()
train_df, columns_info = preprocessor.process_train_dataset(train_dataset_generator)
del train_dataset_generator
gc.collect()

Drop composite features ['lastapprcommoditycat_1041M', 'lastapprcommoditytypec_5251766M', 'lastcancelreason_561M', 'lastrejectcommoditycat_161M', 'lastrejectcommodtypec_5251769M', 'lastrejectreason_759M', 'lastrejectreasonclient_4145040M', 'previouscontdistrict_112M', 'contaddr_district_15M', 'contaddr_zipcode_807M', 'education_927M', 'empladdr_district_926M', 'empladdr_zipcode_114M', 'language1_981M', 'registaddr_district_1083M', 'registaddr_zipcode_184M', 'cancelreason_3545846M', 'district_544M', 'education_1138M', 'postype_4733339M', 'profession_152M', 'rejectreason_755M', 'rejectreasonclient_4145042M']
Generate 901 columns as aggregates
Generate 901 columns as aggregates
Generate 901 columns as aggregates
Generate 901 columns as aggregates
Generate 901 columns as aggregates
Generate 901 columns as aggregates
Generate 901 columns as aggregates
Generate 901 columns as aggregates
Generate 901 columns as aggregates
Generate 901 columns as aggregates
Generate 901 columns as aggregates
G

0

In [7]:
train_df.write_parquet(env.output_directory / "train_df.parquet")
pickle.dump(columns_info, open(env.output_directory / "columns_info.pkl", "wb"))
print(train_df.estimated_size() / 1024 / 1024)
print(train_df)

del train_df
gc.collect()

7971.865206718445
shape: (1_526_659, 1_433)
┌────────┬──────────┬────────────┬────────────┬───┬────────────┬───────────┬───────────┬───────────┐
│ MONTH  ┆ WEEK_NUM ┆ actualdpdt ┆ amtinstpai ┆ … ┆ sumoutstan ┆ totaldebt ┆ totalsett ┆ totinstal │
│ ---    ┆ ---      ┆ olerance_3 ┆ dbefduel24 ┆   ┆ dtotalest_ ┆ _9A/maino ┆ led_863A/ ┆ last1m_45 │
│ i32    ┆ i16      ┆ 44P        ┆ m_4187115A ┆   ┆ 4493215A/m ┆ ccupation ┆ mainoccup ┆ 25188A/ma │
│        ┆          ┆ ---        ┆ ---        ┆   ┆ ai…        ┆ inc_3…    ┆ ation…    ┆ inocc…    │
│        ┆          ┆ f32        ┆ f32        ┆   ┆ ---        ┆ ---       ┆ ---       ┆ ---       │
│        ┆          ┆            ┆            ┆   ┆ f32        ┆ f32       ┆ f32       ┆ f32       │
╞════════╪══════════╪════════════╪════════════╪═══╪════════════╪═══════════╪═══════════╪═══════════╡
│ 201901 ┆ 0        ┆ null       ┆ null       ┆ … ┆ null       ┆ 0.0       ┆ 0.0       ┆ null      │
│ 201901 ┆ 0        ┆ null       ┆ null       ┆

0

In [8]:
test_dataset_generator = data_loader.load_test_dataset(chunk_size=100000)
gc.collect()
test_df, columns_info = preprocessor.process_test_dataset(test_dataset_generator)
del test_dataset_generator
gc.collect()

Generate 901 columns as aggregates
Create 127 new date diff columns, with base_column=date_decision
Create 204 new columns as money feature fraction with base 'credamount_770A'
Create 204 new columns as money feature fraction with base 'mainoccupationinc_384A'


0

In [9]:
test_df.write_parquet(env.output_directory / "test_df.parquet")
print(test_df.estimated_size() / 1024 / 1024)
print(test_df)

del test_df
gc.collect()

0.1262683868408203
shape: (10, 1_432)
┌────────┬──────────┬────────────┬────────────┬───┬────────────┬───────────┬───────────┬───────────┐
│ MONTH  ┆ WEEK_NUM ┆ actualdpdt ┆ amtinstpai ┆ … ┆ sumoutstan ┆ totaldebt ┆ totalsett ┆ totinstal │
│ ---    ┆ ---      ┆ olerance_3 ┆ dbefduel24 ┆   ┆ dtotalest_ ┆ _9A/maino ┆ led_863A/ ┆ last1m_45 │
│ i32    ┆ i16      ┆ 44P        ┆ m_4187115A ┆   ┆ 4493215A/m ┆ ccupation ┆ mainoccup ┆ 25188A/ma │
│        ┆          ┆ ---        ┆ ---        ┆   ┆ ai…        ┆ inc_3…    ┆ ation…    ┆ inocc…    │
│        ┆          ┆ f32        ┆ f32        ┆   ┆ ---        ┆ ---       ┆ ---       ┆ ---       │
│        ┆          ┆            ┆            ┆   ┆ f32        ┆ f32       ┆ f32       ┆ f32       │
╞════════╪══════════╪════════════╪════════════╪═══╪════════════╪═══════════╪═══════════╪═══════════╡
│ 202201 ┆ 100      ┆ 0.0        ┆ 191767.359 ┆ … ┆ 0.357482   ┆ 0.357482  ┆ 13.41268  ┆ 0.525282  │
│        ┆          ┆            ┆ 375        ┆   ┆  

0