# Example Notebook

Welcome to the example notebook for the Home Credit Kaggle competition. The goal of this competition is to determine how likely a customer is going to default on an issued loan. The main difference between the [first](https://www.kaggle.com/c/home-credit-default-risk) and this competition is that now your submission will be scored with a custom metric that will take into account how well the model performs in future. A decline in performance will be penalized. The goal is to create a model that is stable and performs well in the future.

In this notebook you will see how to:
* Load the data
* Join tables with Polars - a DataFrame library implemented in Rust language, designed to be blazingy fast and memory efficient.  
* Create simple aggregation features
* Train a LightGBM model
* Create a submission table

## Load the data

In [1]:
import polars as pl
import gc
import time
import numpy as np
import pandas as pd
import lightgbm as lgb
import seaborn as sns
import matplotlib.pyplot as plt
import copy
import lightgbm as lgb

from glob import glob
from pathlib import Path
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score 
from sklearn.model_selection import StratifiedGroupKFold
from sklearn.base import BaseEstimator, RegressorMixin
from catboost import CatBoostClassifier
from enum import Enum    



In [2]:
!pip install https://github.com/skrydg/kaggle-home-credit-credit-risk-model-stability.git

Collecting https://github.com/skrydg/kaggle-home-credit-credit-risk-model-stability.git
  Downloading https://github.com/skrydg/kaggle-home-credit-credit-risk-model-stability.git
[2K     [32m-[0m [32m239.2 kB[0m [31m7.6 MB/s[0m [33m0:00:00[0m
[?25h[31m  ERROR: Cannot unpack file /tmp/pip-unpack-66dnlily/kaggle-home-credit-credit-risk-model-stability.git (downloaded from /tmp/pip-req-build-j_i9t10m, content-type: text/html; charset=utf-8); cannot detect archive format[0m[31m
[0m[31mERROR: Cannot determine archive format of /tmp/pip-req-build-j_i9t10m[0m[31m
[0m

In [3]:
class Mode(Enum):
    Predict = 0
    Train = 1

MODE = Mode.Predict


In [4]:
class Dataset:
    def __init__(self, base, depth_0, depth_1, depth_2):
        self.base = base
        self.depth_0 = depth_0
        self.depth_1 = depth_1
        self.depth_2 = depth_2

    def filter(self, filter_lambda):
        self.base = filter_lambda(self.base)

        for i in range(len(self.depth_0)):
            self.depth_0[i] = filter_lambda(self.depth_0[i])
            
        for i in range(len(self.depth_1)):
            self.depth_1[i] = filter_lambda(self.depth_1[i])
        
        for i in range(len(self.depth_2)):
            self.depth_2[i] = filter_lambda(self.depth_2[i])
            
        return self

            
class DataLoader:
    DATA_PATH = Path("/kaggle/input/home-credit-credit-risk-model-stability")
    TRAIN_DIR = DATA_PATH / "parquet_files/train/"
    TEST_DIR = DATA_PATH / "parquet_files/test/"
    TRAIN_PERSENT_SIZE = 0.5
    
    def __init__(self, mode):
        self.mode = mode
        
    def load_train_dataset(self) -> Dataset:
        base, depth_0, depth_1, depth_2 = self._get_train_data()
                    
        if self.mode == Mode.Train:
            case_id_set = self._get_train_case_id_set()
            return Dataset(base, depth_0, depth_1, depth_2).filter(lambda df: df.filter(df["case_id"].is_in(case_id_set)))
        else:
            return Dataset(base, depth_0, depth_1, depth_2)
        
    def load_test_dataset(self) -> Dataset:
        if self.mode == Mode.Train:
            base, depth_0, depth_1, depth_2 = self._get_train_data()
            case_id_set = self._get_test_case_id_set()
            return Dataset(base, depth_0, depth_1, depth_2).filter(lambda df: df.filter(df["case_id"].is_in(case_id_set)))
        else:
            base, depth_0, depth_1, depth_2 = self._get_test_data()
            return Dataset(base, depth_0, depth_1, depth_2)
        
    def _get_train_data(self):
        base = self._read_file(self.TRAIN_DIR / "train_base.parquet")
        depth_0 = [
            self._read_files(self.TRAIN_DIR / "train_static_cb_*.parquet"),
            self._read_files(self.TRAIN_DIR / "train_static_0_*.parquet")
        ]            
        depth_1 = [
            self._read_files(self.TRAIN_DIR / "train_applprev_1_*.parquet"),
            self._read_file(self.TRAIN_DIR / "train_tax_registry_a_1.parquet"),
            self._read_file(self.TRAIN_DIR / "train_tax_registry_b_1.parquet"),
            self._read_file(self.TRAIN_DIR / "train_tax_registry_c_1.parquet"),
            self._read_file(self.TRAIN_DIR / "train_credit_bureau_b_1.parquet"),
            self._read_file(self.TRAIN_DIR / "train_other_1.parquet"),
            self._read_file(self.TRAIN_DIR / "train_person_1.parquet"),
            self._read_file(self.TRAIN_DIR / "train_deposit_1.parquet"),
            self._read_file(self.TRAIN_DIR / "train_debitcard_1.parquet"),
        ]
        depth_2 = [
            self._read_file(self.TRAIN_DIR / "train_credit_bureau_b_2.parquet"),
        ]
        return base, depth_0, depth_1, depth_2
    
    def _get_test_data(self):
        base = self._read_file(self.TEST_DIR / "test_base.parquet")
        depth_0 = [
            self._read_files(self.TEST_DIR / "test_static_cb_*.parquet"),
            self._read_files(self.TEST_DIR / "test_static_0_*.parquet")
        ]
        depth_1 = [
            self._read_files(self.TEST_DIR / "test_applprev_1_*.parquet"),
            self._read_files(self.TEST_DIR / "test_tax_registry_a_1.parquet"),
            self._read_file(self.TEST_DIR / "test_tax_registry_b_1.parquet"),
            self._read_file(self.TEST_DIR / "test_tax_registry_c_1.parquet"),
            self._read_file(self.TEST_DIR / "test_credit_bureau_b_1.parquet"),
            self._read_file(self.TEST_DIR / "test_other_1.parquet"),
            self._read_file(self.TEST_DIR / "test_person_1.parquet"),
            self._read_file(self.TEST_DIR / "test_deposit_1.parquet"),
            self._read_file(self.TEST_DIR / "test_debitcard_1.parquet")
        ]
        depth_2 = [
            self._read_file(self.TEST_DIR / "test_credit_bureau_b_2.parquet"),
        ]        
        return base, depth_0, depth_1, depth_2
    
    def _get_train_case_id_set(self):
        case_id_info = pl.read_parquet(self.TRAIN_DIR / "train_base.parquet", columns=["case_id", "WEEK_NUM"])
        
        min_week_id = case_id_info["WEEK_NUM"].min()
        max_week_id = case_id_info["WEEK_NUM"].max()
        week_id_threashold = min_week_id + int((max_week_id - min_week_id) * self.TRAIN_PERSENT_SIZE)
        case_id_info = case_id_info.filter(case_id_info["WEEK_NUM"] <= week_id_threashold)
        return case_id_info["case_id"]
    
    def _get_test_case_id_set(self):
        case_id_info = pl.read_parquet(self.TRAIN_DIR / "train_base.parquet", columns=["case_id", "WEEK_NUM"])
        
        min_week_id = case_id_info["WEEK_NUM"].min()
        max_week_id = case_id_info["WEEK_NUM"].max()
        week_id_threashold = min_week_id + int((max_week_id - min_week_id) * self.TRAIN_PERSENT_SIZE)
        case_id_info = case_id_info.filter(case_id_info["WEEK_NUM"] > week_id_threashold)
        return case_id_info["case_id"]
    
    def _read_file(self, path):
        return pl.read_parquet(path)
    

    def _read_files(self, regex_path):
        chunks = []
        for path in glob(str(regex_path)):
            chunks.append(self._read_file(path))

        return pl.concat(chunks, how="vertical_relaxed")

In [5]:
class Aggregator:
    num_aggregators = [pl.max, pl.min, pl.first, pl.last, pl.mean]
    str_aggregators = [pl.max, pl.min, pl.first, pl.last] # n_unique
    group_aggregators = [pl.max, pl.min, pl.first, pl.last]
    
    @staticmethod
    def num_expr(df):
        cols = [col for col in df.columns if col[-1] in ("P", "A")]
        expr_all = []
        for method in Aggregator.num_aggregators:
            expr = [method(col).alias(f"{method.__name__}_{col}") for col in cols]
            expr_all += expr

        return expr_all

    @staticmethod
    def date_expr(df):
        cols = [col for col in df.columns if col[-1] in ("D",)]
        expr_all = []
        for method in Aggregator.num_aggregators:
            expr = [method(col).alias(f"{method.__name__}_{col}") for col in cols]  
            expr_all += expr

        return expr_all

    @staticmethod
    def str_expr(df):
        cols = [col for col in df.columns if col[-1] in ("M",)]
        
        expr_all = []
        for method in Aggregator.str_aggregators:
            expr = [method(col).alias(f"{method.__name__}_{col}") for col in cols]  
            expr_all += expr
            
        expr_mode = [
            pl.col(col)
            .drop_nulls()
            .mode()
            .first()
            .alias(f"mode_{col}")
            for col in cols
        ]

        return expr_all + expr_mode

    @staticmethod
    def other_expr(df):
        cols = [col for col in df.columns if col[-1] in ("T", "L")]
        
        expr_all = []
        for method in Aggregator.str_aggregators:
            expr = [method(col).alias(f"{method.__name__}_{col}") for col in cols]  
            expr_all += expr

        return expr_all
    
    @staticmethod
    def count_expr(df):
        cols = [col for col in df.columns if "num_group" in col]

        expr_all = []
        for method in Aggregator.group_aggregators:
            expr = [method(col).alias(f"{method.__name__}_{col}") for col in cols]  
            expr_all += expr
            
#         if len(cols) > 0:
#             method = pl.count
#             expr = [method(col).alias(f"{method.__name__}_{col}") for col in [cols[0]]]
#             expr_all += expr

        return expr_all

    @staticmethod
    def get_exprs(df):
        exprs = Aggregator.num_expr(df) + \
                Aggregator.date_expr(df) + \
                Aggregator.str_expr(df) + \
                Aggregator.other_expr(df) + \
                Aggregator.count_expr(df)

        return exprs

In [6]:
class SetTypesStep:
    def __init__(self):
        self.column_to_type = {}
        
    def process_train_dataset(self, train_dataset):
        for df in [train_dataset.base] + train_dataset.depth_0 + train_dataset.depth_1 + train_dataset.depth_2:
            for column in df.columns:
                if column in ("WEEK_NUM", "case_id", "MONTH", "num_group1", "num_group2", "target"):
                    self.column_to_type[column] = pl.Int64
                elif (column[-1] == "D") or (column == "date_decision"):
                    self.column_to_type[column] = pl.Date
                elif (column[-1] in ['M']) or (df[column].dtype == pl.String):
                    self.column_to_type[column] = pl.String
                else:
                    self.column_to_type[column] = pl.Float32
        return self.process(train_dataset)
    
    def process_test_dataset(self, test_dataset):
        return self.process(test_dataset)
    
    def process(self, dataset):
        assert(type(dataset) is Dataset)
        dataset.base = self.process_tables([dataset.base])[0]
        dataset.depth_0 = self.process_tables(dataset.depth_0)
        dataset.depth_1 = self.process_tables(dataset.depth_1)
        dataset.depth_2 = self.process_tables(dataset.depth_2)
        return dataset
    
    def process_tables(self, dfs):
        for i in range(len(dfs)):
            for column in dfs[i].columns:
                assert column in self.column_to_type, "Unknown column: {}".format(column)
                dfs[i] = dfs[i].with_columns(dfs[i][column].cast(self.column_to_type[column]))
        return dfs
    
class AggregateDepthTableStep:        
    def process_train_dataset(self, train_dataset):
        return self.process(train_dataset)
        
    def process_test_dataset(self, test_dataset):
        return self.process(test_dataset)
    
    def process(self, dataset):
        assert(type(dataset) is Dataset)
        for i in range(len(dataset.depth_1)):
            dataset.depth_1[i] = dataset.depth_1[i].group_by("case_id").agg(Aggregator.get_exprs(dataset.depth_1[i]))
        for i in range(len(dataset.depth_2)):
            dataset.depth_2[i] = dataset.depth_2[i].group_by("case_id").agg(Aggregator.get_exprs(dataset.depth_2[i]))
        return dataset
    
class JoinTablesStep:        
    def process_train_dataset(self, train_dataset):
        return self.process(train_dataset)
        
    def process_test_dataset(self, test_dataset):
        return self.process(test_dataset)
    
    def process(self, dataset):
        result = dataset.base
        for i, df in enumerate(dataset.depth_0 + dataset.depth_1 + dataset.depth_2):
            result = result.join(df, how="left", on="case_id", suffix=f"_{i}")
        return result
    
class ProcessDatesStep:        
    def process_train_dataset(self, df):
        return self.process(df)
        
    def process_test_dataset(self, df):
        return self.process(df)
    
    def process(self, df):
        for column in df.columns:
            if (df[column].dtype == pl.Date) and (column != "date_decision"):
                df = df.with_columns(pl.col(column) - pl.col("date_decision"))
                df = df.with_columns(pl.col(column).dt.total_days())
        return df
    
class ProcessCategoricalStep:    
    def __init__(self):
        self.column_to_type = {}
        
    def process_train_dataset(self, df):
        for column in df.columns:
            if df[column].dtype == pl.String:
                unique_values = list(df[column].filter(~df[column].is_null()).unique())
                self.column_to_type[column] = pl.Enum(unique_values + ["__UNKNOWN__"])
            
        return self.process(df)
        
    def process_test_dataset(self, df):
        return self.process(df)
    
    def process(self, df):
        for column in df.columns:
            if df[column].dtype == pl.String:
                column_type = self.column_to_type[column]
                df = df.with_columns(df[column].set(~df[column].is_in(column_type.categories), "__UNKNOWN__"))
                df = df.with_columns(df[column].fill_null("__UNKNOWN__").cast(column_type))
        return df

class DropColumnsStep:    
    def __init__(self):
        self.columns = []
        
    def process_train_dataset(self, df):
        for column in df.columns:
            isnull = df[column].is_null().mean()
            if isnull > 0.95:
                self.columns.append(column)

        for column in df.columns:
            if df[column].dtype == pl.Enum:
                freq = df[column].n_unique()

                if (freq == 1) or (freq > 200):
                    self.columns.append(column)
        self.columns.append("date_decision")
        self.columns.append("MONTH")
                
        print("Columns to drop: {}".format(self.columns))            
        return self.process(df)
        
    def process_test_dataset(self, df):
        return self.process(df)
    
    def process(self, df):
        for column in self.columns:
            df = df.drop(column)
        return df
    
    
class DropDatesColumnsStep:            
    def process_train_dataset(self, df):  
        return self.process(df)
        
    def process_test_dataset(self, df):
        return self.process(df)
    
    def process(self, df):
        for column in df.columns:
            if (column[-1] == 'D'):
                df = df.drop(column)
        return df
    
class ReduceMemoryUsageStep:     
    def process_train_dataset(self, df):  
        return self.process(df)
        
    def process_test_dataset(self, df):
        return self.process(df)
    
    def process(self, df):
        column_to_type = {}
        for column in df.columns:
            column_type = df[column].dtype

            if column_type != pl.Enum:
                c_min = df[column].min()
                c_max = df[column].max()
                if (c_min is None) or (c_max is None):
                    column_to_type[column] = column_type
                    continue
                    
                if str(column_type)[:3] == 'Int':
                    if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                        column_to_type[column] = pl.Int8
                    elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                        column_to_type[column] = pl.Int16
                    elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                        column_to_type[column] = pl.Int32
                    elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                        column_to_type[column] = pl.Int64
                else:
                    if c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                        column_to_type[column] = pl.Float32
                    else:
                        column_to_type[column] = pl.Float64
            else:
                column_to_type[column] = column_type
        for column in df.columns:
            column_type = column_to_type[column]
            df = df.with_columns(df[column].cast(column_type))
        return df
    
class DropNonImportantFeaturesStep:
    def __init__(self):
        self.important_columns = ['dateofbirth_337D', 'days120_123L', 'days180_256L', 'days30_165L', 'days360_512L', 'days90_310L', 'description_5085714M', 'education_1103M', 'numberofqueries_373L', 'pmtaverage_3A', 'pmtaverage_4527227A', 'pmtscount_423L', 'pmtssum_45A', 'requesttype_4525192L', 'responsedate_4527233D', 'responsedate_4917613D', 'riskassesment_302T', 'thirdquarter_1082L', 'amtinstpaidbefduel24m_4187115A', 'annuity_780A', 'applicationcnt_361L', 'applicationscnt_867L', 'avgdpdtolclosure24_3658938P', 'avgmaxdpdlast9m_3716943P', 'bankacctype_710L', 'cardtype_51L', 'clientscnt12m_3712952L', 'clientscnt6m_3712949L', 'cntpmts24_3658933L', 'credtype_322L', 'currdebt_22A', 'datelastinstal40dpd_247D', 'deferredmnthsnum_166L', 'disbursedcredamount_1113A', 'disbursementtype_67L', 'eir_270L', 'homephncnt_628L', 'inittransactioncode_186L', 'interestrate_311L', 'isbidproduct_1095L', 'isdebitcard_729L', 'lastdelinqdate_224D', 'lastrejectreason_759M', 'lastrejectreasonclient_4145040M', 'lastst_736L', 'mastercontrelectronic_519L', 'mastercontrexist_109L', 'maxdbddpdtollast12m_3658940P', 'maxdebt4_972A', 'maxdpdinstldate_3546855D', 'maxdpdlast24m_143P', 'maxdpdlast3m_392P', 'maxdpdlast6m_474P', 'maxdpdlast9m_1059P', 'maxdpdtolerance_374P', 'mobilephncnt_593L', 'numincomingpmts_3546848L', 'numinstlallpaidearly3d_817L', 'numinstlsallpaid_934L', 'numinstlswithdpd10_728L', 'numinstpaidearly3d_3546850L', 'numinsttopaygr_769L', 'numinstunpaidmax_3546851L', 'numinstunpaidmaxest_4493212L', 'pctinstlsallpaidlate1d_3546856L', 'pmtnum_254L', 'price_1097A', 'totaldebt_9A', 'twobodfilling_608L', 'validfrom_1069D', 'max_currdebt_94A', 'max_maxdpdtolerance_577P', 'max_outstandingdebt_522A', 'first_actualdpd_943P', 'mean_credacc_actualbalance_314A', 'mean_currdebt_94A', 'mean_maxdpdtolerance_577P', 'mean_outstandingdebt_522A', 'max_employedfrom_700D', 'max_postype_4733339M', 'max_rejectreason_755M', 'max_rejectreasonclient_4145042M', 'min_education_1138M', 'first_education_1138M', 'last_education_1138M', 'last_rejectreason_755M', 'last_rejectreasonclient_4145042M', 'mode_education_1138M', 'mode_rejectreasonclient_4145042M', 'max_familystate_726L', 'max_pmtnum_8L', 'max_status_219L', 'max_tenor_203L', 'min_credtype_587L', 'min_familystate_726L', 'min_inittransactioncode_279L', 'min_isbidproduct_390L', 'first_familystate_726L', 'first_status_219L', 'last_credtype_587L', 'last_familystate_726L', 'min_num_group1', 'first_num_group1', 'max_amount_4527230A', 'mean_amount_4527230A', 'max_num_group1_3', 'min_num_group1_3', 'min_num_group1_4', 'min_num_group1_5', 'max_classificationofcontr_1114M', 'max_contractst_516M', 'max_contracttype_653M', 'max_periodicityofpmts_997M', 'max_pmtmethod_731M', 'max_purposeofcred_722M', 'max_subjectrole_326M', 'max_subjectrole_43M', 'min_classificationofcontr_1114M', 'min_contractst_516M', 'min_contracttype_653M', 'min_periodicityofpmts_997M', 'min_pmtmethod_731M', 'min_purposeofcred_722M', 'min_subjectrole_326M', 'min_subjectrole_43M', 'first_classificationofcontr_1114M', 'first_contractst_516M', 'first_contracttype_653M', 'first_periodicityofpmts_997M', 'first_purposeofcred_722M', 'first_subjectrole_326M', 'first_subjectrole_43M', 'last_classificationofcontr_1114M', 'last_contracttype_653M', 'last_periodicityofpmts_997M', 'last_pmtmethod_731M', 'last_purposeofcred_722M', 'last_subjectrole_326M', 'last_subjectrole_43M', 'mode_classificationofcontr_1114M', 'mode_purposeofcred_722M', 'mode_subjectrole_326M', 'mode_subjectrole_43M', 'max_birth_259D', 'min_birth_259D', 'first_birth_259D', 'last_birth_259D', 'max_education_927M', 'max_empladdr_district_926M', 'max_empladdr_zipcode_114M', 'max_language1_981M', 'min_education_927M', 'min_language1_981M', 'first_education_927M', 'first_language1_981M', 'last_education_927M', 'last_empladdr_district_926M', 'last_empladdr_zipcode_114M', 'last_language1_981M', 'max_contaddr_matchlist_1032L', 'max_empl_employedtotal_800L', 'max_familystate_447L', 'max_housingtype_772L', 'max_incometype_1044T', 'max_relationshiptoclient_415T', 'max_relationshiptoclient_642T', 'max_sex_738L', 'min_contaddr_matchlist_1032L', 'min_empl_employedtotal_800L', 'min_familystate_447L', 'min_housetype_905L', 'min_housingtype_772L', 'min_incometype_1044T', 'min_maritalst_703L', 'min_personindex_1023L', 'min_persontype_1072L', 'min_persontype_792L', 'min_relationshiptoclient_415T', 'min_relationshiptoclient_642T', 'min_role_993L', 'min_sex_738L', 'first_contaddr_matchlist_1032L', 'first_contaddr_smempladdr_334L', 'first_empl_employedtotal_800L', 'first_familystate_447L', 'first_housetype_905L', 'first_incometype_1044T', 'first_maritalst_703L', 'first_personindex_1023L', 'first_persontype_1072L', 'first_persontype_792L', 'first_role_993L', 'first_safeguarantyflag_411L', 'first_sex_738L', 'last_contaddr_matchlist_1032L', 'last_empl_industry_691L', 'last_familystate_447L', 'last_housetype_905L', 'last_housingtype_772L', 'last_incometype_1044T', 'last_maritalst_703L', 'last_sex_738L', 'min_num_group1_8', 'first_num_group1_8', 'min_num_group1_9', 'min_num_group1_10']
        
    def process_train_dataset(self, df):  
        return self.process(df)
        
    def process_test_dataset(self, df):
        return self.process(df)
    
    def process(self, df):
        for column in df.columns:
            if (column in ["target", "WEEK_NUM", "case_id"]):
                continue
            if (column not in self.important_columns):
                df = df.drop(column)
        return df
    
    
class Preprocessor:
    def __init__(self):
        self.steps = {
            "set_types": SetTypesStep(),
            "aggregate_depth_table": AggregateDepthTableStep(),
            "join_table": JoinTablesStep(),
            "process_date": ProcessDatesStep(),
            "process_categorical": ProcessCategoricalStep(),
            "drop_columns": DropColumnsStep(),
            "reduce_memory_usage": ReduceMemoryUsageStep(),
            "drop_non_important_features": DropNonImportantFeaturesStep()
            #"drop_dates_columns": DropDatesColumnsStep()
        }
    
    def process_train_dataset(self, train_dataset):
        for name, step in self.steps.items():
            start = time.time()
            train_dataset = step.process_train_dataset(train_dataset)
            finish = time.time()
            print("Step: {}, execution_time: {}".format(name, finish - start), flush=True)
        return train_dataset
    
    def process_test_dataset(self, test_dataset):
        for name, step in self.steps.items():
            start = time.time()
            test_dataset = step.process_test_dataset(test_dataset)
            finish = time.time()
            print("Step: {}, execution_time: {}".format(name, finish - start), flush=True)
        return test_dataset

In [7]:
data_loader = DataLoader(MODE)
preprocessor = Preprocessor()

In [8]:
train_dataset = data_loader.load_train_dataset()

In [9]:
train_dataset.depth_0

[shape: (1_500_476, 53)
 ┌─────────┬────────────┬───────────┬───────────┬───┬───────────┬───────────┬───────────┬───────────┐
 │ case_id ┆ assignment ┆ assignmen ┆ assignmen ┆ … ┆ riskasses ┆ riskasses ┆ secondqua ┆ thirdquar │
 │ ---     ┆ date_238D  ┆ tdate_452 ┆ tdate_495 ┆   ┆ ment_302T ┆ ment_940T ┆ rter_766L ┆ ter_1082L │
 │ i64     ┆ ---        ┆ 7235D     ┆ 5616D     ┆   ┆ ---       ┆ ---       ┆ ---       ┆ ---       │
 │         ┆ str        ┆ ---       ┆ ---       ┆   ┆ str       ┆ f64       ┆ f64       ┆ f64       │
 │         ┆            ┆ str       ┆ str       ┆   ┆           ┆           ┆           ┆           │
 ╞═════════╪════════════╪═══════════╪═══════════╪═══╪═══════════╪═══════════╪═══════════╪═══════════╡
 │ 357     ┆ null       ┆ null      ┆ null      ┆ … ┆ null      ┆ null      ┆ null      ┆ null      │
 │ 381     ┆ null       ┆ null      ┆ null      ┆ … ┆ null      ┆ null      ┆ null      ┆ null      │
 │ 388     ┆ null       ┆ null      ┆ null      ┆ … ┆ null

In [10]:
train_dataset = data_loader.load_train_dataset()
print("Train dataset week range: {}-{}".format(train_dataset.base["WEEK_NUM"].min(), train_dataset.base["WEEK_NUM"].max()))
gc.collect()
train_df = preprocessor.process_train_dataset(train_dataset)
del train_dataset
gc.collect()

Train dataset week range: 0-91
Step: set_types, execution_time: 16.991347789764404
Step: aggregate_depth_table, execution_time: 55.333702087402344
Step: join_table, execution_time: 11.784667015075684
Step: process_date, execution_time: 7.059737682342529
Step: process_categorical, execution_time: 41.132843255996704
Columns to drop: ['assignmentdate_4955616D', 'dateofbirth_342D', 'for3years_128L', 'for3years_504L', 'for3years_584L', 'formonth_118L', 'formonth_206L', 'formonth_535L', 'forquarter_1017L', 'forquarter_462L', 'forquarter_634L', 'fortoday_1092L', 'forweek_1077L', 'forweek_528L', 'forweek_601L', 'foryear_618L', 'foryear_818L', 'foryear_850L', 'pmtaverage_4955615A', 'pmtcount_4955617L', 'riskassesment_940T', 'clientscnt_136L', 'equalityempfrom_62L', 'interestrategrace_34L', 'isbidproductrequest_292L', 'lastdependentsnum_448L', 'lastotherinc_902A', 'lastotherlnsexpense_631A', 'lastrepayingdate_696D', 'maxannuity_4075009A', 'payvacationpostpone_4187118D', 'first_credacc_actualbala

0

# Data visualization

In [11]:
# def bucket_mean_split(x, y, buckets=100):
#     x_buckets = []
#     y_buckets = []

#     x = np.array(x)
#     y = np.array(y)
#     x_quantiles = [np.quantile(x, i / buckets) for i in range(0, buckets + 1)]
#     for i in range(0, buckets):
#         filter = (x >= x_quantiles[i]) & (x <= x_quantiles[i + 1])
#         x_buckets.append(np.mean(x[filter]))
#         y_buckets.append(np.mean(y[filter]))
#     return x_buckets, y_buckets

In [12]:
# train["target"].mean()

In [13]:
# print("Mean target={} where actualdpdtolerance_344P != 0".format(train["target"].filter(train["actualdpdtolerance_344P"] != 0).mean()))
# print("Mean target={} where actualdpdtolerance_344P == 0".format(train["target"].filter(train["actualdpdtolerance_344P"] == 0).mean()))
# print("Mean target={} where actualdpdtolerance_344P is null".format(train["target"].filter(train["actualdpdtolerance_344P"].is_null()).mean()))

In [14]:
# columns = [column for column in train.columns if (train[column].dtype == pl.Float64)]

# figure, axis = plt.subplots(len(columns) // 5 + 1, 5, figsize=(15, 30))
# for index, column in enumerate(columns):
#     current_plot = axis[index // 5, index % 5]
#     x = train[column]
#     y = train["target"]
#     if (x.dtype == pl.String or x.dtype == pl.Boolean):
#         continue
#     filter = x.is_not_null()
#     x = x.filter(filter)
#     y = y.filter(filter)
#     if (x.is_empty()):
#         continue
#     x_bucket, y_bucket = bucket_mean_split(x.to_numpy(), y.to_numpy(), buckets=100)
#     current_plot.plot(x_bucket, y_bucket)
#     current_plot.set_title(column)
# plt.show()

In [15]:
features = train_df.columns
features.remove("WEEK_NUM")
features.remove("case_id")
features.remove("target")

### Feature Importance

In [16]:
def train_rf(X, Y):    
    params = {
        "boosting_type": "gbdt",
        "objective": "binary",
        "metric": "auc",
        "max_depth": 8,
        "max_bin": 255,
        "learning_rate": 0.05,
        "n_estimators": 200,
        "colsample_bytree": 0.8, 
        "colsample_bynode": 0.8,
        "verbose": -1,
        "random_state": 42,
        "device": "gpu",
    }
    
    model = lgb.LGBMClassifier(**params)
    
    model.fit(X.to_pandas(), Y.to_pandas())
    
    # Get feature importances
    return pl.DataFrame({
        "feature": list(X.columns),
        "feature_importance": model.feature_importances_,
        "train_score": roc_auc_score(Y.to_pandas(), model.predict(X.to_pandas()))
    })

In [17]:
def get_feature_importance(X, Y):
    COUNT_ITER = 50
    
    dfs = []
    for i in range(COUNT_ITER):
        gc.collect()
        
        start_time = time.time()
        shuffled_Y = Y.clone().sample(fraction=1.0, shuffle=True)
        current_df = train_rf(X, shuffled_Y)
        finish_time = time.time()
        print(f"Finish iteration: {i + 1}/{COUNT_ITER}, time: {finish_time - start_time}")
        current_df = current_df.with_columns(pl.lit(i).alias("iteration"))
        dfs.append(current_df)
        
    return pl.concat(dfs, rechunk=True), train_rf(X, Y)

In [18]:
# shuffled_feature_importance_df, feature_importance_df = get_feature_importance(train_df[features], train_df["target"])
# shuffled_feature_importance_df.write_csv("shuffled_feature_importance_df.csv", separator=",")
# feature_importance_df.write_csv("feature_importance_df.csv", separator=",")

In [19]:
# important_features = []
# for column in feature_importance_df["feature"]:
#     shuffled_feature_importance = shuffled_feature_importance_df.filter(shuffled_feature_importance_df["feature"] == column)["feature_importance"].to_numpy()
#     feature_importance = feature_importance_df.filter(feature_importance_df["feature"] == column)["feature_importance"][0]
    
#     if (np.percentile(shuffled_feature_importance, 90) <= feature_importance):
#         important_features.append(column)
# print(len(important_features), important_features)

# Training

In [20]:
class VotingModel(BaseEstimator, RegressorMixin):
    def __init__(self, estimators):
        super().__init__()
        self.estimators = estimators
        
    def fit(self, X, y=None):
        return self
    
    def predict(self, X):
        y_preds = [estimator.predict(X) for estimator in self.estimators]
        return np.mean(y_preds, axis=0)
    
    def predict_proba(self, X):
        y_preds = [estimator.predict_proba(X) for estimator in self.estimators]
        return np.mean(y_preds, axis=0)
    
    def get_feature_importance(self):
        feature_importances = np.array([0 for feature in features])

        for model in self.estimators:
            feature_importances += model.feature_importances_

        return feature_importances

In [21]:
# for column in features:
#     if (train_dataset[column].dtype != pl.Categorical) and \
#         (train_dataset[column].dtype != pl.Float64) and \
#         (train_dataset[column].dtype != pl.Float64):
#         print(column, train_dataset[column].dtype)

In [22]:
def train_lgb_model(X_train, Y_train, X_test, Y_test):
    params = {
        "boosting_type": "gbdt",
        "objective": "binary",
        "metric": "auc",
        "max_depth": 8,
        "max_bin": 250,
        "learning_rate": 0.05,
        "n_estimators": 1000,
        "colsample_bytree": 0.8, 
        "colsample_bynode": 0.8,
        "verbose": -1,
        "random_state": 42,
        "device": "gpu",
    }
    
    model = lgb.LGBMClassifier(**params)
    
    model.fit(
        X_train, Y_train,
        eval_set=[(X_test, Y_test)],
        callbacks=[lgb.log_evaluation(100), lgb.early_stopping(100)]
    )
        
    return model



In [23]:
gc.collect()

0

In [24]:
weeks = train_df["WEEK_NUM"]
cv = StratifiedGroupKFold(n_splits=5, shuffle=False)

fitted_models = []

oof_predicted = np.zeros(train_df.shape[0])

for idx_train, idx_test in cv.split(train_df[features], train_df["target"], groups=weeks):        
    X_train = train_df[features][idx_train].to_pandas()
    Y_train = train_df["target"][idx_train].to_pandas()
    
    X_test = train_df[features][idx_test].to_pandas()
    Y_test = train_df["target"][idx_test].to_pandas()
    
    start = time.time()
    model = train_lgb_model(X_train, Y_train, X_test, Y_test)
    finish = time.time()
    
    print("fit time: {}".format(finish - start))
    fitted_models.append(model)
    
    test_pred = model.predict_proba(X_test)[:, 1]
    oof_predicted[idx_test] = test_pred
    gc.collect()

model = VotingModel(fitted_models)
gc.collect()



Training until validation scores don't improve for 100 rounds
[100]	valid_0's auc: 0.816374
[200]	valid_0's auc: 0.824666
[300]	valid_0's auc: 0.827106
[400]	valid_0's auc: 0.827725
[500]	valid_0's auc: 0.828164
[600]	valid_0's auc: 0.828623
[700]	valid_0's auc: 0.828964
[800]	valid_0's auc: 0.829091
[900]	valid_0's auc: 0.829324
[1000]	valid_0's auc: 0.829417
Did not meet early stopping. Best iteration is:
[996]	valid_0's auc: 0.829442
fit time: 99.79290318489075




Training until validation scores don't improve for 100 rounds
[100]	valid_0's auc: 0.818207
[200]	valid_0's auc: 0.826426
[300]	valid_0's auc: 0.828688
[400]	valid_0's auc: 0.829617
[500]	valid_0's auc: 0.829997
[600]	valid_0's auc: 0.830545
[700]	valid_0's auc: 0.830817
[800]	valid_0's auc: 0.830846
[900]	valid_0's auc: 0.831036
[1000]	valid_0's auc: 0.831162
Did not meet early stopping. Best iteration is:
[981]	valid_0's auc: 0.831185
fit time: 94.90766286849976




Training until validation scores don't improve for 100 rounds
[100]	valid_0's auc: 0.822241
[200]	valid_0's auc: 0.830915
[300]	valid_0's auc: 0.833639
[400]	valid_0's auc: 0.834387
[500]	valid_0's auc: 0.834844
[600]	valid_0's auc: 0.835145
[700]	valid_0's auc: 0.835402
[800]	valid_0's auc: 0.835713
[900]	valid_0's auc: 0.836033
[1000]	valid_0's auc: 0.836119
Did not meet early stopping. Best iteration is:
[1000]	valid_0's auc: 0.836119
fit time: 92.76177144050598




Training until validation scores don't improve for 100 rounds
[100]	valid_0's auc: 0.823605
[200]	valid_0's auc: 0.832112
[300]	valid_0's auc: 0.834455
[400]	valid_0's auc: 0.835351
[500]	valid_0's auc: 0.83591
[600]	valid_0's auc: 0.836296
[700]	valid_0's auc: 0.836399
[800]	valid_0's auc: 0.836542
[900]	valid_0's auc: 0.836942
[1000]	valid_0's auc: 0.836997
Did not meet early stopping. Best iteration is:
[980]	valid_0's auc: 0.83701
fit time: 97.38110017776489




Training until validation scores don't improve for 100 rounds
[100]	valid_0's auc: 0.817789
[200]	valid_0's auc: 0.826491
[300]	valid_0's auc: 0.828952
[400]	valid_0's auc: 0.829966
[500]	valid_0's auc: 0.83049
[600]	valid_0's auc: 0.830752
[700]	valid_0's auc: 0.830911
[800]	valid_0's auc: 0.831002
[900]	valid_0's auc: 0.831128
Early stopping, best iteration is:
[884]	valid_0's auc: 0.831163
fit time: 91.94811487197876


0

In [25]:
roc_auc_oof = roc_auc_score(train_df["target"], oof_predicted)
print("CV roc_auc_oof: ", roc_auc_oof)

CV roc_auc_oof:  0.8330398780736885


# Predict

In [26]:
del train_df
gc.collect()

0

In [27]:
test_dataset = data_loader.load_test_dataset()
print("Test dataset week range: {}-{}".format(test_dataset.base["WEEK_NUM"].min(), test_dataset.base["WEEK_NUM"].max()))
gc.collect()
test_df = preprocessor.process_test_dataset(test_dataset)
del test_dataset
gc.collect()

Test dataset week range: 92-92
Step: set_types, execution_time: 0.0180966854095459
Step: aggregate_depth_table, execution_time: 0.01849508285522461
Step: join_table, execution_time: 0.010310173034667969
Step: process_date, execution_time: 0.07008981704711914
Step: process_categorical, execution_time: 0.7044830322265625
Step: drop_columns, execution_time: 0.15135598182678223
Step: reduce_memory_usage, execution_time: 0.08845019340515137
Step: drop_non_important_features, execution_time: 0.07697653770446777


0

In [28]:
print([name for value, name in reversed(sorted(zip(model.get_feature_importance(), features)))][0:200])

['price_1097A', 'max_employedfrom_700D', 'annuity_780A', 'dateofbirth_337D', 'maxdpdinstldate_3546855D', 'pmtssum_45A', 'disbursedcredamount_1113A', 'numincomingpmts_3546848L', 'lastdelinqdate_224D', 'pmtnum_254L', 'amtinstpaidbefduel24m_4187115A', 'last_birth_259D', 'mean_amount_4527230A', 'pctinstlsallpaidlate1d_3546856L', 'maxdebt4_972A', 'max_amount_4527230A', 'maxdbddpdtollast12m_3658940P', 'datelastinstal40dpd_247D', 'thirdquarter_1082L', 'days360_512L', 'applicationscnt_867L', 'cntpmts24_3658933L', 'numinstlsallpaid_934L', 'validfrom_1069D', 'numinstlswithdpd10_728L', 'mean_maxdpdtolerance_577P', 'max_num_group1_3', 'numinstlallpaidearly3d_817L', 'mean_credacc_actualbalance_314A', 'mobilephncnt_593L', 'maxdpdtolerance_374P', 'interestrate_311L', 'numinstpaidearly3d_3546850L', 'numberofqueries_373L', 'max_birth_259D', 'days180_256L', 'eir_270L', 'pmtaverage_4527227A', 'min_birth_259D', 'avgdpdtolclosure24_3658938P', 'days120_123L', 'first_birth_259D', 'pmtaverage_3A', 'max_pmtnum

In [29]:
# CV roc_auc_oof:  0.8353789451006361 without feature selection ~500 features
# CV roc_auc_oof:  0.8329322959533371 with feature selection ~200 features

In [30]:
def gini_stability(dataset, w_fallingrate=88.0, w_resstd=-0.5):
    gini_in_time = dataset\
        .sort_values("WEEK_NUM")\
        .groupby("WEEK_NUM")[["true", "predicted"]]\
        .apply(lambda x: 2 * roc_auc_score(x["true"], x["predicted"]) - 1).tolist()
    
    x = np.arange(len(gini_in_time))
    y = gini_in_time
    a, b = np.polyfit(x, y, 1)
    y_hat = a * x + b
    residuals = y - y_hat
    res_std = np.std(residuals)
    avg_gini = np.mean(gini_in_time)
    
    print(avg_gini, min(0, a), res_std)
    
    return avg_gini + w_fallingrate * min(0, a) + w_resstd * res_std

In [31]:
# Y_train_predicted = model.predict_proba(train_df[features].to_pandas())
Y_test_predicted = model.predict_proba(test_df[features].to_pandas())

In [32]:
# result = pd.DataFrame({
#     "WEEK_NUM": train_df["WEEK_NUM"],
#     "true": train_df["target"],
#     "predicted": Y_train_predicted[:, 1],
# })

# train_result = gini_stability(result)
# print("train_score: {}".format(train_result))

# if MODE == Mode.Train:
#     result = pd.DataFrame({
#         "WEEK_NUM": test_df["WEEK_NUM"],
#         "true": test_df["target"],
#         "predicted": Y_test_predicted[:, 1],
#     })

#     test_result = gini_stability(result)
#     print("test_score: {}".format(test_result))

In [33]:
# Version 1, test_score: 0.583319926590042, public score 0.48
# 100 iters

# Version 2, test_score: 0.6062624701357268, public score 0.517
# 300 iters

# Version 3, test_score: ?, public score 0.545
# 2000 iters

# Version 4, test_score: ?, public score ?
# ? iters, Improve aggregate functions for depth_1, depth_2


# Submition

In [34]:
case_id = test_df["case_id"]
X = test_df[features].to_pandas()

Y = model.predict_proba(X)

submission = pd.DataFrame({
    "case_id": case_id.to_numpy(),
    "score": Y[:, 1]
}).set_index('case_id')
submission.to_csv("./submission.csv")