In [1]:
!pip install /kaggle/input/scikit-learn-1-4-2-cp310/scikit_learn-1.4.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl

Processing /kaggle/input/scikit-learn-1-4-2-cp310/scikit_learn-1.4.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl
Installing collected packages: scikit-learn
  Attempting uninstall: scikit-learn
    Found existing installation: scikit-learn 1.2.2
    Uninstalling scikit-learn-1.2.2:
      Successfully uninstalled scikit-learn-1.2.2
[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
spopt 0.6.0 requires shapely>=2.0.1, but you have shapely 1.8.5.post1 which is incompatible.[0m[31m
[0mSuccessfully installed scikit-learn-1.4.2


In [2]:
import os
import gc
import glob
import pickle
import psutil
from time import time

import numpy as np
import pandas as pd
import polars as pl
import lightgbm as lgb
import xgboost as xgb
from catboost import CatBoostClassifier, Pool
from sklearn.ensemble import HistGradientBoostingClassifier
from sklearn.model_selection import StratifiedGroupKFold
from sklearn.metrics import roc_auc_score 
from scipy.stats import chi2_contingency

In [3]:
def get_execution_time(t):
    print(f'Total time = {time() - t:.2f}s')


def get_memory():
    print(f'Available memory left: {psutil.virtual_memory().available / (1024 * 1024 * 1024):.2f}gb')

    
get_memory()

Available memory left: 30.20gb


# Data Preparation

In [4]:
total_t = time()
SEED = 42
MISSING_PERCENTAGE = 0.95
CATEGORIES_MAX = 200
DATA_PATH = "/kaggle/input/home-credit-credit-risk-model-stability/parquet_files/"
TABLES = {
    'depth_0': ['static_0_*', 'static_cb_0'],
    'depth_1': [
        'applprev_1_*', 
        'credit_bureau_a_1_*',
        'credit_bureau_b_1',
        'debitcard_1',
        'deposit_1',
        'other_1',
        'person_1',
        'tax_registry_a_1',
        'tax_registry_b_1',
        'tax_registry_c_1',
    ],
    'depth_2': [
        'applprev_2',
        'credit_bureau_a_2_*',
        'credit_bureau_b_2',   
        'person_2',
    ],
}
PROCESSED_DATA_PATH = ''
SELECTED_COLS = []

In [5]:
def reduce_memory(df):
    return df.select(pl.all().shrink_dtype())


def set_dtypes(df):
    for col in df.columns:
        if col in ("target", "case_id", "WEEK_NUM", "MONTH"):
            df = df.with_columns(pl.col(col).cast(pl.Int32))
        elif col == "date_decision" or col[-1] == "D":
            df = df.with_columns(pl.col(col).cast(pl.Date))
        elif col[-1] in ("P", "A") or ("num_group" in col):
            df = df.with_columns(pl.col(col).cast(pl.Float32))
        elif col[-1] == "M":
            df = df.with_columns(pl.col(col).cast(pl.String))
    return df


def filter_columns(df):
    cols_to_drop = set()

    for col in df.columns:
        if col in ("target", "case_id", "WEEK_NUM", "MONTH"):
            continue

        # Remove categorical columns with 1 or >CATEGORIES_MAX categories
        if df[col].dtype == pl.String:
            n = df[col].n_unique()
            if (n == 1) or (n > CATEGORIES_MAX):
                cols_to_drop.add(col)
                continue

        # Remove columns with >MISSING_PERCENTAGE missing values
        nulls = df[col].is_null().mean()
        if nulls > MISSING_PERCENTAGE:
            cols_to_drop.add(col)

    return df.drop(list(cols_to_drop))


def aggregate(df):
    exprs = []

    for col, dtype in zip(df.columns, df.dtypes):
        if col in ("target", "case_id", "WEEK_NUM", "MONTH"):
            continue
        
        exprs += [
            pl.col(col).max().alias(f"max_{col}"),
        ]
        
        if col[-1] in ("P", "A", "D"):
            exprs += [
                pl.col(col).mean().alias(f"mean_{col}"),
                pl.col(col).var().alias(f"var_{col}"),
            ]

    return exprs


def handle_dates(df):
    assert "date_decision" in df.columns, "date_decision not in df"
    for col in df.columns:
        if col[-1] == "D":
            df = df.with_columns(pl.col(col) - pl.col("date_decision"))
            df = df.with_columns(pl.col(col).dt.total_days().cast(pl.Float32))
        elif col in [
            "dpdmaxdateyear_596T",
            "dpdmaxdateyear_742T",
            "dpdmaxdateyear_896T",
            "overdueamountmaxdateyear_2T",
            "overdueamountmaxdateyear_432T",
            "overdueamountmaxdateyear_994T",
            "pmts_year_1139T",
            "pmts_year_507T",
        ]:  # These columns are represented in years (ex: 2020, 2021, ...)
            df = df.with_columns(
                pl.col(col) - pl.col("date_decision").dt.year().cast(pl.Float32)
            )

    return df.drop("date_decision")


def read_files(split="train"):
    df = pl.read_parquet(DATA_PATH + f"{split}/{split}_base.parquet").pipe(set_dtypes)
    df = df.with_columns([
        pl.col("date_decision").dt.month().alias("month_decision"),
        pl.col("date_decision").dt.weekday().alias("weekday_decision"),
    ])
    date_decision = df.select(pl.col(["case_id", "date_decision"]))

    for key, item in TABLES.items():
        print(f"#\tHandling {key}")
        t = time()

        for i in item:
            print(f"##\tProcessing {i}")
            sub_df = pl.DataFrame()

            for file in glob.glob(DATA_PATH + f"{split}/{split}_{i}.parquet"):
                dummy = (
                    pl.read_parquet(file)
                    .pipe(set_dtypes)
                    .join(date_decision, how="left", on="case_id")
                    .pipe(handle_dates)
                )

                if key != "depth_0" and not dummy.is_empty():
                    dummy = dummy.group_by("case_id").agg(aggregate(dummy))

                sub_df = (
                    dummy
                    if sub_df.is_empty()
                    else pl.concat([sub_df, dummy], how="diagonal_relaxed")
                )

            df = df.join(
                sub_df.unique(subset=["case_id"]),
                how="left",
                on="case_id",
                suffix=f"_{i}",
            )
            if split == "train":
                df = df.pipe(filter_columns)

    return df.drop(["date_decision", "MONTH"]).pipe(reduce_memory)

## Get Train Set

In [6]:
def get_train():
    t = time()
    print("~~~~~~~~~~~~~~~~~~~~~~~~~~~")
    if PROCESSED_DATA_PATH == "":
        print("Reading files")
        df = read_files().pipe(filter_columns)
        df.write_parquet("./data.parquet")
    else:
        print("Processed data found, use these instead")
        df = pl.read_parquet(PROCESSED_DATA_PATH).select(SELECTED_COLS)
    get_memory()
    get_execution_time(t)

    print(f"Unique dtypes: {set(df.dtypes)}")
    cat_cols, num_cols = [], []
    for col, dtype in zip(df.columns, df.dtypes):
        if col in ["target", "WEEK_NUM", "case_id"]:
            continue

        if dtype in (pl.String, pl.Boolean):
            cat_cols.append(col)
        else:
            num_cols.append(col)

    t = time()
    print("~~~~~~~~~~~~~~~~~~~~~~~~~~~")
    print("Converting to pandas")
    df = df.to_pandas()
    df[cat_cols] = df[cat_cols].astype("category")

    print(f"{len(df.index)} rows and {len(df.columns)} columns")
    print(f"{len(cat_cols)} categorical and {len(num_cols)} numerical columns")
    get_memory()
    get_execution_time(t)

    return df.copy(), cat_cols, num_cols

In [7]:
%%time
print('Getting train set')
df, cat_cols, num_cols = get_train()
df.tail()

Getting train set
~~~~~~~~~~~~~~~~~~~~~~~~~~~
Reading files
#	Handling depth_0
##	Processing static_0_*
##	Processing static_cb_0
#	Handling depth_1
##	Processing applprev_1_*
##	Processing credit_bureau_a_1_*
##	Processing credit_bureau_b_1
##	Processing debitcard_1
##	Processing deposit_1
##	Processing other_1
##	Processing person_1
##	Processing tax_registry_a_1
##	Processing tax_registry_b_1
##	Processing tax_registry_c_1
#	Handling depth_2
##	Processing applprev_2
##	Processing credit_bureau_a_2_*
##	Processing credit_bureau_b_2
##	Processing person_2
Available memory left: 25.97gb
Total time = 253.30s
Unique dtypes: {Int32, String, Float32, Int8, Boolean}
~~~~~~~~~~~~~~~~~~~~~~~~~~~
Converting to pandas
1526659 rows and 503 columns
79 categorical and 421 numerical columns
Available memory left: 26.99gb
Total time = 21.30s
CPU times: user 7min 27s, sys: 2min 9s, total: 9min 36s
Wall time: 4min 35s


Unnamed: 0,case_id,WEEK_NUM,target,month_decision,weekday_decision,actualdpdtolerance_344P,amtinstpaidbefduel24m_4187115A,annuity_780A,annuitynextmonth_57A,applicationcnt_361L,...,var_pmts_overdue_1152A,max_pmts_year_1139T,max_pmts_year_507T,max_subjectroles_name_541M,max_subjectroles_name_838M,max_conts_role_79M,max_empls_economicalst_849M,max_empls_employer_name_740M,max_num_group1_person_2,max_num_group2_person_2
1526654,2703450,91,0,10,1,0.0,176561.359375,3675.400146,0.0,0.0,...,223616.078125,1.0,1.0,ab3c25cf,ab3c25cf,a55475b1,a55475b1,a55475b1,0.0,0.0
1526655,2703451,91,0,10,1,0.0,301276.46875,7088.600098,6191.600098,0.0,...,0.0,1.0,1.0,ab3c25cf,ab3c25cf,a55475b1,a55475b1,a55475b1,0.0,0.0
1526656,2703452,91,0,10,1,0.0,14232.400391,7788.800293,0.0,0.0,...,0.0,1.0,0.0,ab3c25cf,ab3c25cf,a55475b1,a55475b1,a55475b1,0.0,0.0
1526657,2703453,91,0,10,1,0.0,197371.578125,1195.400024,2827.199951,0.0,...,292734.6875,1.0,0.0,ab3c25cf,ab3c25cf,a55475b1,a55475b1,a55475b1,0.0,0.0
1526658,2703454,91,0,10,1,0.0,82949.601562,4533.800293,2986.800049,0.0,...,179989.234375,1.0,1.0,ab3c25cf,ab3c25cf,a55475b1,a55475b1,a55475b1,0.0,0.0


## Feature Selection

In [8]:
def cramers_v(x, y):
    # https://stackoverflow.com/questions/20892799/using-pandas-calculate-cram%C3%A9rs-coefficient-matrix
    cm = pd.crosstab(x, y)
    chi2 = chi2_contingency(cm)[0]
    n = cm.sum().sum()
    phi2 = chi2 / n
    r, k = cm.shape
    phi2corr = max(0, phi2 - ((k - 1) * (r - 1)) / (n - 1))
    rcorr = r - ((r - 1) ** 2) / (n - 1)
    kcorr = k - ((k - 1) ** 2) / (n - 1)
    return np.sqrt(phi2corr / min((kcorr - 1), (rcorr - 1)))

In [9]:
%%time
# https://www.kaggle.com/code/harrychan123/lgb-cat-ensemble-stacking
print("\nFinding numerical columns with high correlation")
nans_df = df[num_cols].isna()
nans_groups = {}
for col in num_cols:
    cur_group = nans_df[col].sum()
    try:
        nans_groups[cur_group].append(col)
    except:
        nans_groups[cur_group] = [col]
del nans_df
gc.collect()


def reduce_group(grps, df):
    use = []
    for g in grps:
        mx, vx = 0, g[0]
        for gg in g:
            n = df[gg].nunique()
            if n > mx:
                mx = n
                vx = gg
        use.append(vx)
    return use


def group_columns_by_correlation(matrix, threshold=0.9):
    correlation_matrix = matrix.corr()

    groups = []
    remaining_cols = list(matrix.columns)
    while remaining_cols:
        col = remaining_cols.pop(0)
        group = [col]
        correlated_cols = [col]
        for c in remaining_cols:
            if correlation_matrix.loc[col, c] >= threshold:
                group.append(c)
                correlated_cols.append(c)
        groups.append(group)
        remaining_cols = [c for c in remaining_cols if c not in correlated_cols]

    return groups


uses = []
for k, v in nans_groups.items():
    if len(v) > 1:
        Vs = nans_groups[k]
        grps = group_columns_by_correlation(df[Vs])
        use = reduce_group(grps, df)
        uses = uses + use
    else:
        uses = uses + v

to_remove = set([col for col in num_cols if col not in uses])
print(f"{len(to_remove)} columns are to be dropped")
df = df.drop(list(to_remove), axis=1)
num_cols = [item for item in num_cols if item not in to_remove]


Finding numerical columns with high correlation
65 columns are to be dropped
CPU times: user 21.9 s, sys: 1.65 s, total: 23.6 s
Wall time: 23.2 s


In [10]:
%%time
# Remove categorical columns with high association (Cramer's V)
print("\nFinding categorical columns with high association")
dummy = df[cat_cols].astype("str").astype("category")

to_remove_cat = set()
for i in range(len(cat_cols)):
    for j in range(i + 1, len(cat_cols)):
        col1, col2 = cat_cols[i], cat_cols[j]
        if col1 == col2 or col1 in to_remove_cat or col2 in to_remove_cat:
            continue

        corr = cramers_v(dummy[col1], dummy[col2])
        if corr > 0.9:
            print(f"{col1} & {col2} = {corr}")
            to_remove_cat.add(col2)

print(f"{len(to_remove_cat)} columns are to be dropped")
df = df.drop(list(to_remove_cat), axis=1)
cat_cols = [item for item in cat_cols if item not in to_remove_cat]
del dummy
gc.collect()


Finding categorical columns with high association
cardtype_51L & isdebitcard_729L = 0.9981812530220908
paytype1st_925L & paytype_783L = 0.9999911233493446
max_subjectrole_182M & max_subjectroles_name_838M = 0.9966809618914648
max_contaddr_matchlist_1032L & max_contaddr_smempladdr_334L = 0.9999996724869554
max_empladdr_district_926M & max_empladdr_zipcode_114M = 0.9999993449652226
max_relationshiptoclient_415T & max_relationshiptoclient_642T = 0.9999999999999999
max_relationshiptoclient_415T & max_remitter_829L = 0.9999970523787385
max_conts_role_79M & max_empls_employer_name_740M = 0.9999986899471787
8 columns are to be dropped
CPU times: user 7min 26s, sys: 3.67 s, total: 7min 30s
Wall time: 7min 30s


0

In [11]:
df.tail()

Unnamed: 0,case_id,WEEK_NUM,target,month_decision,weekday_decision,actualdpdtolerance_344P,amtinstpaidbefduel24m_4187115A,annuity_780A,annuitynextmonth_57A,applicationcnt_361L,...,var_pmts_overdue_1140A,mean_pmts_overdue_1152A,var_pmts_overdue_1152A,max_pmts_year_1139T,max_pmts_year_507T,max_subjectroles_name_541M,max_conts_role_79M,max_empls_economicalst_849M,max_num_group1_person_2,max_num_group2_person_2
1526654,2703450,91,0,10,1,0.0,176561.359375,3675.400146,0.0,0.0,...,0.0,69.693535,223616.078125,1.0,1.0,ab3c25cf,a55475b1,a55475b1,0.0,0.0
1526655,2703451,91,0,10,1,0.0,301276.46875,7088.600098,6191.600098,0.0,...,0.0,0.0,0.0,1.0,1.0,ab3c25cf,a55475b1,a55475b1,0.0,0.0
1526656,2703452,91,0,10,1,0.0,14232.400391,7788.800293,0.0,0.0,...,662676.9375,0.0,0.0,1.0,0.0,ab3c25cf,a55475b1,a55475b1,0.0,0.0
1526657,2703453,91,0,10,1,0.0,197371.578125,1195.400024,2827.199951,0.0,...,0.0,123.303925,292734.6875,1.0,0.0,ab3c25cf,a55475b1,a55475b1,0.0,0.0
1526658,2703454,91,0,10,1,0.0,82949.601562,4533.800293,2986.800049,0.0,...,0.0,38.611423,179989.234375,1.0,1.0,ab3c25cf,a55475b1,a55475b1,0.0,0.0


In [12]:
print(f'\nColumns to use {len(df.columns)}: ')
print(df.columns.tolist())


Columns to use 430: 
['case_id', 'WEEK_NUM', 'target', 'month_decision', 'weekday_decision', 'actualdpdtolerance_344P', 'amtinstpaidbefduel24m_4187115A', 'annuity_780A', 'annuitynextmonth_57A', 'applicationcnt_361L', 'applications30d_658L', 'applicationscnt_1086L', 'applicationscnt_464L', 'applicationscnt_867L', 'avgdbddpdlast3m_4187120P', 'avgdbdtollast24m_4525197P', 'avgdpdtolclosure24_3658938P', 'avginstallast24m_3658937A', 'avglnamtstart24m_4525187A', 'avgmaxdpdlast9m_3716943P', 'avgoutstandbalancel6m_4187114A', 'avgpmtlast12m_4525200A', 'bankacctype_710L', 'cardtype_51L', 'clientscnt_100L', 'clientscnt_1022L', 'clientscnt_1071L', 'clientscnt_1130L', 'clientscnt_157L', 'clientscnt_257L', 'clientscnt_304L', 'clientscnt_360L', 'clientscnt_493L', 'clientscnt_533L', 'clientscnt_887L', 'clientscnt_946L', 'cntincpaycont9m_3716944L', 'cntpmts24_3658933L', 'commnoinclast6m_3546845L', 'credtype_322L', 'currdebt_22A', 'currdebtcredtyperange_828A', 'datefirstoffer_1144D', 'datelastinstal40dp

## Get Test Set

In [13]:
def get_test(features, dtypes):
    df_submission = read_files(split="test").to_pandas()
    initial_cols = df_submission.columns.to_list()

    # Check for missing columns
    missing, mapping = {}, {}

    for col, dtype in zip(features, dtypes):
        mapping[col] = dtype

        if col not in initial_cols:
            missing[col] = np.repeat(
                np.nan if dtype.kind in "iufc" else None,
                len(df_submission.index),
            )

    df_submission = pd.concat([df_submission, pd.DataFrame(missing)], axis=1).astype(
        mapping
    )

    return df_submission[["case_id"] + features].copy()

In [14]:
%%time
print('Getting test set')
features = num_cols + cat_cols
dtypes = df[features].dtypes.to_list()
df_submission = get_test(features, dtypes)
print(f'Test set has {len(df_submission.index)} rows and {len(df_submission.columns)} columns')
get_memory()
df_submission.head()

Getting test set
#	Handling depth_0
##	Processing static_0_*
##	Processing static_cb_0
#	Handling depth_1
##	Processing applprev_1_*
##	Processing credit_bureau_a_1_*
##	Processing credit_bureau_b_1
##	Processing debitcard_1
##	Processing deposit_1
##	Processing other_1
##	Processing person_1
##	Processing tax_registry_a_1
##	Processing tax_registry_b_1
##	Processing tax_registry_c_1
#	Handling depth_2
##	Processing applprev_2
##	Processing credit_bureau_a_2_*
##	Processing credit_bureau_b_2
##	Processing person_2
Test set has 10 rows and 428 columns
Available memory left: 24.99gb
CPU times: user 656 ms, sys: 497 ms, total: 1.15 s
Wall time: 1.25 s


Unnamed: 0,case_id,month_decision,weekday_decision,actualdpdtolerance_344P,amtinstpaidbefduel24m_4187115A,annuity_780A,annuitynextmonth_57A,applicationcnt_361L,applications30d_658L,applicationscnt_1086L,...,max_cacccardblochreas_147M,max_conts_type_509L,max_credacc_cards_status_52L,max_collater_typofvalofguarant_298M,max_collater_typofvalofguarant_407M,max_collaterals_typeofguarante_359M,max_collaterals_typeofguarante_669M,max_subjectroles_name_541M,max_conts_role_79M,max_empls_economicalst_849M
0,57543,5,5,0.0,191767.359375,3674.600098,1218.200073,0.0,0.0,0.0,...,a55475b1,PRIMARY_MOBILE,,a55475b1,9a0c095e,c7a5ad39,c7a5ad39,ab3c25cf,,
1,57549,1,1,0.0,129704.398438,5742.600098,3546.600098,0.0,2.0,0.0,...,,,,a55475b1,9a0c095e,c7a5ad39,c7a5ad39,ab3c25cf,,
2,57551,11,5,0.0,71036.398438,2844.600098,0.0,0.0,1.0,0.0,...,,,,a55475b1,9a0c095e,c7a5ad39,c7a5ad39,ab3c25cf,a55475b1,a55475b1
3,57552,11,5,0.0,183992.0,6298.800293,12155.400391,0.0,0.0,0.0,...,,,,,,,,,a55475b1,a55475b1
4,57569,12,1,0.0,0.0,4682.600098,0.0,0.0,1.0,0.0,...,,,,a55475b1,a55475b1,c7a5ad39,a55475b1,ab3c25cf,a55475b1,a55475b1


# Training

In [15]:
def gini_stability(base, w_fallingrate=88.0, w_resstd=-0.5):
    # https://www.kaggle.com/code/jetakow/home-credit-2024-starter-notebook
    gini_in_time = (
        base.loc[:, ["WEEK_NUM", "target", "score"]]
        .sort_values("WEEK_NUM")
        .groupby("WEEK_NUM")[["target", "score"]]
        .apply(lambda x: 2 * roc_auc_score(x["target"], x["score"]) - 1)
        .tolist()
    )

    x = np.arange(len(gini_in_time))
    y = gini_in_time
    a, b = np.polyfit(x, y, 1)
    y_hat = a * x + b
    residuals = y - y_hat
    res_std = np.std(residuals)
    avg_gini = np.mean(gini_in_time)
    return avg_gini + w_fallingrate * min(0, a) + w_resstd * res_std

In [16]:
X = df[features].copy()
X[cat_cols] = X[cat_cols].astype('string').fillna("None").astype(str)
y = df['target'].copy()
groups = df['WEEK_NUM'].copy()
case_id = df['case_id'].copy()
cv = StratifiedGroupKFold(n_splits=5).split(X, y, groups)

del df
gc.collect()

0

In [17]:
params = {
    'eval_metric': 'AUC',
    'l2_leaf_reg': 10,
    'random_seed': SEED,
    'iterations': 3000,
    'learning_rate': 0.03,
    'early_stopping_rounds': 50,
    'task_type': 'GPU',     
    'devices': '0:1',
}

In [18]:
cat_models, rocs, ginis = [], [], []
for i, (train_index, valid_index) in enumerate(cv):
    print('~~~~~~~~~~~~~~~~~~~~~~~~~~~')
    print(f'Currently on fold {i + 1}')

    t = time()
    
    # Training
    cat = CatBoostClassifier(**params)
    cat.fit(
        Pool(X.iloc[train_index], y.iloc[train_index], cat_features=cat_cols),
        eval_set=Pool(X.iloc[valid_index], y.iloc[valid_index], cat_features=cat_cols),
        verbose=200,
    )
    
    # Evaluation
    y_preds = cat.predict_proba(X.iloc[valid_index])[:, 1]
    base = pd.DataFrame({
        'WEEK_NUM': groups.iloc[valid_index],
        'target': y.iloc[valid_index],
        'score': y_preds,
    })
    
    roc = roc_auc_score(y.iloc[valid_index], y_preds)
    gini = gini_stability(base)
    
    print(f"AUC: {roc}")
    print(f'Stability: {gini}')
    
    # Saving
    cat_models.append(cat) 
    rocs.append(roc)
    ginis.append(gini)
    
    get_memory()
    get_execution_time(t)
    
    del cat
    gc.collect()

~~~~~~~~~~~~~~~~~~~~~~~~~~~
Currently on fold 1


Default metric period is 5 because AUC is/are not implemented for GPU


0:	test: 0.6824181	best: 0.6824181 (0)	total: 14.4s	remaining: 12h 1m 50s
200:	test: 0.8265890	best: 0.8265890 (200)	total: 1m 55s	remaining: 26m 48s
400:	test: 0.8371554	best: 0.8371554 (400)	total: 3m 34s	remaining: 23m 11s
600:	test: 0.8407125	best: 0.8407125 (600)	total: 5m 13s	remaining: 20m 49s
800:	test: 0.8426733	best: 0.8426733 (800)	total: 6m 50s	remaining: 18m 46s
1000:	test: 0.8440377	best: 0.8440377 (1000)	total: 8m 25s	remaining: 16m 50s
1200:	test: 0.8449118	best: 0.8449141 (1199)	total: 10m 1s	remaining: 15m 1s
1400:	test: 0.8456062	best: 0.8456077 (1396)	total: 11m 36s	remaining: 13m 15s
1600:	test: 0.8462840	best: 0.8462840 (1600)	total: 13m 11s	remaining: 11m 31s
1800:	test: 0.8468493	best: 0.8468493 (1800)	total: 14m 47s	remaining: 9m 50s
2000:	test: 0.8473385	best: 0.8473386 (1999)	total: 16m 23s	remaining: 8m 10s
2200:	test: 0.8478146	best: 0.8478146 (2200)	total: 17m 58s	remaining: 6m 31s
2400:	test: 0.8482247	best: 0.8482247 (2400)	total: 19m 34s	remaining: 4m 5

Default metric period is 5 because AUC is/are not implemented for GPU


0:	test: 0.6694137	best: 0.6694137 (0)	total: 515ms	remaining: 25m 43s
200:	test: 0.8275394	best: 0.8275394 (200)	total: 1m 42s	remaining: 23m 43s
400:	test: 0.8374584	best: 0.8374584 (400)	total: 3m 22s	remaining: 21m 53s
600:	test: 0.8412201	best: 0.8412231 (599)	total: 5m 1s	remaining: 20m 5s
800:	test: 0.8430964	best: 0.8430964 (800)	total: 6m 39s	remaining: 18m 16s
1000:	test: 0.8443843	best: 0.8443843 (1000)	total: 8m 16s	remaining: 16m 31s
1200:	test: 0.8452720	best: 0.8452720 (1200)	total: 9m 53s	remaining: 14m 48s
1400:	test: 0.8460715	best: 0.8460718 (1399)	total: 11m 29s	remaining: 13m 7s
1600:	test: 0.8465969	best: 0.8465971 (1599)	total: 13m 5s	remaining: 11m 26s
1800:	test: 0.8470960	best: 0.8470960 (1800)	total: 14m 42s	remaining: 9m 47s
2000:	test: 0.8474721	best: 0.8474721 (2000)	total: 16m 17s	remaining: 8m 8s
2200:	test: 0.8478433	best: 0.8478437 (2199)	total: 17m 53s	remaining: 6m 29s
2400:	test: 0.8482372	best: 0.8482372 (2400)	total: 19m 30s	remaining: 4m 52s
2600

Default metric period is 5 because AUC is/are not implemented for GPU


0:	test: 0.6357994	best: 0.6357994 (0)	total: 506ms	remaining: 25m 18s
200:	test: 0.8322414	best: 0.8322414 (200)	total: 1m 42s	remaining: 23m 41s
400:	test: 0.8420390	best: 0.8420390 (400)	total: 3m 22s	remaining: 21m 52s
600:	test: 0.8459163	best: 0.8459163 (600)	total: 5m 1s	remaining: 20m 2s
800:	test: 0.8481433	best: 0.8481433 (800)	total: 6m 39s	remaining: 18m 16s
1000:	test: 0.8494538	best: 0.8494538 (1000)	total: 8m 16s	remaining: 16m 32s
1200:	test: 0.8505006	best: 0.8505006 (1200)	total: 9m 53s	remaining: 14m 49s
1400:	test: 0.8513092	best: 0.8513092 (1400)	total: 11m 29s	remaining: 13m 7s
1600:	test: 0.8519883	best: 0.8519883 (1600)	total: 13m 5s	remaining: 11m 26s
1800:	test: 0.8525532	best: 0.8525532 (1800)	total: 14m 41s	remaining: 9m 46s
2000:	test: 0.8530353	best: 0.8530353 (2000)	total: 16m 17s	remaining: 8m 7s
2200:	test: 0.8534708	best: 0.8534708 (2200)	total: 17m 53s	remaining: 6m 29s
2400:	test: 0.8538278	best: 0.8538278 (2400)	total: 19m 29s	remaining: 4m 51s
2600

Default metric period is 5 because AUC is/are not implemented for GPU


0:	test: 0.6371231	best: 0.6371231 (0)	total: 504ms	remaining: 25m 11s
200:	test: 0.8308125	best: 0.8308125 (200)	total: 1m 42s	remaining: 23m 46s
400:	test: 0.8414921	best: 0.8414921 (400)	total: 3m 22s	remaining: 21m 50s
600:	test: 0.8448939	best: 0.8448939 (600)	total: 5m 1s	remaining: 20m 1s
800:	test: 0.8467970	best: 0.8467970 (800)	total: 6m 38s	remaining: 18m 14s
1000:	test: 0.8483033	best: 0.8483033 (1000)	total: 8m 16s	remaining: 16m 31s
1200:	test: 0.8492417	best: 0.8492417 (1200)	total: 9m 53s	remaining: 14m 48s
1400:	test: 0.8500152	best: 0.8500152 (1400)	total: 11m 29s	remaining: 13m 7s
1600:	test: 0.8505497	best: 0.8505506 (1599)	total: 13m 6s	remaining: 11m 27s
1800:	test: 0.8510689	best: 0.8510689 (1800)	total: 14m 42s	remaining: 9m 47s
2000:	test: 0.8515930	best: 0.8515930 (2000)	total: 16m 19s	remaining: 8m 9s
2200:	test: 0.8520366	best: 0.8520366 (2200)	total: 17m 56s	remaining: 6m 30s
2400:	test: 0.8524758	best: 0.8524758 (2400)	total: 19m 33s	remaining: 4m 52s
2600

Default metric period is 5 because AUC is/are not implemented for GPU


0:	test: 0.6305729	best: 0.6305729 (0)	total: 507ms	remaining: 25m 21s
200:	test: 0.8260240	best: 0.8260240 (200)	total: 1m 41s	remaining: 23m 39s
400:	test: 0.8364660	best: 0.8364660 (400)	total: 3m 22s	remaining: 21m 50s
600:	test: 0.8403848	best: 0.8403848 (600)	total: 5m 1s	remaining: 20m 2s
800:	test: 0.8424206	best: 0.8424217 (799)	total: 6m 38s	remaining: 18m 15s
1000:	test: 0.8436083	best: 0.8436083 (1000)	total: 8m 16s	remaining: 16m 30s
1200:	test: 0.8446691	best: 0.8446691 (1200)	total: 9m 53s	remaining: 14m 48s
1400:	test: 0.8454474	best: 0.8454474 (1400)	total: 11m 30s	remaining: 13m 7s
1600:	test: 0.8461038	best: 0.8461038 (1600)	total: 13m 5s	remaining: 11m 26s
1800:	test: 0.8467391	best: 0.8467391 (1800)	total: 14m 42s	remaining: 9m 47s
2000:	test: 0.8473190	best: 0.8473190 (2000)	total: 16m 18s	remaining: 8m 8s
2200:	test: 0.8478216	best: 0.8478216 (2200)	total: 17m 54s	remaining: 6m 30s
2400:	test: 0.8482572	best: 0.8482586 (2398)	total: 19m 30s	remaining: 4m 52s
2600

In [19]:
print(f'Average AUC score: {np.mean(rocs)}')
print(f'Average gini score: {np.mean(ginis)}')

Average AUC score: 0.8511675112716487
Average gini score: 0.6837276412070508


In [20]:
with open('cat.pickle', 'wb') as handle:
    pickle.dump(cat_models, handle, protocol=pickle.HIGHEST_PROTOCOL)

# Submission

In [21]:
class VotingModel:
    # https://www.kaggle.com/code/kononenko/metric-trick-home-credit-baseline-inference
    def __init__(self, estimators, batch_size=100000):
        self.estimators = estimators
        self.batch_size = batch_size
    
    def predict_proba_in_batches(self, model, data):
        num_samples = len(data)
        num_batches = int(np.ceil(num_samples / self.batch_size))
        probabilities = np.zeros((num_samples, 2))

        for batch_idx in range(num_batches):
            start_idx = batch_idx * self.batch_size
            end_idx = min((batch_idx + 1) * self.batch_size, num_samples)
            probabilities[start_idx:end_idx, :] = model.predict_proba(data.iloc[start_idx:end_idx])
            gc.collect()
        
        return probabilities      
    
    def predict_proba(self, X):
        y_preds = [self.predict_proba_in_batches(estimator, X) for estimator in self.estimators]
        return np.mean(y_preds, axis=0)    

In [22]:
%%time
print('Predicting on test set')
cat_models = VotingModel(cat_models)
df_submission[cat_cols] = df_submission[cat_cols].astype('string').fillna("None").astype(str)
df_submission['score'] = cat_models.predict_proba(df_submission[features])[:, 1]
print(df_submission[['case_id', 'score']].to_string())

Predicting on test set
   case_id     score
0    57543  0.013040
1    57549  0.043042
2    57551  0.003140
3    57552  0.026022
4    57569  0.106713
5    57630  0.011424
6    57631  0.028799
7    57632  0.005259
8    57633  0.026575
9    57634  0.014078
CPU times: user 639 ms, sys: 944 µs, total: 640 ms
Wall time: 637 ms


In [23]:
df_submission[['case_id', 'score']].to_csv('./submission.csv', index=None)

In [24]:
print(f'Notebook completed in {(time() - total_t) / 60:.2f}min')
get_memory()

Notebook completed in 140.06min
Available memory left: 19.18gb
