In [1]:
!pip install /kaggle/input/scikit-learn-1-4-2-cp310/scikit_learn-1.4.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl

Processing /kaggle/input/scikit-learn-1-4-2-cp310/scikit_learn-1.4.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl
Installing collected packages: scikit-learn
  Attempting uninstall: scikit-learn
    Found existing installation: scikit-learn 1.2.2
    Uninstalling scikit-learn-1.2.2:
      Successfully uninstalled scikit-learn-1.2.2
[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
spopt 0.6.0 requires shapely>=2.0.1, but you have shapely 1.8.5.post1 which is incompatible.[0m[31m
[0mSuccessfully installed scikit-learn-1.4.2


In [2]:
import os
import gc
import glob
import pickle
import psutil
from time import time

import numpy as np
import pandas as pd
import polars as pl
import lightgbm as lgb
import xgboost as xgb
from catboost import CatBoostClassifier, Pool
from sklearn.ensemble import HistGradientBoostingClassifier
from sklearn.model_selection import StratifiedGroupKFold
from sklearn.metrics import roc_auc_score 
from scipy.stats import chi2_contingency

In [3]:
def get_execution_time(t):
    print(f'Total time = {time() - t:.2f}s')


def get_memory():
    print(f'Available memory left: {psutil.virtual_memory().available / (1024 * 1024 * 1024):.2f}gb')

    
get_memory()

Available memory left: 30.20gb


# Data Preparation

In [4]:
total_t = time()
SEED = 42
MISSING_PERCENTAGE = 0.95
CATEGORIES_MAX = 200
DATA_PATH = "/kaggle/input/home-credit-credit-risk-model-stability/parquet_files/"
TABLES = {
    'depth_0': ['static_0_*', 'static_cb_0'],
    'depth_1': [
        'applprev_1_*', 
        'credit_bureau_a_1_*',
        'credit_bureau_b_1',
        'debitcard_1',
        'deposit_1',
        'other_1',
        'person_1',
        'tax_registry_a_1',
        'tax_registry_b_1',
        'tax_registry_c_1',
    ],
    'depth_2': [
        'applprev_2',
        'credit_bureau_a_2_*',
        'credit_bureau_b_2',   
        'person_2',
    ],
}
PROCESSED_DATA_PATH = ''
SELECTED_COLS = []

In [5]:
def reduce_memory(df):
    return df.select(pl.all().shrink_dtype())


def set_dtypes(df):
    for col in df.columns:
        if col in ("target", "case_id", "WEEK_NUM", "MONTH"):
            df = df.with_columns(pl.col(col).cast(pl.Int32))
        elif col == "date_decision" or col[-1] == "D":
            df = df.with_columns(pl.col(col).cast(pl.Date))
        elif col[-1] in ("P", "A") or ("num_group" in col):
            df = df.with_columns(pl.col(col).cast(pl.Float32))
        elif col[-1] == "M":
            df = df.with_columns(pl.col(col).cast(pl.String))
    return df


def filter_columns(df):
    cols_to_drop = set()

    for col in df.columns:
        if col in ("target", "case_id", "WEEK_NUM", "MONTH"):
            continue

        # Remove categorical columns with 1 or >CATEGORIES_MAX categories
        if df[col].dtype == pl.String:
            n = df[col].n_unique()
            if (n == 1) or (n > CATEGORIES_MAX):
                cols_to_drop.add(col)
                continue

        # Remove columns with >MISSING_PERCENTAGE missing values
        nulls = df[col].is_null().mean()
        if nulls > MISSING_PERCENTAGE:
            cols_to_drop.add(col)

    return df.drop(list(cols_to_drop))


def aggregate(df):
    exprs = []

    for col, dtype in zip(df.columns, df.dtypes):
        if col in ("target", "case_id", "WEEK_NUM", "MONTH"):
            continue
        
        exprs += [
            pl.col(col).max().alias(f"max_{col}"),
        ]
        
        if col[-1] in ("P", "A", "D"):
            exprs += [
                pl.col(col).mean().alias(f"mean_{col}"),
                pl.col(col).var().alias(f"var_{col}"),
            ]

    return exprs


def handle_dates(df):
    assert "date_decision" in df.columns, "date_decision not in df"
    for col in df.columns:
        if col[-1] == "D":
            df = df.with_columns(pl.col(col) - pl.col("date_decision"))
            df = df.with_columns(pl.col(col).dt.total_days().cast(pl.Float32))
        elif col in [
            "dpdmaxdateyear_596T",
            "dpdmaxdateyear_742T",
            "dpdmaxdateyear_896T",
            "overdueamountmaxdateyear_2T",
            "overdueamountmaxdateyear_432T",
            "overdueamountmaxdateyear_994T",
            "pmts_year_1139T",
            "pmts_year_507T",
        ]:  # These columns are represented in years (ex: 2020, 2021, ...)
            df = df.with_columns(
                pl.col(col) - pl.col("date_decision").dt.year().cast(pl.Float32)
            )

    return df.drop("date_decision")


def read_files(split="train"):
    df = pl.read_parquet(DATA_PATH + f"{split}/{split}_base.parquet").pipe(set_dtypes)
    df = df.with_columns([
        pl.col("date_decision").dt.month().alias("month_decision"),
        pl.col("date_decision").dt.weekday().alias("weekday_decision"),
    ])
    date_decision = df.select(pl.col(["case_id", "date_decision"]))

    for key, item in TABLES.items():
        print(f"#\tHandling {key}")
        t = time()

        for i in item:
            print(f"##\tProcessing {i}")
            sub_df = pl.DataFrame()

            for file in glob.glob(DATA_PATH + f"{split}/{split}_{i}.parquet"):
                dummy = (
                    pl.read_parquet(file)
                    .pipe(set_dtypes)
                    .join(date_decision, how="left", on="case_id")
                    .pipe(handle_dates)
                )

                if key != "depth_0" and not dummy.is_empty():
                    dummy = dummy.group_by("case_id").agg(aggregate(dummy))

                sub_df = (
                    dummy
                    if sub_df.is_empty()
                    else pl.concat([sub_df, dummy], how="diagonal_relaxed")
                )

            df = df.join(
                sub_df.unique(subset=["case_id"]),
                how="left",
                on="case_id",
                suffix=f"_{i}",
            )
            if split == "train":
                df = df.pipe(filter_columns)

    return df.drop(["date_decision", "MONTH"]).pipe(reduce_memory)

## Get Train Set

In [6]:
def get_train():
    t = time()
    print("~~~~~~~~~~~~~~~~~~~~~~~~~~~")
    if PROCESSED_DATA_PATH == "":
        print("Reading files")
        df = read_files().pipe(filter_columns)
        df.write_parquet("./data.parquet")
    else:
        print("Processed data found, use these instead")
        df = pl.read_parquet(PROCESSED_DATA_PATH).select(SELECTED_COLS)
    get_memory()
    get_execution_time(t)

    print(f"Unique dtypes: {set(df.dtypes)}")
    cat_cols, num_cols = [], []
    for col, dtype in zip(df.columns, df.dtypes):
        if col in ["target", "WEEK_NUM", "case_id"]:
            continue

        if dtype in (pl.String, pl.Boolean):
            cat_cols.append(col)
        else:
            num_cols.append(col)

    t = time()
    print("~~~~~~~~~~~~~~~~~~~~~~~~~~~")
    print("Converting to pandas")
    df = df.to_pandas()
    df[cat_cols] = df[cat_cols].astype("category")

    print(f"{len(df.index)} rows and {len(df.columns)} columns")
    print(f"{len(cat_cols)} categorical and {len(num_cols)} numerical columns")
    get_memory()
    get_execution_time(t)

    return df.copy(), cat_cols, num_cols

In [7]:
%%time
print('Getting train set')
df, cat_cols, num_cols = get_train()
df.tail()

Getting train set
~~~~~~~~~~~~~~~~~~~~~~~~~~~
Reading files
#	Handling depth_0
##	Processing static_0_*
##	Processing static_cb_0
#	Handling depth_1
##	Processing applprev_1_*
##	Processing credit_bureau_a_1_*
##	Processing credit_bureau_b_1
##	Processing debitcard_1
##	Processing deposit_1
##	Processing other_1
##	Processing person_1
##	Processing tax_registry_a_1
##	Processing tax_registry_b_1
##	Processing tax_registry_c_1
#	Handling depth_2
##	Processing applprev_2
##	Processing credit_bureau_a_2_*
##	Processing credit_bureau_b_2
##	Processing person_2
Available memory left: 26.00gb
Total time = 244.08s
Unique dtypes: {String, Float32, Int8, Boolean, Int32}
~~~~~~~~~~~~~~~~~~~~~~~~~~~
Converting to pandas
1526659 rows and 503 columns
79 categorical and 421 numerical columns
Available memory left: 27.01gb
Total time = 21.15s
CPU times: user 7min 24s, sys: 2min 9s, total: 9min 34s
Wall time: 4min 26s


Unnamed: 0,case_id,WEEK_NUM,target,month_decision,weekday_decision,actualdpdtolerance_344P,amtinstpaidbefduel24m_4187115A,annuity_780A,annuitynextmonth_57A,applicationcnt_361L,...,var_pmts_overdue_1152A,max_pmts_year_1139T,max_pmts_year_507T,max_subjectroles_name_541M,max_subjectroles_name_838M,max_conts_role_79M,max_empls_economicalst_849M,max_empls_employer_name_740M,max_num_group1_person_2,max_num_group2_person_2
1526654,2703450,91,0,10,1,0.0,176561.359375,3675.400146,0.0,0.0,...,223616.078125,1.0,1.0,ab3c25cf,ab3c25cf,a55475b1,a55475b1,a55475b1,0.0,0.0
1526655,2703451,91,0,10,1,0.0,301276.46875,7088.600098,6191.600098,0.0,...,0.0,1.0,1.0,ab3c25cf,ab3c25cf,a55475b1,a55475b1,a55475b1,0.0,0.0
1526656,2703452,91,0,10,1,0.0,14232.400391,7788.800293,0.0,0.0,...,0.0,1.0,0.0,ab3c25cf,ab3c25cf,a55475b1,a55475b1,a55475b1,0.0,0.0
1526657,2703453,91,0,10,1,0.0,197371.578125,1195.400024,2827.199951,0.0,...,292734.6875,1.0,0.0,ab3c25cf,ab3c25cf,a55475b1,a55475b1,a55475b1,0.0,0.0
1526658,2703454,91,0,10,1,0.0,82949.601562,4533.800293,2986.800049,0.0,...,179989.234375,1.0,1.0,ab3c25cf,ab3c25cf,a55475b1,a55475b1,a55475b1,0.0,0.0


## Feature Selection

In [8]:
def cramers_v(x, y):
    # https://stackoverflow.com/questions/20892799/using-pandas-calculate-cram%C3%A9rs-coefficient-matrix
    cm = pd.crosstab(x, y)
    chi2 = chi2_contingency(cm)[0]
    n = cm.sum().sum()
    phi2 = chi2 / n
    r, k = cm.shape
    phi2corr = max(0, phi2 - ((k - 1) * (r - 1)) / (n - 1))
    rcorr = r - ((r - 1) ** 2) / (n - 1)
    kcorr = k - ((k - 1) ** 2) / (n - 1)
    return np.sqrt(phi2corr / min((kcorr - 1), (rcorr - 1)))

In [9]:
%%time
# https://www.kaggle.com/code/harrychan123/lgb-cat-ensemble-stacking
print("\nFinding numerical columns with high correlation")
nans_df = df[num_cols].isna()
nans_groups = {}
for col in num_cols:
    cur_group = nans_df[col].sum()
    try:
        nans_groups[cur_group].append(col)
    except:
        nans_groups[cur_group] = [col]
del nans_df
gc.collect()


def reduce_group(grps, df):
    use = []
    for g in grps:
        mx, vx = 0, g[0]
        for gg in g:
            n = df[gg].nunique()
            if n > mx:
                mx = n
                vx = gg
        use.append(vx)
    return use


def group_columns_by_correlation(matrix, threshold=0.9):
    correlation_matrix = matrix.corr()

    groups = []
    remaining_cols = list(matrix.columns)
    while remaining_cols:
        col = remaining_cols.pop(0)
        group = [col]
        correlated_cols = [col]
        for c in remaining_cols:
            if correlation_matrix.loc[col, c] >= threshold:
                group.append(c)
                correlated_cols.append(c)
        groups.append(group)
        remaining_cols = [c for c in remaining_cols if c not in correlated_cols]

    return groups


uses = []
for k, v in nans_groups.items():
    if len(v) > 1:
        Vs = nans_groups[k]
        grps = group_columns_by_correlation(df[Vs])
        use = reduce_group(grps, df)
        uses = uses + use
    else:
        uses = uses + v

to_remove = set([col for col in num_cols if col not in uses])
print(f"{len(to_remove)} columns are to be dropped")
df = df.drop(list(to_remove), axis=1)
num_cols = [item for item in num_cols if item not in to_remove]


Finding numerical columns with high correlation
65 columns are to be dropped
CPU times: user 21.7 s, sys: 1.53 s, total: 23.2 s
Wall time: 22.9 s


In [10]:
%%time
# Remove categorical columns with high association (Cramer's V)
print("\nFinding categorical columns with high association")
dummy = df[cat_cols].astype("str").astype("category")

to_remove_cat = set()
for i in range(len(cat_cols)):
    for j in range(i + 1, len(cat_cols)):
        col1, col2 = cat_cols[i], cat_cols[j]
        if col1 == col2 or col1 in to_remove_cat or col2 in to_remove_cat:
            continue

        corr = cramers_v(dummy[col1], dummy[col2])
        if corr > 0.9:
            print(f"{col1} & {col2} = {corr}")
            to_remove_cat.add(col2)

print(f"{len(to_remove_cat)} columns are to be dropped")
df = df.drop(list(to_remove_cat), axis=1)
cat_cols = [item for item in cat_cols if item not in to_remove_cat]
del dummy
gc.collect()


Finding categorical columns with high association
cardtype_51L & isdebitcard_729L = 0.9981812530220908
paytype1st_925L & paytype_783L = 0.9999911233493446
max_subjectrole_182M & max_subjectroles_name_838M = 0.9966809618914648
max_contaddr_matchlist_1032L & max_contaddr_smempladdr_334L = 0.9999996724869554
max_empladdr_district_926M & max_empladdr_zipcode_114M = 0.9999993449652226
max_relationshiptoclient_415T & max_relationshiptoclient_642T = 0.9999999999999999
max_relationshiptoclient_415T & max_remitter_829L = 0.9999970523787385
max_conts_role_79M & max_empls_employer_name_740M = 0.9999986899471787
8 columns are to be dropped
CPU times: user 7min 10s, sys: 3.56 s, total: 7min 13s
Wall time: 7min 13s


0

In [11]:
df.tail()

Unnamed: 0,case_id,WEEK_NUM,target,month_decision,weekday_decision,actualdpdtolerance_344P,amtinstpaidbefduel24m_4187115A,annuity_780A,annuitynextmonth_57A,applicationcnt_361L,...,var_pmts_overdue_1140A,mean_pmts_overdue_1152A,var_pmts_overdue_1152A,max_pmts_year_1139T,max_pmts_year_507T,max_subjectroles_name_541M,max_conts_role_79M,max_empls_economicalst_849M,max_num_group1_person_2,max_num_group2_person_2
1526654,2703450,91,0,10,1,0.0,176561.359375,3675.400146,0.0,0.0,...,0.0,69.693535,223616.078125,1.0,1.0,ab3c25cf,a55475b1,a55475b1,0.0,0.0
1526655,2703451,91,0,10,1,0.0,301276.46875,7088.600098,6191.600098,0.0,...,0.0,0.0,0.0,1.0,1.0,ab3c25cf,a55475b1,a55475b1,0.0,0.0
1526656,2703452,91,0,10,1,0.0,14232.400391,7788.800293,0.0,0.0,...,662676.9375,0.0,0.0,1.0,0.0,ab3c25cf,a55475b1,a55475b1,0.0,0.0
1526657,2703453,91,0,10,1,0.0,197371.578125,1195.400024,2827.199951,0.0,...,0.0,123.303925,292734.6875,1.0,0.0,ab3c25cf,a55475b1,a55475b1,0.0,0.0
1526658,2703454,91,0,10,1,0.0,82949.601562,4533.800293,2986.800049,0.0,...,0.0,38.611423,179989.234375,1.0,1.0,ab3c25cf,a55475b1,a55475b1,0.0,0.0


In [12]:
print(f'\nColumns to use {len(df.columns)}: ')
print(df.columns.tolist())


Columns to use 430: 
['case_id', 'WEEK_NUM', 'target', 'month_decision', 'weekday_decision', 'actualdpdtolerance_344P', 'amtinstpaidbefduel24m_4187115A', 'annuity_780A', 'annuitynextmonth_57A', 'applicationcnt_361L', 'applications30d_658L', 'applicationscnt_1086L', 'applicationscnt_464L', 'applicationscnt_867L', 'avgdbddpdlast3m_4187120P', 'avgdbdtollast24m_4525197P', 'avgdpdtolclosure24_3658938P', 'avginstallast24m_3658937A', 'avglnamtstart24m_4525187A', 'avgmaxdpdlast9m_3716943P', 'avgoutstandbalancel6m_4187114A', 'avgpmtlast12m_4525200A', 'bankacctype_710L', 'cardtype_51L', 'clientscnt_100L', 'clientscnt_1022L', 'clientscnt_1071L', 'clientscnt_1130L', 'clientscnt_157L', 'clientscnt_257L', 'clientscnt_304L', 'clientscnt_360L', 'clientscnt_493L', 'clientscnt_533L', 'clientscnt_887L', 'clientscnt_946L', 'cntincpaycont9m_3716944L', 'cntpmts24_3658933L', 'commnoinclast6m_3546845L', 'credtype_322L', 'currdebt_22A', 'currdebtcredtyperange_828A', 'datefirstoffer_1144D', 'datelastinstal40dp

## Get Test Set

In [13]:
def get_test(features, dtypes):
    df_submission = read_files(split="test").to_pandas()
    initial_cols = df_submission.columns.to_list()

    # Check for missing columns
    missing, mapping = {}, {}

    for col, dtype in zip(features, dtypes):
        mapping[col] = dtype

        if col not in initial_cols:
            missing[col] = np.repeat(
                np.nan if dtype.kind in "iufc" else None,
                len(df_submission.index),
            )

    df_submission = pd.concat([df_submission, pd.DataFrame(missing)], axis=1).astype(
        mapping
    )

    return df_submission[["case_id"] + features].copy()

In [14]:
%%time
print('Getting test set')
features = num_cols + cat_cols
dtypes = df[features].dtypes.to_list()
df_submission = get_test(features, dtypes)
print(f'Test set has {len(df_submission.index)} rows and {len(df_submission.columns)} columns')
get_memory()
df_submission.head()

Getting test set
#	Handling depth_0
##	Processing static_0_*
##	Processing static_cb_0
#	Handling depth_1
##	Processing applprev_1_*
##	Processing credit_bureau_a_1_*
##	Processing credit_bureau_b_1
##	Processing debitcard_1
##	Processing deposit_1
##	Processing other_1
##	Processing person_1
##	Processing tax_registry_a_1
##	Processing tax_registry_b_1
##	Processing tax_registry_c_1
#	Handling depth_2
##	Processing applprev_2
##	Processing credit_bureau_a_2_*
##	Processing credit_bureau_b_2
##	Processing person_2
Test set has 10 rows and 428 columns
Available memory left: 24.91gb
CPU times: user 661 ms, sys: 502 ms, total: 1.16 s
Wall time: 1.21 s


Unnamed: 0,case_id,month_decision,weekday_decision,actualdpdtolerance_344P,amtinstpaidbefduel24m_4187115A,annuity_780A,annuitynextmonth_57A,applicationcnt_361L,applications30d_658L,applicationscnt_1086L,...,max_cacccardblochreas_147M,max_conts_type_509L,max_credacc_cards_status_52L,max_collater_typofvalofguarant_298M,max_collater_typofvalofguarant_407M,max_collaterals_typeofguarante_359M,max_collaterals_typeofguarante_669M,max_subjectroles_name_541M,max_conts_role_79M,max_empls_economicalst_849M
0,57543,5,5,0.0,191767.359375,3674.600098,1218.200073,0.0,0.0,0.0,...,a55475b1,PRIMARY_MOBILE,,a55475b1,9a0c095e,c7a5ad39,c7a5ad39,ab3c25cf,,
1,57549,1,1,0.0,129704.398438,5742.600098,3546.600098,0.0,2.0,0.0,...,,,,a55475b1,9a0c095e,c7a5ad39,c7a5ad39,ab3c25cf,,
2,57551,11,5,0.0,71036.398438,2844.600098,0.0,0.0,1.0,0.0,...,,,,a55475b1,9a0c095e,c7a5ad39,c7a5ad39,ab3c25cf,a55475b1,a55475b1
3,57552,11,5,0.0,183992.0,6298.800293,12155.400391,0.0,0.0,0.0,...,,,,,,,,,a55475b1,a55475b1
4,57569,12,1,0.0,0.0,4682.600098,0.0,0.0,1.0,0.0,...,,,,a55475b1,a55475b1,c7a5ad39,a55475b1,ab3c25cf,a55475b1,a55475b1


# Training

In [15]:
def gini_stability(base, w_fallingrate=88.0, w_resstd=-0.5):
    # https://www.kaggle.com/code/jetakow/home-credit-2024-starter-notebook
    gini_in_time = (
        base.loc[:, ["WEEK_NUM", "target", "score"]]
        .sort_values("WEEK_NUM")
        .groupby("WEEK_NUM")[["target", "score"]]
        .apply(lambda x: 2 * roc_auc_score(x["target"], x["score"]) - 1)
        .tolist()
    )

    x = np.arange(len(gini_in_time))
    y = gini_in_time
    a, b = np.polyfit(x, y, 1)
    y_hat = a * x + b
    residuals = y - y_hat
    res_std = np.std(residuals)
    avg_gini = np.mean(gini_in_time)
    return avg_gini + w_fallingrate * min(0, a) + w_resstd * res_std

In [16]:
X = df[features].copy()
y = df['target'].copy()
groups = df['WEEK_NUM'].copy()
case_id = df['case_id'].copy()
cv = StratifiedGroupKFold(n_splits=5).split(X, y, groups)

del df
gc.collect()

0

In [17]:
params_best = {
    'booster': 'gbtree', 
    'objective': 'binary:logistic', 
    'eval_metric': 'auc', 
    'max_depth': 27, 
    'learning_rate': 0.01858408585675685, 
    'early_stopping_rounds': 50, 
    'n_estimators': 2000, 
    'subsample': 0.7498064113666455, 
    'colsample_bytree': 0.5975118290673396, 
    'colsample_bylevel': 0.5214259936218979, 
    'colsample_bynode': 0.7765315135500115, 
    'reg_alpha': 0.1, 
    'reg_lambda': 10, 
    'max_leaves': 128, 
    'max_bin': 109, 
    'device': 'cuda', 
    'tree_method': 'hist', 
    'random_state': 42, 
    'enable_categorical': True, 
    'n_jobs': -1
}

In [18]:
models, rocs, ginis = [], [], []

for i, (train_index, valid_index) in enumerate(cv):
    print("~~~~~~~~~~~~~~~~~~~~~~~~~~~")
    print(f"Currently on fold {i + 1}")

    t = time()

    # Training
    model = xgb.XGBClassifier(**params_best)
    model.fit(
        X.iloc[train_index], 
        y.iloc[train_index],
        eval_set=[(X.iloc[valid_index], y.iloc[valid_index])],
        verbose=200,
    )

    # Evaluation
    y_preds = model.predict_proba(X.iloc[valid_index])[:, 1]
    base = pd.DataFrame({
        "WEEK_NUM": groups.iloc[valid_index],
        "target": y.iloc[valid_index],
        "score": y_preds,
    })

    roc = roc_auc_score(y.iloc[valid_index], y_preds)
    gini = gini_stability(base)

    print(f"AUC: {roc}")
    print(f"Stability: {gini}")

    # Saving
    models.append(model)
    rocs.append(roc)
    ginis.append(gini)

    get_memory()
    get_execution_time(t)

    del model
    gc.collect()

~~~~~~~~~~~~~~~~~~~~~~~~~~~
Currently on fold 1
[0]	validation_0-auc:0.74768
[200]	validation_0-auc:0.83178
[400]	validation_0-auc:0.84657
[600]	validation_0-auc:0.85147
[800]	validation_0-auc:0.85389
[1000]	validation_0-auc:0.85530
[1200]	validation_0-auc:0.85633
[1400]	validation_0-auc:0.85711
[1600]	validation_0-auc:0.85771
[1800]	validation_0-auc:0.85825
[1999]	validation_0-auc:0.85858


Potential solutions:
- Use a data structure that matches the device ordinal in the booster.
- Set the device for booster before call to inplace_predict.




AUC: 0.8585769287654874
Stability: 0.7016373650732164
Available memory left: 21.95gb
Total time = 325.90s
~~~~~~~~~~~~~~~~~~~~~~~~~~~
Currently on fold 2
[0]	validation_0-auc:0.74230
[200]	validation_0-auc:0.83341
[400]	validation_0-auc:0.84727
[600]	validation_0-auc:0.85192
[800]	validation_0-auc:0.85422
[1000]	validation_0-auc:0.85560
[1200]	validation_0-auc:0.85662
[1400]	validation_0-auc:0.85737
[1600]	validation_0-auc:0.85789
[1800]	validation_0-auc:0.85827
[1999]	validation_0-auc:0.85867
AUC: 0.8586674207271402
Stability: 0.6953129875528088
Available memory left: 21.91gb
Total time = 328.09s
~~~~~~~~~~~~~~~~~~~~~~~~~~~
Currently on fold 3
[0]	validation_0-auc:0.74798
[200]	validation_0-auc:0.83706
[400]	validation_0-auc:0.85198
[600]	validation_0-auc:0.85699
[800]	validation_0-auc:0.85952
[1000]	validation_0-auc:0.86111
[1200]	validation_0-auc:0.86221
[1400]	validation_0-auc:0.86293
[1600]	validation_0-auc:0.86355
[1800]	validation_0-auc:0.86408
[1999]	validation_0-auc:0.86437
AU

In [19]:
print(f'Average AUC score: {np.mean(rocs)}')
print(f'Average gini score: {np.mean(ginis)}')

Average AUC score: 0.8608688922021702
Average gini score: 0.7038153830810625


In [20]:
with open('xgb.pickle', 'wb') as handle:
    pickle.dump(models, handle, protocol=pickle.HIGHEST_PROTOCOL)

# Submission

In [21]:
class VotingModel:
    # https://www.kaggle.com/code/kononenko/metric-trick-home-credit-baseline-inference
    def __init__(self, estimators, batch_size=100000):
        self.estimators = estimators
        self.batch_size = batch_size
    
    def predict_proba_in_batches(self, model, data):
        num_samples = len(data)
        num_batches = int(np.ceil(num_samples / self.batch_size))
        probabilities = np.zeros((num_samples, 2))

        for batch_idx in range(num_batches):
            start_idx = batch_idx * self.batch_size
            end_idx = min((batch_idx + 1) * self.batch_size, num_samples)
            probabilities[start_idx:end_idx, :] = model.predict_proba(data.iloc[start_idx:end_idx])
            gc.collect()
        
        return probabilities      
    
    def predict_proba(self, X):
        y_preds = [self.predict_proba_in_batches(estimator, X) for estimator in self.estimators]
        return np.mean(y_preds, axis=0)    

In [22]:
%%time
print('Predicting on test set')
models = VotingModel(models)
df_submission['score'] = models.predict_proba(df_submission[features])[:, 1]
print(df_submission[['case_id', 'score']].to_string())

Predicting on test set
   case_id     score
0    57543  0.005175
1    57549  0.046377
2    57551  0.002636
3    57552  0.016990
4    57569  0.117013
5    57630  0.010003
6    57631  0.047491
7    57632  0.015112
8    57633  0.053612
9    57634  0.037673
CPU times: user 1.82 s, sys: 11.1 ms, total: 1.83 s
Wall time: 1.61 s


In [23]:
df_submission[['case_id', 'score']].to_csv('./submission.csv', index=None)

In [24]:
print(f'Notebook completed in {(time() - total_t) / 60:.2f}min')
get_memory()

Notebook completed in 39.70min
Available memory left: 22.18gb
