In [1]:
from catboost import CatBoostClassifier
import gc; gc.enable()
from sklearn import *
import pandas as pd
import numpy as np
import numba, os
from sklearn.model_selection import StratifiedKFold
import datetime

In [2]:
# @yunchonggan's fast metric implementation
# From https://www.kaggle.com/competitions/amex-default-prediction/discussion/328020
def amex_metric(y_true: np.array, y_pred: np.array) -> float:

    # count of positives and negatives
    n_pos = y_true.sum()
    n_neg = y_true.shape[0] - n_pos

    # sorting by descring prediction values
    indices = np.argsort(y_pred)[::-1]
    preds, target = y_pred[indices], y_true[indices]

    # filter the top 4% by cumulative row weights
    weight = 20.0 - target * 19.0
    cum_norm_weight = (weight / weight.sum()).cumsum()
    four_pct_filter = cum_norm_weight <= 0.04

    # default rate captured at 4%
    d = target[four_pct_filter].sum() / n_pos

    # weighted gini coefficient
    lorentz = (target / n_pos).cumsum()
    gini = ((lorentz - cum_norm_weight) * weight).sum()

    # max weighted gini coefficient
    gini_max = 10 * n_neg * (1 - 19 / (n_pos + 20 * n_neg))

    # normalized weighted gini coefficient
    g = gini / gini_max

    return 0.5 * (g + d)

def lgb_amex_metric(y_true, y_pred):
    """The competition metric with lightgbm's calling convention"""
    return ('amex',
            amex_metric(y_true, y_pred),
            True)

In [3]:
%%time

for i in ['test','train']:
    df = pd.read_parquet(f'../input/amex-data-integer-dtypes-parquet-format/{i}.parquet')
    
    unique_people = pd.Categorical(df.pop('customer_ID'), ordered=True)
    
    fitures = df.columns.drop('S_2')
    
    df_avg = (df
              .groupby(unique_people)
              .mean()[fitures]
              .rename(columns={f: f"{f}_avg" for f in fitures})
             )
    gc.collect()
    
    df = pd.concat([df_avg], axis=1)
         
    if i == 'train': train = df
    else: test = df
    gc.collect()
    print(f"{i} shape: {df.shape}")
    
    del df

target = pd.read_csv('../input/amex-default-prediction/train_labels.csv').target.values
print(f"target shape: {target.shape}")

test shape: (924621, 188)
train shape: (458913, 188)
target shape: (458913,)
CPU times: user 1min 8s, sys: 1min 14s, total: 2min 23s
Wall time: 2min 3s


In [4]:
from lightgbm import LGBMClassifier, log_evaluation

In [5]:
import warnings

In [None]:
%%time


N_estimators = 20500
Boosting_type = 'dart'#'gbdt'
Min_child_samples = 10
Learning_rate = 0.005
Reg_lambda = 2
Num_leaves = 1023
def my_booster(random_state=1, n_estimators=N_estimators):
    return LGBMClassifier(boosting_type = Boosting_type,
                          n_estimators=n_estimators,
                          learning_rate=Learning_rate,
                          reg_lambda=Reg_lambda,
                          min_child_samples=Min_child_samples,
                          num_leaves=Num_leaves,
                          early_stopping_rounds = 1500,
                          random_state=random_state)


y_pred_list = []
score_list = []

features = [f for f in train.columns if f != 'customer_ID' and f != 'target']
kf = StratifiedKFold(n_splits=5)
for fold, (idx_tr, idx_va) in enumerate(kf.split(train, target)):
    
    X_tr, X_va, y_tr, y_va, model = None, None, None, None, None
    start_time = datetime.datetime.now()
    
    X_tr = train.iloc[idx_tr][features]
    X_va = train.iloc[idx_va][features]
    y_tr = target[idx_tr]
    y_va = target[idx_va]
    
   
    
    model = my_booster()
    with warnings.catch_warnings():
        warnings.filterwarnings('ignore', category=UserWarning)    
        model.fit(X_tr, y_tr,
                eval_set = [(X_va, y_va)], 
                eval_metric=[lgb_amex_metric],
                callbacks=[log_evaluation(100)]) 
    X_tr, y_tr = None, None#Зачищаем большие чанки памяти
    y_va_pred = model.predict_proba(X_va)[:,1]
    score = amex_metric(y_va, y_va_pred)
    
    
    
    print(f"Fold {fold+1} | {str(datetime.datetime.now() - start_time)[-12:-7]} |"\
          f" {model.n_estimators:5} trees |"
          f"                Score = {score:.5f}")
   
    score_list.append(score)
    y_va_pred=pd.DataFrame(y_va_pred)
    
    #Оцениваем на тесте
    y_pred_fold = model.predict_proba(test[features], raw_score=True)
    y_pred_list.append(y_pred_fold)
    y_pred_fold = pd.DataFrame(y_pred_fold)

[100]	valid_0's binary_logloss: 0.534417	valid_0's amex: 0.72191
[200]	valid_0's binary_logloss: 0.50338	valid_0's amex: 0.726693
[300]	valid_0's binary_logloss: 0.468334	valid_0's amex: 0.729303
[400]	valid_0's binary_logloss: 0.443261	valid_0's amex: 0.73086
[500]	valid_0's binary_logloss: 0.409119	valid_0's amex: 0.734486
[600]	valid_0's binary_logloss: 0.38161	valid_0's amex: 0.737092
[700]	valid_0's binary_logloss: 0.364566	valid_0's amex: 0.738402
[800]	valid_0's binary_logloss: 0.358026	valid_0's amex: 0.739271
[900]	valid_0's binary_logloss: 0.344048	valid_0's amex: 0.739708
[1000]	valid_0's binary_logloss: 0.328983	valid_0's amex: 0.740961
[1100]	valid_0's binary_logloss: 0.316642	valid_0's amex: 0.743248
[1200]	valid_0's binary_logloss: 0.308801	valid_0's amex: 0.744863
[1300]	valid_0's binary_logloss: 0.297425	valid_0's amex: 0.747509
[1400]	valid_0's binary_logloss: 0.289729	valid_0's amex: 0.748615
