In [1]:
import gc
import datetime
import warnings

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import StratifiedKFold
import lightgbm as lgbm
from lightgbm import early_stopping

warnings.filterwarnings('ignore')

In [2]:
NUM_FOLDS = 5

In [3]:
%%time

df_train = pd.read_parquet('./data/train.parquet')

Wall time: 1.41 s


In [4]:
%%time 

df_train['S_2'] = pd.to_datetime(df_train['S_2'])
df_train['days'] = (df_train['S_2'] - df_train.groupby(['customer_ID'])['S_2'].transform('min')).dt.days.astype('int16') + 1

for col in df_train[df_train.columns[df_train.dtypes == 'float32']]:
    df_train[col] = df_train[col].astype('float16')


all_cols = [c for c in list(df_train.columns) if c not in ['customer_ID', 'S_2']]
cat_features = [
    "B_30", "B_38", "D_114", "D_116", "D_117", "D_120", "D_126", "D_63",
    "D_64","D_66", "D_68"
]
num_features = [col for col in all_cols if col not in cat_features]

df_num_agg = df_train.groupby('customer_ID')[num_features].agg(['mean', 'std', 'min', 'max', 'last', 'median'])
df_num_agg.columns = ['_'.join(x) for x in df_num_agg.columns]

df_cat_agg = df_train.groupby('customer_ID')[cat_features].agg(['count', 'last', 'nunique'])
df_cat_agg.columns = ['_'.join(x) for x in df_cat_agg.columns]

df_train = pd.concat([df_num_agg, df_cat_agg], axis=1)
# df_train = df_train.set_index('customer_ID')
del df_num_agg, df_cat_agg
gc.collect()

Wall time: 1min 29s


0

In [5]:
df_train

Unnamed: 0_level_0,P_2_mean,P_2_std,P_2_min,P_2_max,P_2_last,P_2_median,D_39_mean,D_39_std,D_39_min,D_39_max,...,D_63_nunique,D_64_count,D_64_last,D_64_nunique,D_66_count,D_66_last,D_66_nunique,D_68_count,D_68_last,D_68_nunique
customer_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0000099d6bd597052cdcda90ffabf56573fe9d7c79be5fbac11a8ed792feb62a,0.933594,0.024194,0.868652,0.960449,0.934570,0.938477,0.230769,0.832050,0,3,...,1,13,0,1,13,-1,1,13,6,1
00000fd6641609c6ece5454664794f0340ad84dddce9a267a310b5ae68e9d8e5,0.899902,0.022097,0.861328,0.929199,0.880371,0.904785,7.153846,6.743468,0,19,...,1,13,0,1,13,-1,1,13,6,1
00001b22f846c82c51f6e3958ccd81970162bae8b007e80662ef27519fcc18c1,0.878418,0.028837,0.797852,0.904297,0.880859,0.884766,0.000000,0.000000,0,0,...,1,13,2,1,13,-1,1,13,6,1
000041bdba6ecadd89a52d11886e8eaaec9325906c9723355abb5ca523658edc,0.599121,0.020082,0.567383,0.623535,0.621582,0.598145,1.538462,3.017046,0,9,...,1,13,0,1,13,-1,1,13,3,3
00007889e4fcd2614b6cbe7f8f3d2e5c728eca32d9eb8ad51ca8b8c4a24cefed,0.891602,0.042316,0.805176,0.940430,0.872070,0.879395,0.000000,0.000000,0,0,...,1,13,0,1,13,1,1,13,6,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
ffff41c8a52833b56430603969b9ca48d208e7c192c6a4081a6acc28cf4f8af7,0.848633,0.041953,0.730469,0.895508,0.844238,0.859863,3.846154,6.656402,0,20,...,1,13,3,1,13,1,1,13,6,1
ffff518bb2075e4816ee3fe9f3b152c57fc0e6f01bf7fdd3e5b57cfcbee30286,0.859375,0.012459,0.831055,0.868164,0.831055,0.864746,2.076923,4.192484,0,16,...,1,13,2,1,13,-1,1,13,6,1
ffff9984b999fccb2b6127635ed0736dda94e544e67e026eee4d20f680639ff6,0.786621,0.014292,0.756836,0.802734,0.800293,0.793457,7.384615,6.212064,0,18,...,1,13,3,1,13,-1,1,13,5,1
ffffa5c46bc8de74f5a4554e74e239c8dee6b9baf388145b2c3d01967fcce461,0.804688,0.037462,0.753906,0.856934,0.753906,0.791504,0.923077,2.752621,0,10,...,1,13,3,2,13,-1,1,13,3,2


In [6]:
%%time

df_train_labels = pd.read_csv('./data/train_labels.csv')
df_train_labels.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 458913 entries, 0 to 458912
Data columns (total 2 columns):
 #   Column       Non-Null Count   Dtype 
---  ------       --------------   ----- 
 0   customer_ID  458913 non-null  object
 1   target       458913 non-null  int64 
dtypes: int64(1), object(1)
memory usage: 7.0+ MB
Wall time: 284 ms


In [7]:
df_train_labels['target'] = df_train_labels['target'].astype('int8')
print(df_train_labels.shape)
df_train_labels.head()

(458913, 2)


Unnamed: 0,customer_ID,target
0,0000099d6bd597052cdcda90ffabf56573fe9d7c79be5f...,0
1,00000fd6641609c6ece5454664794f0340ad84dddce9a2...,0
2,00001b22f846c82c51f6e3958ccd81970162bae8b007e8...,0
3,000041bdba6ecadd89a52d11886e8eaaec9325906c9723...,0
4,00007889e4fcd2614b6cbe7f8f3d2e5c728eca32d9eb8a...,0


In [8]:
%%time

df_train = df_train.merge(df_train_labels, on='customer_ID', how='left')
print(df_train.shape)
print(df_train.head())
del df_train_labels
gc.collect()

(458913, 1103)
                                         customer_ID  P_2_mean   P_2_std  \
0  0000099d6bd597052cdcda90ffabf56573fe9d7c79be5f...  0.933594  0.024194   
1  00000fd6641609c6ece5454664794f0340ad84dddce9a2...  0.899902  0.022097   
2  00001b22f846c82c51f6e3958ccd81970162bae8b007e8...  0.878418  0.028837   
3  000041bdba6ecadd89a52d11886e8eaaec9325906c9723...  0.599121  0.020082   
4  00007889e4fcd2614b6cbe7f8f3d2e5c728eca32d9eb8a...  0.891602  0.042316   

    P_2_min   P_2_max  P_2_last  P_2_median  D_39_mean  D_39_std  D_39_min  \
0  0.868652  0.960449  0.934570    0.938477   0.230769  0.832050         0   
1  0.861328  0.929199  0.880371    0.904785   7.153846  6.743468         0   
2  0.797852  0.904297  0.880859    0.884766   0.000000  0.000000         0   
3  0.567383  0.623535  0.621582    0.598145   1.538462  3.017046         0   
4  0.805176  0.940430  0.872070    0.879395   0.000000  0.000000         0   

   ...  D_64_count  D_64_last  D_64_nunique  D_66_count  D_

0

In [9]:
# https://www.kaggle.com/code/cdeotte/xgboost-starter-0-793/notebook
# https://www.kaggle.com/kyakovlev
# https://www.kaggle.com/competitions/amex-default-prediction/discussion/327534
def amex_metric_mod(y_true, y_pred):
    
    labels = np.transpose(np.array([y_true, y_pred]))
    labels = labels[labels[:, 1].argsort()[::-1]]
    weights = np.where(labels[:,0]==0, 20, 1)
    cut_vals = labels[np.cumsum(weights) <= int(0.04 * np.sum(weights))]
    top_four = np.sum(cut_vals[:,0]) / np.sum(labels[:,0])

    gini = [0,0]
    for i in [1,0]:
        labels = np.transpose(np.array([y_true, y_pred]))
        labels = labels[labels[:, i].argsort()[::-1]]
        weight = np.where(labels[:,0]==0, 20, 1)
        weight_random = np.cumsum(weight / np.sum(weight))
        total_pos = np.sum(labels[:, 0] *  weight)
        cum_pos_found = np.cumsum(labels[:, 0] * weight)
        lorentz = cum_pos_found / total_pos
        gini[i] = np.sum((lorentz - weight_random) * weight)

    return 0.5 * (gini[1] / gini[0] + top_four), _

In [10]:
FEATURES = df_train.columns.drop(["target", "customer_ID"])
categorical_cols = [
    'B_30', 'B_38', 'D_114', 'D_116', 'D_117', 'D_120',
    'D_126', 'D_63', 'D_64', 'D_66', 'D_68'
]

cat_col = []
n = 0
for col in df_train[FEATURES]:
    for coll in categorical_cols:
        if col.startswith(coll) and col.endswith('_last'):
            cat_col.append(n)
            break
    n += 1
cat_col

[1069, 1072, 1075, 1078, 1081, 1084, 1087, 1090, 1093, 1096, 1099]

In [11]:
%%time
params = {}
feature_importances = []  # 특성 중요도 
scores = []               # fold 별 점수 
models = []               # 모델 
pred_val = []
yval = []

# 교차 검증 클래스
skf = StratifiedKFold(n_splits=NUM_FOLDS, shuffle=True, random_state=2022)

# 폴드별 데이터 나누기 
for fold, (train_idx, val_idx) in enumerate(skf.split(df_train[FEATURES], df_train["target"])):
    
    print('FOLD:', fold)
    
    # 데이터 나누기
    X_train = df_train.loc[train_idx, FEATURES].values
    y_train = df_train.loc[train_idx, 'target'].values
    X_val = df_train.loc[val_idx, FEATURES].values
    y_val = df_train.loc[val_idx, 'target'].values

    print("y_train t=0 count:", len(y_train[y_train == 0]))
    print("y_train t=1 count:", len(y_train[y_train == 1]))
    print("y_val t=0 count:", len(y_val[y_val == 0]))
    print("y_val t=1 count:", len(y_val[y_val == 1]))


    params = {
        "num_iterations": 10000,
        'learning_rate': 0.05,
    }
    
    # LGBM 알고리즘
    model = lgbm.LGBMClassifier(**params).fit(
        X_train,y_train,
        eval_set=[(X_val, y_val), (X_train, y_train)],
        verbose=100,
        callbacks=[early_stopping(100)],
        categorical_feature=cat_col
    )
    
    # 특성 중요도
    feature_importances.append(model.feature_importances_)   
    models.append(model)
    pred_val = np.append(pred_val, model.predict_proba(X_val)[:, 1])
    yval = np.append(yval, y_val)   
    
    del X_train, y_train, X_val, y_val, model
    gc.collect()


score = amex_metric_mod(yval, pred_val)[0]
print('score:', score)
with open('score_lightgbm.txt', 'w') as f:
    f.write(str(score))
# f = open("lightgbm_score.txt", "a")
# f.write(str(score))
# f.close()

FOLD: 0
y_train t=0 count: 272068
y_train t=1 count: 95062
y_val t=0 count: 68017
y_val t=1 count: 23766
Training until validation scores don't improve for 100 rounds
[100]	training's binary_logloss: 0.224299	valid_0's binary_logloss: 0.232012
[200]	training's binary_logloss: 0.209567	valid_0's binary_logloss: 0.222987
[300]	training's binary_logloss: 0.201134	valid_0's binary_logloss: 0.220798
[400]	training's binary_logloss: 0.194768	valid_0's binary_logloss: 0.220424
[500]	training's binary_logloss: 0.189096	valid_0's binary_logloss: 0.220257
[600]	training's binary_logloss: 0.183843	valid_0's binary_logloss: 0.220013
[700]	training's binary_logloss: 0.1788	valid_0's binary_logloss: 0.219946
Early stopping, best iteration is:
[689]	training's binary_logloss: 0.179338	valid_0's binary_logloss: 0.2199
FOLD: 1
y_train t=0 count: 272068
y_train t=1 count: 95062
y_val t=0 count: 68017
y_val t=1 count: 23766
Training until validation scores don't improve for 100 rounds
[100]	training's bi

In [12]:
del df_train, train_idx, val_idx, yval, pred_val
gc.collect()

20

In [13]:
# df_feat_imp = pd.DataFrame(index=FEATURES)
# df_feat_imp["imp0"] = feature_importances[0]
# df_feat_imp["imp1"] = feature_importances[1]
# df_feat_imp["imp2"] = feature_importances[2]
# df_feat_imp["imp3"] = feature_importances[3]
# df_feat_imp["imp4"] = feature_importances[4]
# df_feat_imp["mean_imp"] = df_feat_imp.mean(axis=1).values

# df_feat_imp = df_feat_imp.sort_values(by="mean_imp",ascending=False)

# df_feat_imp.to_csv("feat_imp.csv")

# fig, ax = plt.subplots(figsize=(20,5))
# sns.barplot(x=df_feat_imp.index,y=df_feat_imp["mean_imp"])
# plt.xticks([])
# print(df_feat_imp)

In [15]:
df_test = pd.read_parquet('./data/test.parquet')

print("convert float32 columns to float16")
for col in df_test[df_test.columns[df_test.dtypes == "float32"]]:
    df_test[col] = df_test[col].astype("float16")

print("date and time")
df_test["S_2"] = pd.to_datetime(df_test["S_2"])
df_test["days"] = (df_test["S_2"] - df_test.groupby(["customer_ID"])["S_2"].transform("min")).dt.days.astype("int16") + 1

# print("grouping")
# df_test = df_test.groupby(["customer_ID"]).tail(1).set_index('customer_ID')


all_cols = [c for c in list(df_test.columns) if c not in ['customer_ID', 'S_2']]
cat_features = [
    "B_30", "B_38", "D_114", "D_116", "D_117", "D_120", "D_126", "D_63",
    "D_64","D_66", "D_68"
]
num_features = [col for col in all_cols if col not in cat_features]
    
df_num_agg = df_test.groupby('customer_ID')[num_features].agg(['mean', 'std', 'min', 'max', 'last', 'median'])
df_num_agg.columns = ['_'.join(x) for x in df_num_agg.columns]

df_cat_agg = df_test.groupby('customer_ID')[cat_features].agg(['count', 'last', 'nunique'])
df_cat_agg.columns = ['_'.join(x) for x in df_cat_agg.columns]

df_test = pd.concat([df_num_agg, df_cat_agg], axis=1)
# df_test = df.test.set_index('customer_ID')
del df_num_agg, df_cat_agg
gc.collect()

convert float32 columns to float16
date and time


0

In [18]:
print("prediction")
pred = []
for fold in range(5):
    print('FOLD:', fold)

    if len(pred) == 0:
        pred = models[fold].predict_proba(df_test)[:, 1]
    else:
        pred += models[fold].predict_proba(df_test)[:, 1]

pred = pred / 5

prediction
FOLD: 0
FOLD: 1
FOLD: 2
FOLD: 3
FOLD: 4


In [19]:
subm = pd.read_csv('./data/sample_submission.csv')
subm["prediction"] = pred
subm.to_csv("submission_lightgbm.csv", index=False)