In [None]:
import gc
import pickle
import numpy as np
import pandas as pd
import lightgbm as lgb
from sklearn.model_selection import KFold

In [None]:
df_train = pd.read_parquet("../input/amex-parquet/train_data.parquet")
drop_cols = ["customer_ID", "S_2", "D_87", "D_88", "D_108", "D_110", "D_111", "B_39", "D_73", "B_42", "D_134", "D_135", "D_136", "D_137", "D_138", "R_9"]
df_train.drop(drop_cols, axis=1, inplace=True)
df_train0 = df_train.query("target == 0").sample(n=1377869, random_state=0)
df_train1 = df_train.query("target == 1")
del df_train; gc.collect()
df_train = pd.concat([df_train0, df_train1])
del df_train0, df_train1; gc.collect()
df_train.reset_index(inplace=True)
for col in ["D_63", "D_64"]:
    d = {}
    for i, val in enumerate(df_train[col].value_counts().keys()):
        d[val] = i
    df_train[col] = df_train[col].replace(d).astype("category")
train_y = df_train["target"]
df_train.drop(["target", "index"], axis=1, inplace=True)
train_x = df_train
del df_train; gc.collect()

In [None]:
params = {
    'boosting_type': 'gbdt',  # default = 'gbdt'
    'num_leaves': 63,         # default = 31,
    'learning_rate': 0.1,     # default = 0.1
    'feature_fraction': 0.8,  # default = 1.0
    'bagging_freq': 1,        # default = 0
    'bagging_fraction': 0.8,  # default = 1.0
    'n_estimators': 10000,
    'random_state': 0,        # default = None
}

# Thanks!
# https://www.kaggle.com/code/munumbutt/simple-lgbm-starter
# cat_cols = ['B_30', 'B_38', 'D_114', 'D_116', 'D_117', 'D_120', 'D_126', 'D_63', 'D_64', 'D_66', 'D_68']
cat_cols = ["D_63", "D_64"]

cv = KFold(n_splits=5)
for fold, (trn_idx, val_idx) in enumerate(cv.split(train_x), start=1):
    if fold != 1:
        df_train = pd.read_parquet("../input/amex-parquet/train_data.parquet")
        drop_cols = ["customer_ID", "S_2", "D_87", "D_88", "D_108", "D_110", "D_111", "B_39", "D_73", "B_42", "D_134", "D_135", "D_136", "D_137", "D_138", "R_9"]
        df_train.drop(drop_cols, axis=1, inplace=True)
        
        df_train0 = df_train.query("target == 0").sample(n=1377869, random_state=0)
        df_train1 = df_train.query("target == 1")
        del df_train; gc.collect()
        df_train = pd.concat([df_train0, df_train1])
        del df_train0, df_train1; gc.collect()
        df_train.reset_index(inplace=True)

        for col in ["D_63", "D_64"]:
            d = {}
            for i, val in enumerate(df_train[col].value_counts().keys()):
                d[val] = i
            df_train[col] = df_train[col].replace(d).astype("category")

        train_y = df_train["target"]
        df_train.drop(["target", "index"], axis=1, inplace=True)
        train_x = df_train

        del df_train; gc.collect()
    
    # def main():
    trn_x, trn_y = train_x.iloc[trn_idx, :], train_y[trn_idx]
    val_x, val_y = train_x.iloc[val_idx, :], train_y[val_idx]
    del train_x, train_y; gc.collect()
    
    clf = lgb.LGBMClassifier(**params)
    clf.fit(
        trn_x, trn_y, 
        eval_set=[(val_x, val_y)],
        callbacks=[lgb.early_stopping(50), lgb.log_evaluation(200)],
        categorical_feature=cat_cols
    )
    del trn_x, trn_y, val_x, val_y; gc.collect()
    pickle.dump(clf, open(f"model.lgb.{fold}.pkl", 'wb'))


## Infer

In [None]:
df_test = pd.read_feather("../input/amexfeather/test_data.ftr").groupby('customer_ID').tail(1).set_index('customer_ID', drop=True).sort_index().drop(['S_2'], axis='columns')

drop_cols = ["D_87", "D_88", "D_108", "D_110", "D_111", "B_39", "D_73", "B_42", "D_134", "D_135", "D_136", "D_137", "D_138", "R_9"]
df_test.drop(drop_cols, axis=1, inplace=True)
for col in ["D_63", "D_64"]:
    d = {}
    for i, val in enumerate(df_test[col].value_counts().keys()):
        d[val] = i
    df_test[col] = df_test[col].replace(d).astype("category")

clfs = []
for fold in [1, 2, 3, 4, 5]:
    clfs.append(pickle.load(open(f"../input/amex-1st-lgb/model.lgb.{fold}.pkl", "rb")))

preds_y = np.zeros(len(df_test))
for fold in range(5):
    preds_y += clfs[fold].predict_proba(df_test.values)[:, 1] / 5.0

In [None]:
# Make submission files
df_sub = pd.read_csv("../input/amex-default-prediction/sample_submission.csv")
df_sub["prediction"] = preds_y
df_sub.to_csv("submission.csv.gz", index=False)