### 라이브러리

In [1]:
import gc
import datetime

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import StratifiedKFold
import lightgbm as lgbm
from lightgbm import early_stopping

import warnings
warnings.simplefilter('ignore')

In [2]:
NUM_FOLDS = 5

### train 데이터 불러오기

In [3]:
%%time

df_train = pd.read_parquet('../input/amex-data-integer-dtypes-parquet-format/train.parquet')

### 데이터 전처리
* S_2: datetime 으로 변환
* days 특성 생성
* float32 인 열들 float16으로 변환

In [4]:
%%time 

df_train['S_2'] = pd.to_datetime(df_train['S_2'])
df_train['days'] = (df_train['S_2'] - df_train.groupby(['customer_ID'])['S_2'].transform('min')).dt.days.astype('int16') + 1

for col in df_train[df_train.columns[df_train.dtypes == 'float32']]:
    df_train[col] = df_train[col].astype('float16')

In [5]:
gc.collect()

In [6]:
print(df_train.shape)
value_counts = df_train['customer_ID'].value_counts()
print(value_counts.shape)
print(value_counts)
print()
del value_counts
gc.collect()

In [7]:
df_train.groupby(['customer_ID']).tail(1)

### train 데이터
* customer_ID당 마지막 데이터포인트를 사용

In [8]:
df_train = df_train.groupby(['customer_ID']).tail(1).set_index('customer_ID')

In [9]:
df_train

### train lables 데이터 불러오기

In [10]:
%%time

df_train_labels = pd.read_csv('../input/amex-default-prediction/train_labels.csv')
df_train_labels.info()

* target -> int8 타입으로 변환

In [11]:
df_train_labels['target'] = df_train_labels['target'].astype('int8')
print(df_train_labels.shape)
df_train_labels.head()

### train 데이터와 labels 데이터 병합
* customer_ID를 기준

In [12]:
%%time

df_train = df_train.merge(df_train_labels, on='customer_ID', how='left')
print(df_train.shape)
print(df_train.head())
del df_train_labels
gc.collect()

### 평가지표

In [13]:
# https://www.kaggle.com/code/cdeotte/xgboost-starter-0-793/notebook
# https://www.kaggle.com/kyakovlev
# https://www.kaggle.com/competitions/amex-default-prediction/discussion/327534
def amex_metric_mod(y_true, y_pred):
    
    labels = np.transpose(np.array([y_true, y_pred]))
    labels = labels[labels[:, 1].argsort()[::-1]]
    weights = np.where(labels[:,0]==0, 20, 1)
    cut_vals = labels[np.cumsum(weights) <= int(0.04 * np.sum(weights))]
    top_four = np.sum(cut_vals[:,0]) / np.sum(labels[:,0])

    gini = [0,0]
    for i in [1,0]:
        labels = np.transpose(np.array([y_true, y_pred]))
        labels = labels[labels[:, i].argsort()[::-1]]
        weight = np.where(labels[:,0]==0, 20, 1)
        weight_random = np.cumsum(weight / np.sum(weight))
        total_pos = np.sum(labels[:, 0] *  weight)
        cum_pos_found = np.cumsum(labels[:, 0] * weight)
        lorentz = cum_pos_found / total_pos
        gini[i] = np.sum((lorentz - weight_random) * weight)

    return 0.5 * (gini[1] / gini[0] + top_four), _

### 범주형 특성 선택

In [14]:
FEATURES = df_train.columns.drop(["target", "customer_ID", "S_2"])
categorical_cols = [
    'B_30', 'B_38', 'D_114', 'D_116', 'D_117', 'D_120',
    'D_126', 'D_63', 'D_64', 'D_66', 'D_68'
]

cat_col = []
n = 0
for col in df_train[FEATURES]:
    for coll in categorical_cols:
        if col == coll:
            cat_col.append(n)
            break
    n += 1
cat_col

### LightGBM 학습
* 5개의 폴드

In [15]:
%%time
params = {}
feature_importances = []  # 특성 중요도 
scores = []               # fold 별 점수 
models = []               # 모델 
pred_val = []
yval = []

# 교차 검증 클래스
skf = StratifiedKFold(n_splits=NUM_FOLDS, shuffle=True, random_state=2022)

# 폴드별 데이터 나누기 
for fold, (train_idx, val_idx) in enumerate(skf.split(df_train[FEATURES], df_train["target"])):
    
    print('FOLD:', fold)
    
    # 데이터 나누기
    X_train = df_train.loc[train_idx, FEATURES].values
    y_train = df_train.loc[train_idx, 'target'].values
    X_val = df_train.loc[val_idx, FEATURES].values
    y_val = df_train.loc[val_idx, 'target'].values

    print("y_train t=0 count:", len(y_train[y_train == 0]))
    print("y_train t=1 count:", len(y_train[y_train == 1]))
    print("y_val t=0 count:", len(y_val[y_val == 0]))
    print("y_val t=1 count:", len(y_val[y_val == 1]))


    params = {
        "num_iterations": 10000,
        'learning_rate': 0.05,
    }
    
    # LGBM 알고리즘
    model = lgbm.LGBMClassifier(**params).fit(
        X_train,y_train,
        eval_set=[(X_val, y_val), (X_train, y_train)],
        verbose=100,
        callbacks=[early_stopping(100)],
        categorical_feature=cat_col
    )
    
    # 특성 중요도
    feature_importances.append(model.feature_importances_)   
    models.append(model)
    pred_val = np.append(pred_val, model.predict_proba(X_val)[:,1])
    yval = np.append(yval, y_val)   
    
    del X_train, y_train, X_val, y_val, model
    gc.collect()


score = amex_metric_mod(yval, pred_val)[0]
print('score:', score)
with open('score_lightgbm.txt', 'w') as f:
    f.write(str(score))

In [16]:
del df_train, train_idx, val_idx, yval, pred_val
gc.collect()

### 특성 중요도

In [20]:
df_feat_imp = pd.DataFrame(index=FEATURES)
df_feat_imp["imp0"] = feature_importances[0]
df_feat_imp["imp1"] = feature_importances[1]
df_feat_imp["imp2"] = feature_importances[2]
df_feat_imp["imp3"] = feature_importances[3]
df_feat_imp["imp4"] = feature_importances[4]
df_feat_imp["mean_imp"] = df_feat_imp.mean(axis=1).values

df_feat_imp = df_feat_imp.sort_values(by="mean_imp",ascending=False)

df_feat_imp.to_csv("feat_imp.csv")

fig, ax = plt.subplots(figsize=(20,5))
sns.barplot(x=df_feat_imp.index, y=df_feat_imp["mean_imp"])
plt.xticks([])

print(df_feat_imp)

### test 데이터 불러오기
* float32 인 열 float16으로 변환
* S_2 -> datetime으로 변환
* days 특성 생성
* customer_ID당 마지막 데이터포인트 사용

In [21]:
df_test = pd.read_parquet('../input/amex-data-integer-dtypes-parquet-format/test.parquet')

print("convert float32 columns to float16")
for col in df_test[df_test.columns[df_test.dtypes == "float32"]]:
    df_test[col] = df_test[col].astype("float16")

print("date and time")
df_test["S_2"] = pd.to_datetime(df_test["S_2"])
df_test["days"] = (df_test["S_2"] - df_test.groupby(["customer_ID"])["S_2"].transform("min")).dt.days.astype("int16") + 1

print("grouping")
df_test = df_test.groupby(["customer_ID"]).tail(1).set_index('customer_ID')

### test 데이터 예측

In [22]:
print("prediction")
pred=[]
for fold in range(5):
    print('FOLD:',fold)

    if len(pred)==0:
        pred = models[fold].predict_proba(df_test.drop(["S_2"], axis=1))[:, 1]
    else:
        pred += models[fold].predict_proba(df_test.drop(["S_2"], axis=1))[:, 1]

pred = pred / 5

In [23]:
subm = pd.read_csv('../input/amex-default-prediction/sample_submission.csv')
subm["prediction"] = pred
subm.to_csv("submission_lightgbm.csv", index=False)
subm.to_csv("submission.csv", index=False)