In [1]:
import gc

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

from catboost import CatBoostClassifier
from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import LabelEncoder

### 데이터 불러오기

In [2]:
train = pd.read_parquet('../input/amex-agg-parquet/train_agg.parquet')
test = pd.read_parquet('../input/amex-agg-parquet/test_agg.parquet')

train.shape, test.shape

### 평가 지표

In [3]:
def amex_metric(y_true: pd.DataFrame, y_pred: pd.DataFrame) -> float:
    def top_four_percent_captured(y_true: pd.DataFrame, y_pred: pd.DataFrame) -> float:
        df = (pd.concat([y_true, y_pred], axis='columns')
            .sort_values('prediction', ascending=False))
        df['weight'] = df['target'].apply(lambda x: 20 if x == 0 else 1)
        four_pct_cutoff = int(0.04 * df['weight'].sum())
        df['weight_cumsum'] = df['weight'].cumsum()
        df_cutoff = df.loc[df['weight_cumsum'] <= four_pct_cutoff]
        return (df_cutoff['target'] == 1).sum() / (df['target'] == 1).sum()

    def weighted_gini(y_true: pd.DataFrame, y_pred: pd.DataFrame) -> float:
        df = (pd.concat([y_true, y_pred], axis='columns')
            .sort_values('prediction', ascending=False))
        df['weight'] = df['target'].apply(lambda x: 20 if x == 0 else 1)
        df['random'] = (df['weight'] / df['weight'].sum()).cumsum()
        total_pos = (df['target'] * df['weight']).sum()
        df['cum_pos_found'] = (df['target'] * df['weight']).cumsum()
        df['lorentz'] = df['cum_pos_found'] / total_pos
        df['gini'] = (df['lorentz'] - df['random']) * df['weight']
        return df['gini'].sum()

    def normalized_weighted_gini(y_true: pd.DataFrame, y_pred: pd.DataFrame) -> float:
        y_true_pred = y_true.rename(columns={'target': 'prediction'})
        return weighted_gini(y_true, y_pred) / weighted_gini(y_true, y_true_pred)

    g = normalized_weighted_gini(y_true, y_pred)
    d = top_four_percent_captured(y_true, y_pred)

    return 0.5 * (g + d)

### 범주형 변수 라벨 인코딩
* 범주형 변수 (*_last)
* train과 test 합쳐서 LabelEncoder() fit 해준후에 변환

In [4]:
features = test.columns.to_list()
categorical_features = [
    'B_30', 'B_38', 'D_63', 'D_64', 'D_66', 'D_68',
    'D_114', 'D_116', 'D_117', 'D_120', 'D_126'
]
categorical_features = [f'{cf}_last' for cf in categorical_features]
le = LabelEncoder()
encoders = []
for categorical_feature in categorical_features:
    le = LabelEncoder().fit(pd.concat([train[categorical_feature], test[categorical_feature]], axis=0))
    train[categorical_feature] = le.transform(train[categorical_feature])
    encoders.append(le)
    
del test
gc.collect()

In [5]:
y_train = pd.DataFrame(train['target'])
X_train = train.drop('target', axis=1)

### Catboost 학습
* StratifiedKFold
    * 5개의 폴드 사용.

In [6]:
N_FOLDS = 5
skf = StratifiedKFold(n_splits=N_FOLDS, shuffle=True, random_state=22)
y_oof = np.zeros(X_train.shape[0])
# y_test = np.zeros(test.shape[0])
idx = 0
models = []

for train_idx, val_idx in skf.split(X_train, y_train):
    print(f'******* Fold {idx} *******')
    X_tr, X_val = (
        X_train.iloc[train_idx].reset_index(drop=True),
        X_train.iloc[val_idx].reset_index(drop=True)
    )
    y_tr, y_val = (
        y_train.iloc[train_idx].reset_index(drop=True),
        y_train.iloc[val_idx].reset_index(drop=True)
    )

    clf = CatBoostClassifier(iterations=5000, random_state=22, task_type='GPU')
    clf.fit(X_tr, y_tr, eval_set=[(X_val, y_val)], cat_features=categorical_features, verbose=100)
    preds = clf.predict_proba(X_val)[:, 1]
    y_oof[val_idx] = y_oof[val_idx] + preds

#     preds_test = clf.predict_proba(test)[:, 1]
#     y_test = y_test + preds_test / N_FOLDS
    models.append(clf)
    idx += 1

y_pred = y_train.copy(deep=True)
y_pred = y_pred.rename(columns={'target': 'prediction'})
y_pred['prediction'] = y_oof
val_score = amex_metric(y_train, y_pred)
print(f'Amex metric: {val_score}')

with open('score_catboost.txt', 'w') as f:
    f.write(str(val_score))

In [7]:
y_oof_binary = (y_oof >= np.percentile(y_oof, 96)).astype(int)

In [8]:
y_oof_binary.mean()

In [9]:
import matplotlib.pyplot as plt
import itertools
from sklearn.metrics import confusion_matrix

def plot_confusion_matrix(cm, classes, normalize=False, title='Confusion matrix', cmap=plt.cm.Blues):
    plt.imshow(cm, interpolation='nearest', cmap=cmap)
    plt.title(title)
    plt.colorbar()
    tick_marks = np.arange(len(classes))
    plt.xticks(tick_marks, classes, rotation=0)
    plt.yticks(tick_marks, classes)

    thresh = cm.max() / 2.
    for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])):
        plt.text(j, i, cm[i, j], horizontalalignment='center', color='white' if cm[i, j] > thresh else 'black')

    plt.tight_layout()
    plt.ylabel('True label')
    plt.xlabel('Predicted label')

cm = confusion_matrix(y_train, y_oof_binary)
class_names = [0, 1]
plt.figure()
plot_confusion_matrix(cm, classes=class_names, title='Confusion matrix at 4%')
plt.show()

In [10]:
del train, X_train, y_train, y_pred, y_oof_binary
gc.collect()

### 테스트 데이터 예측
* 범주형 변수 라벨 인코딩
* 5개의 폴드된 모델에 대해 예측

In [11]:
test = pd.read_parquet('../input/amex-agg-parquet/test_agg.parquet')

test.shape

In [12]:
for (le, categorical_feature) in zip(encoders, categorical_features):
    test[categorical_feature] = le.transform(test[categorical_feature])

In [13]:
y_test = np.zeros(test.shape[0])
for model in models:
    preds_test = model.predict_proba(test)[:, 1]
    y_test = y_test + preds_test / N_FOLDS

In [14]:
test['prediction'] = y_test
test['prediction'].to_csv('submission_catboost.csv', index=True)
test['prediction'].to_csv('submission.csv', index=True)