In [1]:
import gc

import pandas as pd
import numpy as np

from catboost import CatBoostClassifier
from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import LabelEncoder

In [2]:
train = pd.read_parquet('./data/train.parquet')
train_labels = pd.read_csv('./data/train_labels.csv')

train.shape, train_labels.shape

((5531451, 190), (458913, 2))

In [3]:
train = train.merge(train_labels, how='inner', on='customer_ID')
print(train.shape)
del train_labels
gc.collect()

(5531451, 191)


0

In [5]:
features = train.drop(['customer_ID', 'S_2', 'target'], axis=1).columns.to_list()
categorical_features = [
    'B_30', 'B_38', 'D_63', 'D_64', 'D_66', 'D_68',
    'D_114', 'D_116', 'D_117', 'D_120', 'D_126'
]
num_features = [col for col in features if col not in categorical_features]

In [6]:
train_num_agg = train.groupby('customer_ID')[num_features].agg(['mean', 'std', 'min', 'max', 'last'])
train_num_agg.columns = ['_'.join(x) for x in train_num_agg.columns]
train_cat_agg = train.groupby('customer_ID')[categorical_features].agg(['count', 'last', 'nunique'])
train_cat_agg.columns = ['_'.join(x) for x in train_cat_agg.columns]
train_target = (train.groupby('customer_ID').tail(1).set_index('customer_ID', drop=True).sort_index()['target'])
train = pd.concat([train_num_agg, train_cat_agg, train_target], axis=1)

train.to_parquet('./data/train_agg.parquet', engine='pyarrow', compression='gzip', index=False)
del train_num_agg, train_cat_agg, train_target
gc.collect()

0

In [7]:
test = pd.read_parquet('./data/test.parquet')
test.shape

(11363762, 190)

In [8]:
test_num_agg = test.groupby('customer_ID')[num_features].agg(['mean', 'std', 'min', 'max', 'last'])
test_num_agg.columns = ['_'.join(x) for x in test_num_agg.columns]
test_cat_agg = test.groupby('customer_ID')[categorical_features].agg(['count', 'last', 'nunique'])
test_cat_agg.columns = ['_'.join(x) for x in test_cat_agg.columns]
test = pd.concat([test_num_agg, test_cat_agg], axis=1)

test.to_parquet('./data/test_agg.parquet', engine='pyarrow', compression='gzip', index=False)
del test_num_agg, test_cat_agg
gc.collect()

0

In [9]:
train.head()

Unnamed: 0_level_0,P_2_mean,P_2_std,P_2_min,P_2_max,P_2_last,D_39_mean,D_39_std,D_39_min,D_39_max,D_39_last,...,D_117_count,D_117_last,D_117_nunique,D_120_count,D_120_last,D_120_nunique,D_126_count,D_126_last,D_126_nunique,target
customer_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0000099d6bd597052cdcda90ffabf56573fe9d7c79be5fbac11a8ed792feb62a,0.933824,0.024194,0.86858,0.960384,0.934745,0.230769,0.83205,0,3,0,...,13,5,1,13,0,1,13,2,1,0
00000fd6641609c6ece5454664794f0340ad84dddce9a267a310b5ae68e9d8e5,0.89982,0.022119,0.861109,0.929122,0.880519,7.153846,6.743468,0,19,6,...,13,0,1,13,0,2,13,2,1,0
00001b22f846c82c51f6e3958ccd81970162bae8b007e80662ef27519fcc18c1,0.878454,0.028911,0.79767,0.904482,0.880875,0.0,0.0,0,0,0,...,13,0,1,13,0,1,13,2,1,0
000041bdba6ecadd89a52d11886e8eaaec9325906c9723355abb5ca523658edc,0.598969,0.020107,0.567442,0.623392,0.621776,1.538462,3.017046,0,9,0,...,13,7,2,13,0,1,13,2,1,0
00007889e4fcd2614b6cbe7f8f3d2e5c728eca32d9eb8ad51ca8b8c4a24cefed,0.891679,0.042325,0.805045,0.940382,0.8719,0.0,0.0,0,0,0,...,13,5,1,13,0,1,13,2,1,0


In [10]:
def amex_metric(y_true: pd.DataFrame, y_pred: pd.DataFrame) -> float:
    def top_four_percent_captured(y_true: pd.DataFrame, y_pred: pd.DataFrame) -> float:
        df = (pd.concat([y_true, y_pred], axis='columns')
            .sort_values('prediction', ascending=False))
        df['weight'] = df['target'].apply(lambda x: 20 if x == 0 else 1)
        four_pct_cutoff = int(0.04 * df['weight'].sum())
        df['weight_cumsum'] = df['weight'].cumsum()
        df_cutoff = df.loc[df['weight_cumsum'] <= four_pct_cutoff]
        return (df_cutoff['target'] == 1).sum() / (df['target'] == 1).sum()

    def weighted_gini(y_true: pd.DataFrame, y_pred: pd.DataFrame) -> float:
        df = (pd.concat([y_true, y_pred], axis='columns')
            .sort_values('prediction', ascending=False))
        df['weight'] = df['target'].apply(lambda x: 20 if x == 0 else 1)
        df['random'] = (df['weight'] / df['weight'].sum()).cumsum()
        total_pos = (df['target'] * df['weight']).sum()
        df['cum_pos_found'] = (df['target'] * df['weight']).cumsum()
        df['lorentz'] = df['cum_pos_found'] / total_pos
        df['gini'] = (df['lorentz'] - df['random']) * df['weight']
        return df['gini'].sum()

    def normalized_weighted_gini(y_true: pd.DataFrame, y_pred: pd.DataFrame) -> float:
        y_true_pred = y_true.rename(columns={'target': 'prediction'})
        return weighted_gini(y_true, y_pred) / weighted_gini(y_true, y_true_pred)

    g = normalized_weighted_gini(y_true, y_pred)
    d = top_four_percent_captured(y_true, y_pred)

    return 0.5 * (g + d)

In [9]:
features = test.columns.to_list()
categorical_features = [
    'B_30', 'B_38', 'D_63', 'D_64', 'D_66', 'D_68',
    'D_114', 'D_116', 'D_117', 'D_120', 'D_126'
]
categorical_features = [f'{cf}_last' for cf in categorical_features]
le = LabelEncoder()
for categorical_feature in categorical_features:
    train[categorical_feature] = le.fit_transform(train[categorical_feature])
    test[categorical_feature] = le.transform(test[categorical_feature])

In [11]:
y_train = pd.DataFrame(train['target'])
X_train = train.drop('target', axis=1)

In [12]:
N_FOLDS = 5
skf = StratifiedKFold(n_splits=N_FOLDS, shuffle=True, random_state=22)
y_oof = np.zeros(X_train.shape[0])
y_test = np.zeros(test.shape[0])
idx = 0

for train_idx, val_idx in skf.split(X_train, y_train):
    print(f'******* Fold {idx} *******')
    X_tr, X_val = (
        X_train.iloc[train_idx].reset_index(drop=True),
        X_train.iloc[val_idx].reset_index(drop=True)
    )
    y_tr, y_val = (
        y_train.iloc[train_idx].reset_index(drop=True),
        y_train.iloc[val_idx].reset_index(drop=True)
    )

    clf = CatBoostClassifier(iterations=5000, random_state=22)
    clf.fit(X_tr, y_tr, eval_set=[(X_val, y_val)], cat_features=categorical_features, verbose=100)
    preds = clf.predict_proba(X_val)[:, 1]
    y_oof[val_idx] = y_oof[val_idx] + preds

    preds_test = clf.predict_proba(test)[:, 1]
    y_test = y_test + preds_test / N_FOLDS
    idx += 1

y_pred = y_train.copy(deep=True)
y_pred = y_pred.rename(columns={'target': 'prediction'})
y_pred['prediction'] = y_oof
val_score = amex_metric(y_train, y_pred)
print(f'Amex metric: {val_score}')

******* Fold 0 *******
Learning rate set to 0.067666
0:	learn: 0.6120772	test: 0.6122906	best: 0.6122906 (0)	total: 403ms	remaining: 33m 34s
100:	learn: 0.2272071	test: 0.2309770	best: 0.2309770 (100)	total: 23.3s	remaining: 18m 51s
200:	learn: 0.2201874	test: 0.2256355	best: 0.2256355 (200)	total: 45.2s	remaining: 17m 58s
300:	learn: 0.2155222	test: 0.2235171	best: 0.2235171 (300)	total: 1m 6s	remaining: 17m 16s
400:	learn: 0.2119751	test: 0.2223201	best: 0.2223201 (400)	total: 1m 27s	remaining: 16m 47s
500:	learn: 0.2090988	test: 0.2216021	best: 0.2216021 (500)	total: 1m 49s	remaining: 16m 21s
600:	learn: 0.2062903	test: 0.2210748	best: 0.2210746 (599)	total: 2m 11s	remaining: 15m 59s
700:	learn: 0.2037052	test: 0.2207770	best: 0.2207734 (698)	total: 2m 32s	remaining: 15m 32s
800:	learn: 0.2012328	test: 0.2205294	best: 0.2205294 (800)	total: 2m 53s	remaining: 15m 9s
900:	learn: 0.1989019	test: 0.2203005	best: 0.2203005 (900)	total: 3m 14s	remaining: 14m 46s
1000:	learn: 0.1965729	tes

KeyboardInterrupt: 

In [None]:
y_oof_binary = (y_oof >= np.percentile(y_oof, 96)).astype(int)

In [None]:
y_oof_binary.mean()

In [None]:
import matplotlib.pyplot as plt
import itertools
from sklearn.metrics import confusion_matrix

def plot_confusion_matrix(cm, classes, normalize=False, title='Confusion matrix', cmap=plt.cm.Blues):
    plt.imshow(cm, interpolation='nearest', cmap=cmap)
    plt.title(title)
    plt.colorbar()
    tick_marks = np.arange(len(classes))
    plt.xticks(tick_marks, classes, rotation=0)
    plt.yticks(tick_marks, classes)

    thresh = cm.max() / 2.
    for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])):
        plt.text(j, i, cm[i, j], horizontalalignment='center', color='white' if cm[i, j] > thresh else 'black')

    plt.tight_layout()
    plt.ylabel('True label')
    plt.xlabel('Predicted label')

cm = confusion_matrix(y_train, y_oof_binary)
class_names = [0, 1]
plt.figure()
plot_confusion_matrix(cm, classes=class_names, title=f'Confusion matrix at 4%')
plt.show()

In [None]:
test['prediction'] = y_test
test['prediction'].to_csv(f'submission_cat_{val_score}.csv', index=True)