In [None]:
import pandas as pd
import numpy as np
from catboost import CatBoostClassifier
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
import gc; gc.enable()

## Load pickle file as dataset


> Compressed data in Pickle Format for American Express - Default Prediction Competition

> float64 categories converted to float16

> int64 categories converted to int8

> object categories converted to category

> num cols agg stats -> 'mean', 'std', 'min', 'max', 'last'

> cat cols agg stats -> 'count', 'last', 'nunique'

In [None]:
sub1 = pd.read_csv('../input/d/datasets/bhavikardeshna/avg-weights/submission.csv')
traini = pd.read_csv('/kaggle/input/amex-default-prediction/train_data.csv', parse_dates=['S_2'], chunksize=900_000, iterator=True)
testi = pd.read_csv('/kaggle/input/amex-default-prediction/test_data.csv', parse_dates=['S_2'], chunksize=500_000, iterator=True) 
labels = pd.read_csv('/kaggle/input/amex-default-prediction/train_labels.csv')

In [None]:
%%time
train = []
for df in traini:
    if len(train)>0: train = pd.concat([train, df])
    else: train = df[:]
    train.sort_values(by=['S_2'], inplace=True)
    train.reset_index(drop=True, inplace=True)
    train.drop_duplicates(subset=['customer_ID'], keep='last', inplace=True)
    del df; gc.collect()
train = pd.merge(train, labels, how='inner', on=['customer_ID'])
del labels; gc.collect()
col = [c for c in train if c not in ['customer_ID', 'target','S_2']]
train.fillna(0).to_csv('train.csv', index=False)
del train; del traini; gc.collect()

test = []
for df in testi:
    if len(test)>0: test = pd.concat([test, df])
    else: test = df[:]
    test.sort_values(by=['S_2'], inplace=True)
    test.reset_index(drop=True, inplace=True)
    test.drop_duplicates(subset=['customer_ID'], keep='last', inplace=True)
    del df; gc.collect()
test.fillna(0).to_csv('test.csv', index=False)
del test; del testi; gc.collect()

## AMEX Metrics
(From discusion)

In [None]:
def amex_metric(y_true: pd.DataFrame, y_pred: pd.DataFrame) -> float:
    def top_four_percent_captured(y_true: pd.DataFrame, y_pred: pd.DataFrame) -> float:
        df = (pd.concat([y_true, y_pred], axis='columns')
              .sort_values('prediction', ascending=False))
        df['weight'] = df['target'].apply(lambda x: 20 if x == 0 else 1)
        four_pct_cutoff = int(0.04 * df['weight'].sum())
        df['weight_cumsum'] = df['weight'].cumsum()
        df_cutoff = df.loc[df['weight_cumsum'] <= four_pct_cutoff]
        return (df_cutoff['target'] == 1).sum() / (df['target'] == 1).sum()

    def weighted_gini(y_true: pd.DataFrame, y_pred: pd.DataFrame) -> float:
        df = (pd.concat([y_true, y_pred], axis='columns')
              .sort_values('prediction', ascending=False))
        df['weight'] = df['target'].apply(lambda x: 20 if x == 0 else 1)
        df['random'] = (df['weight'] / df['weight'].sum()).cumsum()
        total_pos = (df['target'] * df['weight']).sum()
        df['cum_pos_found'] = (df['target'] * df['weight']).cumsum()
        df['lorentz'] = df['cum_pos_found'] / total_pos
        df['gini'] = (df['lorentz'] - df['random']) * df['weight']
        return df['gini'].sum()

    def normalized_weighted_gini(y_true: pd.DataFrame, y_pred: pd.DataFrame) -> float:
        y_true_pred = y_true.rename(columns={'target': 'prediction'})
        return weighted_gini(y_true, y_pred) / weighted_gini(y_true, y_true_pred)

    g = normalized_weighted_gini(y_true, y_pred)
    d = top_four_percent_captured(y_true, y_pred)

    return 0.5 * (g + d)

## Training CatBoostClassifier 

In [None]:
train = pd.read_csv('train.csv')
cat_features = ['B_30', 'B_31', 'B_38', 'D_114', 'D_116', 'D_117', 'D_120', 'D_126', 'D_63', 'D_64', 'D_66', 'D_68']
for c in cat_features: train[c] = train[c].astype(str)

In [None]:
x1, x2, y1, y2 = train_test_split(train[col], train.target, test_size=0.20, random_state=42)

In [None]:
clf = CatBoostClassifier(iterations=10000, random_state=42, nan_mode='Min')
clf.fit(x1, y1, eval_set=[(x2, y2)], cat_features=cat_features,  verbose=50, early_stopping_rounds=20)
preds = clf.predict_proba(x2)[:, 1]

In [None]:
_ = gc.collect()

In [None]:
import os

In [None]:
test = pd.read_csv('test.csv')
for c in cat_features: test[c] = test[c].astype(str)
test['prediction'] = clf.predict_proba(test[col])[:, 1]
sub2 = test[['customer_ID', 'prediction']]
del test;  gc.collect()
os.remove ('test.csv')

In [None]:
sub2.columns = ['customer_ID', 'prediction2']
blend = pd.merge(sub1, sub2, how='inner', on='customer_ID')
blend.prediction = (blend.prediction * 0.955 + blend.prediction2 * 0.045)
blend[['customer_ID', 'prediction']].to_csv('submission.csv', index=False)

In [None]:
sub2.columns = ['customer_ID','prediction2']
blend = pd.merge(sub1, sub2, how='inner', on='customer_ID')
blend.prediction = (blend.prediction * 0.955 + blend.prediction2 * 0.045)
blend[['customer_ID','prediction']].to_csv('submission.csv', index=False)