In [0]:
#!pip install catboost

In [2]:
gpu_info = !nvidia-smi
gpu_info = '\n'.join(gpu_info)
if gpu_info.find('failed') >= 0:
  print('Select the Runtime → "Change runtime type" menu to enable a GPU accelerator, ')
  print('and then re-execute this cell.')
else:
  print(gpu_info)

Wed Apr 15 03:19:10 2020       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 440.64.00    Driver Version: 418.67       CUDA Version: 10.1     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|   0  Tesla P100-PCIE...  Off  | 00000000:00:04.0 Off |                    0 |
| N/A   42C    P0    27W / 250W |      0MiB / 16280MiB |      0%      Default |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Processes:                                                       GPU Memory |
|  GPU       PID   Type   Process name                             Usage      |
|  No ru

In [0]:
import pandas as pd
import numpy as np
from hyperopt import hp
import gc
import xgboost as xgb
import catboost as ctb
from sklearn.model_selection import train_test_split
pd.set_option('display.max_columns',None)
pd.set_option('display.max_rows',None)
from hyperopt import fmin, tpe, STATUS_OK, STATUS_FAIL, Trials

In [4]:
%%time
data = pd.read_csv('/content/drive/My Drive/Data/features.csv')
data['Date'] = pd.to_datetime(data['Date'].apply(lambda x: x.split()[0]))

CPU times: user 4.68 s, sys: 383 ms, total: 5.06 s
Wall time: 5.32 s


In [5]:
%%time
for key in data.columns:
    if 'Days' in key:
        data[key] = data[key].apply(round)

CPU times: user 405 ms, sys: 41.9 ms, total: 447 ms
Wall time: 448 ms


In [0]:
sel = ['Amount_Cardnum_sum_0d',
 'Cardnum_count_0d',
 'Amount_Cardnum_sum_1d',
 'Amount_Merchnum_mean_0d',
 'Amount_Merchnum_max_0d',
 'Amount_Merchnum_sum_0d',
 'Amount_Merchnum_max_1d',
 'Amount_Cardnum_Merchnum_max_0d',
 'Amount_Cardnum_Merch zip_sum_0d',
 'Amount_Cardnum_Merch zip_max_3d',
 'Amount_Cardnum_Merch zip_sum_3d',
 'Cardnum_Merch zip_count_3d',
 'Amount_Cardnum_Merch zip_sum_7d',
 'Cardnum_Merch zip_count_7d',
 'Amount_Cardnum_Merch zip_max_14d',
 'Amount_Cardnum_Merch zip_sum_14d',
 'Cardnum_Merch zip_count_14d',
 'Amount_Cardnum_Merch zip_max_30d',
 'Amount_Cardnum_Merch zip_sum_30d',
 'Amount_Cardnum_Merch state_max_1d',
 'Amount_Cardnum_Merch state_max_3d',
 'Amount_Cardnum_Merch state_sum_3d',
 'Amount_Cardnum_Merch state_max_7d',
 'count_Cardnum_0d/mean_mean_Cardnum_14d',
 'count_Cardnum_0d/mean_mean_Cardnum_30d',
 'count_Cardnum_0d/mean_count_Cardnum_30d',
 'count_Cardnum_1d/mean_mean_Cardnum_30d',
 'count_Cardnum_1d/mean_count_Cardnum_30d',
 'count_Merchnum_0d/mean_count_Merchnum_7d',
 'count_Merchnum_0d/mean_count_Merchnum_14d']

In [0]:
data = data[sel + ['Date', 'Fraud']]

In [0]:
data = data[data['Date'] > pd.to_datetime('2010-01-14')]
train = data[data['Date'] <= pd.to_datetime('2010-10-31')].copy()
test = data[data['Date'] > pd.to_datetime('2010-10-31')].copy()

In [0]:
train, val = train_test_split(train, test_size = 0.2, random_state = 0)

In [0]:
X = train.drop(columns = ['Date', 'Fraud'])
y = train['Fraud']

In [0]:
def calculate_fdr(y_true, y_pred):
    tot = y_true.sum()
    pos = y_true[y_pred.argsort()[::-1]][:int(len(y_true) * 0.03)].sum()
    return pos / tot *100

In [0]:
def xgbrf_score(params):
    model = xgb.XGBRFClassifier(**params)
    model.fit(X, y)
    predictions = model.predict_proba(val.drop(columns = ['Date', 'Fraud']))[:,1]
    fdr = calculate_fdr(val['Fraud'].values, predictions)
    del model, predictions
    gc.collect()
    return {'loss': -1 * fdr, 'status': STATUS_OK}

In [0]:
def xgbrf_optimize(evals, trials, optimizer=tpe.suggest, random_state=0):
    space = {
        'n_estimators': hp.choice('n_estimators', np.arange(200, 600, dtype=int)),
        'eta': hp.quniform('eta', 0.025, 0.25, 0.025),
        'max_depth':  hp.choice('max_depth', np.arange(1, 14, dtype=int)),
        'min_child_weight': hp.quniform('min_child_weight', 1, 10, 1),
        'subsample': hp.quniform('subsample', 0.7, 1, 0.05),
        'gamma': hp.quniform('gamma', 0.5, 1, 0.05),
        'colsample_bytree': hp.quniform('colsample_bytree', 0.7, 1, 0.05),
        'alpha' :  hp.quniform('alpha', 0, 10, 1),
        'lambda': hp.quniform('lambda', 1, 2, 0.1),
        'tree_method': 'gpu_hist',
        'seed': random_state
    }
    best = fmin(xgbrf_score, space, algo=tpe.suggest, max_evals=evals, trials = trials)
    return best

In [0]:
%%time
trials = Trials()
n= 1000
xgbrf_best_param = xgbrf_optimize(evals = n,
                      optimizer=tpe.suggest,
                      trials = trials)

 88%|████████▊ | 877/1000 [49:46<05:05,  2.48s/it, best loss: -83.16326530612244]

In [0]:
xgbrf_best_param['tree_method'] = 'gpu_hist'
xgbrf_best_param['max_depth'] = max(xgbrf_best_param['max_depth'], 10)
xgbrf_best_param['seed'] = 0

In [0]:
xgbrf_best_param

In [0]:
xgbrf_model = xgb.XGBRFClassifier(**xgbrf_best_param)

In [0]:
%%time
xgbrf_model.fit(X, y)

In [0]:
predictions = xgbrf_model.predict_proba(train.drop(columns = ['Date', 'Fraud']))[:,1]
calculate_fdr(train['Fraud'].values, predictions)

In [0]:
predictions = xgbrf_model.predict_proba(val.drop(columns = ['Date', 'Fraud']))[:,1]
calculate_fdr(val['Fraud'].values, predictions)

In [0]:
predictions = xgbrf_model.predict_proba(test.drop(columns = ['Date', 'Fraud']))[:,1]
xgbrf_fdr = calculate_fdr(test['Fraud'].values, predictions)
print(xgbrf_fdr)

In [0]:
def ctb_score(params):
    model = ctb.CatBoostClassifier(**params)
    model.fit(X, y)
    predictions = model.predict_proba(val.drop(columns = ['Date', 'Fraud']))[:,1]
    fdr = calculate_fdr(val['Fraud'].values, predictions)
    del model, predictions
    gc.collect()
    return {'loss': -1 * fdr, 'status': STATUS_OK}

In [0]:
def ctb_optimize(evals, trials, optimizer=tpe.suggest, random_state=0):
    space = {
            'depth': hp.choice('depth', np.arange(3, 12, dtype=int)),
            'max_ctr_complexity': hp.choice('max_ctr_complexity', np.arange(1, 16, dtype=int)),
            'border_count': hp.choice('border_count', [64, 128, 255]),
            'learning_rate': hp.loguniform('learning_rate', -5, 0),
            'one_hot_max_size': hp.choice('one_hot_max_size', np.arange(0, 25, dtype=int)),
            'l2_leaf_reg': hp.loguniform('l2_leaf_reg', 0, np.log(10)),
            'bagging_temperature': hp.uniform('bagging_temperature', 0, 1),
            'iterations': hp.choice('iterations', np.arange(200, 600, dtype=int)),
            'task_type': 'GPU',
            'verbose': 0,
            'random_seed': random_state,
        }
    best = fmin(ctb_score, space, algo=tpe.suggest, max_evals=evals, trials = trials)
    return best

In [0]:
%%time
trials = Trials()
n= 1000
ctb_best_param = ctb_optimize(evals = n,
                      optimizer=tpe.suggest,
                      trials = trials)

In [0]:
ctb_best_param['task_type'] = 'GPU'
ctb_best_param['depth'] = max(ctb_best_param['depth'], 10)
ctb_best_param['verbose'] = 0
ctb_best_param['random_seed'] = 0

In [0]:
ctb_best_param

In [0]:
ctb_model = ctb.CatBoostClassifier(**ctb_best_param)

In [0]:
%%time
ctb_model.fit(X, y)

In [0]:
predictions = ctb_model.predict_proba(train.drop(columns = ['Date', 'Fraud']))[:,1]
calculate_fdr(train['Fraud'].values, predictions)

In [0]:
predictions = ctb_model.predict_proba(val.drop(columns = ['Date', 'Fraud']))[:,1]
calculate_fdr(val['Fraud'].values, predictions)

In [0]:
predictions = ctb_model.predict_proba(test.drop(columns = ['Date', 'Fraud']))[:,1]
ctb_fdr = calculate_fdr(test['Fraud'].values, predictions)
print(ctb_fdr)

# Ensemble model

In [0]:
predictions = (ctb_model.predict_proba(train.drop(columns = ['Date', 'Fraud']))[:,1] * ctb_fdr + xgbrf_model.predict_proba(train.drop(columns = ['Date', 'Fraud']))[:,1] * xgbrf_fdr) / (ctb_fdr + xgbrf_fdr)
calculate_fdr(train['Fraud'].values, predictions)

In [0]:
predictions = (ctb_model.predict_proba(val.drop(columns = ['Date', 'Fraud']))[:,1] * ctb_fdr + xgbrf_model.predict_proba(val.drop(columns = ['Date', 'Fraud']))[:,1] * xgbrf_fdr) / (ctb_fdr + xgbrf_fdr)
calculate_fdr(val['Fraud'].values, predictions)

In [0]:
predictions = (ctb_model.predict_proba(test.drop(columns = ['Date', 'Fraud']))[:,1] * ctb_fdr + xgbrf_model.predict_proba(test.drop(columns = ['Date', 'Fraud']))[:,1] * xgbrf_fdr) / (ctb_fdr + xgbrf_fdr)
calculate_fdr(test['Fraud'].values, predictions)