In [0]:
#!pip install -U imbalanced-learn

In [0]:
gpu_info = !nvidia-smi
gpu_info = '\n'.join(gpu_info)
if gpu_info.find('failed') >= 0:
  print('Select the Runtime → "Change runtime type" menu to enable a GPU accelerator, ')
  print('and then re-execute this cell.')
else:
  print(gpu_info)

Sun Apr 19 07:23:57 2020       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 440.64.00    Driver Version: 418.67       CUDA Version: 10.1     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|   0  Tesla P100-PCIE...  Off  | 00000000:00:04.0 Off |                    0 |
| N/A   49C    P0    28W / 250W |      0MiB / 16280MiB |      0%      Default |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Processes:                                                       GPU Memory |
|  GPU       PID   Type   Process name                             Usage      |
|  No ru

In [0]:
import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings("ignore")
from hyperopt import hp
import gc
import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn.metrics import make_scorer
from imblearn.combine import SMOTETomek
from sklearn.model_selection import TimeSeriesSplit
from sklearn.model_selection import cross_validate
from collections import Counter
from sklearn.feature_selection import SelectFromModel
pd.set_option('display.max_columns',None)
pd.set_option('display.max_rows',None)
from hyperopt import fmin, tpe, STATUS_OK, STATUS_FAIL, Trials

In [0]:
%%time
data = pd.read_csv('/content/drive/My Drive/Data/features.csv')
data['Date'] = pd.to_datetime(data['Date'].apply(lambda x: x.split()[0]))

CPU times: user 6.07 s, sys: 543 ms, total: 6.62 s
Wall time: 6.99 s


In [0]:
%%time
for key in data.columns:
    if 'Days' in key:
        data[key] = data[key].apply(round)

CPU times: user 504 ms, sys: 59 ms, total: 563 ms
Wall time: 567 ms


In [0]:
sel = ['Amount_Cardnum_Merch zip_sum_3d',
 'Amount_Cardnum_sum_1d',
 'Amount_Cardnum_max_14d',
 'Amount_Cardnum_max_7d',
 'Amount_Merchnum_max_7d',
 'Amount_Merchnum_sum_1d',
 'Amount_Cardnum_Merch zip_max_30d',
 'Amount_Cardnum_Merch zip_max_14d',
 'Amount_Cardnum_sum_0d',
 'Amount_Cardnum_Merch zip_sum_0d',
 'Amount_Cardnum_mean_14d',
 'Amount_Merchnum_sum_0d',
 'Amount_Cardnum_Merch state_sum_0d',
 'Amount_Merchnum_sum_3d',
 'Amount_Cardnum_Merch state_sum_7d',
 'Amount_Cardnum_Merchnum_sum_1d',
 'Amount_Merchnum_mean_1d',
 'Amount_Merchnum_max_1d',
 'Amount_Cardnum_Merch zip_sum_14d',
 'Amount_Cardnum_Merchnum_sum_14d',
 'Amount_Cardnum_Merch state_sum_3d',
 'Amount_Cardnum_Merchnum_sum_7d',
 'Amount_Merchnum_sum_7d',
 'Amount_Cardnum_Merch zip_sum_7d',
 'Amount_Cardnum_Merch zip_mean_14d',
 'Amount_Cardnum_Merch state_sum_14d',
 'Amount_Cardnum_Merchnum_max_14d',
 'Amount_Cardnum_Merchnum_mean_14d',
 'Amount_Merchnum_max_3d',
 'Amount_Cardnum_Merch state_max_3d']

In [0]:
data = data[sel + ['Date', 'Fraud']]

In [0]:
data = data[data['Date'] > pd.to_datetime('2010-01-14')]
train = data[data['Date'] <= pd.to_datetime('2010-10-31')].copy()
test = data[data['Date'] > pd.to_datetime('2010-10-31')].copy()

In [0]:
train, val = train_test_split(train, test_size = 0.2, random_state = 0)

In [0]:
X = train.drop(columns = ['Date', 'Fraud'])
y = train['Fraud']

In [0]:
Counter(y)

Counter({0: 63833, 1: 672})

In [0]:
def calculate_fdr(y_true, y_pred):
    tot = y_true.sum()
    pos = y_true[y_pred.argsort()[::-1]][:int(len(y_true) * 0.03)].sum()
    return pos / tot *100

In [0]:
fdr_scorer = make_scorer(calculate_fdr, needs_proba = True)

In [0]:
def rf_score(params):
    selector = SelectFromModel(estimator=xgb.XGBRFClassifier(tree_method = 'gpu_hist'), max_features = params['max_features']).fit(X, y)
    del params['max_features']
    X_best = selector.transform(X)
    resampler = SMOTETomek(sampling_strategy = params['sampling_strategy'])
    X_best, y_best = resampler.fit_resample(X_best, y)
    del params['sampling_strategy']
    fdr = np.mean(cross_validate(xgb.XGBRFClassifier(**params), X_best, y_best, scoring = fdr_scorer, cv = TimeSeriesSplit(n_splits = 5))['test_score'])
    return {'loss': -1 * fdr, 'status': STATUS_OK}

In [0]:
def rf_optimize(evals, trials, optimizer=tpe.suggest, random_state=0):
    space = {
        'sampling_strategy': hp.quniform('sampling_strategy', 0.015, 0.05, 0.01),
        'max_features': hp.choice('max_features', np.arange(1, 30, dtype=int)),
        'n_estimators': hp.choice('n_estimators', np.arange(200, 600, dtype=int)),
        'scale_pos_weight': hp.quniform('scale_pos_weight', 0, 1, 0.01),
        'eta': hp.quniform('eta', 0.025, 0.25, 0.025),
        'max_depth':  hp.choice('max_depth', np.arange(1, 14, dtype=int)),
        'min_child_weight': hp.quniform('min_child_weight', 1, 100, 1),
        'subsample': hp.quniform('subsample', 0.7, 1, 0.05),
        'gamma': hp.quniform('gamma', 0.5, 10, 0.05),
        'colsample_bytree': hp.quniform('colsample_bytree', 0.5, 1, 0.05),
        'alpha' :  hp.quniform('alpha', 0, 10, 1),
        'lambda': hp.quniform('lambda', 1, 2, 0.1),
        'tree_method': 'gpu_hist',
        'seed': random_state
    }
    best = fmin(rf_score, space, algo=tpe.suggest, max_evals=evals, trials = trials, rstate =  np.random.RandomState(0))
    return best

In [0]:
%%time
trials = Trials()
n= 1000
rf_best_param = rf_optimize(evals = n,
                      optimizer=tpe.suggest,
                      trials = trials)

 43%|████▎     | 428/1000 [41:51<1:12:47,  7.63s/it, best loss: -75.29092949468702]

In [0]:
rf_best_param['tree_method'] = 'gpu_hist'
rf_best_param['seed'] = 0

In [0]:
rf_best_param

In [0]:
%%time
selector = SelectFromModel(estimator=xgb.XGBClassifier(tree_method = 'gpu_hist'), max_features = rf_best_param['max_features']+1)
X_best = selector.fit_transform(X, y)

In [0]:
del rf_best_param['max_features']

In [0]:
resampler = SMOTETomek(sampling_strategy = rf_best_param['sampling_strategy'])
X_best, y_best = resampler.fit_resample(X_best, y)

In [0]:
del rf_best_param['sampling_strategy']

In [0]:
%%time
model = xgb.XGBRFClassifier(**rf_best_param)
model.fit(X_best, y_best)

In [0]:
calculate_fdr(train['Fraud'].values, model.predict_proba(selector.transform(train.drop(columns = ['Date', 'Fraud'])))[:,1])

In [0]:
train['Fraud_score'] = model.predict_proba(selector.transform(train.drop(columns = ['Date', 'Fraud'])))[:,1]
train[['Fraud', 'Fraud_score']].sort_values(by = 'Fraud_score', ascending = False).to_csv('/content/drive/My Drive/Data/random_forest_train_fraud_scores.csv', index = False)

In [0]:
calculate_fdr(val['Fraud'].values, model.predict_proba(selector.transform(val.drop(columns = ['Date', 'Fraud'])))[:,1])

In [0]:
val['Fraud_score'] = model.predict_proba(selector.transform(val.drop(columns = ['Date', 'Fraud'])))[:,1]
val[['Fraud', 'Fraud_score']].sort_values(by = 'Fraud_score', ascending = False).to_csv('/content/drive/My Drive/Data/random_forest_test_fraud_scores.csv', index = False)

In [0]:
calculate_fdr(test['Fraud'].values, model.predict_proba(selector.transform(test.drop(columns = ['Date', 'Fraud'])))[:,1])

In [0]:
test['Fraud_score'] = model.predict_proba(selector.transform(test.drop(columns = ['Date', 'Fraud'])))[:,1]
test[['Fraud', 'Fraud_score']].sort_values(by = 'Fraud_score', ascending = False).to_csv('/content/drive/My Drive/Data/random_forest_oot_fraud_scores.csv', index = False)