In [1]:
gpu_info = !nvidia-smi
gpu_info = '\n'.join(gpu_info)
if gpu_info.find('failed') >= 0:
  print('Select the Runtime → "Change runtime type" menu to enable a GPU accelerator, ')
  print('and then re-execute this cell.')
else:
  print(gpu_info)

Fri Apr 17 03:30:59 2020       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 440.64.00    Driver Version: 418.67       CUDA Version: 10.1     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|   0  Tesla P100-PCIE...  Off  | 00000000:00:04.0 Off |                    0 |
| N/A   35C    P0    27W / 250W |      0MiB / 16280MiB |      0%      Default |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Processes:                                                       GPU Memory |
|  GPU       PID   Type   Process name                             Usage      |
|  No ru

In [0]:
import pandas as pd
import numpy as np
from scipy.stats import ks_2samp
from sklearn.feature_selection import RFECV
from sklearn.metrics import make_scorer
import xgboost as xgb

In [3]:
%%time
df = pd.read_csv('/content/drive/My Drive/Data/features.csv')
df['Date'] = pd.to_datetime(df['Date'].apply(lambda x: x.split()[0]))

CPU times: user 4.88 s, sys: 439 ms, total: 5.32 s
Wall time: 5.6 s


In [4]:
%%time
for key in df.columns:
    if 'Days' in key:
        df[key] = df[key].apply(round)

df = df[(df['Date'] >= pd.to_datetime('2010-01-15')) & (df['Date'] <= pd.to_datetime('2010-10-31'))]
df = df.drop(columns = ['Recnum', 'Cardnum', 'Merchnum', 'Merch description', 'Merch state', 'Merch zip', 'Transtype',
                            'Cardnum_Merchnum', 'Cardnum_Merch zip', 'Cardnum_Merch state', 'Date'])

CPU times: user 550 ms, sys: 45.1 ms, total: 596 ms
Wall time: 596 ms


In [0]:
fraud = df[df['Fraud'] == 1]
not_fraud = df[df['Fraud'] == 0]

def ks_score(column):
    return ks_2samp(fraud[column], not_fraud[column])[0]

def calculate_fdr(y_true, y_pred, flag = -1):
    tot = y_true.sum()
    pos = y_true[y_pred.argsort()[::flag]][:int(len(y_true) * 0.03)].sum()
    return pos / tot *100

In [6]:
%%time
scores = pd.DataFrame({'feature': df.columns})
scores['KS_score'] = scores['feature'].apply(ks_score)
scores['FDR_score'] = scores['feature'].apply(lambda x: max(calculate_fdr(df['Fraud'].values, df[x].values, -1), calculate_fdr(df['Fraud'].values, df[x].values, 1)))
scores['KS_rank'] = scores['KS_score'].rank(ascending = False)
scores['FDR_rank'] = scores['FDR_score'].rank(ascending = False)
scores['average_rank'] = (scores['FDR_rank'] + scores['KS_rank'])/2
scores = scores.sort_values(by = 'average_rank', ascending = True).head(81)

CPU times: user 7.17 s, sys: 26.2 ms, total: 7.19 s
Wall time: 7.2 s


In [7]:
scores.head()

Unnamed: 0,feature,KS_score,FDR_score,KS_rank,FDR_rank,average_rank
1,Fraud,1.0,100.0,1.0,1.0,1.0
191,Amount_Cardnum_Merch zip_sum_3d,0.681104,64.400922,3.0,2.0,2.5
200,Amount_Cardnum_Merch zip_sum_7d,0.683414,63.59447,2.0,5.5,3.75
136,Amount_Cardnum_Merchnum_sum_3d,0.677825,63.824885,5.0,3.0,4.0
145,Amount_Cardnum_Merchnum_sum_7d,0.680373,63.59447,4.0,5.5,4.75


In [0]:
scores.drop(labels=1, axis=0, inplace = True)

In [0]:
sel = scores['feature'].to_list()

In [0]:
X = df[sel]
y = df['Fraud']

In [0]:
fdr_scorer = make_scorer(calculate_fdr, needs_proba = True)

In [12]:
%%time
rf_rfe = RFECV(xgb.XGBRFClassifier(tree_method = 'gpu_hist'), min_features_to_select = 1, step = 1, verbose = 0, scoring = fdr_scorer, cv = 5)
rf_rfe.fit(X, y)

CPU times: user 1min 51s, sys: 40.7 s, total: 2min 31s
Wall time: 2min 31s


In [13]:
rf_rfe.ranking_

array([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1])

In [14]:
%%time
gbm_rfe = RFECV(xgb.XGBClassifier(tree_method = 'gpu_hist'), min_features_to_select = 1, step = 1, verbose = 0, scoring = fdr_scorer, cv = 5)
gbm_rfe.fit(X, y)

CPU times: user 1min 59s, sys: 46.5 s, total: 2min 46s
Wall time: 2min 45s


In [15]:
gbm_rfe.ranking_

array([ 1,  1, 48,  1,  1,  1,  1, 20, 53,  1,  1,  8,  2,  1, 10,  1, 45,
        6,  1,  1,  3, 40, 56, 14, 54, 23, 13, 32, 51,  1,  1, 35, 22, 50,
       36,  7, 31,  1,  1, 25, 19, 55, 49,  1,  1, 39,  1, 41,  5, 34,  1,
       33, 38, 44, 12, 30, 43, 16, 37, 26,  1, 42, 21, 11,  1, 15, 46,  1,
       24, 47, 27, 17, 18, 29,  4, 52,  1,  9,  1, 28])

In [0]:
scores['random_forest_rank'] = rf_rfe.ranking_
scores['boosting_tree_rank'] = gbm_rfe.ranking_
scores['model_rank'] = (scores['random_forest_rank'] + scores['boosting_tree_rank'])/2
scores = scores.sort_values(by = 'model_rank', ascending = True).head(30)

In [17]:
scores['feature'].to_list()

['Amount_Cardnum_Merch zip_sum_3d',
 'Amount_Cardnum_sum_1d',
 'Amount_Cardnum_max_14d',
 'Amount_Cardnum_max_7d',
 'Amount_Merchnum_max_7d',
 'Amount_Merchnum_sum_1d',
 'Amount_Cardnum_Merch zip_max_30d',
 'Amount_Cardnum_Merch zip_max_14d',
 'Amount_Cardnum_sum_0d',
 'Amount_Cardnum_Merch zip_sum_0d',
 'Amount_Cardnum_mean_14d',
 'Amount_Merchnum_sum_0d',
 'Amount_Cardnum_Merch state_sum_0d',
 'Amount_Merchnum_sum_3d',
 'Amount_Cardnum_Merch state_sum_7d',
 'Amount_Cardnum_Merchnum_sum_1d',
 'Amount_Merchnum_mean_1d',
 'Amount_Merchnum_max_1d',
 'Amount_Cardnum_Merch zip_sum_14d',
 'Amount_Cardnum_Merchnum_sum_14d',
 'Amount_Cardnum_Merch state_sum_3d',
 'Amount_Cardnum_Merchnum_sum_7d',
 'Amount_Merchnum_sum_7d',
 'Amount_Cardnum_Merch zip_sum_7d',
 'Amount_Cardnum_Merch zip_mean_14d',
 'Amount_Cardnum_Merch state_sum_14d',
 'Amount_Cardnum_Merchnum_max_14d',
 'Amount_Cardnum_Merchnum_mean_14d',
 'Amount_Merchnum_max_3d',
 'Amount_Cardnum_Merch state_max_3d']