In [2]:
import pickle
import numpy as np
import pandas as pd

from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix

In [3]:
data_path    = '../data/'
random_state = 142857

In [4]:
original_df = pd.read_csv(data_path + 'standardized_credit_card.csv')
original_df.head()

Unnamed: 0,V1,V2,V3,V4,V5,V6,V7,V8,V9,V10,...,V21,V22,V23,V24,V25,V26,V27,V28,Amount,Class
0,-0.694242,-0.044075,1.672773,0.973366,-0.245117,0.347068,0.193679,0.082637,0.331128,0.083386,...,-0.024923,0.382854,-0.176911,0.110507,0.246585,-0.39217,0.330892,-0.063781,0.244964,0
1,0.608496,0.161176,0.109797,0.316523,0.043483,-0.06182,-0.0637,0.071253,-0.232494,-0.15335,...,-0.307377,-0.880077,0.162201,-0.561131,0.320694,0.261069,-0.022256,0.044608,-0.342475,0
2,-0.6935,-0.811578,1.169468,0.268231,-0.364572,1.351454,0.639776,0.207373,-1.378675,0.1907,...,0.337632,1.063358,1.45632,-1.138092,-0.628537,-0.288447,-0.137137,-0.181021,1.160686,0
3,-0.493325,-0.112169,1.182516,-0.609727,-0.007469,0.93615,0.192071,0.316018,-1.262503,-0.050468,...,-0.147443,0.007267,-0.304777,-1.941027,1.241904,-0.460217,0.155396,0.186189,0.140534,0
4,-0.59133,0.531541,1.021412,0.284655,-0.295015,0.071999,0.479302,-0.22651,0.744326,0.691625,...,-0.012839,1.100011,-0.220123,0.23325,-0.395202,1.041611,0.54362,0.651816,-0.073403,0


In [7]:
fraud_filter = original_df.Class == 1
fraud        = original_df[ fraud_filter]
non_fraud    = original_df[~fraud_filter]
n_fraud      = fraud.shape[0]
fraud_subset = fraud.sample(n_fraud // 2, random_state = random_state)
df           = pd.concat([non_fraud, fraud_subset])

In [4]:
def generate_dataset(df, non_anom_proportion):
    X                  = df.drop(['Class'], axis = 1)
    y                  = df['Class']
    fraud              = X[y == 1]
    non_fraud          = X[y == 0]
    fraud_number       = fraud.shape[0]
    undersampling_size = int(round(fraud_number * non_anom_proportion))
    small_X            = pd.concat([fraud, non_fraud.sample(undersampling_size, random_state = random_state)])
    small_y            = np.array([1] * fraud_number + [0] * undersampling_size)
    
    return small_X, small_y

In [5]:
def try_various_undersampling(df, start_value, end_value, steps, create_model, test_size = 0.15):
    train_confusion_matrices  = []
    test_confusion_matrices   = []
    full_confusion_matrices   = []
    undersampling_proportions = np.linspace(start_value, end_value, steps)
    X_full = df.drop(['Class'], axis = 1)
    y_full = df['Class']
    
    for prop in undersampling_proportions:
        X, y                             = generate_dataset(df, prop)
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = test_size, random_state = random_state)
        model                            = create_model()
        model.fit(X_train, y_train)
        train_confusion_matrices.append(confusion_matrix(y_train, model.predict(X_train)))
        test_confusion_matrices.append(confusion_matrix(y_test, model.predict(X_test)))
        full_confusion_matrices.append(confusion_matrix(y_full, model.predict(X_full)))
        
    return undersampling_proportions, train_confusion_matrices, test_confusion_matrices, full_confusion_matrices

In [6]:
create_model                                                 = lambda : RandomForestClassifier(n_estimators = 20, n_jobs = -1)
proportions, train_conf_mats, test_conf_mats, full_conf_mats = try_various_undersampling(df, 1, 100, 25, create_model)

In [7]:
for i, prop in enumerate(proportions):
    print('########## %.2fx more of non fraud ###############' % prop)
    print('Training set confusion matrix')
    print(train_conf_mats[i])
    print('Test set confusion matrix')
    print(test_conf_mats[i])
    print('Full imbalanced dataset confusion matrix')
    print(full_conf_mats[i])
    print('##################################################\n')

########## 1.00x more of non fraud ###############
Training set confusion matrix
[[416   0]
 [  0 420]]
Test set confusion matrix
[[76  0]
 [14 58]]
Full imbalanced dataset confusion matrix
[[278065   6250]
 [    14    478]]
##################################################

########## 5.12x more of non fraud ###############
Training set confusion matrix
[[2129    0]
 [   1  431]]
Test set confusion matrix
[[393   0]
 [  6  54]]
Full imbalanced dataset confusion matrix
[[283587    728]
 [     7    485]]
##################################################

########## 9.25x more of non fraud ###############
Training set confusion matrix
[[3864    0]
 [   4  418]]
Test set confusion matrix
[[686   1]
 [  9  61]]
Full imbalanced dataset confusion matrix
[[283993    322]
 [    13    479]]
##################################################

########## 13.38x more of non fraud ###############
Training set confusion matrix
[[5580    0]
 [   6  425]]
Test set confusion matrix
[[998   2]
 [  5  