In [1]:
import pickle
import keras

import pandas as pd
import numpy  as np

from sklearn.preprocessing   import StandardScaler
from keras.models            import Sequential
from keras.layers            import Input
from keras.layers            import Dense
from keras.layers            import BatchNormalization
from keras.layers            import Activation
from keras.layers            import Dropout
from keras.utils.np_utils    import to_categorical
from sklearn.model_selection import train_test_split
from tqdm                    import tqdm

Using TensorFlow backend.


In [2]:
data_path = '../data/'

In [None]:
non_std_df = pd.read_csv(data_path + 'creditcard.csv')
non_std_df.describe()

In [None]:
features    = non_std_df.drop(['Class'], axis = 1)
target      = non_std_df['Class']
scaler      = StandardScaler()
features[:] = scaler.fit_transform(features)
features.describe()

In [None]:
(df['Class'] == 0).sum(), (df['Class'] == 1).sum()

In [None]:
scaler_file = open(data_path + 'scaler.pkl', 'wb')
pickle.dump(scaler, scaler_file)
scaler_file.close()

In [None]:
df = pd.concat([features, target], axis = 1)
df.to_csv(data_path + 'standardized_credit_card.csv')

In [3]:
scaler_file = open(data_path + 'scaler.pkl', 'rb')
scaler      = pickle.load(scaler_file)
df          = pd.read_csv(data_path + 'standardized_credit_card.csv')
scaler_file.close()

In [4]:
def generate_training_batch(big_class, small_class, final_proportion):
    batch_size              = int(round(big_class.shape[0] / (1 - final_proportion)))
    print('batch size', batch_size)
    small_class_sample_size = int(round(batch_size * final_proportion))  
    print('small_class_sample_size', small_class_sample_size)
    oversampled_small       = small_class.sample(small_class_sample_size, replace = True)
    
    return pd.concat([big_class, oversampled_small])

In [5]:
def df_to_numpy_arrays(df, test_prop):
    # regular transaction df
    reg_df   = df[df['Class'] == 0]
    reg_feat = reg_df.drop(['Class'], axis = 1)
    reg_targ = reg_df['Class']

    # anomalies transaction df
    anom_df   = df[df['Class'] == 1]
    anom_feat = anom_df.drop(['Class'], axis = 1)
    anom_targ = anom_df['Class']

    # splitting
    reg_feat_train, reg_feat_test, reg_targ_train, reg_targ_test     = train_test_split(reg_feat, reg_targ, test_size = test_prop)
    anom_feat_train, anom_feat_test, anom_targ_train, anom_targ_test = train_test_split(anom_feat, anom_targ, test_size = test_prop)
    
    train_feat   = pd.concat([reg_feat_train, anom_feat_train]).as_matrix()
    train_target = to_categorical(pd.concat([reg_targ_train, anom_targ_train]).as_matrix())
    test_feat    = pd.concat([reg_feat_test, anom_feat_test]).as_matrix()
    test_target  = to_categorical(pd.concat([reg_targ_test, anom_targ_test]).as_matrix())

    return train_feat, train_target, test_feat, test_target

In [22]:
def train_model(model, df, step_number, epoch_start, epoch_end, fit_params, validation_prop):
    small_class_prop = df[df['Class'] == 1].shape[0] / df.shape[0]
    steps            = np.linspace(0.5, small_class_prop, step_number)
    epochs           = np.linspace(epoch_start, epoch_end, step_number)
    print(epochs)
    big_class        = df[df['Class'] == 0]
    small_class      = df[df['Class'] == 1]
    
    for step, epoch in tqdm(zip(steps, epochs)):
        training_batch                   = generate_training_batch(big_class, small_class, step)
        X_train, y_train, X_test, y_test = df_to_numpy_arrays(training_batch, validation_prop)
        fit_params['x']                  = X_train
        fit_params['y']                  = y_train
        fit_params['epochs']             = int(round(epoch))
        fit_params['validation_data']    = (X_test, y_test)
        print('Oversampling small class to %.2f%% of the training data, %d epochs' % (step, fit_params['epochs']))
        model.fit(**fit_params)

In [26]:
model = Sequential([
    Dense(256, input_shape = (31,)),
    BatchNormalization(),
    Activation('relu'),
    Dropout(0.5),
    Dense(256),
    BatchNormalization(),
    Activation('relu'),
    Dropout(0.5),
    Dense(256),
    BatchNormalization(),
    Activation('relu'),
    Dropout(0.5),
    Dense(256),
    BatchNormalization(),
    Activation('relu'),
    Dropout(0.50),
    Dense(256),
    BatchNormalization(),
    Activation('relu'),
    Dropout(0.6),
    Dense(256),
    BatchNormalization(),
    Activation('relu'),
    Dropout(0.7),
    Dense(2, activation = 'softmax')
])
model.compile(optimizer = 'adam', loss = 'binary_crossentropy')

In [27]:
fit_params = {
    'batch_size': 2048,
    'verbose': 2
}

In [None]:
train_model(model, df, 2, 5, 5, fit_params, 0.15)


0it [00:00, ?it/s]

[ 5.  5.]
Oversampling small class to 0.50% of the training data, 5 epochs
batch size 568630
small_class_sample_size 284315


[A

Train on 483334 samples, validate on 85296 samples
Epoch 1/5





3s - loss: 0.7590 - val_loss: 0.6911
Epoch 2/5
3s - loss: 0.6853 - val_loss: 0.6861
Epoch 3/5
2s - loss: 0.6771 - val_loss: 0.7346
Epoch 4/5
3s - loss: 0.6010 - val_loss: 1.9012
Epoch 5/5


In [11]:
X           = df.drop(['Class'], axis = 1).as_matrix()
y           = df['Class'].as_matrix()
predictions = model.predict_classes(X, batch_size = 2048, verbose = 0)
good_preds  = (predictions == y).sum()
total_preds = X.shape[0]
print('accuracy: %.2f' % (good_preds / total_preds))

accuracy: 0.52


In [12]:
from collections import Counter

In [13]:
Counter(predictions)

Counter({0: 147267, 1: 137540})