# Credit Card Fraud Detection using a Deep Neural Network

For more information on the dataset look at the other file, if you havent already.

## Load Data

In [11]:
#Import relevant dependencies
import numpy as np
import pandas as pd
import tensorflow as tf
from sklearn import preprocessing as pp

#Loading data into a numpy array
raw_data = np.loadtxt('creditcard.csv', delimiter = ',')

#Excludes the targets
unscaled_inputs_all = raw_data[:,:-1]

#Last column stores the targets
targets_all = raw_data[:,-1]

## Balance the Dataset

In [12]:
#Targets are binary (1 or 0) so getting the sum of all target values is equivalent to how many 1s are in the dataset
num_one_targets = int(np.sum(targets_all))

#Our zero target counter
num_zero_targets = 0

#Once we have an equal amount of 0s and 1s we need to remove all the other data points and we will store the indices to be removed in a list
indices_to_remove = []

#Iterating through our dataset to load our indices_to_remove list
for i in range(targets_all.shape[0]):
    if targets_all[i] == 0.:
        num_zero_targets += 1.
        if num_zero_targets > num_one_targets:
            indices_to_remove.append(i)  

#Storing the new balanced inputs and targets by removing all the unnecessary datapoints
unscaled_inputs_equal_priors = np.delete(unscaled_inputs_all, indices_to_remove, axis = 0)
targets_equal_priors = np.delete(targets_all, indices_to_remove, axis = 0)

## Standardize Inputs

In [13]:
#Standardizes the inputs for more accuracy
scaled_inputs = pp.scale(unscaled_inputs_equal_priors)

## Shuffle the Data

In [14]:
#np.arange([start], stop) returns evenly spaced values in a given interval
shuffled_indices = np.arange(scaled_inputs.shape[0])
#np.random.shuffle(X) is a method that shuffles numbers in a given sequence
np.random.shuffle(shuffled_indices)

#Storing the new shuffled inputs and targets
shuffled_inputs = scaled_inputs[shuffled_indices]
shuffled_targets = targets_equal_priors[shuffled_indices]

## Split Data into Train, Test Subsets

In [15]:
#Count of all data points
samples_count = shuffled_inputs.shape[0]

#Storing the count of data points in each subset
train_samples_count = int(0.8*samples_count)
validation_samples_count = int(0.1*samples_count)
test_samples_count = int(0.1*samples_count)

train_inputs = shuffled_inputs[: train_samples_count]
train_targets = shuffled_targets[: train_samples_count]

validation_inputs = shuffled_inputs[train_samples_count : train_samples_count + validation_samples_count]
validation_targets = shuffled_targets[train_samples_count : train_samples_count + validation_samples_count]

test_inputs = shuffled_inputs[train_samples_count + validation_samples_count:]
test_targets = shuffled_targets[train_samples_count + validation_samples_count:]

#Shows how balanced the datasets are
print(np.sum(train_targets), train_samples_count, np.sum(train_targets)/train_samples_count)
print(np.sum(validation_targets), validation_samples_count, np.sum(validation_targets)/validation_samples_count)
print(np.sum(test_targets), test_samples_count, np.sum(test_targets)/test_samples_count)

401.0 787 0.5095298602287166
49.0 98 0.5
42.0 98 0.42857142857142855


In [16]:
np.savez('audiobooks_data_train', inputs = train_inputs, targets = train_targets)
np.savez('audiobooks_data_validation', inputs = validation_inputs, targets = validation_targets)
np.savez('audiobooks_data_test', inputs = test_inputs, targets = test_targets)

## Loading the .npz Files

In [17]:
#Loading the data from the npz files
#We need to load the inputs and targets into their dedicated arrays
npz = np.load('audiobooks_data_train.npz')

train_inputs = npz['inputs']
train_targets = npz['targets']

npz = np.load('audiobooks_data_validation.npz')

valid_inputs = npz['inputs']
valid_targets = npz['targets']

npz = np.load('audiobooks_data_test.npz')

test_inputs = npz['inputs']
test_targets = npz['targets']

## Model

In [18]:
input_layer_size = 30
hidden_layer_size = 100
output_layer_size = 2

#Outlining the model similar to MNIST
model = tf.keras.Sequential([
    tf.keras.layers.Dense(hidden_layer_size, activation = 'relu'),
    tf.keras.layers.Dense(hidden_layer_size, activation = 'relu'),
    tf.keras.layers.Dense(hidden_layer_size, activation = 'relu'),
    tf.keras.layers.Dense(output_layer_size, activation = 'softmax')
])

#Choose the optimizer and loss function
model.compile(optimizer = 'adam', loss = 'sparse_categorical_crossentropy', metrics = ['accuracy'])

BATCH_SIZE = 5
NUM_OF_EPOCHS = 10

early_stopping = tf.keras.callbacks.EarlyStopping(patience = 2)

model.fit(train_inputs,
         train_targets,
         batch_size = BATCH_SIZE,
         epochs = NUM_OF_EPOCHS, 
         callbacks = [early_stopping],
         validation_data = (valid_inputs,
                           valid_targets),
         verbose = 2)

Epoch 1/10
158/158 - 0s - loss: 0.1558 - accuracy: 0.9352 - val_loss: 0.2050 - val_accuracy: 0.9694
Epoch 2/10
158/158 - 0s - loss: 0.0158 - accuracy: 0.9949 - val_loss: 0.2473 - val_accuracy: 0.9694
Epoch 3/10
158/158 - 0s - loss: 0.0038 - accuracy: 1.0000 - val_loss: 0.3017 - val_accuracy: 0.9694


<tensorflow.python.keras.callbacks.History at 0x1a44e2ce90>

In [19]:
test_loss, test_accuracy = model.evaluate(test_inputs, test_targets)



In [20]:
print('Test Loss: {0:.2f}. Test Accuracy: {1:.2f}%'.format(test_loss, test_accuracy*100.))

Test Loss: 0.09. Test Accuracy: 98.99%


We got lucky and got a test accuracy higher than the validation accuracy.

## Additional Comments

Due to the small size of the dataset it is probably better to use N-Fold cross validation to validate the training data instead of making a seperate validation subset.