# Credit Card Fraud Detection using a Deep Neural Network

For more information on the dataset look at the other file, if you havent already.

## Load Data

In [1]:
#Import relevant dependencies
import numpy as np
import pandas as pd
import tensorflow as tf
from sklearn import preprocessing as pp

#Loading data into a numpy array
raw_data = np.loadtxt('creditcard.csv', delimiter = ',')

#Excludes the targets
unscaled_inputs_all = raw_data[:,:-1]

#Last column stores the targets
targets_all = raw_data[:,-1]

## Balance the Dataset

In [2]:
#Targets are binary (1 or 0) so getting the sum of all target values is equivalent to how many 1s are in the dataset
num_one_targets = int(np.sum(targets_all))

#Our zero target counter
num_zero_targets = 0

#Once we have an equal amount of 0s and 1s we need to remove all the other data points and we will store the indices to be removed in a list
indices_to_remove = []

#Iterating through our dataset to load our indices_to_remove list
for i in range(targets_all.shape[0]):
    if targets_all[i] == 0.:
        num_zero_targets += 1.
        if num_zero_targets > num_one_targets:
            indices_to_remove.append(i)  

#Storing the new balanced inputs and targets by removing all the unnecessary datapoints
unscaled_inputs_equal_priors = np.delete(unscaled_inputs_all, indices_to_remove, axis = 0)
targets_equal_priors = np.delete(targets_all, indices_to_remove, axis = 0)

## Standardize Inputs

In [3]:
#Standardizes the inputs for more accuracy
scaled_inputs = pp.scale(unscaled_inputs_equal_priors)

## Shuffle the Data

In [4]:
#np.arange([start], stop) returns evenly spaced values in a given interval
shuffled_indices = np.arange(scaled_inputs.shape[0])
#np.random.shuffle(X) is a method that shuffles numbers in a given sequence
np.random.shuffle(shuffled_indices)

#Storing the new shuffled inputs and targets
shuffled_inputs = scaled_inputs[shuffled_indices]
shuffled_targets = targets_equal_priors[shuffled_indices]

## Split Data into Train, Test Subsets

In [5]:
#Count of all data points
samples_count = shuffled_inputs.shape[0]

#Storing the count of data points in each subset
train_samples_count = int(0.8*samples_count)
test_samples_count = int(0.2*samples_count)

train_inputs = shuffled_inputs[: train_samples_count]
train_targets = shuffled_targets[: train_samples_count]


test_inputs = shuffled_inputs[train_samples_count:]
test_targets = shuffled_targets[train_samples_count:]

#Shows how balanced the datasets are
print(np.sum(train_targets), train_samples_count, np.sum(train_targets)/train_samples_count)
print(np.sum(test_targets), test_samples_count, np.sum(test_targets)/test_samples_count)

386.0 787 0.49047013977128334
106.0 196 0.5408163265306123


In [6]:
np.savez('audiobooks_data_train', inputs = train_inputs, targets = train_targets)
np.savez('audiobooks_data_test', inputs = test_inputs, targets = test_targets)

## Loading the .npz Files

In [7]:
#Loading the data from the npz files
#We need to load the inputs and targets into their dedicated arrays
npz = np.load('audiobooks_data_train.npz')

train_inputs = npz['inputs']
train_targets = npz['targets']

npz = np.load('audiobooks_data_validation.npz')

valid_inputs = npz['inputs']
valid_targets = npz['targets']

npz = np.load('audiobooks_data_test.npz')

test_inputs = npz['inputs']
test_targets = npz['targets']

## Model

In [28]:
input_layer_size = 30
hidden_layer_size = 100
output_layer_size = 2

#Outlining the model similar to MNIST
model = tf.keras.Sequential([
    tf.keras.layers.Dense(hidden_layer_size, activation = 'relu'),
    tf.keras.layers.Dense(hidden_layer_size, activation = 'relu'),
    tf.keras.layers.Dense(hidden_layer_size, activation = 'relu'),
    tf.keras.layers.Dense(output_layer_size, activation = 'softmax')
])

#Choose the optimizer and loss function
model.compile(optimizer = 'adam', loss = 'sparse_categorical_crossentropy', metrics = ['accuracy'])

BATCH_SIZE = 5
NUM_OF_EPOCHS = 10

early_stopping = tf.keras.callbacks.EarlyStopping(patience = 2)

model.fit(train_inputs,
         train_targets,
         batch_size = BATCH_SIZE,
         epochs = NUM_OF_EPOCHS, 
         callbacks = [early_stopping],
         validation_split = 0.1,
         verbose = 2)

Epoch 1/10


To change all layers to have dtype float64 by default, call `tf.keras.backend.set_floatx('float64')`. To change just this layer, pass dtype='float64' to the layer constructor. If you are the author of this layer, you can disable autocasting by passing autocast=False to the base Layer constructor.

142/142 - 0s - loss: 0.1475 - accuracy: 0.9350 - val_loss: 0.0787 - val_accuracy: 0.9620
Epoch 2/10
142/142 - 0s - loss: 0.0482 - accuracy: 0.9873 - val_loss: 0.1159 - val_accuracy: 0.9620
Epoch 3/10
142/142 - 0s - loss: 0.0190 - accuracy: 0.9972 - val_loss: 0.0140 - val_accuracy: 1.0000
Epoch 4/10
142/142 - 0s - loss: 0.0107 - accuracy: 0.9986 - val_loss: 0.0127 - val_accuracy: 1.0000
Epoch 5/10
142/142 - 0s - loss: 0.0042 - accuracy: 0.9986 - val_loss: 0.0110 - val_accuracy: 1.0000
Epoch 6/10
142/142 - 0s - loss: 7.5018e-04 - accuracy: 1.0000 - val_loss: 0.0057 - val_accuracy: 1.0000
Epoch 7/10
142/142 - 0s - loss: 3.7697e-04 - accuracy: 1.0000 - val_loss: 0.0106 - val_accuracy

<tensorflow.python.keras.callbacks.History at 0x1a37b52390>

In [29]:
model.evaluate(test_inputs, test_targets)



[0.06408549100160599, 0.989847719669342]

Test Accuracy is 98.98% with the balanced dataset.