# Credit Card fraud detection

This model is build using Neural Network (This is the first project on NN. Here I use very basic concepts on NN)

In [1]:
import tensorflow as tf

  from ._conv import register_converters as _register_converters


In [2]:
# necessary modules

import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import precision_recall_fscore_support, accuracy_score

In [3]:
# Preset to avoid randomisation

np.random.seed(1)

In [4]:
dataset = pd.read_csv('../data/creditcard.csv')

# Sanity check
print ('Columns : {}\nShape: {}'.format(list(dataset.columns), dataset.shape))

Columns : ['Time', 'V1', 'V2', 'V3', 'V4', 'V5', 'V6', 'V7', 'V8', 'V9', 'V10', 'V11', 'V12', 'V13', 'V14', 'V15', 'V16', 'V17', 'V18', 'V19', 'V20', 'V21', 'V22', 'V23', 'V24', 'V25', 'V26', 'V27', 'V28', 'Amount', 'Class']
Shape: (284807, 31)


In [5]:
# dropping the time columns since we dont need it
dataset = dataset.drop(labels=['Time'], axis=1)

# Let's see how the dataset looks now
dataset.head()

Unnamed: 0,V1,V2,V3,V4,V5,V6,V7,V8,V9,V10,...,V21,V22,V23,V24,V25,V26,V27,V28,Amount,Class
0,-1.359807,-0.072781,2.536347,1.378155,-0.338321,0.462388,0.239599,0.098698,0.363787,0.090794,...,-0.018307,0.277838,-0.110474,0.066928,0.128539,-0.189115,0.133558,-0.021053,149.62,0
1,1.191857,0.266151,0.16648,0.448154,0.060018,-0.082361,-0.078803,0.085102,-0.255425,-0.166974,...,-0.225775,-0.638672,0.101288,-0.339846,0.16717,0.125895,-0.008983,0.014724,2.69,0
2,-1.358354,-1.340163,1.773209,0.37978,-0.503198,1.800499,0.791461,0.247676,-1.514654,0.207643,...,0.247998,0.771679,0.909412,-0.689281,-0.327642,-0.139097,-0.055353,-0.059752,378.66,0
3,-0.966272,-0.185226,1.792993,-0.863291,-0.010309,1.247203,0.237609,0.377436,-1.387024,-0.054952,...,-0.1083,0.005274,-0.190321,-1.175575,0.647376,-0.221929,0.062723,0.061458,123.5,0
4,-1.158233,0.877737,1.548718,0.403034,-0.407193,0.095921,0.592941,-0.270533,0.817739,0.753074,...,-0.009431,0.798278,-0.137458,0.141267,-0.20601,0.502292,0.219422,0.215153,69.99,0


In [6]:
# Let's figure out the dataset imbalance
c1 = len(dataset.loc[dataset['Class']==1])
c0 = len(dataset)-c1
print ('# Class 0 vs # Class 1: {} vs {}'.format(c0,c1))

# Class 0 vs # Class 1: 284315 vs 492


So to make the dataset balanced we must multiply the Fraudulent class 577 times

In [9]:
def balance_dataset(df):
    
    class1 = df.loc[df['Class']==1]
    
    for i in range(577):
        df = pd.concat([df, class1])
        
    return df

def train_test_split(df):
    '''This function takes in the data-frame and splits the data into test and train'''
    
    data_class1 = df.loc[df.Class == 1] 
    data_class0 = df.loc[df.Class == 0]
    
    l1 = int(data_class1.shape[0]*.8)
    l0 = int(data_class0.shape[0]*.8)
    
    train = pd.concat([data_class1[:l1], data_class0[:l0]]); train = train.sample(frac=1)
    test = pd.concat([data_class1[l1:], data_class0[l0:]]); test = test.sample(frac=1)
    
    #y_train = train.Class; y_test = test.Class
    #X_train = train.drop(labels=['Class'],axis=1)
    #X_test = test.drop(labels=['Class'],axis=1)
    
    return (train, test)

In [10]:
# Let's prepare the data

X_train, X_test = train_test_split(dataset)

X_train = balance_dataset(X_train)
X_test = balance_dataset(X_test)

y_train = X_train['Class']; X_train = X_train.drop(labels=['Class'],axis=1)
y_test = X_test['Class']; X_test = X_test.drop(labels=['Class'],axis=1)

# Let's check the sizes
print ("SIZES")
print ('Training data: {}\tTraining label: {}\nTest data: {}\tTest label: {}'\
      .format(X_train.shape[0], y_train.shape[0], X_test.shape[0], y_test.shape[0]))

SIZES
Training data: 454606	Training label: 454606
Test data: 114085	Test label: 114085


In [11]:
# Lets normalize the data

X_train = StandardScaler().fit_transform(X_train)
X_test = StandardScaler().fit_transform(X_test)

y_train = np.array(y_train, dtype=np.float32)
y_test = np.array(y_test, dtype=np.float32)

# NN models
---

In [16]:
from keras.models import Sequential
from keras.layers import Dense, Dropout
from keras.optimizers import Adam

## Baseline model

In [21]:
def baseline_model():
    '''This returns a baseline model without any dropouts or batch normalization'''
   
    # Parameters
    hidden_unit1 = 18
    hidden_unit2 = round(hidden_unit1*1.5)
    hidden_unit3 = round(hidden_unit2*1.5)
    ia = 'relu'; fa = 'sigmoid'; ini = 'he_normal'
    optimizer = Adam(lr=3e-4, beta_1=0.9, beta_2=0.999, epsilon=1e-15, decay=0.0, amsgrad=False)
    loss = 'binary_crossentropy'
    metrics = ['acc']
    
    #Defining layers
    model = Sequential()
    model.add(Dense(hidden_unit1, input_dim=29, activation=ia, kernel_initializer=ini))
    model.add(Dense(hidden_unit2, activation=ia, kernel_initializer=ini))
    model.add(Dense(hidden_unit3, activation=ia, kernel_initializer=ini))
    model.add(Dense(1, activation=fa,kernel_initializer=ini))
    
    #Compiling 
    model.compile(loss=loss, optimizer=optimizer,metrics=metrics)
    return model

In [23]:
model = baseline_model()

model.fit(X_train, y_train, epochs=10, batch_size=2048, verbose=1)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x1a4275617f0>

In [24]:
model.fit(X_train, y_train, epochs=2000, batch_size=2048, verbose=0)

<keras.callbacks.History at 0x1a43c4aca90>

In [27]:
train_performance = model.evaluate(X_train,y_train,batch_size=2048)
test_performance = model.evaluate(X_test, y_test, batch_size=2048)

print ('Accuracy: \nOn train data: {}%\tOn test data: {}%'\
       .format(round(train_performance[1]*100,2),round(test_performance[1]*100,2)))

Accuracy: 
On train data: 100.0%	On test data: 85.57%


In [35]:
y_hat = model.predict_classes(X_test)
y_hat = y_hat.reshape(y_hat.shape[0]*y_hat.shape[1])

In [39]:
precision_recall_fscore_support(np.array(y_test, dtype=np.int32), y_hat)

(array([0.77757926, 0.99312715]),
 array([0.99500554, 0.71717172]),
 array([0.87295759, 0.83288683]),
 array([56863, 57222], dtype=int64))