# Fraud detection by using Deep Learning

**Importing some libraries**

In [3]:
from sklearn.model_selection import train_test_split
import numpy as np
import pandas as pd
import seaborn as sns
from sklearn.metrics import precision_recall_fscore_support

**Install tensorflow if it is not installed yet**

In [4]:
#!pip install tensorflow

In [5]:
import tensorflow as tf

In [6]:
# Version of tensorflow installed in this environment
print(tf.__version__)

1.14.0


**Reading data**

In [7]:
df = pd.read_csv('dataset/kaggle_data_fraud.csv')

In [8]:
df.head()

Unnamed: 0,step,type,amount,nameOrig,oldbalanceOrg,newbalanceOrig,nameDest,oldbalanceDest,newbalanceDest,isFraud,isFlaggedFraud
0,1,TRANSFER,181.0,C1305486145,181.0,0.0,C553264065,0.0,0.0,1,0
1,1,CASH_OUT,181.0,C840083671,181.0,0.0,C38997010,21182.0,0.0,1,0
2,1,TRANSFER,2806.0,C1420196421,2806.0,0.0,C972765878,0.0,0.0,1,0
3,1,CASH_OUT,2806.0,C2101527076,2806.0,0.0,C1007251739,26202.0,0.0,1,0
4,1,TRANSFER,20128.0,C137533655,20128.0,0.0,C1848415041,0.0,0.0,1,0


**Dropping some columns that are not useful for the model**

In [9]:
df.drop(columns=['nameOrig','nameDest','isFlaggedFraud'], inplace=True)

In [10]:
df.head()

Unnamed: 0,step,type,amount,oldbalanceOrg,newbalanceOrig,oldbalanceDest,newbalanceDest,isFraud
0,1,TRANSFER,181.0,181.0,0.0,0.0,0.0,1
1,1,CASH_OUT,181.0,181.0,0.0,21182.0,0.0,1
2,1,TRANSFER,2806.0,2806.0,0.0,0.0,0.0,1
3,1,CASH_OUT,2806.0,2806.0,0.0,26202.0,0.0,1
4,1,TRANSFER,20128.0,20128.0,0.0,0.0,0.0,1


Splitting the features and targets

In [11]:
[m,n] = df.shape
X_data = df.iloc[0:m,0:6] # Features
y_data = df[['isFraud']] # Target

Convert column 'type' to dummy variables

In [12]:
dummy_type = pd.get_dummies(X_data['type'])
dummy_type.head()

Unnamed: 0,CASH_IN,CASH_OUT,DEBIT,PAYMENT,TRANSFER
0,0,0,0,0,1
1,0,1,0,0,0
2,0,0,0,0,1
3,0,1,0,0,0
4,0,0,0,0,1


Drop column 'type' and concatenate the dummy variables 

In [13]:
X_data.drop(columns=['type'], inplace=True)
X_data = pd.concat([X_data, dummy_type], axis=1)
X_data.head()

Unnamed: 0,step,amount,oldbalanceOrg,newbalanceOrig,oldbalanceDest,CASH_IN,CASH_OUT,DEBIT,PAYMENT,TRANSFER
0,1,181.0,181.0,0.0,0.0,0,0,0,0,1
1,1,181.0,181.0,0.0,21182.0,0,1,0,0,0
2,1,2806.0,2806.0,0.0,0.0,0,0,0,0,1
3,1,2806.0,2806.0,0.0,26202.0,0,1,0,0,0
4,1,20128.0,20128.0,0.0,0.0,0,0,0,0,1


Convert Dataframe to numpy array

In [14]:
X_data = X_data.values
y_data = y_data.values

In [15]:
# Splitting the data into training and testing set
X_train, X_test, y_train, y_test = train_test_split(X_data, y_data, test_size=0.20, random_state=1)

**Adding an early stop for training process when the training accuracy reaches 90%**

In [16]:
class myCallback(tf.keras.callbacks.Callback):
  def on_epoch_end(self, epoch, logs={}):
    if(logs.get('acc')>0.997):
      print("\nReached 99.7% accuracy so cancelling training!")
      self.model.stop_training = True

In [17]:
model = tf.keras.models.Sequential([
  tf.keras.layers.Flatten(),
  tf.keras.layers.Dense(128, activation='relu'),
  tf.keras.layers.Dense(1, activation='sigmoid')
])

W0730 14:37:06.813119 4540216768 deprecation.py:506] From /Users/rathapech/anaconda3/lib/python3.7/site-packages/tensorflow/python/ops/init_ops.py:1251: calling VarianceScaling.__init__ (from tensorflow.python.ops.init_ops) with dtype is deprecated and will be removed in a future version.
Instructions for updating:
Call initializer instance with the dtype argument instead of passing it to the constructor


In [18]:
callbacks = myCallback()

model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

In [19]:
history = model.fit(X_train, y_train, epochs=20, callbacks=[callbacks], validation_data=(X_test, y_test))

test_loss = model.evaluate(X_test, y_test)

W0730 14:37:08.482247 4540216768 deprecation.py:323] From /Users/rathapech/anaconda3/lib/python3.7/site-packages/tensorflow/python/ops/nn_impl.py:180: add_dispatch_support.<locals>.wrapper (from tensorflow.python.ops.array_ops) is deprecated and will be removed in a future version.
Instructions for updating:
Use tf.where in 2.0, which has the same broadcast rule as np.where


Train on 838860 samples, validate on 209715 samples
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Reached 99.7% accuracy so cancelling training!


In [20]:
acc = history.history[ 'acc']
val_acc  = history.history['val_acc']
loss = history.history[ 'loss']
val_loss = history.history['val_loss']
#print('All history: ', history, '\n')
print('Accuracy: ', acc, )
print('Validation accuracy: ', val_acc, '\n')
print('Loss: ', loss)
print('Validation loss: ', val_loss, '\n')
print('Model evaluation: ', test_loss)

Accuracy:  [0.9950969, 0.99634385, 0.99640226, 0.99703765]
Validation accuracy:  [0.99283314, 0.9993372, 0.9926758, 0.999032] 

Loss:  [92.64649308002876, 51.01338383751578, 56.785196423858345, 40.44309856061371]
Validation loss:  [29.3651450697085, 10.938027850404621, 26.97938463778059, 5.252836834428519] 

Model evaluation:  [5.252836919848112, 0.999032]


In [34]:
y_pred_DL = model.predict(X_test)
scores = precision_recall_fscore_support(y_test, y_pred_DL.astype(int), average='binary')
scores

(0.5635179153094463, 0.7178423236514523, 0.6313868613138687, None)

# Comparing with some methods in Scikit Learn

In [35]:
from sklearn.metrics import accuracy_score
from sklearn import svm
from sklearn.naive_bayes import GaussianNB


In [36]:
#clf_svm = svm.SVC(kernel='rbf', C=50000, gamma=0.9)
#clf_svm.fit(X_train, y_train)

clf_gnb = GaussianNB()
clf_gnb.fit(X_train, y_train)

  y = column_or_1d(y, warn=True)


GaussianNB(priors=None, var_smoothing=1e-09)

In [37]:
y_pred = clf_gnb.predict(X_test)

In [38]:
acc = accuracy_score(y_test, y_pred)
acc

0.9863004553799204

In [39]:
scores = precision_recall_fscore_support(y_test, y_pred, average='binary')

In [40]:
scores

(0.02284263959390863, 0.26141078838174275, 0.04201400466822274, None)