# Fraud detection by using Deep Learning

**Importing some libraries**

In [1]:
from sklearn.model_selection import train_test_split
import numpy as np
import pandas as pd
import seaborn as sns

**Install tensorflow if it is not installed yet**

In [2]:
#!pip install tensorflow

In [3]:
import tensorflow as tf

In [4]:
# Version of tensorflow installed in this environment
print(tf.__version__)

1.14.0


**Reading data**

In [5]:
df = pd.read_csv('dataset/kaggle_data_fraud.csv')

In [6]:
df.head()

Unnamed: 0,step,type,amount,nameOrig,oldbalanceOrg,newbalanceOrig,nameDest,oldbalanceDest,newbalanceDest,isFraud,isFlaggedFraud
0,1,TRANSFER,181.0,C1305486145,181.0,0.0,C553264065,0.0,0.0,1,0
1,1,CASH_OUT,181.0,C840083671,181.0,0.0,C38997010,21182.0,0.0,1,0
2,1,TRANSFER,2806.0,C1420196421,2806.0,0.0,C972765878,0.0,0.0,1,0
3,1,CASH_OUT,2806.0,C2101527076,2806.0,0.0,C1007251739,26202.0,0.0,1,0
4,1,TRANSFER,20128.0,C137533655,20128.0,0.0,C1848415041,0.0,0.0,1,0


**Dropping some columns that are not useful for the model**

In [7]:
df.drop(columns=['nameOrig','nameDest','isFlaggedFraud'], inplace=True)

In [8]:
df.head()

Unnamed: 0,step,type,amount,oldbalanceOrg,newbalanceOrig,oldbalanceDest,newbalanceDest,isFraud
0,1,TRANSFER,181.0,181.0,0.0,0.0,0.0,1
1,1,CASH_OUT,181.0,181.0,0.0,21182.0,0.0,1
2,1,TRANSFER,2806.0,2806.0,0.0,0.0,0.0,1
3,1,CASH_OUT,2806.0,2806.0,0.0,26202.0,0.0,1
4,1,TRANSFER,20128.0,20128.0,0.0,0.0,0.0,1


Splitting the features and targets

In [9]:
[m,n] = df.shape
X_data = df.iloc[0:m,0:6] # Features
y_data = df[['isFraud']] # Target

Convert column 'type' to dummy variables

In [10]:
dummy_type = pd.get_dummies(X_data['type'])
dummy_type.head()

Unnamed: 0,CASH_IN,CASH_OUT,DEBIT,PAYMENT,TRANSFER
0,0,0,0,0,1
1,0,1,0,0,0
2,0,0,0,0,1
3,0,1,0,0,0
4,0,0,0,0,1


Drop column 'type' and concatenate the dummy variables 

In [11]:
X_data.drop(columns=['type'], inplace=True)
X_data = pd.concat([X_data, dummy_type], axis=1)
X_data.head()

Unnamed: 0,step,amount,oldbalanceOrg,newbalanceOrig,oldbalanceDest,CASH_IN,CASH_OUT,DEBIT,PAYMENT,TRANSFER
0,1,181.0,181.0,0.0,0.0,0,0,0,0,1
1,1,181.0,181.0,0.0,21182.0,0,1,0,0,0
2,1,2806.0,2806.0,0.0,0.0,0,0,0,0,1
3,1,2806.0,2806.0,0.0,26202.0,0,1,0,0,0
4,1,20128.0,20128.0,0.0,0.0,0,0,0,0,1


Convert Dataframe to numpy array

In [12]:
X_data = X_data.values
y_data = y_data.values

In [13]:
# Splitting the data into training and testing set
X_train, X_test, y_train, y_test = train_test_split(X_data, y_data, test_size=0.20, random_state=1)

**Adding an early stop for training process when the training accuracy reaches 90%**

In [14]:
class myCallback(tf.keras.callbacks.Callback):
  def on_epoch_end(self, epoch, logs={}):
    if(logs.get('acc')>0.997):
      print("\nReached 99.7% accuracy so cancelling training!")
      self.model.stop_training = True

In [15]:
model = tf.keras.models.Sequential([
  tf.keras.layers.Flatten(),
  tf.keras.layers.Dense(128, activation='relu'),
  tf.keras.layers.Dense(1, activation='sigmoid')
])

W0730 14:20:40.859506 4617811392 deprecation.py:506] From /Users/rathapech/anaconda3/lib/python3.7/site-packages/tensorflow/python/ops/init_ops.py:1251: calling VarianceScaling.__init__ (from tensorflow.python.ops.init_ops) with dtype is deprecated and will be removed in a future version.
Instructions for updating:
Call initializer instance with the dtype argument instead of passing it to the constructor


In [16]:
callbacks = myCallback()

model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

In [17]:
history = model.fit(X_train, y_train, epochs=20, callbacks=[callbacks], validation_data=(X_test, y_test))

test_loss = model.evaluate(X_test, y_test)

W0730 14:20:43.392871 4617811392 deprecation.py:323] From /Users/rathapech/anaconda3/lib/python3.7/site-packages/tensorflow/python/ops/nn_impl.py:180: add_dispatch_support.<locals>.wrapper (from tensorflow.python.ops.array_ops) is deprecated and will be removed in a future version.
Instructions for updating:
Use tf.where in 2.0, which has the same broadcast rule as np.where


Train on 838860 samples, validate on 209715 samples
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Reached 99.7% accuracy so cancelling training!


In [18]:
acc = history.history[ 'acc']
val_acc  = history.history['val_acc']
loss = history.history[ 'loss']
val_loss = history.history['val_loss']
#print('All history: ', history, '\n')
print('Accuracy: ', acc, )
print('Validation accuracy: ', val_acc, '\n')
print('Loss: ', loss)
print('Validation loss: ', val_loss, '\n')
print('Model evaluation: ', test_loss)

Accuracy:  [0.99485016, 0.996413, 0.99693036, 0.9971473]
Validation accuracy:  [0.99810696, 0.99865055, 0.99931335, 0.9993372] 

Loss:  [84.90579780619666, 49.55409529590007, 22.362802190365667, 28.45663192830981]
Validation loss:  [39.94500534500506, 4.849741345238582, 2.5045008150950596, 3.134285909429294] 

Model evaluation:  [3.134285972157805, 0.9993372]


# Comparing with some methods in Scikit Learn

In [19]:
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Ridge
from sklearn.metrics import accuracy_score
from sklearn import svm

In [None]:
clf_svm = svm.SVC(kernel='rbf', C=50000, gamma=0.9)
clf_svm.fit(X_train, y_train)

  y = column_or_1d(y, warn=True)


In [None]:
y_pred = clf.predict(X_test)

In [None]:
acc = accuracy_score(y_test, y_pred)
acc

In [None]:
y_train