# Isolation Forest Model for Credit Card Fraud Detection

## Importing Libraries and Dataset


In [1]:
#importing libraries
import pandas as pd        #for dataframe data structure
import numpy as np         # for numpy arrays and scientific computations

import matplotlib.pyplot as plt      #for data visualization
%matplotlib inline

from sklearn.ensemble import IsolationForest       #for Isolation Forest Model
from sklearn import metrics                            #for evaluation metrics
                                                       
import seaborn as sns                                  #for visualization
from sklearn.preprocessing import StandardScaler       #for Data Preprocessing
from sklearn.model_selection import train_test_split

from sklearn.model_selection import GridSearchCV

In [2]:
from imblearn.over_sampling import SMOTE               #for SMOTE Oversampling

Using TensorFlow backend.


## Data Overview

In [3]:
#importing dataset
df=pd.read_csv("creditcard.csv")
df.head()

Unnamed: 0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V21,V22,V23,V24,V25,V26,V27,V28,Amount,Class
0,0.0,-1.359807,-0.072781,2.536347,1.378155,-0.338321,0.462388,0.239599,0.098698,0.363787,...,-0.018307,0.277838,-0.110474,0.066928,0.128539,-0.189115,0.133558,-0.021053,149.62,0
1,0.0,1.191857,0.266151,0.16648,0.448154,0.060018,-0.082361,-0.078803,0.085102,-0.255425,...,-0.225775,-0.638672,0.101288,-0.339846,0.16717,0.125895,-0.008983,0.014724,2.69,0
2,1.0,-1.358354,-1.340163,1.773209,0.37978,-0.503198,1.800499,0.791461,0.247676,-1.514654,...,0.247998,0.771679,0.909412,-0.689281,-0.327642,-0.139097,-0.055353,-0.059752,378.66,0
3,1.0,-0.966272,-0.185226,1.792993,-0.863291,-0.010309,1.247203,0.237609,0.377436,-1.387024,...,-0.1083,0.005274,-0.190321,-1.175575,0.647376,-0.221929,0.062723,0.061458,123.5,0
4,2.0,-1.158233,0.877737,1.548718,0.403034,-0.407193,0.095921,0.592941,-0.270533,0.817739,...,-0.009431,0.798278,-0.137458,0.141267,-0.20601,0.502292,0.219422,0.215153,69.99,0


In [4]:
data=df.sample(frac=1,random_state=1)
print(data.shape)

(284807, 31)


In [5]:
fraud = data[data['Class'] == 1]
valid = data[data['Class'] == 0]

outlier_frac = len(fraud) / float(len(valid))
print(outlier_frac)

print("fraud cases: {}".format(len(fraud)))
print("valid cases: {}".format(len(valid)))

0.0017304750013189597
fraud cases: 492
valid cases: 284315


In [6]:
#Creating predictor and target variables
X=data.copy()
X.drop(['Class'],axis=1,inplace=True)
y=data['Class']

##  Data Splitting

In [7]:
#Splitting dataset into Trainset and Testset
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.2,random_state=4)
print("Number transactions X_train dataset: ", X_train.shape) 
print("Number transactions y_train dataset: ", y_train.shape) 
print("Number transactions X_test dataset: ", X_test.shape) 
print("Number transactions y_test dataset: ", y_test.shape) 

Number transactions X_train dataset:  (227845, 30)
Number transactions y_train dataset:  (227845,)
Number transactions X_test dataset:  (56962, 30)
Number transactions y_test dataset:  (56962,)


## Data Modeling

## Selecting the best Sampling technique

## Before SMOTE OverSampling

In [8]:
# define a random state
state = 1;

In [9]:
isf=IsolationForest(max_samples=len(X_train), contamination = outlier_frac, random_state = state, behaviour = "new")
isf

IsolationForest(behaviour='new', bootstrap=False,
                contamination=0.0017304750013189597, max_features=1.0,
                max_samples=227845, n_estimators=100, n_jobs=None,
                random_state=1, verbose=0, warm_start=False)

In [10]:
isf.fit(X_train)



IsolationForest(behaviour='new', bootstrap=False,
                contamination=0.0017304750013189597, max_features=1.0,
                max_samples=227845, n_estimators=100, n_jobs=None,
                random_state=1, verbose=0, warm_start=False)

In [11]:
score_pred = isf.decision_function(X_test)
y_pred = isf.predict(X_test)

In [12]:
score_pred[0:10]

array([0.19082215, 0.19030468, 0.19023666, 0.15133028, 0.1680755 ,
       0.1786962 , 0.19176756, 0.18135336, 0.18857889, 0.18366131])

In [13]:
y_pred[0:10]

array([1, 1, 1, 1, 1, 1, 1, 1, 1, 1])

In [14]:
pd.DataFrame(y_pred)[0].value_counts()

 1    56859
-1      103
Name: 0, dtype: int64

In [15]:
y_pred[y_pred == 1] = 0
y_pred[y_pred == -1] = 1

In [16]:
E1=(y_pred != y_test).sum()
T1=metrics.accuracy_score(y_test,y_pred)
L1=metrics.log_loss(y_test,y_pred)
F1=metrics.f1_score(y_test,y_pred,average='weighted')
C1=metrics.classification_report(y_test,y_pred)

print("Number of Errors : ",E1)
print("Test Accuracy score : ",T1)
print("Log Loss : ",L1)
print("F1- Score : ",F1)
print("Classification Report : ",C1)

Number of Errors :  115
Test Accuracy score :  0.9979811102138267
Log Loss :  0.06973081109622496
F1- Score :  0.9979960014891391
Classification Report :                precision    recall  f1-score   support

           0       1.00      1.00      1.00     56862
           1       0.43      0.44      0.43       100

    accuracy                           1.00     56962
   macro avg       0.71      0.72      0.72     56962
weighted avg       1.00      1.00      1.00     56962



In [17]:
#saving accuracy results
one=pd.DataFrame([E1,T1,L1,F1,1.00,0.44])

## After SMOTE OverSampling

The dataset creditcasrd.csv that we are dealing with is imbalanced i.e. very less cases of one type and huge of other. These creates a bias in the modeling classifier and leads to mislearning. One approach to addresse imbalanced datasets is to oversample the minority class. The simplest approach involves duplicating examples in the minority class, although these examples don’t add any new information to the model. Instead, new examples can be synthesized from the existing examples. This is a type of data augmentation for the minority class and is referred to as the Synthetic Minority Oversampling Technique, or SMOTE for short.

In [18]:
print("Before OverSampling - counts of label '1': {}".format(sum(y_train == 1))) 
print("Before OverSampling - counts of label '0': {} \n".format(sum(y_train == 0))) 

Before OverSampling - counts of label '1': 392
Before OverSampling - counts of label '0': 227453 



In [19]:
sm = SMOTE(random_state = 2) 
X_train_res, y_train_res = sm.fit_sample(X_train, y_train) 
  
print('After OverSampling - the shape of train_X: {}'.format(X_train_res.shape)) 
print('After OverSampling - the shape of train_y: {} \n'.format(y_train_res.shape)) 
  
print("After OverSampling - counts of label '1': {}".format(sum(y_train_res == 1))) 
print("After OverSampling - counts of label '0': {}".format(sum(y_train_res == 0))) 

After OverSampling - the shape of train_X: (454906, 30)
After OverSampling - the shape of train_y: (454906,) 

After OverSampling - counts of label '1': 227453
After OverSampling - counts of label '0': 227453


In [20]:
isf1=IsolationForest(max_samples=len(X_train_res), contamination = outlier_frac, random_state = state, behaviour = "new")
isf1

IsolationForest(behaviour='new', bootstrap=False,
                contamination=0.0017304750013189597, max_features=1.0,
                max_samples=454906, n_estimators=100, n_jobs=None,
                random_state=1, verbose=0, warm_start=False)

In [21]:
isf1.fit(X_train_res)
score_pred = isf1.decision_function(X_test)
y_pred = isf1.predict(X_test)
pd.DataFrame(y_pred)[0].value_counts()



 1    56909
-1       53
Name: 0, dtype: int64

In [22]:
y_pred[y_pred == 1] = 0
y_pred[y_pred == -1] = 1

In [23]:
E2=(y_pred != y_test).sum()
T2=metrics.accuracy_score(y_test,y_pred)
L2=metrics.log_loss(y_test,y_pred)
F2=metrics.f1_score(y_test,y_pred,average='weighted')
C2=metrics.classification_report(y_test,y_pred)

print("Number of Errors : ",E2)
print("Test Accuracy score : ",T2)
print("Log Loss : ",L2)
print("F1- Score : ",F2)
print("Classification Report : ",C2)



Number of Errors :  147
Test Accuracy score :  0.9974193321863699
Log Loss :  0.08913381043368275
F1- Score :  0.9970234881544296
Classification Report :                precision    recall  f1-score   support

           0       1.00      1.00      1.00     56862
           1       0.06      0.03      0.04       100

    accuracy                           1.00     56962
   macro avg       0.53      0.51      0.52     56962
weighted avg       1.00      1.00      1.00     56962

Number of Errors :  147
Test Accuracy score :  0.9974193321863699
Log Loss :  0.08913381043368275
F1- Score :  0.9970234881544296
Classification Report :                precision    recall  f1-score   support

           0       1.00      1.00      1.00     56862
           1       0.06      0.03      0.04       100

    accuracy                           1.00     56962
   macro avg       0.53      0.51      0.52     56962
weighted avg       1.00      1.00      1.00     56962



In [29]:
#saving accuracy results
two=pd.DataFrame([E2,T2,L2,F2,1.00,0.03])

In this case OutSampling doesn't prove to be effecting. Therefore the best model will undergo no sampling.

## Fitting best Model

In [26]:
isf_best=IsolationForest(max_samples=len(X_train), contamination = outlier_frac, random_state = state, behaviour = "new")
isf_best.fit(X_train)
score_pred = isf_best.decision_function(X_test)
y_pred = isf_best.predict(X_test)
pd.DataFrame(y_pred)[0].value_counts()
y_pred[y_pred == 1] = 0
y_pred[y_pred == -1] = 1

E3=(y_pred != y_test).sum()
T3=metrics.accuracy_score(y_test,y_pred)
L3=metrics.log_loss(y_test,y_pred)
F3=metrics.f1_score(y_test,y_pred,average='weighted')
C3=metrics.classification_report(y_test,y_pred)

print("Number of Errors : ",E3)
print("Test Accuracy score : ",T3)
print("Log Loss : ",L3)
print("F1- Score : ",F3)
print("Classification Report : ",C3)




Number of Errors :  115
Test Accuracy score :  0.9979811102138267
Log Loss :  0.06973081109622496
F1- Score :  0.9979960014891391
Classification Report :                precision    recall  f1-score   support

           0       1.00      1.00      1.00     56862
           1       0.43      0.44      0.43       100

    accuracy                           1.00     56962
   macro avg       0.71      0.72      0.72     56962
weighted avg       1.00      1.00      1.00     56962



In [27]:
#saving accuracy results
three=pd.DataFrame([E3,T3,L3,F3,1.00,0.44])

## Final Report

In [31]:
print("===============================    Isolation Forest Analysis Report   ===============================")
res=pd.concat([one,two,three],axis=1)
rows=pd.DataFrame(['No.of Error','Accuracy Score','Log Loss','F1-Score','Class0 Recall','Class1 Recall'])
res=pd.concat([res,rows],axis=1)
res.columns=['No Sampling','SMOTE Oversampling','BEST MODEL','Criteria']
res.set_index("Criteria", inplace = True) 
res



Unnamed: 0_level_0,No Sampling,SMOTE Oversampling,BEST MODEL
Criteria,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
No.of Error,147.0,147.0,115.0
Accuracy Score,0.997419,0.997419,0.997981
Log Loss,0.089134,0.089134,0.069731
F1-Score,0.997023,0.997023,0.997996
Class0 Recall,1.0,1.0,1.0
Class1 Recall,0.03,0.03,0.44
