# Naive Baiye for Credit Card Fraud Detection

## Importing Libraries and Dataset

In [30]:
#importing libraries
import pandas as pd        #for dataframe data structure
import numpy as np         # for numpy arrays and scientific computations

import matplotlib.pyplot as plt      #for data visualization
%matplotlib inline

from sklearn.naive_bayes import BernoulliNB            #for Nive Baiye's Model
from sklearn import metrics                            #for evaluation metrics
                                                       
import seaborn as sns                                  #for visualization
from sklearn.preprocessing import StandardScaler       #for Data Preprocessing
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score

from sklearn.model_selection import GridSearchCV


In [31]:
from imblearn.over_sampling import ADASYN  

## Data Overview

In [32]:
#importing dataset
df=pd.read_csv("creditcard.csv")
df.head()

Unnamed: 0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V21,V22,V23,V24,V25,V26,V27,V28,Amount,Class
0,0.0,-1.359807,-0.072781,2.536347,1.378155,-0.338321,0.462388,0.239599,0.098698,0.363787,...,-0.018307,0.277838,-0.110474,0.066928,0.128539,-0.189115,0.133558,-0.021053,149.62,0
1,0.0,1.191857,0.266151,0.16648,0.448154,0.060018,-0.082361,-0.078803,0.085102,-0.255425,...,-0.225775,-0.638672,0.101288,-0.339846,0.16717,0.125895,-0.008983,0.014724,2.69,0
2,1.0,-1.358354,-1.340163,1.773209,0.37978,-0.503198,1.800499,0.791461,0.247676,-1.514654,...,0.247998,0.771679,0.909412,-0.689281,-0.327642,-0.139097,-0.055353,-0.059752,378.66,0
3,1.0,-0.966272,-0.185226,1.792993,-0.863291,-0.010309,1.247203,0.237609,0.377436,-1.387024,...,-0.1083,0.005274,-0.190321,-1.175575,0.647376,-0.221929,0.062723,0.061458,123.5,0
4,2.0,-1.158233,0.877737,1.548718,0.403034,-0.407193,0.095921,0.592941,-0.270533,0.817739,...,-0.009431,0.798278,-0.137458,0.141267,-0.20601,0.502292,0.219422,0.215153,69.99,0


In [33]:
data=df.sample(frac=1,random_state=1)
print(data.shape)

(284807, 31)


In [34]:
fraud = data[data['Class'] == 1]
valid = data[data['Class'] == 0]

outlier_frac = len(fraud) / float(len(valid))
print(outlier_frac)

print("fraud cases: {}".format(len(fraud)))
print("valid cases: {}".format(len(valid)))

0.0017304750013189597
fraud cases: 492
valid cases: 284315


In [35]:
#Creating predictor and target variables
X=data.copy()
X.drop(['Class'],axis=1,inplace=True)
y=data['Class']

## Data Splitting

In [36]:
#Splitting dataset into Trainset and Testset
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.2,random_state=42)
print("Number transactions X_train dataset: ", X_train.shape) 
print("Number transactions y_train dataset: ", y_train.shape) 
print("Number transactions X_test dataset: ", X_test.shape) 
print("Number transactions y_test dataset: ", y_test.shape) 

Number transactions X_train dataset:  (227845, 30)
Number transactions y_train dataset:  (227845,)
Number transactions X_test dataset:  (56962, 30)
Number transactions y_test dataset:  (56962,)


## Data Modeling

## No Sampling Technique

In [37]:
bnb = BernoulliNB()
bnb

BernoulliNB(alpha=1.0, binarize=0.0, class_prior=None, fit_prior=True)

In [38]:
bnb.fit(X_train, y_train)

BernoulliNB(alpha=1.0, binarize=0.0, class_prior=None, fit_prior=True)

In [39]:
yhat=bnb.predict(X_test)
yhat[0:10]

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0], dtype=int64)

In [40]:
pred_score=bnb.predict_proba(X_test)
pred_score[0:10]

array([[9.99999980e-01, 2.00893517e-08],
       [9.99999699e-01, 3.00822756e-07],
       [9.99996228e-01, 3.77249834e-06],
       [9.99994032e-01, 5.96842482e-06],
       [9.99999733e-01, 2.67197757e-07],
       [9.99999929e-01, 7.11455754e-08],
       [9.99975681e-01, 2.43194671e-05],
       [9.99999914e-01, 8.58830565e-08],
       [9.99999960e-01, 3.99027098e-08],
       [9.99999945e-01, 5.49254267e-08]])

In [41]:
scores = cross_val_score(bnb, X_train, y_train, cv=10)

In [42]:
scores

array([0.99903445, 0.99929778, 0.9992539 , 0.9988589 , 0.99872723,
       0.9990783 , 0.99903441, 0.99929775, 0.99903441, 0.99925386])

## Data Evaluation

In [43]:
#Average Score
avg_score=np.mean(scores)
print("Average score is :",avg_score)

Average score is : 0.9990870998408425


In [44]:
E1=(yhat != y_test).sum()
T1=metrics.accuracy_score(y_test,yhat)
F1=metrics.f1_score(y_test,yhat,average='weighted')
C1=metrics.classification_report(y_test,yhat)
CM1 = metrics.confusion_matrix(y_test, yhat)
CV1= np.round(scores.mean(), 3) * 100

print("Number of Errors : ",E1)
print("Test Accuracy score : ",T1)
print("F1- Score : ",F1)
print ("Cross Validation Mean Score: ",CV1)
print("Classification Report : ",C1)
print()
print("Confusion Matrix:" "\n",CM1 ) 
print()

Number of Errors :  35
Test Accuracy score :  0.999385555282469
F1- Score :  0.9993726966033968
Cross Validation Mean Score:  99.9
Classification Report :                precision    recall  f1-score   support

           0       1.00      1.00      1.00     56875
           1       0.82      0.76      0.79        87

    accuracy                           1.00     56962
   macro avg       0.91      0.88      0.90     56962
weighted avg       1.00      1.00      1.00     56962


Confusion Matrix:
 [[56861    14]
 [   21    66]]



In [45]:
#saving accuracy results
one=pd.DataFrame([E1,T1,F1,CV1,1.00,0.76])

The recall for the Class 1 is 0.63 which is comparitively less. This is due the imbalanced data as very low fraud cases are avalibale in the dataset. To solve this problem , we us the ADASYN Oversampling technique and see the score.

## ADASYN Oversampling

In [46]:
print("Before OverSampling - counts of label '1': {}".format(sum(y_train == 1))) 
print("Before OverSampling - counts of label '0': {} \n".format(sum(y_train == 0))) 

Before OverSampling - counts of label '1': 405
Before OverSampling - counts of label '0': 227440 



In [47]:
# apply the ADASYN over-sampling
ada = ADASYN(random_state=42)
X_train_res, y_train_res = ada.fit_sample(X_train, y_train)

print('After OverSampling - the shape of train_X: {}'.format(X_train_res.shape)) 
print('After OverSampling - the shape of train_y: {} \n'.format(y_train_res.shape)) 
  
print("After OverSampling - counts of label '1': {}".format(sum(y_train_res == 1))) 
print("After OverSampling - counts of label '0': {}".format(sum(y_train_res == 0))) 

After OverSampling - the shape of train_X: (454908, 30)
After OverSampling - the shape of train_y: (454908,) 

After OverSampling - counts of label '1': 227468
After OverSampling - counts of label '0': 227440


In [48]:
bnb1= BernoulliNB()
bnb1

BernoulliNB(alpha=1.0, binarize=0.0, class_prior=None, fit_prior=True)

In [49]:
bnb1.fit(X_train_res, y_train_res)
yhat=bnb1.predict(X_test)
yhat[0:10]

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0], dtype=int64)

In [50]:
E2=(yhat != y_test).sum()
T2=metrics.accuracy_score(y_test,yhat)
F2=metrics.f1_score(y_test,yhat,average='weighted')
C2=metrics.classification_report(y_test,yhat)
CM2 = metrics.confusion_matrix(y_test, yhat)
CV2= np.round(scores.mean(), 3) * 100

print("Number of Errors : ",E2)
print("Test Accuracy score : ",T2)
print("F1- Score : ",F2)
print ("Cross Validation Mean Score: ",CV2)
print("Classification Report : ",C2)
print()
print("Confusion Matrix:" "\n", CM2)
print()

Number of Errors :  235
Test Accuracy score :  0.9958744426108633
F1- Score :  0.9969914042463212
Cross Validation Mean Score:  99.9
Classification Report :                precision    recall  f1-score   support

           0       1.00      1.00      1.00     56875
           1       0.25      0.84      0.38        87

    accuracy                           1.00     56962
   macro avg       0.62      0.92      0.69     56962
weighted avg       1.00      1.00      1.00     56962


Confusion Matrix:
 [[56654   221]
 [   14    73]]



In [51]:
#saving accuracy results
two=pd.DataFrame([E2,T2,F2,CV2,1.00,0.84])

## Final Report

In [52]:
print("=====================  Naive Baiyes Analysis Report   ======================")
res=pd.concat([one,two,two],axis=1)
rows=pd.DataFrame(['No.of Error','Accuracy Score','F1-Score','Cross Validation mean score','Class0 Recall','Class1 Recall'])
res=pd.concat([res,rows],axis=1)
res.columns=['No Sampling','ADASYN Oversampling','BEST MODEL','Criteria']
res.set_index("Criteria", inplace = True) 
res



Unnamed: 0_level_0,No Sampling,ADASYN Oversampling,BEST MODEL
Criteria,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
No.of Error,35.0,235.0,235.0
Accuracy Score,0.999386,0.995874,0.995874
F1-Score,0.999373,0.996991,0.996991
Cross Validation mean score,99.9,99.9,99.9
Class0 Recall,1.0,1.0,1.0
Class1 Recall,0.76,0.84,0.84
