
# PyCaret 2 Credit Card Fraud Detection Example
## Great to understand imbalanced learn in Pycaret 

This notebook is created using PyCaret 2.0. Last updated : 10-08-2020

Credit Card Fraud Detection dataset from Kaggle https://www.kaggle.com/mlg-ulb/creditcardfraud <br>
The dataset contains 284,807 Transaction records 


In [1]:
# Mount Google Drive 
# Skip this step if using on local hardware 
from google.colab import drive
drive.mount('/content/gdrive')

Go to this URL in a browser: https://accounts.google.com/o/oauth2/auth?client_id=947318989803-6bn6qk8qdgf4n4g3pfee6491hc0brc4i.apps.googleusercontent.com&redirect_uri=urn%3aietf%3awg%3aoauth%3a2.0%3aoob&response_type=code&scope=email%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdocs.test%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive.photos.readonly%20https%3a%2f%2fwww.googleapis.com%2fauth%2fpeopleapi.readonly

Enter your authorization code:
··········
Mounted at /content/gdrive


In [1]:
#!pip install pycaret==2.0
# Importing six and using it instead of 'sklearn.externals.six' from imbalanced-learn
# Use this method when using fix_imbalance=True
import six
import sys
sys.modules['sklearn.externals.six'] = six

from pycaret.classification import *
from pycaret.utils import version
import pandas as pd
from sklearn.model_selection import train_test_split
import numpy as np

In [23]:
# check version
from pycaret.utils import version
version()

2.0


In [2]:
# Change path as per your file structure
# Remove root_path if using local hardware 
root_path = 'gdrive/My Drive/Colab Notebooks/'
data = pd.read_csv('gdrive/My Drive/Colab Notebooks/CreditCard/creditcard.csv')
data.shape

(284807, 31)

In [3]:
data.pop('Time')
data.head()

Unnamed: 0,V1,V2,V3,V4,V5,V6,V7,V8,V9,V10,V11,V12,V13,V14,V15,V16,V17,V18,V19,V20,V21,V22,V23,V24,V25,V26,V27,V28,Amount,Class
0,-1.359807,-0.072781,2.536347,1.378155,-0.338321,0.462388,0.239599,0.098698,0.363787,0.090794,-0.5516,-0.617801,-0.99139,-0.311169,1.468177,-0.470401,0.207971,0.025791,0.403993,0.251412,-0.018307,0.277838,-0.110474,0.066928,0.128539,-0.189115,0.133558,-0.021053,149.62,0
1,1.191857,0.266151,0.16648,0.448154,0.060018,-0.082361,-0.078803,0.085102,-0.255425,-0.166974,1.612727,1.065235,0.489095,-0.143772,0.635558,0.463917,-0.114805,-0.183361,-0.145783,-0.069083,-0.225775,-0.638672,0.101288,-0.339846,0.16717,0.125895,-0.008983,0.014724,2.69,0
2,-1.358354,-1.340163,1.773209,0.37978,-0.503198,1.800499,0.791461,0.247676,-1.514654,0.207643,0.624501,0.066084,0.717293,-0.165946,2.345865,-2.890083,1.109969,-0.121359,-2.261857,0.52498,0.247998,0.771679,0.909412,-0.689281,-0.327642,-0.139097,-0.055353,-0.059752,378.66,0
3,-0.966272,-0.185226,1.792993,-0.863291,-0.010309,1.247203,0.237609,0.377436,-1.387024,-0.054952,-0.226487,0.178228,0.507757,-0.287924,-0.631418,-1.059647,-0.684093,1.965775,-1.232622,-0.208038,-0.1083,0.005274,-0.190321,-1.175575,0.647376,-0.221929,0.062723,0.061458,123.5,0
4,-1.158233,0.877737,1.548718,0.403034,-0.407193,0.095921,0.592941,-0.270533,0.817739,0.753074,-0.822843,0.538196,1.345852,-1.11967,0.175121,-0.451449,-0.237033,-0.038195,0.803487,0.408542,-0.009431,0.798278,-0.137458,0.141267,-0.20601,0.502292,0.219422,0.215153,69.99,0


In [4]:
# Splitting the data into training and testing with 30% of data as test set.
train_data, test_data = train_test_split(data, test_size=0.3)
test_labels = test_data.pop('Class') 
print(train_data.shape)
print(test_data.shape)
print(test_labels.shape)

(199364, 30)
(85443, 29)
(85443,)


In [5]:
# The data is very imbalanced with 199008 Non-Fraudulent transactions and 
# 356 Fraudulent transactions
unique, counts = np.unique(train_data['Class'], return_counts=True)
np.asarray((unique, counts)).T

array([[     0, 199008],
       [     1,    356]])

In [6]:
# As the dataset is huge, PyCaret will ask to if we wish to sample the data
# I used 50% data.
# Fix Imbalance = True will use SMOTE as default
fraud_detection = setup(data = train_data, target = 'Class', 
                   normalize = True,
                   transformation = True, transformation_method = 'yeo-johnson', 
                   fix_imbalance = True)

Setup Succesfully Completed!


Unnamed: 0,Description,Value
0,session_id,5747
1,Target Type,Binary
2,Label Encoded,
3,Original Data,"(199364, 30)"
4,Missing Values,False
5,Numeric Features,29
6,Categorical Features,0
7,Ordinal Features,False
8,High Cardinality Features,False
9,High Cardinality Method,


In [10]:
models()

Unnamed: 0_level_0,Name,Reference,Turbo
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
lr,Logistic Regression,sklearn.linear_model.LogisticRegression,True
knn,K Neighbors Classifier,sklearn.neighbors.KNeighborsClassifier,True
nb,Naive Bayes,sklearn.naive_bayes.GaussianNB,True
dt,Decision Tree Classifier,sklearn.tree.DecisionTreeClassifier,True
svm,SVM - Linear Kernel,sklearn.linear_model.SGDClassifier,True
rbfsvm,SVM - Radial Kernel,sklearn.svm.SVC,False
gpc,Gaussian Process Classifier,sklearn.gaussian_process.GPC,False
mlp,MLP Classifier,sklearn.neural_network.MLPClassifier,False
ridge,Ridge Classifier,sklearn.linear_model.RidgeClassifier,True
rf,Random Forest Classifier,sklearn.ensemble.RandomForestClassifier,True


In [13]:
compare_models(whitelist=['svm', 'ada' ,'mlp', 'dt', 'rf', 'et'])

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,TT (Sec)
0,Extra Trees Classifier,0.9995,0.9617,0.8231,0.9156,0.8648,0.8646,0.8669,7.0905
1,Random Forest Classifier,0.9994,0.9422,0.775,0.8646,0.814,0.8137,0.8166,4.9238
2,MLP Classifier,0.9993,0.9538,0.7981,0.8252,0.8084,0.8081,0.8097,31.5859
3,Decision Tree Classifier,0.9979,0.871,0.7436,0.4545,0.5637,0.5627,0.5801,8.1198
4,Ada Boost Classifier,0.992,0.9566,0.8635,0.1663,0.2786,0.2764,0.3765,33.4172
5,SVM - Linear Kernel,0.979,0.0,0.9115,0.0752,0.1385,0.1357,0.2571,0.3953


ExtraTreesClassifier(bootstrap=False, ccp_alpha=0.0, class_weight=None,
                     criterion='gini', max_depth=None, max_features='auto',
                     max_leaf_nodes=None, max_samples=None,
                     min_impurity_decrease=0.0, min_impurity_split=None,
                     min_samples_leaf=1, min_samples_split=2,
                     min_weight_fraction_leaf=0.0, n_estimators=100, n_jobs=-1,
                     oob_score=False, random_state=5747, verbose=0,
                     warm_start=False)

In [14]:
# Creating models for the best estimators 
rf = create_model('rf')
mlp = create_model('mlp')
xt = create_model('et')

Unnamed: 0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
0,0.9993,0.9517,0.8333,0.7692,0.8,0.7996,0.8003
1,0.9996,0.9061,0.75,1.0,0.8571,0.8569,0.8658
2,0.9996,0.9985,0.8462,0.9167,0.88,0.8798,0.8805
3,0.9999,1.0,0.9231,1.0,0.96,0.9599,0.9607
4,0.9996,0.9557,0.8462,0.9167,0.88,0.8798,0.8805
5,0.9996,0.9547,0.7692,1.0,0.8696,0.8694,0.8769
6,0.9996,0.9116,0.8462,0.9167,0.88,0.8798,0.8805
7,0.9993,0.939,0.75,0.8182,0.7826,0.7823,0.783
8,0.9996,0.9999,0.8333,0.9091,0.8696,0.8694,0.8702
9,0.9996,0.9999,0.8333,0.9091,0.8696,0.8694,0.8702


In [15]:
# Tuning models for the best estimators 
rf = tune_model(rf)
mlp = tune_model(mlp)
xt = tune_model(xt)

Unnamed: 0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
0,0.9993,0.9866,0.8333,0.7692,0.8,0.7996,0.8003
1,0.9994,0.9458,0.75,0.9,0.8182,0.8179,0.8213
2,0.9991,0.9981,0.8462,0.7333,0.7857,0.7853,0.7873
3,1.0,1.0,1.0,1.0,1.0,1.0,1.0
4,0.9994,0.9883,0.8462,0.8462,0.8462,0.8459,0.8459
5,0.9996,0.9808,0.7692,1.0,0.8696,0.8694,0.8769
6,0.9993,0.9704,0.8462,0.7857,0.8148,0.8145,0.815
7,0.9986,0.9334,0.75,0.5625,0.6429,0.6422,0.6488
8,0.9996,0.9998,0.8333,0.9091,0.8696,0.8694,0.8702
9,0.9997,0.9999,0.9167,0.9167,0.9167,0.9165,0.9165


In [17]:
# Blending models
blender = blend_models(estimator_list = [rf, mlp, xt])

Unnamed: 0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
0,0.9993,0.0,0.8333,0.7692,0.8,0.7996,0.8003
1,0.9996,0.0,0.8333,0.9091,0.8696,0.8694,0.8702
2,0.9991,0.0,0.8462,0.7333,0.7857,0.7853,0.7873
3,1.0,0.0,1.0,1.0,1.0,1.0,1.0
4,0.9994,0.0,0.8462,0.8462,0.8462,0.8459,0.8459
5,0.9996,0.0,0.7692,1.0,0.8696,0.8694,0.8769
6,0.9993,0.0,0.8462,0.7857,0.8148,0.8145,0.815
7,0.9987,0.0,0.75,0.6,0.6667,0.666,0.6702
8,0.9991,0.0,0.8333,0.7143,0.7692,0.7688,0.7711
9,0.9997,0.0,0.9167,0.9167,0.9167,0.9165,0.9165


In [18]:
# Finaliszing model for predictions
model = finalize_model(blender)
predictions = predict_model(model, data = test_data)

In [24]:
from sklearn.metrics import f1_score
f1 = f1_score(test_labels, predictions['Label'], average='macro')
print("F1 Score= ",f1)

F1 Score=  0.9261864021629076
