# Fraud prevention in Payment Transactions with Decision Tree Classifier

Author: Renata Guimarães (https://github.com/renatadata)
Based on the original of "Ciência dos Dados" with adaptations.

Sources:
https://www.trulioo.com
https://www.cienciadosdados.com
https://edisciplinas.usp.br/pluginfile.php/4136542/mod_resource/content/1/decision_trees.pdf

## Payment fraud management is critical to prevent losses, protect the organization, and ensure operations are smooth, secure and scalable.

#### Considering the ever-changing nature of fraud technology and techniques and the sophistication of e-commerce solutions, solutions are needed to properly handle payment fraud.
In this example we will create a Predictive Machine to Detect Transactions that are possibly fraudulent based on data from the Payment Gateway service of a large Brazilian Startup.
Here we will use Decision trees which are classification methods that are able to extract simple rules about the characteristics of the data that are inferred from the input.
data set. Several algorithms for inducing decision trees are available in the literature.
We'll start with importing the libraries and data. The data will be available at the link:

In [16]:
# Imports
import pandas as pd
import numpy as np
import plotly.express as px
import matplotlib.pyplot as plt
from sklearn.metrics import classification_report
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier

#### Getting the data

In [2]:
data = pd.read_csv("fraud.csv")
data

Unnamed: 0,step,type,amount,nameOrig,oldbalanceOrg,newbalanceOrig,nameDest,oldbalanceDest,newbalanceDest,isFraud,isFlaggedFraud
0,1,PAYMENT,9839.64,C1231006815,170136.00,160296.36,M1979787155,0.00,0.00,0,0
1,1,PAYMENT,1864.28,C1666544295,21249.00,19384.72,M2044282225,0.00,0.00,0,0
2,1,TRANSFER,181.00,C1305486145,181.00,0.00,C553264065,0.00,0.00,1,0
3,1,CASH_OUT,181.00,C840083671,181.00,0.00,C38997010,21182.00,0.00,1,0
4,1,PAYMENT,11668.14,C2048537720,41554.00,29885.86,M1230701703,0.00,0.00,0,0
...,...,...,...,...,...,...,...,...,...,...,...
6362615,743,CASH_OUT,339682.13,C786484425,339682.13,0.00,C776919290,0.00,339682.13,1,0
6362616,743,TRANSFER,6311409.28,C1529008245,6311409.28,0.00,C1881841831,0.00,0.00,1,0
6362617,743,CASH_OUT,6311409.28,C1162922333,6311409.28,0.00,C1365125890,68488.84,6379898.11,1,0
6362618,743,TRANSFER,850002.52,C1685995037,850002.52,0.00,C2080388513,0.00,0.00,1,0


#### Cleaning the data

In [3]:
#Drop Na's and Check percentage of missing data
data = data.dropna()
round(data.isnull().mean()*100,2)

step              0.0
type              0.0
amount            0.0
nameOrig          0.0
oldbalanceOrg     0.0
newbalanceOrig    0.0
nameDest          0.0
oldbalanceDest    0.0
newbalanceDest    0.0
isFraud           0.0
isFlaggedFraud    0.0
dtype: float64

#### Exploring the data

In [4]:
#Stats
data.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
step,6362620.0,243.3972,142.332,1.0,156.0,239.0,335.0,743.0
amount,6362620.0,179861.9,603858.2,0.0,13389.57,74871.94,208721.5,92445520.0
oldbalanceOrg,6362620.0,833883.1,2888243.0,0.0,0.0,14208.0,107315.2,59585040.0
newbalanceOrig,6362620.0,855113.7,2924049.0,0.0,0.0,0.0,144258.4,49585040.0
oldbalanceDest,6362620.0,1100702.0,3399180.0,0.0,0.0,132705.665,943036.7,356015900.0
newbalanceDest,6362620.0,1224996.0,3674129.0,0.0,0.0,214661.44,1111909.0,356179300.0
isFraud,6362620.0,0.00129082,0.0359048,0.0,0.0,0.0,0.0,1.0
isFlaggedFraud,6362620.0,2.514687e-06,0.001585775,0.0,0.0,0.0,0.0,1.0


In [5]:
# Checking correlation - Target
correlation = data.corr()
print(correlation["isFraud"].sort_values(ascending=False))

isFraud           1.000000
amount            0.076688
isFlaggedFraud    0.044109
step              0.031578
oldbalanceOrg     0.010154
newbalanceDest    0.000535
oldbalanceDest   -0.005885
newbalanceOrig   -0.008148
Name: isFraud, dtype: float64


In [6]:
#Checking the most frequent payment type
type = data["type"].value_counts()
type

CASH_OUT    2237500
PAYMENT     2151495
CASH_IN     1399284
TRANSFER     532909
DEBIT         41432
Name: type, dtype: int64

In [7]:
#Transforming type categorical variables into binary variables using dummy encoding
dummy=pd.get_dummies(data['type'])
dummy

Unnamed: 0,CASH_IN,CASH_OUT,DEBIT,PAYMENT,TRANSFER
0,0,0,0,1,0
1,0,0,0,1,0
2,0,0,0,0,1
3,0,1,0,0,0
4,0,0,0,1,0
...,...,...,...,...,...
6362615,0,1,0,0,0
6362616,0,0,0,0,1
6362617,0,1,0,0,0
6362618,0,0,0,0,1


In [8]:
# Concatenating the datasets
df=pd.concat((data,dummy), axis=1)
df.head()

Unnamed: 0,step,type,amount,nameOrig,oldbalanceOrg,newbalanceOrig,nameDest,oldbalanceDest,newbalanceDest,isFraud,isFlaggedFraud,CASH_IN,CASH_OUT,DEBIT,PAYMENT,TRANSFER
0,1,PAYMENT,9839.64,C1231006815,170136.0,160296.36,M1979787155,0.0,0.0,0,0,0,0,0,1,0
1,1,PAYMENT,1864.28,C1666544295,21249.0,19384.72,M2044282225,0.0,0.0,0,0,0,0,0,1,0
2,1,TRANSFER,181.0,C1305486145,181.0,0.0,C553264065,0.0,0.0,1,0,0,0,0,0,1
3,1,CASH_OUT,181.0,C840083671,181.0,0.0,C38997010,21182.0,0.0,1,0,0,1,0,0,0
4,1,PAYMENT,11668.14,C2048537720,41554.0,29885.86,M1230701703,0.0,0.0,0,0,0,0,0,1,0


In [9]:
# Checking the Target percentage
print("Fraud %: ",round((df["isFraud"]==1).mean()*100,2) )

Fraud %:  0.13


In [10]:
df.head()

Unnamed: 0,step,type,amount,nameOrig,oldbalanceOrg,newbalanceOrig,nameDest,oldbalanceDest,newbalanceDest,isFraud,isFlaggedFraud,CASH_IN,CASH_OUT,DEBIT,PAYMENT,TRANSFER
0,1,PAYMENT,9839.64,C1231006815,170136.0,160296.36,M1979787155,0.0,0.0,0,0,0,0,0,1,0
1,1,PAYMENT,1864.28,C1666544295,21249.0,19384.72,M2044282225,0.0,0.0,0,0,0,0,0,1,0
2,1,TRANSFER,181.0,C1305486145,181.0,0.0,C553264065,0.0,0.0,1,0,0,0,0,0,1
3,1,CASH_OUT,181.0,C840083671,181.0,0.0,C38997010,21182.0,0.0,1,0,0,1,0,0,0
4,1,PAYMENT,11668.14,C2048537720,41554.0,29885.86,M1230701703,0.0,0.0,0,0,0,0,0,1,0


In [11]:
# Separating the Explanatory Variables (x) from the Target variable (y)
x = np.array(df[["amount", "oldbalanceOrg", "newbalanceOrig", "CASH_IN","CASH_OUT", "DEBIT", "PAYMENT","TRANSFER"]])
y = np.array(df[["isFraud"]])


### Model

In [12]:
#Split the training and test bases and train the model
xtrain, xtest, ytrain, ytest = train_test_split(x, y, test_size=0.10, random_state=42)
model = DecisionTreeClassifier()
model.fit(xtrain, ytrain)

In [13]:
# Making new Predictions with Test data
y_pred = model.predict(xtest)

In [14]:
#Creating a report with the model evaluation

print("--------------------------REPORT--------------------------------------------")
#Verfica percentual de previsões de compra na Base de Teste
print("Percentage of Forecasts: ",round((y_pred==1).mean()*100,2))
print("----------------------------------------------------------------------------")

print("Score:")
print(model.score(xtrain, ytrain))
print("----------------------------------------------------------------------------")
#Imprime métricas
print("Metrics:")
#Precision: Dos dados apontados como positivos, quantos estavam corretos?
#Recall: Dos dados que deveriam ser apontados como positivos, quantos foram apontados como posistivos?
#F1 - Score: Média harmônica entre Precision e Recall
from sklearn.metrics import classification_report
print(classification_report(ytest, y_pred))
print("----------------------------------------------------------------------------")
#Calcula a acurácia, comparando a variável y_test com a predição - Regressão Logística
acc = accuracy_score(ytest, y_pred)   #Calcula a acurácia
print("Accuracy:", acc)

#MATRIZ DE CONFUSÃO
print("----------------------------------------------------------------------------")
print("Confusion Matrix:")
from sklearn.metrics import confusion_matrix
confusion_matrix = confusion_matrix(ytest, y_pred)
print(confusion_matrix)
print("----------------------------------------------------------------------------")




--------------------------REPORT--------------------------------------------
Percentage of Forecasts:  0.13
----------------------------------------------------------------------------
Score:
0.9999998253689343
----------------------------------------------------------------------------
Metrics:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00    635445
           1       0.91      0.88      0.90       817

    accuracy                           1.00    636262
   macro avg       0.95      0.94      0.95    636262
weighted avg       1.00      1.00      1.00    636262

----------------------------------------------------------------------------
Accuracy: 0.9997343861491021
----------------------------------------------------------------------------
Confusion Matrix:
[[635371     74]
 [    95    722]]
----------------------------------------------------------------------------


## Prediction Input:

In [15]:
#INPUT FEATURES:
#amount, oldbalanceOrg, newbalanceOrig, CASH_IN, CASH_OUT, DEBIT, PAYMENT, TRANSFER

features = np.array([[9000.60, 9000.60, 0.0, 0, 0, 0, 0, 1]])
result = (model.predict(features))
if result==1:
    print ("High Probability of Fraud")
else:    
    print ("Low Probability of Fraud")


High Probability of Fraud
