In [27]:
#importing required libraries
import pandas as pd
import numpy as np
import plotly.express as px

The below column reference:

- step: represents a unit of time where 1 step equals 1 hour
- type: type of online transaction
- amount: the amount of the transaction
- nameOrig: customer starting the transaction
- oldbalanceOrg: balance before the transaction
- newbalanceOrig: balance after the transaction
- nameDest: recipient of the transaction
- oldbalanceDest: initial balance of recipient before the transaction
- newbalanceDest: the new balance of recipient after the transaction
- isFraud: fraud transaction

In [28]:
data = pd.read_csv("C:\\Users\\HAI\\Downloads\\pay.csv")
data.head()

Unnamed: 0,step,type,amount,nameOrig,oldbalanceOrg,newbalanceOrig,nameDest,oldbalanceDest,newbalanceDest,isFraud,isFlaggedFraud
0,1,PAYMENT,9839.64,C1231006815,170136.0,160296.36,M1979787155,0.0,0.0,0,0
1,1,PAYMENT,1864.28,C1666544295,21249.0,19384.72,M2044282225,0.0,0.0,0,0
2,1,TRANSFER,181.0,C1305486145,181.0,0.0,C553264065,0.0,0.0,1,0
3,1,CASH_OUT,181.0,C840083671,181.0,0.0,C38997010,21182.0,0.0,1,0
4,1,PAYMENT,11668.14,C2048537720,41554.0,29885.86,M1230701703,0.0,0.0,0,0


# Study and preparing data

In [29]:
data.shape

(6362620, 11)

In [30]:
data.duplicated().sum()

0

In [31]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6362620 entries, 0 to 6362619
Data columns (total 11 columns):
 #   Column          Dtype  
---  ------          -----  
 0   step            int64  
 1   type            object 
 2   amount          float64
 3   nameOrig        object 
 4   oldbalanceOrg   float64
 5   newbalanceOrig  float64
 6   nameDest        object 
 7   oldbalanceDest  float64
 8   newbalanceDest  float64
 9   isFraud         int64  
 10  isFlaggedFraud  int64  
dtypes: float64(5), int64(3), object(3)
memory usage: 534.0+ MB


In [32]:
#Cheking for Null values

data.isnull().sum()  #does not have any null values

step              0
type              0
amount            0
nameOrig          0
oldbalanceOrg     0
newbalanceOrig    0
nameDest          0
oldbalanceDest    0
newbalanceDest    0
isFraud           0
isFlaggedFraud    0
dtype: int64

In [33]:
data.describe()

Unnamed: 0,step,amount,oldbalanceOrg,newbalanceOrig,oldbalanceDest,newbalanceDest,isFraud,isFlaggedFraud
count,6362620.0,6362620.0,6362620.0,6362620.0,6362620.0,6362620.0,6362620.0,6362620.0
mean,243.3972,179861.9,833883.1,855113.7,1100702.0,1224996.0,0.00129082,2.514687e-06
std,142.332,603858.2,2888243.0,2924049.0,3399180.0,3674129.0,0.0359048,0.001585775
min,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,156.0,13389.57,0.0,0.0,0.0,0.0,0.0,0.0
50%,239.0,74871.94,14208.0,0.0,132705.7,214661.4,0.0,0.0
75%,335.0,208721.5,107315.2,144258.4,943036.7,1111909.0,0.0,0.0
max,743.0,92445520.0,59585040.0,49585040.0,356015900.0,356179300.0,1.0,1.0


# type of transaction present in Dataset

In [34]:
type_transaction = data["type"].value_counts()
transaction = type_transaction.index
quantity = type_transaction.values

# plotting pie chart
fig = px.pie(data,
            values = quantity,
            names = transaction, 
            hole = 0.4,
            title = "Distribution of Transaction Type")
fig.show()

# Checking for co relataion between the features with isFraud column

In [35]:
corr = data.corr()
corr["isFraud"].sort_values(ascending=False)

isFraud           1.000000
amount            0.076688
isFlaggedFraud    0.044109
step              0.031578
oldbalanceOrg     0.010154
newbalanceDest    0.000535
oldbalanceDest   -0.005885
newbalanceOrig   -0.008148
Name: isFraud, dtype: float64

# transform the categorical features into numerical.

In [36]:
data["type"] = data["type"].map({"CASH_OUT": 1, 
                                 "PAYMENT": 2, 
                                 "CASH_IN": 3, 
                                 "TRANSFER": 4,
                                 "DEBIT": 5})
data["isFraud"] = data["isFraud"].map({0: "No Fraud", 1: "Fraud"})
data.head()

Unnamed: 0,step,type,amount,nameOrig,oldbalanceOrg,newbalanceOrig,nameDest,oldbalanceDest,newbalanceDest,isFraud,isFlaggedFraud
0,1,2,9839.64,C1231006815,170136.0,160296.36,M1979787155,0.0,0.0,No Fraud,0
1,1,2,1864.28,C1666544295,21249.0,19384.72,M2044282225,0.0,0.0,No Fraud,0
2,1,4,181.0,C1305486145,181.0,0.0,C553264065,0.0,0.0,Fraud,0
3,1,1,181.0,C840083671,181.0,0.0,C38997010,21182.0,0.0,Fraud,0
4,1,2,11668.14,C2048537720,41554.0,29885.86,M1230701703,0.0,0.0,No Fraud,0


# Online Payments Fraud Detection Model

In [37]:
# CLassification model

In [38]:
# Splitting the data 

from sklearn.model_selection import train_test_split
x = np.array(data[["type","amount", "oldbalanceOrg", "newbalanceOrig"]])
y = np.array(data["isFraud"])
y


array(['No Fraud', 'No Fraud', 'Fraud', ..., 'Fraud', 'Fraud', 'Fraud'],
      dtype=object)

In [39]:
# Training the data

    # using Decision tree classifier
    # fit the training data in model
    
from sklearn.tree import DecisionTreeClassifier
x_train, x_test, y_train, y_test = train_test_split(x,y,random_state = 42)
from sklearn.metrics import classification_report,confusion_matrix,accuracy_score


model = DecisionTreeClassifier()
model.fit(x_train, y_train)
pred = model.predict(x_test)
print('accuracay_score:%2f'%(accuracy_score(y_test,pred)*100))
print('....................Classification_report......................')
print(classification_report(y_test,pred))
print ('......................confusion_matrix........................')
df = pd.DataFrame(confusion_matrix(y_test,pred),index = (0,1),columns = (0,1))
df



accuracay_score:99.970138
....................Classification_report......................
              precision    recall  f1-score   support

       Fraud       0.89      0.87      0.88      2045
    No Fraud       1.00      1.00      1.00   1588610

    accuracy                           1.00   1590655
   macro avg       0.95      0.94      0.94   1590655
weighted avg       1.00      1.00      1.00   1590655

......................confusion_matrix........................


Unnamed: 0,0,1
0,1788,257
1,218,1588392


In [40]:
# Classifying the transaction by feeding input

# Features = [type, amount, oldbalanceOrg, newbalanceOrig]
features = np.array([[1, 10000, 5000, 100]])
print(model.predict(features))

['No Fraud']
