In [1]:
import warnings
warnings.filterwarnings('ignore')

In [2]:
import pandas as pd
import numpy as np
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, confusion_matrix

In [3]:
data=pd.read_csv(r'C:\Users\Shraddha\srk_datasets\payment_fraud.csv')

In [4]:
data.head()

Unnamed: 0,accountAgeDays,numItems,localTime,paymentMethod,paymentMethodAgeDays,label
0,29,1,4.745402,paypal,28.204861,0
1,725,1,4.742303,storecredit,0.0,0
2,845,1,4.921318,creditcard,0.0,0
3,503,1,4.886641,creditcard,0.0,0
4,2000,1,5.040929,creditcard,0.0,0


In [5]:
data.shape

(39221, 6)

In [6]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 39221 entries, 0 to 39220
Data columns (total 6 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   accountAgeDays        39221 non-null  int64  
 1   numItems              39221 non-null  int64  
 2   localTime             39221 non-null  float64
 3   paymentMethod         39221 non-null  object 
 4   paymentMethodAgeDays  39221 non-null  float64
 5   label                 39221 non-null  int64  
dtypes: float64(2), int64(3), object(1)
memory usage: 1.8+ MB


In [7]:
data.describe()

Unnamed: 0,accountAgeDays,numItems,localTime,paymentMethodAgeDays,label
count,39221.0,39221.0,39221.0,39221.0,39221.0
mean,857.563984,1.084751,4.748232,122.641326,0.014278
std,804.788212,0.566899,0.38936,283.569177,0.118636
min,1.0,1.0,0.421214,0.0,0.0
25%,72.0,1.0,4.742303,0.0,0.0
50%,603.0,1.0,4.886641,0.0125,0.0
75%,1804.0,1.0,4.962055,87.510417,0.0
max,2000.0,29.0,5.040929,1999.580556,1.0


In [8]:
data.nunique()

accountAgeDays           1999
numItems                   14
localTime                  25
paymentMethod               3
paymentMethodAgeDays    17094
label                       2
dtype: int64

In [9]:
data.isnull().sum()

accountAgeDays          0
numItems                0
localTime               0
paymentMethod           0
paymentMethodAgeDays    0
label                   0
dtype: int64

In [10]:
p=data['paymentMethod'].unique()
p

array(['paypal', 'storecredit', 'creditcard'], dtype=object)

In [11]:
data=pd.get_dummies(data,drop_first=True)

In [12]:
data.head()

Unnamed: 0,accountAgeDays,numItems,localTime,paymentMethodAgeDays,label,paymentMethod_paypal,paymentMethod_storecredit
0,29,1,4.745402,28.204861,0,1,0
1,725,1,4.742303,0.0,0,0,1
2,845,1,4.921318,0.0,0,0,0
3,503,1,4.886641,0.0,0,0,0
4,2000,1,5.040929,0.0,0,0,0


In [13]:
data.tail()

Unnamed: 0,accountAgeDays,numItems,localTime,paymentMethodAgeDays,label,paymentMethod_paypal,paymentMethod_storecredit
39216,986,1,4.836982,0.0,0,0,0
39217,1647,1,4.876771,377.930556,0,0,0
39218,1591,1,4.742303,0.0,0,0,0
39219,237,1,4.921318,236.082639,0,0,0
39220,272,1,5.040929,0.000694,0,1,0


In [14]:
data.corr()

Unnamed: 0,accountAgeDays,numItems,localTime,paymentMethodAgeDays,label,paymentMethod_paypal,paymentMethod_storecredit
accountAgeDays,1.0,-0.037562,-0.0126,0.330909,-0.128098,-0.002842,-0.004757
numItems,-0.037562,1.0,-0.042563,0.031683,0.038116,-0.004804,0.000164
localTime,-0.0126,-0.042563,1.0,-0.033076,-0.059505,-0.001442,-0.000499
paymentMethodAgeDays,0.330909,0.031683,-0.033076,1.0,-0.052047,-0.001032,-0.002758
label,-0.128098,0.038116,-0.059505,-0.052047,1.0,-0.001935,-0.006313
paymentMethod_paypal,-0.002842,-0.004804,-0.001442,-0.001032,-0.001935,1.0,-0.126305
paymentMethod_storecredit,-0.004757,0.000164,-0.000499,-0.002758,-0.006313,-0.126305,1.0


In [15]:
X_train, X_test, y_train, y_test = train_test_split(data.drop('label', axis=1), data['label'],test_size=0.30, random_state=17)

Because this is a binary classification problem, I will employ the Logistic Regression algorithm, which is one of the most powerful algorithms for a binary classification model. If you don't know what Binary Classification is, you can find out here. Let us now simply train the fraud detection model using the logistic regression algorithm and examine the accuracy score that we will obtain.

In [16]:
clf = LogisticRegression().fit(X_train, y_train)

In [17]:
# Make predictions on test set
y_pred = clf.predict(X_test)
print(accuracy_score(y_pred, y_test))

1.0


So, when was the last time you got a 100 percent accuracy? Using the logistic regression algorithm, we were able to detect fraud with 100% accuracy.

Let us now assess our model's performance. To assess the performance of our model, I will employ the confusion matrix algorithm. The confusion matrix algorithm can be implemented in a single line of code:

In [18]:
print(confusion_matrix(y_test, y_pred))

[[11596     0]
 [    0   171]]


So, out of the total number of transactions in the dataset, 171 are correctly identified as fraudulent, while the remaining 11596 are authentic. 

Thank You!