In [5]:
# Import the required  library
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

# Data importing and undestand the dataset

1) step - maps a unit of time in the real world. In this case 1 step is 1 hour of time. Total steps 744 (30 days simulation).

2) type - CASH-IN, CASH-OUT, DEBIT, PAYMENT and TRANSFER.

3) amount - amount of the transaction in local currency.

4) nameOrig - customer who started the transaction

5) oldbalanceOrg - initial balance before the transaction

6) newbalanceOrig - new balance after the transaction

7) nameDest - customer who is the recipient of the transaction

8) oldbalanceDest - initial balance recipient before the transaction. Note that there is not information for customers that start with M (Merchants).

9) newbalanceDest - new balance recipient after the transaction. Note that there is not information for customers that start with M (Merchants).

10) isFraud - This is the transactions made by the fraudulent agents inside the simulation. In this specific dataset the fraudulent behavior of the agents aims to profit by taking control or customers accounts and try to empty the funds by transferring to another account and then cashing out of the system.

11) isFlaggedFraud - The business model aims to control massive transfers from one account to another and flags illegal attempts. An illegal attempt in this dataset is an attempt to transfer more than 200.000 in a single transaction.

In [6]:
df = pd.read_csv('Fraud.csv')

In [7]:
df.shape

(3870890, 11)

In [8]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3870890 entries, 0 to 3870889
Data columns (total 11 columns):
 #   Column          Dtype  
---  ------          -----  
 0   step            int64  
 1   type            object 
 2   amount          float64
 3   nameOrig        object 
 4   oldbalanceOrg   float64
 5   newbalanceOrig  float64
 6   nameDest        object 
 7   oldbalanceDest  float64
 8   newbalanceDest  float64
 9   isFraud         float64
 10  isFlaggedFraud  float64
dtypes: float64(7), int64(1), object(3)
memory usage: 324.9+ MB


In [9]:
df.columns

Index(['step', 'type', 'amount', 'nameOrig', 'oldbalanceOrg', 'newbalanceOrig',
       'nameDest', 'oldbalanceDest', 'newbalanceDest', 'isFraud',
       'isFlaggedFraud'],
      dtype='object')

In [10]:
df.head()

Unnamed: 0,step,type,amount,nameOrig,oldbalanceOrg,newbalanceOrig,nameDest,oldbalanceDest,newbalanceDest,isFraud,isFlaggedFraud
0,1,PAYMENT,9839.64,C1231006815,170136.0,160296.36,M1979787155,0.0,0.0,0.0,0.0
1,1,PAYMENT,1864.28,C1666544295,21249.0,19384.72,M2044282225,0.0,0.0,0.0,0.0
2,1,TRANSFER,181.0,C1305486145,181.0,0.0,C553264065,0.0,0.0,1.0,0.0
3,1,CASH_OUT,181.0,C840083671,181.0,0.0,C38997010,21182.0,0.0,1.0,0.0
4,1,PAYMENT,11668.14,C2048537720,41554.0,29885.86,M1230701703,0.0,0.0,0.0,0.0


In [11]:
df['type'].value_counts()

CASH_OUT    1376328
PAYMENT     1303552
CASH_IN      847895
TRANSFER     319357
DEBIT         23758
Name: type, dtype: int64

# EDA

In [12]:
# sns.distplot(df['isFraud'])

In [13]:
# sns.pairplot(df)

# Check and Handle Missing values

In [14]:
df.isnull().sum()

step              0
type              0
amount            0
nameOrig          0
oldbalanceOrg     1
newbalanceOrig    1
nameDest          1
oldbalanceDest    1
newbalanceDest    1
isFraud           1
isFlaggedFraud    1
dtype: int64

In [15]:
df.drop(columns = ['step','nameOrig','nameDest','isFlaggedFraud'],axis=1,inplace=True)

In [16]:
df.dropna(inplace=True)

In [17]:
cat_col = df.select_dtypes(exclude = ['int','float'])
cat_col

Unnamed: 0,type
0,PAYMENT
1,PAYMENT
2,TRANSFER
3,CASH_OUT
4,PAYMENT
...,...
3870884,CASH_OUT
3870885,CASH_IN
3870886,CASH_IN
3870887,PAYMENT


In [18]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
df['type'] = le.fit_transform(df['type'])

In [19]:
df

Unnamed: 0,type,amount,oldbalanceOrg,newbalanceOrig,oldbalanceDest,newbalanceDest,isFraud
0,3,9839.64,170136.00,160296.36,0.00,0.00,0.0
1,3,1864.28,21249.00,19384.72,0.00,0.00,0.0
2,4,181.00,181.00,0.00,0.00,0.00,1.0
3,1,181.00,181.00,0.00,21182.00,0.00,1.0
4,3,11668.14,41554.00,29885.86,0.00,0.00,0.0
...,...,...,...,...,...,...,...
3870884,1,50363.29,174495.03,124131.74,425940.72,476304.01,0.0
3870885,0,68905.07,11931.00,80836.07,1128437.03,1059531.96,0.0
3870886,0,235214.08,40380.00,275594.08,3711255.54,3476041.46,0.0
3870887,3,21915.43,275594.08,253678.65,0.00,0.00,0.0


In [20]:
X = df.iloc[:,0:6].values
y = df.iloc[:,[6]].values

In [21]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size = 0.2, random_state = 122)

In [22]:
from sklearn.preprocessing import MinMaxScaler
mms = MinMaxScaler()
X_train_scaled = mms.fit_transform(X_train)
X_test_scaled = mms.fit_transform(X_test)

# Model Building(Logistic + Random forest)

In [23]:
from sklearn.linear_model import LogisticRegression
model_lr = LogisticRegression() 
model_lr.fit(X_train_scaled,y_train)
y_preds = model_lr.predict(X_test_scaled)

  y = column_or_1d(y, warn=True)


In [24]:
from sklearn.metrics import confusion_matrix,classification_report

In [25]:
confusion_matrix(y_test,y_preds)

array([[773517,      9],
       [   614,     38]])

In [26]:
print(classification_report(y_test,y_preds))

              precision    recall  f1-score   support

         0.0       1.00      1.00      1.00    773526
         1.0       0.81      0.06      0.11       652

    accuracy                           1.00    774178
   macro avg       0.90      0.53      0.55    774178
weighted avg       1.00      1.00      1.00    774178



In [35]:
from sklearn.ensemble import RandomForestClassifier
 
model_RFR = RandomForestClassifier()
model_RFR.fit(X_train_scaled,y_train)
y_pred = model_RFR.predict(X_test_scaled)

  model_RFR.fit(X_train_scaled,y_train)


In [36]:
confusion_matrix(y_test,y_pred)

array([[773522,      4],
       [   256,    396]])

In [37]:
print(classification_report(y_test,y_pred))

              precision    recall  f1-score   support

         0.0       1.00      1.00      1.00    773526
         1.0       0.99      0.61      0.75       652

    accuracy                           1.00    774178
   macro avg       0.99      0.80      0.88    774178
weighted avg       1.00      1.00      1.00    774178

