In [1]:
# import important liberaries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt


In [5]:
# load data
df=pd.read_csv('data/Fraud.csv')

In [6]:
df.tail(5)

Unnamed: 0,step,type,amount,nameOrig,oldbalanceOrg,newbalanceOrig,nameDest,oldbalanceDest,newbalanceDest,isFraud,isFlaggedFraud
6362615,743,CASH_OUT,339682.13,C786484425,339682.13,0.0,C776919290,0.0,339682.13,1,0
6362616,743,TRANSFER,6311409.28,C1529008245,6311409.28,0.0,C1881841831,0.0,0.0,1,0
6362617,743,CASH_OUT,6311409.28,C1162922333,6311409.28,0.0,C1365125890,68488.84,6379898.11,1,0
6362618,743,TRANSFER,850002.52,C1685995037,850002.52,0.0,C2080388513,0.0,0.0,1,0
6362619,743,CASH_OUT,850002.52,C1280323807,850002.52,0.0,C873221189,6510099.11,7360101.63,1,0


In [8]:
#the dictionary represents definitions of each variable
with open('data/DataDictionary.txt') as text_file:
    dictionary = text_file.read()
print(dictionary)

step - maps a unit of time in the real world. In this case 1 step is 1 hour of time. Total steps 744 (30 days simulation).

type - CASH-IN, CASH-OUT, DEBIT, PAYMENT and TRANSFER.

amount - amount of the transaction in local currency.

nameOrig - customer who started the transaction

oldbalanceOrg - initial balance before the transaction

newbalanceOrig - new balance after the transaction

nameDest - customer who is the recipient of the transaction

oldbalanceDest - initial balance recipient before the transaction. Note that there is not information for customers that start with M (Merchants).

newbalanceDest - new balance recipient after the transaction. Note that there is not information for customers that start with M (Merchants).

isFraud - This is the transactions made by the fraudulent agents inside the simulation. In this specific dataset the fraudulent behavior of the agents aims to profit by taking control or customers accounts and try to empty the funds by transferring to anot

# Data cleaning
1. Handling missing values,
2. Handling outliers
3. Handling multi-collinearity

In [13]:
# data dimension
print('Number of Rows: {}'.format(df.shape[0]))
print('Number of Columns: {}'.format(df.shape[1]))

Number of Rows: 6362620
Number of Columns: 11


In [14]:
# data type and description
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6362620 entries, 0 to 6362619
Data columns (total 11 columns):
step              int64
type              object
amount            float64
nameOrig          object
oldbalanceOrg     float64
newbalanceOrig    float64
nameDest          object
oldbalanceDest    float64
newbalanceDest    float64
isFraud           int64
isFlaggedFraud    int64
dtypes: float64(5), int64(3), object(3)
memory usage: 534.0+ MB


**Handling missing values**

In [16]:
# check missing value
df.isna().sum()

step              0
type              0
amount            0
nameOrig          0
oldbalanceOrg     0
newbalanceOrig    0
nameDest          0
oldbalanceDest    0
newbalanceDest    0
isFraud           0
isFlaggedFraud    0
dtype: int64

The dependent variable here are 'isFraud' and 'isFlaggedFraud'.

The result shows there is no missing value.

**Handling outliers**

We know that an outlier in statistics  is an observation point that is distant from other observations where features or independent variable are used to detect it. So for our case we look numerical variables.I prefer to use multivarate analysis to detect it.

In [17]:
# inter quaartile range calculation
Q1 = df.quantile(0.25)
Q3 = df.quantile(0.75)
IQR = Q3 - Q1
print(IQR)

step              1.790000e+02
amount            1.953319e+05
oldbalanceOrg     1.073152e+05
newbalanceOrig    1.442584e+05
oldbalanceDest    9.430367e+05
newbalanceDest    1.111909e+06
isFraud           0.000000e+00
isFlaggedFraud    0.000000e+00
dtype: float64


In [18]:
# The data point where we have False that means these values are valid whereas True indicates presence of an outlier.
print(df < (Q1 - 1.5 * IQR)) |(df > (Q3 + 1.5 * IQR))

         amount  isFlaggedFraud  isFraud  nameDest  nameOrig  newbalanceDest  \
0         False           False    False     False     False           False   
1         False           False    False     False     False           False   
2         False           False    False     False     False           False   
3         False           False    False     False     False           False   
4         False           False    False     False     False           False   
5         False           False    False     False     False           False   
6         False           False    False     False     False           False   
7         False           False    False     False     False           False   
8         False           False    False     False     False           False   
9         False           False    False     False     False           False   
10        False           False    False     False     False           False   
11        False           False    False

TypeError: cannot compare a dtyped [bool] array with a scalar of type [NoneType]

In [19]:
df_out = df[~((df < (Q1 - 1.5 * IQR)) |(df > (Q3 + 1.5 * IQR))).any(axis=1)]
df_out.shape

(4319406, 11)