In [1]:
# ==================
#
# IMPORTS
#
# ==================
import pandas as pd
import numpy as np

In [2]:
# ==================
#
# RAW DATA 
# Reference: E. A. Lopez-Rojas , A. Elmir, and S. Axelsson. 
# "PaySim: A financial mobile money simulator for fraud detection". 
# In: The 28th European Modeling and Simulation Symposium-EMSS, 
# Larnaca, Cyprus. 2016
#
# ==================
raw_df = pd.read_csv('./data/paysim1/PS_20174392719_1491204439457_log.csv')

In [3]:
# ==================
#
# INITIAL CHECKS
#
# ==================
df = raw_df.copy()

In [4]:
df.head()

Unnamed: 0,step,type,amount,nameOrig,oldbalanceOrg,newbalanceOrig,nameDest,oldbalanceDest,newbalanceDest,isFraud,isFlaggedFraud
0,1,PAYMENT,9839.64,C1231006815,170136.0,160296.36,M1979787155,0.0,0.0,0,0
1,1,PAYMENT,1864.28,C1666544295,21249.0,19384.72,M2044282225,0.0,0.0,0,0
2,1,TRANSFER,181.0,C1305486145,181.0,0.0,C553264065,0.0,0.0,1,0
3,1,CASH_OUT,181.0,C840083671,181.0,0.0,C38997010,21182.0,0.0,1,0
4,1,PAYMENT,11668.14,C2048537720,41554.0,29885.86,M1230701703,0.0,0.0,0,0


In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6362620 entries, 0 to 6362619
Data columns (total 11 columns):
step              int64
type              object
amount            float64
nameOrig          object
oldbalanceOrg     float64
newbalanceOrig    float64
nameDest          object
oldbalanceDest    float64
newbalanceDest    float64
isFraud           int64
isFlaggedFraud    int64
dtypes: float64(5), int64(3), object(3)
memory usage: 534.0+ MB


In [6]:
# ==================
#
# CLEAN DATA
#
# ==================

In [7]:
# ==================
# DROP COLUMNS
df.head()

Unnamed: 0,step,type,amount,nameOrig,oldbalanceOrg,newbalanceOrig,nameDest,oldbalanceDest,newbalanceDest,isFraud,isFlaggedFraud
0,1,PAYMENT,9839.64,C1231006815,170136.0,160296.36,M1979787155,0.0,0.0,0,0
1,1,PAYMENT,1864.28,C1666544295,21249.0,19384.72,M2044282225,0.0,0.0,0,0
2,1,TRANSFER,181.0,C1305486145,181.0,0.0,C553264065,0.0,0.0,1,0
3,1,CASH_OUT,181.0,C840083671,181.0,0.0,C38997010,21182.0,0.0,1,0
4,1,PAYMENT,11668.14,C2048537720,41554.0,29885.86,M1230701703,0.0,0.0,0,0


In [8]:
# =============
# RENAME FEATURES
df = df.rename(columns={'step':'hour', 'nameOrig':'name_orig', 
                        'oldbalanceOrg':'init_bal_orig', 
                        'newbalanceOrig':'new_bal_orig', 
                        'nameDest':'name_dest',
                        'oldbalanceDest':'init_bal_dest',
                        'newbalanceDest':'new_bal_dest',
                        'isFraud':'is_fraud','isFlaggedFraud':'is_flagged'})

In [9]:
# ============
# CREATE CUSTOMER AND MERCHANT FEATURES
df['type_origin'] = df['name_origin'].map(lambda name: name[0])

KeyError: 'name_origin'

In [None]:
df['type_origin'].unique()

In [None]:
# Drop type_origin
df = df.drop(columns=['type_origin'])

In [None]:
df['type_destination'] = df['name_destination'].map(lambda name: name[0])

In [None]:
df['type_destination'].unique()

In [None]:
df.head()

In [None]:
# ---- CONVERT type_destination TO DUMMIES ----
df = pd.get_dummies(data=df, columns=['type_destination'])

In [None]:
df = df.drop(columns=['type_destination_M'])

In [None]:
df = df.rename(columns={'type_destination_C':'is_customer_destination'})

In [None]:
# ---- REARRANGE FEATURES ----
df = df[['hour','type','amount','']]

In [None]:
# ==================
# NULL/BOGUS VALUES

# ---- NULL VALUES ----
df.isnull().sum().sum()

In [None]:
# ---- EMPTY STRING VALUES ----
df[df == ''].sum().sum()

In [None]:
df[df==' '].sum().sum()

In [None]:
# ---- ZEROS ----
type(df)

In [None]:
df[df==0.0].sum()

In [None]:
df.head()