In [57]:
# ==================
#
# IMPORTS
#
# ==================
import pandas as pd
import numpy as np
import dill
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns

from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix, accuracy_score, roc_auc_score, classification_report

In [2]:
# ==================
#
# RAW DATA 
# Reference: E. A. Lopez-Rojas , A. Elmir, and S. Axelsson. 
# "PaySim: A financial mobile money simulator for fraud detection". 
# In: The 28th European Modeling and Simulation Symposium-EMSS, 
# Larnaca, Cyprus. 2016
#
# ==================
'''
raw_df = pd.read_csv('./data/paysim1/PS_20174392719_1491204439457_log.csv')
'''

"\nraw_df = pd.read_csv('./data/paysim1/PS_20174392719_1491204439457_log.csv')\n"

In [3]:
'''
dill.dump(obj=raw_df, file=open('./data/raw_df.pkl', mode='wb'))
'''

"\ndill.dump(obj=raw_df, file=open('./data/raw_df.pkl', mode='wb'))\n"

In [4]:
raw_df = dill.load(file=open('./data/raw_df.pkl', mode='rb'))

In [5]:
# ==================
#
# INITIAL CHECKS
#
# ==================
df = raw_df.copy()

In [6]:
df.head()

Unnamed: 0,step,type,amount,nameOrig,oldbalanceOrg,newbalanceOrig,nameDest,oldbalanceDest,newbalanceDest,isFraud,isFlaggedFraud
0,1,PAYMENT,9839.64,C1231006815,170136.0,160296.36,M1979787155,0.0,0.0,0,0
1,1,PAYMENT,1864.28,C1666544295,21249.0,19384.72,M2044282225,0.0,0.0,0,0
2,1,TRANSFER,181.0,C1305486145,181.0,0.0,C553264065,0.0,0.0,1,0
3,1,CASH_OUT,181.0,C840083671,181.0,0.0,C38997010,21182.0,0.0,1,0
4,1,PAYMENT,11668.14,C2048537720,41554.0,29885.86,M1230701703,0.0,0.0,0,0


In [7]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6362620 entries, 0 to 6362619
Data columns (total 11 columns):
step              int64
type              object
amount            float64
nameOrig          object
oldbalanceOrg     float64
newbalanceOrig    float64
nameDest          object
oldbalanceDest    float64
newbalanceDest    float64
isFraud           int64
isFlaggedFraud    int64
dtypes: float64(5), int64(3), object(3)
memory usage: 534.0+ MB


In [8]:
# ==================
#
# CLEAN DATA
#
# ==================

In [9]:
# =============
# RENAME FEATURES
df = df.rename(columns={'step':'hour', 'nameOrig':'name_orig', 
                        'oldbalanceOrg':'init_bal_orig', 
                        'newbalanceOrig':'new_bal_orig', 
                        'nameDest':'name_dest',
                        'oldbalanceDest':'init_bal_dest',
                        'newbalanceDest':'new_bal_dest',
                        'isFraud':'is_fraud','isFlaggedFraud':'is_flagged'})

In [10]:
# ============
# CREATE CUSTOMER AND MERCHANT FEATURES
df['type_orig'] = df['name_orig'].map(lambda name: name[0])

In [11]:
df['type_orig'].unique()

array(['C'], dtype=object)

In [12]:
# Drop type_origin
df = df.drop(columns=['type_orig'])

In [13]:
df['type_dest'] = df['name_dest'].map(lambda name: name[0])

In [14]:
df['type_dest'].unique()

array(['M', 'C'], dtype=object)

In [15]:
# ---- CONVERT type_destination TO DUMMIES ----
df = pd.get_dummies(data=df, columns=['type_dest'])

In [16]:
df = df.drop(columns=['type_dest_M'])

In [17]:
df = df.rename(columns={'type_dest_C':'is_cust_dest'})

In [18]:
# ---- REARRANGE FEATURES ----
df = df[['hour', 'type', 'amount', 'name_orig', 'init_bal_orig', 'new_bal_orig',
         'name_dest', 'is_cust_dest', 'init_bal_dest', 'new_bal_dest', 
         'is_flagged', 'is_fraud']]

In [19]:
df.head()

Unnamed: 0,hour,type,amount,name_orig,init_bal_orig,new_bal_orig,name_dest,is_cust_dest,init_bal_dest,new_bal_dest,is_flagged,is_fraud
0,1,PAYMENT,9839.64,C1231006815,170136.0,160296.36,M1979787155,0,0.0,0.0,0,0
1,1,PAYMENT,1864.28,C1666544295,21249.0,19384.72,M2044282225,0,0.0,0.0,0,0
2,1,TRANSFER,181.0,C1305486145,181.0,0.0,C553264065,1,0.0,0.0,0,1
3,1,CASH_OUT,181.0,C840083671,181.0,0.0,C38997010,1,21182.0,0.0,0,1
4,1,PAYMENT,11668.14,C2048537720,41554.0,29885.86,M1230701703,0,0.0,0.0,0,0


In [20]:
# ---- TYPE DUMMIES ----
df = pd.get_dummies(data = df, columns=['type'], drop_first=True)

In [21]:
df.head()

Unnamed: 0,hour,amount,name_orig,init_bal_orig,new_bal_orig,name_dest,is_cust_dest,init_bal_dest,new_bal_dest,is_flagged,is_fraud,type_CASH_OUT,type_DEBIT,type_PAYMENT,type_TRANSFER
0,1,9839.64,C1231006815,170136.0,160296.36,M1979787155,0,0.0,0.0,0,0,0,0,1,0
1,1,1864.28,C1666544295,21249.0,19384.72,M2044282225,0,0.0,0.0,0,0,0,0,1,0
2,1,181.0,C1305486145,181.0,0.0,C553264065,1,0.0,0.0,0,1,0,0,0,1
3,1,181.0,C840083671,181.0,0.0,C38997010,1,21182.0,0.0,0,1,1,0,0,0
4,1,11668.14,C2048537720,41554.0,29885.86,M1230701703,0,0.0,0.0,0,0,0,0,1,0


In [22]:
 df.columns

Index(['hour', 'amount', 'name_orig', 'init_bal_orig', 'new_bal_orig',
       'name_dest', 'is_cust_dest', 'init_bal_dest', 'new_bal_dest',
       'is_flagged', 'is_fraud', 'type_CASH_OUT', 'type_DEBIT', 'type_PAYMENT',
       'type_TRANSFER'],
      dtype='object')

In [23]:
df = df[['type_CASH_OUT', 'type_DEBIT', 'type_PAYMENT', 'type_TRANSFER', 
        'hour', 'amount', 'name_orig', 'init_bal_orig', 'new_bal_orig',
        'name_dest', 'is_cust_dest', 'init_bal_dest', 'new_bal_dest',
        'is_flagged', 'is_fraud']]

In [24]:
df = df.rename(columns={'type_CASH_OUT':'is_type_cash_out',
                  'type_DEBIT':'is_type_debit',
                  'type_PAYMENT':'is_type_payment',
                  'type_TRANSFER':'is_type_transfer'})

In [25]:
df.head()

Unnamed: 0,is_type_cash_out,is_type_debit,is_type_payment,is_type_transfer,hour,amount,name_orig,init_bal_orig,new_bal_orig,name_dest,is_cust_dest,init_bal_dest,new_bal_dest,is_flagged,is_fraud
0,0,0,1,0,1,9839.64,C1231006815,170136.0,160296.36,M1979787155,0,0.0,0.0,0,0
1,0,0,1,0,1,1864.28,C1666544295,21249.0,19384.72,M2044282225,0,0.0,0.0,0,0
2,0,0,0,1,1,181.0,C1305486145,181.0,0.0,C553264065,1,0.0,0.0,0,1
3,1,0,0,0,1,181.0,C840083671,181.0,0.0,C38997010,1,21182.0,0.0,0,1
4,0,0,1,0,1,11668.14,C2048537720,41554.0,29885.86,M1230701703,0,0.0,0.0,0,0


In [26]:
# ==================
# NULL/BOGUS VALUES

# ---- NULL VALUES ----
df.isnull().sum().sum()

0

In [27]:
# ---- EMPTY STRING VALUES ----
(df=='').sum().sum()

0

In [28]:
(df==' ').sum().sum()

0

In [29]:
# ---- ZEROS ----
(df==0).sum()/len(df)*100

is_type_cash_out    64.833669
is_type_debit       99.348822
is_type_payment     66.185392
is_type_transfer    91.624378
hour                 0.000000
amount               0.000251
name_orig            0.000000
init_bal_orig       33.043762
new_bal_orig        56.730812
name_dest            0.000000
is_cust_dest        33.814608
init_bal_dest       42.504314
new_bal_dest        38.340071
is_flagged          99.999749
is_fraud            99.870918
dtype: float64

Notes about 0s:
* A surprisingly large 57% of the transactionss have an originator new balance of 0.
* A whopping 99.87% of the data is labeled is_fraud=0 - meaning only 0.13% of the data is actually fraud.
* From the dataset description we know that the merchant data is unknown; it is all 0s.

In [30]:
df[df['is_cust_dest']==0][['init_bal_dest', 'new_bal_dest']].sum()

init_bal_dest    0.0
new_bal_dest     0.0
dtype: float64

In [31]:
df['name_orig'].nunique() / len(df)

0.9985362947967975

In [32]:
df['name_dest'].nunique() / len(df)

0.4278680795018404

In [33]:
df.head()

Unnamed: 0,is_type_cash_out,is_type_debit,is_type_payment,is_type_transfer,hour,amount,name_orig,init_bal_orig,new_bal_orig,name_dest,is_cust_dest,init_bal_dest,new_bal_dest,is_flagged,is_fraud
0,0,0,1,0,1,9839.64,C1231006815,170136.0,160296.36,M1979787155,0,0.0,0.0,0,0
1,0,0,1,0,1,1864.28,C1666544295,21249.0,19384.72,M2044282225,0,0.0,0.0,0,0
2,0,0,0,1,1,181.0,C1305486145,181.0,0.0,C553264065,1,0.0,0.0,0,1
3,1,0,0,0,1,181.0,C840083671,181.0,0.0,C38997010,1,21182.0,0.0,0,1
4,0,0,1,0,1,11668.14,C2048537720,41554.0,29885.86,M1230701703,0,0.0,0.0,0,0


In [34]:
df.shape

(6362620, 15)

In [35]:
# =========================
# EDA

In [36]:
df.describe()

Unnamed: 0,is_type_cash_out,is_type_debit,is_type_payment,is_type_transfer,hour,amount,init_bal_orig,new_bal_orig,is_cust_dest,init_bal_dest,new_bal_dest,is_flagged,is_fraud
count,6362620.0,6362620.0,6362620.0,6362620.0,6362620.0,6362620.0,6362620.0,6362620.0,6362620.0,6362620.0,6362620.0,6362620.0,6362620.0
mean,0.3516633,0.006511783,0.3381461,0.08375622,243.3972,179861.9,833883.1,855113.7,0.6618539,1100702.0,1224996.0,2.514687e-06,0.00129082
std,0.4774895,0.08043246,0.4730786,0.2770219,142.332,603858.2,2888243.0,2924049.0,0.4730786,3399180.0,3674129.0,0.001585775,0.0359048
min,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.0,0.0,0.0,0.0,156.0,13389.57,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,0.0,0.0,0.0,0.0,239.0,74871.94,14208.0,0.0,1.0,132705.7,214661.4,0.0,0.0
75%,1.0,0.0,1.0,0.0,335.0,208721.5,107315.2,144258.4,1.0,943036.7,1111909.0,0.0,0.0
max,1.0,1.0,1.0,1.0,743.0,92445520.0,59585040.0,49585040.0,1.0,356015900.0,356179300.0,1.0,1.0


Notes:
* 0.13% are fraud - highly imbalanced

In [37]:
df['is_fraud'].value_counts(normalize=True)*100

0    99.870918
1     0.129082
Name: is_fraud, dtype: float64

In [38]:
# ========================
# TRAIN/TEST SPLIT
df.head(1)

Unnamed: 0,is_type_cash_out,is_type_debit,is_type_payment,is_type_transfer,hour,amount,name_orig,init_bal_orig,new_bal_orig,name_dest,is_cust_dest,init_bal_dest,new_bal_dest,is_flagged,is_fraud
0,0,0,1,0,1,9839.64,C1231006815,170136.0,160296.36,M1979787155,0,0.0,0.0,0,0


In [39]:
X_train, X_test, y_train, y_test = train_test_split(
    df.drop(columns=['is_fraud','name_orig','name_dest']), df['is_fraud'], test_size=0.20, random_state=101)

In [40]:
print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)

(5090096, 12)
(1272524, 12)
(5090096,)
(1272524,)


In [41]:
print(df['is_fraud'].value_counts(normalize=True)[1])
print(y_train.value_counts(normalize=True)[1])
print(y_test.value_counts(normalize=True)[1])

0.001290820448180152
0.0012938852233828203
0.0012785613473694799


In [42]:
# =======================
# MODELS

In [43]:
# ---- BASELINE ----
# Logistic regression

# Scale
sc_X = StandardScaler()
X_train = sc_X.fit_transform(X_train)
X_test = sc_X.transform(X_test)

In [44]:
# Train model
fit0 = LogisticRegression(random_state=101)
fit0.fit(X_train, y_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=101, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [45]:
# Predict on training set
pred0 = fit0.predict(X_test)

In [70]:
cm0 = confusion_matrix(y_test,pred0)
acc0 = accuracy_score(y_test, pred0)
cr0 = classification_report(y_test, pred0)
print('\n',cm0,'\n')
print('----------------------------------------------------')
print('\n', 'Accuracy: ', round(acc0, 4), '\n')
print('----------------------------------------------------')
print('\n', cr0)


 [[1270856      41]
 [    937     690]] 

----------------------------------------------------

 Accuracy:  0.9992 

----------------------------------------------------

              precision    recall  f1-score   support

          0       1.00      1.00      1.00   1270897
          1       0.94      0.42      0.59      1627

avg / total       1.00      1.00      1.00   1272524

