In [2]:
import pandas as pd
import numpy as np

from sklearn import linear_model
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression

from sklearn.metrics import accuracy_score
from sklearn.metrics import roc_auc_score

In [3]:
# This notebook will help to illustrate some of the transformations we studied during session 2
# We will start reading the dataset. It is quite big (6 million rows) so it will take 10 to 15 seconds to load
# Please be patient!
d = pd.read_csv('PS_20174392719_1491204439457_log.csv')

In [4]:
# Let's have a look to the contents
d

Unnamed: 0,step,type,amount,nameOrig,oldbalanceOrg,newbalanceOrig,nameDest,oldbalanceDest,newbalanceDest,isFraud,isFlaggedFraud
0,1,PAYMENT,9839.64,C1231006815,170136.00,160296.36,M1979787155,0.00,0.00,0,0
1,1,PAYMENT,1864.28,C1666544295,21249.00,19384.72,M2044282225,0.00,0.00,0,0
2,1,TRANSFER,181.00,C1305486145,181.00,0.00,C553264065,0.00,0.00,1,0
3,1,CASH_OUT,181.00,C840083671,181.00,0.00,C38997010,21182.00,0.00,1,0
4,1,PAYMENT,11668.14,C2048537720,41554.00,29885.86,M1230701703,0.00,0.00,0,0
...,...,...,...,...,...,...,...,...,...,...,...
6362615,743,CASH_OUT,339682.13,C786484425,339682.13,0.00,C776919290,0.00,339682.13,1,0
6362616,743,TRANSFER,6311409.28,C1529008245,6311409.28,0.00,C1881841831,0.00,0.00,1,0
6362617,743,CASH_OUT,6311409.28,C1162922333,6311409.28,0.00,C1365125890,68488.84,6379898.11,1,0
6362618,743,TRANSFER,850002.52,C1685995037,850002.52,0.00,C2080388513,0.00,0.00,1,0


In [5]:
# And see how the main statistic for the numeric columns
d.describe()

Unnamed: 0,step,amount,oldbalanceOrg,newbalanceOrig,oldbalanceDest,newbalanceDest,isFraud,isFlaggedFraud
count,6362620.0,6362620.0,6362620.0,6362620.0,6362620.0,6362620.0,6362620.0,6362620.0
mean,243.3972,179861.9,833883.1,855113.7,1100702.0,1224996.0,0.00129082,2.514687e-06
std,142.332,603858.2,2888243.0,2924049.0,3399180.0,3674129.0,0.0359048,0.001585775
min,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,156.0,13389.57,0.0,0.0,0.0,0.0,0.0,0.0
50%,239.0,74871.94,14208.0,0.0,132705.7,214661.4,0.0,0.0
75%,335.0,208721.5,107315.2,144258.4,943036.7,1111909.0,0.0,0.0
max,743.0,92445520.0,59585040.0,49585040.0,356015900.0,356179300.0,1.0,1.0


In [6]:
# We are dropping here some columns. I will drop isFlaggedFraud because it does not belong to the problem
# Why do you think I dropped 'nameOrig' and 'nameDest' ?
d = d.drop(columns=['nameOrig', 'nameDest', 'isFlaggedFraud'])
d

Unnamed: 0,step,type,amount,oldbalanceOrg,newbalanceOrig,oldbalanceDest,newbalanceDest,isFraud
0,1,PAYMENT,9839.64,170136.00,160296.36,0.00,0.00,0
1,1,PAYMENT,1864.28,21249.00,19384.72,0.00,0.00,0
2,1,TRANSFER,181.00,181.00,0.00,0.00,0.00,1
3,1,CASH_OUT,181.00,181.00,0.00,21182.00,0.00,1
4,1,PAYMENT,11668.14,41554.00,29885.86,0.00,0.00,0
...,...,...,...,...,...,...,...,...
6362615,743,CASH_OUT,339682.13,339682.13,0.00,0.00,339682.13,1
6362616,743,TRANSFER,6311409.28,6311409.28,0.00,0.00,0.00,1
6362617,743,CASH_OUT,6311409.28,6311409.28,0.00,68488.84,6379898.11,1
6362618,743,TRANSFER,850002.52,850002.52,0.00,0.00,0.00,1


In [7]:
# Check if there are null values
d.isna().sum()

step              0
type              0
amount            0
oldbalanceOrg     0
newbalanceOrig    0
oldbalanceDest    0
newbalanceDest    0
isFraud           0
dtype: int64

In [8]:
# I will encode the column 'type' assigning and arbitrary numerical value to each transaction type
# First, check the different values of column type
d['type'].unique()

array(['PAYMENT', 'TRANSFER', 'CASH_OUT', 'DEBIT', 'CASH_IN'],
      dtype=object)

In [9]:
# Now use tranform to apply the function encode_type
def encode_type(x):
    if x == 'PAYMENT':
        return 1
    if x == 'TRANSFER':
        return 2
    if x == 'CASH_OUT':
        return 3
    if x == 'DEBIT':
        return 4
    if x == 'CASH_IN':
        return 5
    
d['type'] = d['type'].transform(encode_type)
d

Unnamed: 0,step,type,amount,oldbalanceOrg,newbalanceOrig,oldbalanceDest,newbalanceDest,isFraud
0,1,1,9839.64,170136.00,160296.36,0.00,0.00,0
1,1,1,1864.28,21249.00,19384.72,0.00,0.00,0
2,1,2,181.00,181.00,0.00,0.00,0.00,1
3,1,3,181.00,181.00,0.00,21182.00,0.00,1
4,1,1,11668.14,41554.00,29885.86,0.00,0.00,0
...,...,...,...,...,...,...,...,...
6362615,743,3,339682.13,339682.13,0.00,0.00,339682.13,1
6362616,743,2,6311409.28,6311409.28,0.00,0.00,0.00,1
6362617,743,3,6311409.28,6311409.28,0.00,68488.84,6379898.11,1
6362618,743,2,850002.52,850002.52,0.00,0.00,0.00,1


In [10]:
# At this point you should try one hot encoding column 'type' and
# check if the model quality improves.
d = pd.get_dummies(d, prefix = ['one_hot'], columns = ['type'])
d

Unnamed: 0,step,amount,oldbalanceOrg,newbalanceOrig,oldbalanceDest,newbalanceDest,isFraud,one_hot_1,one_hot_2,one_hot_3,one_hot_4,one_hot_5
0,1,9839.64,170136.00,160296.36,0.00,0.00,0,1,0,0,0,0
1,1,1864.28,21249.00,19384.72,0.00,0.00,0,1,0,0,0,0
2,1,181.00,181.00,0.00,0.00,0.00,1,0,1,0,0,0
3,1,181.00,181.00,0.00,21182.00,0.00,1,0,0,1,0,0
4,1,11668.14,41554.00,29885.86,0.00,0.00,0,1,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...
6362615,743,339682.13,339682.13,0.00,0.00,339682.13,1,0,0,1,0,0
6362616,743,6311409.28,6311409.28,0.00,0.00,0.00,1,0,1,0,0,0
6362617,743,6311409.28,6311409.28,0.00,68488.84,6379898.11,1,0,0,1,0,0
6362618,743,850002.52,850002.52,0.00,0.00,0.00,1,0,1,0,0,0


In [11]:
# Insert here the code to standardize the dataest values and check
# if the model performance improves
#for column in ['amount', 'oldbalanceOrg', 'newbalanceOrig', 'oldbalanceDest']:
#    d[column] = (d[column] - d[column].mean()) / d[column].std()
#d

In [12]:
# Check how balanced is the dataset
is_fraud = d[d.isFraud == 1]
is_not_fraud = d[d.isFraud == 0]

print(len(is_fraud))
print(len(is_not_fraud))

8213
6354407


In [13]:
# The dataset is very unbalanced. 
# With this code we will undersample the dataset and get balanced classes

# index property contains the indexes of the not_fraud dataset
is_not_fraud_indexes = is_not_fraud.index

# get as many fraud indexes (randomly) as non fraud occurrences
np.random.seed(42)
random_non_fraud_indexes = np.random.choice(is_not_fraud_indexes, len(is_fraud))

# Keep those entries in not fraud
is_not_fraud = d.loc[random_non_fraud_indexes]

# We will form the balanced dataset concatenating fraud and non_fraud
bln = is_fraud.append(is_not_fraud)

# Check the result
print(len(bln[bln.isFraud == 1]))
print(len(bln[bln.isFraud == 0]))

8213
8213


In [14]:
# To see how the model performs with the unbalanced dataset 
# uncomment the next line
#bln = d

In [15]:
# The regular learning process
X = bln.copy()
Y = bln['isFraud']
X = bln.drop(['isFraud'], axis=1)

trainX, testX, trainY, testY = train_test_split(X, Y, test_size = 0.2, random_state=42)

#trainX = X 
#trainY = Y

#testX = d.drop(columns=['isFraud'])
#testY = d['isFraud']

clf = linear_model.LogisticRegression()
clf.fit(trainX, trainY)
predY = clf.predict(testX)

# We measure the quality of our model using two scores:
# - Accuracy: number of correct predictions divided by the number of samples
# - AUC: area under the curve. This will be explained in session 4
# The best measure for classification problems is AUC. If you try with the unbalanced dataset
# you will get an almost perfect accuracy but a rather low AUC. Can you explain how is this possible?
# (This will be explained in detail in session 4 as well)
print(accuracy_score(testY, predY))
print(roc_auc_score(testY, predY))

0.9074863055386488
0.9075161896308567
