In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

In [2]:
df = pd.read_csv('Fraud.csv')

In [3]:
df

Unnamed: 0,step,type,amount,nameOrig,oldbalanceOrg,newbalanceOrig,nameDest,oldbalanceDest,newbalanceDest,isFraud,isFlaggedFraud
0,1,PAYMENT,9839.64,C1231006815,170136.00,160296.36,M1979787155,0.00,0.00,0,0
1,1,PAYMENT,1864.28,C1666544295,21249.00,19384.72,M2044282225,0.00,0.00,0,0
2,1,TRANSFER,181.00,C1305486145,181.00,0.00,C553264065,0.00,0.00,1,0
3,1,CASH_OUT,181.00,C840083671,181.00,0.00,C38997010,21182.00,0.00,1,0
4,1,PAYMENT,11668.14,C2048537720,41554.00,29885.86,M1230701703,0.00,0.00,0,0
...,...,...,...,...,...,...,...,...,...,...,...
6362615,743,CASH_OUT,339682.13,C786484425,339682.13,0.00,C776919290,0.00,339682.13,1,0
6362616,743,TRANSFER,6311409.28,C1529008245,6311409.28,0.00,C1881841831,0.00,0.00,1,0
6362617,743,CASH_OUT,6311409.28,C1162922333,6311409.28,0.00,C1365125890,68488.84,6379898.11,1,0
6362618,743,TRANSFER,850002.52,C1685995037,850002.52,0.00,C2080388513,0.00,0.00,1,0


In [4]:
print(df.dtypes)

step                int64
type               object
amount            float64
nameOrig           object
oldbalanceOrg     float64
newbalanceOrig    float64
nameDest           object
oldbalanceDest    float64
newbalanceDest    float64
isFraud             int64
isFlaggedFraud      int64
dtype: object


In [5]:
df.isna().sum()

step              0
type              0
amount            0
nameOrig          0
oldbalanceOrg     0
newbalanceOrig    0
nameDest          0
oldbalanceDest    0
newbalanceDest    0
isFraud           0
isFlaggedFraud    0
dtype: int64

There is no NULL or Missing variables in the dataset.

In [6]:
# We change the data type in 'type' feature
df['type']=df['type'].map({'PAYMENT':1 ,'TRANSFER':2, 'CASH_OUT':3, 'DEBIT':4, 'CASH_IN':5})

In [7]:
df[df.isFraud == 0].sample(10)

Unnamed: 0,step,type,amount,nameOrig,oldbalanceOrg,newbalanceOrig,nameDest,oldbalanceDest,newbalanceDest,isFraud,isFlaggedFraud
3982227,298,3,119762.84,C202199616,5254.0,0.0,C378616921,472555.1,592317.94,0,0
5292597,373,5,129748.36,C757549266,49731.0,179479.36,C2099946567,2198212.73,2068464.37,0,0
3533997,259,3,314669.47,C1918876100,0.0,0.0,C1177441283,1714787.71,2029457.18,0,0
1508210,144,3,166981.19,C1659106528,0.0,0.0,C681419196,4687336.49,4854317.68,0,0
4990779,352,1,6407.99,C237884703,6029.0,0.0,M1485879479,0.0,0.0,0,0
1867392,164,3,97459.71,C1242593732,3197.0,0.0,C79588333,3632495.9,3729955.61,0,0
4812843,346,1,5530.26,C1610625537,0.0,0.0,M831575735,0.0,0.0,0,0
5083278,355,1,25608.77,C1650288567,0.0,0.0,M1461955572,0.0,0.0,0,0
6111087,525,3,31267.07,C975825222,0.0,0.0,C744944969,3876206.35,3907473.42,0,0
6096759,522,3,22813.47,C1369801721,39845.0,17031.53,C2061917972,44415.84,67229.31,0,0


In [8]:
print(f"Number of isFlaggedFraud = {df.isFlaggedFraud.sum()}")

Number of isFlaggedFraud = 16


In [9]:
df = df.drop(['nameOrig','nameDest','isFlaggedFraud'],axis=1)

In [10]:
# Now let's check collinearity and multicollinearity between features
from patsy import dmatrices
from statsmodels.stats.outliers_influence import variance_inflation_factor

In [11]:
#create DataFrame to hold VIF values
vif_df = pd.DataFrame()
vif_df['variable'] = df.columns 

In [12]:
#calculate VIF for each predictor variable 
vif_df['VIF'] = [variance_inflation_factor(df.values, i) for i in range(df.shape[1])]

In [13]:
print(vif_df)

         variable         VIF
0            step    2.468588
1            type    3.278076
2          amount    4.181768
3   oldbalanceOrg  590.618237
4  newbalanceOrig  599.584249
5  oldbalanceDest   74.276004
6  newbalanceDest   85.767257
7         isFraud    1.195561


In [14]:
# Calculating correlation
df.corr()

Unnamed: 0,step,type,amount,oldbalanceOrg,newbalanceOrig,oldbalanceDest,newbalanceDest,isFraud
step,1.0,-0.001992,0.022373,-0.010058,-0.010299,0.027665,0.025888,0.031578
type,-0.001992,1.0,0.050693,0.4157,0.431006,0.165383,0.135362,-0.004463
amount,0.022373,0.050693,1.0,-0.002762,-0.007861,0.294137,0.459304,0.076688
oldbalanceOrg,-0.010058,0.4157,-0.002762,1.0,0.998803,0.066243,0.042029,0.010154
newbalanceOrig,-0.010299,0.431006,-0.007861,0.998803,1.0,0.067812,0.041837,-0.008148
oldbalanceDest,0.027665,0.165383,0.294137,0.066243,0.067812,1.0,0.976569,-0.005885
newbalanceDest,0.025888,0.135362,0.459304,0.042029,0.041837,0.976569,1.0,0.000535
isFraud,0.031578,-0.004463,0.076688,0.010154,-0.008148,-0.005885,0.000535,1.0


In [15]:
# Now let's see types of Fraud
payment_fraud = df.loc[(df.isFraud == 1) & (df.type == 1)]
transfer_fraud = df.loc[(df.isFraud == 1) & (df.type == 2)]
cashout_fraud = df.loc[(df.isFraud == 1) & (df.type == 3)]
debit_fraud = df.loc[(df.isFraud == 1) & (df.type == 4)]
cashin_fraud = df.loc[(df.isFraud == 1) & (df.type == 5)]

In [16]:
print('number of PAYMENT are fraud :{}'.format(len(payment_fraud)))
print('number of TRANSFER are fraud :{}'.format(len(transfer_fraud)))
print('number of CASH OUT are fraud :{}'.format(len(cashout_fraud)))
print('number of DEBIT are fraud :{}'.format(len(debit_fraud)))
print('number of CASH IN are fraud :{}'.format(len(cashin_fraud)))

number of PAYMENT are fraud :0
number of TRANSFER are fraud :4097
number of CASH OUT are fraud :4116
number of DEBIT are fraud :0
number of CASH IN are fraud :0


All frauds are of type 'TRANSFER' and 'CASH OUT'.
So we can choose only these 2 types of payments and we will remove Variables 'nameorig' , 'namedest', and 'isFlaggedFraud'.

In [17]:
# We will consider only 'TRANSFER' and 'CASH OUT' type cases only.
df = df.loc[(df.type == 2) | (df.type == 3)]

In [18]:
X = df.iloc[:,:-1]
y = df.iloc[:,-1]

In [19]:
X['type']= X['type'].map({2:1,3:2})

**Replacing missing values -**

- There are transactions where 'newbalanceorig' and 'oldbalanceorig' is zero while amount is non zero so we replace them with np.nan.
- There are 'newblancedest' and 'oldbalancedest' values are equal to zero but we can notimpute them because they shows the fraud transaction occured when amount is non zero instead of imputing them by 0 = -1

In [20]:
X.loc[(X.oldbalanceDest == 0) & (X.newbalanceDest == 0) & (X.amount != 0) ,['oldbalanceDest','newbalanceDest'] ] = -1 
X.loc[(X.oldbalanceOrg == 0) & (X.newbalanceOrig == 0) & (X.amount != 0) ,['oldbalanceOrg','newbalanceOrig'] ] = 0 

Now we can create two features of error at origin and destination account, where non zero amount is transfer but oldbalance and new balance have zero value

In [21]:
X['errorbalanceOrig'] = X.newbalanceOrig + X.amount - X.oldbalanceOrg
X['errorbalanceDest'] = X.oldbalanceDest + X.amount - X.newbalanceDest

In [22]:
X

Unnamed: 0,step,type,amount,oldbalanceOrg,newbalanceOrig,oldbalanceDest,newbalanceDest,errorbalanceOrig,errorbalanceDest
2,1,1,181.00,181.00,0.0,-1.00,-1.00,0.00,1.810000e+02
3,1,2,181.00,181.00,0.0,21182.00,0.00,0.00,2.136300e+04
15,1,2,229133.94,15325.00,0.0,5083.00,51513.44,213808.94,1.827035e+05
19,1,1,215310.30,705.00,0.0,22425.00,0.00,214605.30,2.377353e+05
24,1,1,311685.89,10835.00,0.0,6267.00,2719172.89,300850.89,-2.401220e+06
...,...,...,...,...,...,...,...,...,...
6362615,743,2,339682.13,339682.13,0.0,0.00,339682.13,0.00,0.000000e+00
6362616,743,1,6311409.28,6311409.28,0.0,-1.00,-1.00,0.00,6.311409e+06
6362617,743,2,6311409.28,6311409.28,0.0,68488.84,6379898.11,0.00,1.000000e-02
6362618,743,1,850002.52,850002.52,0.0,-1.00,-1.00,0.00,8.500025e+05


Let's check if the data is imbalances or not.

In [23]:
pos = df.isFraud.sum()
neg = df.size - pos
print(f"Dateframe has {pos} positive outputs and {neg} negative outputs")

Dateframe has 8213 positive outputs and 22155059 negative outputs


So the data is highly imbalanced.

Now we will split data in train and test dataset in 80:20 ratio.

In [24]:
# Split the dataset in traning set and test set
from sklearn.model_selection import train_test_split

In [25]:
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size = 0.8, random_state = 0)

**Selection of metric:** Since the data is highly skewed, I use the area under the precision-recall curve (AUPRC) rather than the conventional area under the receiver operating characteristic (AUROC). AUPRC on a dataset with 99% negative 1% positive examples, and it will “focus” on how the model handles the 1% positive examples. If the model handles the positive examples well, AUPRC will be high. If the model does poorly on the positive examples, AUPRC will be low.

In [27]:
!pip install xgboost --quiet

In [38]:
from sklearn.metrics import average_precision_score
from xgboost.sklearn import XGBClassifier
from xgboost import plot_importance, to_graphviz
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report

In [31]:
weights = (y == 0).sum() / (1.0 * (y == 1).sum())                  #xgb uses the scale_pos_weight to account for highly skewed datset
model = XGBClassifier(max_depth = 3, scale_pos_weight = weights,
                n_jobs = 4)
probabilities = model.fit(X_train,y_train).predict_proba(X_test)
print('AUPRC = {}'.format(average_precision_score(y_test, probabilities[:, 1])))

AUPRC = 0.9974047663613061


In [34]:
y_preds = model.predict(X_test)

array([0, 0, 0, ..., 0, 0, 0])

In [39]:
cf = confusion_matrix(y_test,y_preds)
cf

array([[2209730,      35],
       [     23,    6540]], dtype=int64)

In [37]:
print(classification_report(y_test, y_preds))

              precision    recall  f1-score   support

           0       1.00      1.00      1.00   2209765
           1       0.99      1.00      1.00      6563

    accuracy                           1.00   2216328
   macro avg       1.00      1.00      1.00   2216328
weighted avg       1.00      1.00      1.00   2216328



**AUPRC score is 0.99 means our model is performing well on positive (Fraud transaction) class.**

**Q. What kind of prevention should be adopted while company update its infrastructure?**

- Company can prevent fraudalnt transaction by focusing more on payment method type - 'Transfer' & 'Cash_out'
- Look Out for Patterns in Fraud and Theft

**Q. Assuming these actions have been implemented, how would you determine if they work?**

- we can retrain & maintain model after certain intervals so our model perform best under various fraudlant transaction.