In [1]:
import pandas as pd
import numpy as np

In [2]:
df = pd.read_csv("Fraud.csv")

In [3]:
df.shape

(6362620, 11)

#### Data Dictionary

In [4]:
df.columns

Index(['step', 'type', 'amount', 'nameOrig', 'oldbalanceOrg', 'newbalanceOrig',
       'nameDest', 'oldbalanceDest', 'newbalanceDest', 'isFraud',
       'isFlaggedFraud'],
      dtype='object')

step - maps a unit of time in the real world. In this case 1 step is 1 hour of time. Total steps 744 (30 days simulation).

type - CASH-IN, CASH-OUT, DEBIT, PAYMENT and TRANSFER.

amount - amount of the transaction in local currency.

nameOrig - customer who started the transaction

oldbalanceOrg - initial balance before the transaction

newbalanceOrig - new balance after the transaction

nameDest - customer who is the recipient of the transaction

oldbalanceDest - initial balance recipient before the transaction. Note that there is not information for customers that start with M (Merchants).

newbalanceDest - new balance recipient after the transaction. Note that there is not information for customers that start with M (Merchants).

isFraud - This is the transactions made by the fraudulent agents inside the simulation. In this specific dataset the fraudulent behavior of the agents aims to profit by taking control or customers accounts and try to empty the funds by transferring to another account and then cashing out of the system.

isFlaggedFraud - The business model aims to control massive transfers from one account to another and flags illegal attempts. An illegal attempt in this dataset is an attempt to transfer more than 200.000 in a single transaction.

In [5]:
df.head()

Unnamed: 0,step,type,amount,nameOrig,oldbalanceOrg,newbalanceOrig,nameDest,oldbalanceDest,newbalanceDest,isFraud,isFlaggedFraud
0,1,PAYMENT,9839.64,C1231006815,170136.0,160296.36,M1979787155,0.0,0.0,0,0
1,1,PAYMENT,1864.28,C1666544295,21249.0,19384.72,M2044282225,0.0,0.0,0,0
2,1,TRANSFER,181.0,C1305486145,181.0,0.0,C553264065,0.0,0.0,1,0
3,1,CASH_OUT,181.0,C840083671,181.0,0.0,C38997010,21182.0,0.0,1,0
4,1,PAYMENT,11668.14,C2048537720,41554.0,29885.86,M1230701703,0.0,0.0,0,0


Since step is a related to time(mappped to hrs in 30 days) this is not an important feature.

In [6]:
df.drop(['step'] , axis = 1 , inplace=True)

In [7]:
df.iloc[:,[-1,-2]].value_counts()

isFlaggedFraud  isFraud
0               0          6354407
                1             8197
1               1               16
Name: count, dtype: int64

Since 'isFraud' is always 1 when 'isFlaggedFraud' is 1 so we can drop 'isFlaggedFraud' this column

In [8]:
df.drop(['isFlaggedFraud'] , axis = 1 , inplace=True)

In [9]:
df.iloc[:,[0,-1]].value_counts()

type      isFraud
CASH_OUT  0          2233384
PAYMENT   0          2151495
CASH_IN   0          1399284
TRANSFER  0           528812
DEBIT     0            41432
CASH_OUT  1             4116
TRANSFER  1             4097
Name: count, dtype: int64

There is only a fraud when type is CASH_OUT and TRANSFER. so keeping them and removing others.

In [10]:
df = df[(df['type'] == 'CASH_OUT') | (df['type'] == 'TRANSFER')]

In [11]:
df['oldbalanceDest'].value_counts()

oldbalanceDest
0.00           389320
10000000.00       602
20000000.00       219
30000000.00        86
40000000.00        31
                ...  
3279806.07          1
992711.24           1
162068.53           1
496468.82           1
6510099.11          1
Name: count, Length: 2360852, dtype: int64

In [12]:
df['oldbalanceOrg'].value_counts()

oldbalanceOrg
0.00         1308582
154.00           434
124.00           427
109.00           425
186.00           425
              ...   
87167.00           1
232638.92          1
139622.87          1
340828.14          1
250420.00          1
Name: count, Length: 435702, dtype: int64

Since these columns contains almost all null values so we can drop this columns


In [13]:
df.drop(['oldbalanceDest' , 'oldbalanceOrg'] , axis = 1,inplace=True)

In [14]:
df['newbalanceOrig'].value_counts()

newbalanceOrig
0.00           2496656
2305.53              3
14403.77             3
19585040.37          3
174.94               3
                ...   
381520.95            1
1197.02              1
873440.65            1
608602.71            1
311268.62            1
Name: count, Length: 271977, dtype: int64

In [15]:
df['newbalanceDest'].value_counts()

newbalanceDest
0.00           16599
10000000.00       53
16532032.16       22
19169204.93       21
4743010.67        18
               ...  
1991821.27         1
202048.56          1
7226985.57         1
4189252.93         1
7360101.63         1
Name: count, Length: 2562844, dtype: int64

Since 'newbalanceOrig' column contains almost all null values so we can drop this columns

In [16]:
df.drop(['newbalanceOrig'] , axis = 1,inplace=True)

In [17]:
df.head()

Unnamed: 0,type,amount,nameOrig,nameDest,newbalanceDest,isFraud
2,TRANSFER,181.0,C1305486145,C553264065,0.0,1
3,CASH_OUT,181.0,C840083671,C38997010,0.0,1
15,CASH_OUT,229133.94,C905080434,C476402209,51513.44,0
19,TRANSFER,215310.3,C1670993182,C1100439041,0.0,0
24,TRANSFER,311685.89,C1984094095,C932583850,2719172.89,0


In [18]:
df['nameOrig'] = df['nameOrig'].apply(lambda x : x[0])

In [19]:
df['nameDest'] = df['nameDest'].apply(lambda x : x[0])

In [20]:
df.head()

Unnamed: 0,type,amount,nameOrig,nameDest,newbalanceDest,isFraud
2,TRANSFER,181.0,C,C,0.0,1
3,CASH_OUT,181.0,C,C,0.0,1
15,CASH_OUT,229133.94,C,C,51513.44,0
19,TRANSFER,215310.3,C,C,0.0,0
24,TRANSFER,311685.89,C,C,2719172.89,0


In [21]:
df['nameDest'].value_counts()

nameDest
C    2770409
Name: count, dtype: int64

In [22]:
df['nameOrig'].value_counts()

nameOrig
C    2770409
Name: count, dtype: int64

Since all are of type C. We can remove these columns

In [23]:
df.drop(['nameDest' , 'nameOrig'] , axis = 1,inplace=True)

In [24]:
df.head()

Unnamed: 0,type,amount,newbalanceDest,isFraud
2,TRANSFER,181.0,0.0,1
3,CASH_OUT,181.0,0.0,1
15,CASH_OUT,229133.94,51513.44,0
19,TRANSFER,215310.3,0.0,0
24,TRANSFER,311685.89,2719172.89,0


In [30]:
df.reset_index(drop=True,inplace=True)

In [31]:
df.shape

(2770409, 4)

Encodding

In [34]:
df.head()

Unnamed: 0,type,amount,newbalanceDest,isFraud
0,TRANSFER,181.0,0.0,1
1,CASH_OUT,181.0,0.0,1
2,CASH_OUT,229133.94,51513.44,0
3,TRANSFER,215310.3,0.0,0
4,TRANSFER,311685.89,2719172.89,0


TRANSFER = 1 and CASH_OUT = 0

In [39]:
from sklearn.preprocessing import LabelEncoder

label_encoder = LabelEncoder()
df['type_encoded'] = label_encoder.fit_transform(df['type'])
df.drop(['type'] , axis=1,inplace=True)


In [42]:
df = df[['amount', 'newbalanceDest', 'type_encoded', 'isFraud']]

In [43]:
df.head()

Unnamed: 0,amount,newbalanceDest,type_encoded,isFraud
0,181.0,0.0,1,1
1,181.0,0.0,0,1
2,229133.94,51513.44,0,0
3,215310.3,0.0,1,0
4,311685.89,2719172.89,1,0


In [44]:
df.to_csv('cleaned_data.csv')

In [46]:
X = df.iloc[:,0:-1]
y = df.iloc[:,-1]

In [47]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

In [48]:
# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Standardize the features
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

In [49]:
from sklearn.linear_model import LogisticRegression

# Model
lr = LogisticRegression()
lr.fit(X_train, y_train)

# Predictions
y_pred = lr.predict(X_test)

# Evaluation
print("Logistic Regression")
print("Accuracy:", accuracy_score(y_test, y_pred))
print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))

Logistic Regression
Accuracy: 0.9971845322533488
[[552411     25]
 [  1535    111]]
              precision    recall  f1-score   support

           0       1.00      1.00      1.00    552436
           1       0.82      0.07      0.12      1646

    accuracy                           1.00    554082
   macro avg       0.91      0.53      0.56    554082
weighted avg       1.00      1.00      1.00    554082



In [50]:
from sklearn.tree import DecisionTreeClassifier

# Model
dt = DecisionTreeClassifier()
dt.fit(X_train, y_train)

# Predictions
y_pred = dt.predict(X_test)

# Evaluation
print("Decision Tree Classifier")
print("Accuracy:", accuracy_score(y_test, y_pred))
print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))

Decision Tree Classifier
Accuracy: 0.997381254038211
[[551713    723]
 [   728    918]]
              precision    recall  f1-score   support

           0       1.00      1.00      1.00    552436
           1       0.56      0.56      0.56      1646

    accuracy                           1.00    554082
   macro avg       0.78      0.78      0.78    554082
weighted avg       1.00      1.00      1.00    554082



In [52]:
from sklearn.neighbors import KNeighborsClassifier

# Model
knn = KNeighborsClassifier()
knn.fit(X_train, y_train)

# Predictions
y_pred = knn.predict(X_test)

# Evaluation
print("K-Nearest Neighbors")
print("Accuracy:", accuracy_score(y_test, y_pred))
print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))


K-Nearest Neighbors
Accuracy: 0.9983576438144535
[[552163    273]
 [   637   1009]]
              precision    recall  f1-score   support

           0       1.00      1.00      1.00    552436
           1       0.79      0.61      0.69      1646

    accuracy                           1.00    554082
   macro avg       0.89      0.81      0.84    554082
weighted avg       1.00      1.00      1.00    554082

