In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
from collections import defaultdict
#import re

import warnings
warnings.filterwarnings("ignore")

import matplotlib.pyplot as plt
import collections


from sklearn.metrics import precision_recall_curve
from sklearn.metrics import accuracy_score, balanced_accuracy_score, f1_score, precision_score, recall_score, roc_auc_score, average_precision_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report

from pathlib import Path
import time

sns.set()

**Dataset**

- [2-synthetic-fraud-detection](./data/CreditCardfraudTrain.csv) : ...

describe the dataset

( more : https://huggingface.co/datasets/vitaliy-sharandin/synthetic-fraud-detection)

In [2]:
data_dir = Path('data')
work_with = "synthetic-fraud-detection.csv"

df = pd.read_csv(data_dir / work_with, encoding='utf-8')

# data distribution 
# 0 - normal, 1 - fraud
df.drop(['isFlaggedFraud'], inplace=True, axis=1)
display(df["isFraud"].value_counts())

df.shape

isFraud
0    6354407
1       8213
Name: count, dtype: int64

(6362620, 10)

In [3]:
data_dir = Path('data')
work_with = "synthetic-fraud-detection.csv"

df = pd.read_csv(data_dir / work_with, encoding='utf-8', nrows=500000)
#Drop missing values
df.dropna(axis=0, how='any', subset=None, inplace=True)

# df.describe()
#df.info()
df.shape


(500000, 11)

In [4]:
df.head(3)

Unnamed: 0,step,type,amount,nameOrig,oldbalanceOrg,newbalanceOrig,nameDest,oldbalanceDest,newbalanceDest,isFraud,isFlaggedFraud
0,1,PAYMENT,9839.64,C1231006815,170136.0,160296.36,M1979787155,0.0,0.0,0,0
1,1,PAYMENT,1864.28,C1666544295,21249.0,19384.72,M2044282225,0.0,0.0,0,0
2,1,TRANSFER,181.0,C1305486145,181.0,0.0,C553264065,0.0,0.0,1,0


In [5]:
# data distribution 
# 0 - normal, 1 - fraud
df.drop(['isFlaggedFraud'], inplace=True, axis=1)
display(df["isFraud"].value_counts())

isFraud
0    499767
1       233
Name: count, dtype: int64

In [6]:
# handle categorical data
cat_columns = df.select_dtypes(include=['object'], exclude=["number"]).columns
print(f"uniques val count:\n{df[[*cat_columns]].nunique()}")

uniques val count:
type             5
nameOrig    499953
nameDest    214856
dtype: int64


In [7]:
# encode transaction type with one hot encoder
df = pd.get_dummies(df, columns=["type"], prefix=["type"],dtype="int8")
df.head(3)

Unnamed: 0,step,amount,nameOrig,oldbalanceOrg,newbalanceOrig,nameDest,oldbalanceDest,newbalanceDest,isFraud,type_CASH_IN,type_CASH_OUT,type_DEBIT,type_PAYMENT,type_TRANSFER
0,1,9839.64,C1231006815,170136.0,160296.36,M1979787155,0.0,0.0,0,0,0,0,1,0
1,1,1864.28,C1666544295,21249.0,19384.72,M2044282225,0.0,0.0,0,0,0,0,1,0
2,1,181.0,C1305486145,181.0,0.0,C553264065,0.0,0.0,1,0,0,0,0,1


In [8]:
# # encode nameOrig and nameDest with label encoder
from sklearn.preprocessing import LabelEncoder
leNameOrig = LabelEncoder()
df["nameOrig"] = leNameOrig.fit_transform(df["nameOrig"])

In [9]:
# # encode nameOrig and nameDest with label encoder
from sklearn.preprocessing import LabelEncoder
leNameDest = LabelEncoder()
df["nameDest"] = leNameDest.fit_transform(df["nameDest"])

In [10]:
df.head(3)

Unnamed: 0,step,amount,nameOrig,oldbalanceOrg,newbalanceOrig,nameDest,oldbalanceDest,newbalanceDest,isFraud,type_CASH_IN,type_CASH_OUT,type_DEBIT,type_PAYMENT,type_TRANSFER
0,1,9839.64,59872,170136.0,160296.36,134004,0.0,0.0,0,0,0,0,1,0
1,1,1864.28,172096,21249.0,19384.72,139396,0.0,0.0,0,0,0,0,1,0
2,1,181.0,78797,181.0,0.0,38975,0.0,0.0,1,0,0,0,0,1


In [11]:
target = "isFraud"

X = df.drop([target], axis=1)
y = df[target]


In [12]:
X.shape, y.shape

((500000, 13), (500000,))

In [13]:
# # scaling the data
# from sklearn.preprocessing import StandardScaler

# scaler = StandardScaler()
# X[X.columns] = scaler.fit_transform(X)

# X.shape, y.shape

In [14]:
# split the data into train and test
from sklearn.model_selection import train_test_split
# X_train, X_test, y_train, y_test = train_test_split(X, y,
#                                                     stratify=y,
#                                                     test_size=0.2,
#                                                     random_state=42)
X_train, X_test, y_train, y_test = X, X, y, y
# count fraud in train and test 
print(f"Train: \n {y_train.value_counts()}")
print(f"Test: \n {y_test.value_counts()} ")

Train: 
 isFraud
0    499767
1       233
Name: count, dtype: int64
Test: 
 isFraud
0    499767
1       233
Name: count, dtype: int64 


In [15]:
''' Helper functions '''

def get_scores(y_true, y_pred):
    scores = {
        'accuracy': round(accuracy_score(y_true, y_pred),2),
        'balanced': round(balanced_accuracy_score(y_true, y_pred),2),
        'F1': round(f1_score(y_true, y_pred),2),
        'precision': round(precision_score(y_true, y_pred),2),
        'recall': round(recall_score(y_true, y_pred),2),
        'roc_auc': round(roc_auc_score(y_true, y_pred),2),
        'pr_auc': round(average_precision_score(y_true, y_pred),2)
    }
    
    return scores

In [16]:
combined_scores = []

isolation forest

In [17]:
# isolation forest

from pyod.models.iforest import IForest

start_time = time.time()

clf_name = 'IForest'

clf = IForest()
clf.fit(X_train)

duration = round(time.time() - start_time,2)

# get the prediction labels and outlier scores of the training data
y_train_pred = clf.labels_  # binary labels (0: inliers, 1: outliers)
y_train_scores = clf.decision_scores_  # raw outlier scores

# get the prediction on the test data
y_test_pred = clf.predict(X_test)  # outlier labels (0 or 1)
y_test_scores = clf.decision_function(X_test)  # outlier scores

scores = {"clf_name":clf_name, **get_scores(y_test, y_test_pred), "duration": duration}
combined_scores.append(scores)

Local Outlier Factor

In [18]:
#  Local Outlier Factor

from pyod.models.lof import LOF

start_time = time.time()

clf_name = 'LOF'
clf = LOF()
clf.fit(X_train)
duration = round(time.time() - start_time,2)

# get the prediction labels
y_train_pred = clf.labels_  
y_test_pred = clf.predict(X_test)


scores = {"clf_name":clf_name, **get_scores(y_test, y_test_pred), "duration": duration}
combined_scores.append(scores)
scores

{'clf_name': 'LOF',
 'accuracy': 0.91,
 'balanced': 0.66,
 'F1': 0.0,
 'precision': 0.0,
 'recall': 0.4,
 'roc_auc': 0.66,
 'pr_auc': 0.0,
 'duration': 31.32}

ECOD

In [19]:
#  ECOD

from pyod.models.ecod import ECOD
start_time = time.time()


clf_name = 'ECOD'
clf = ECOD()
clf.fit(X_train)
duration = round(time.time() - start_time,2)

# get the prediction labels
y_train_pred = clf.labels_  
y_test_pred = clf.predict(X_test)


scores = {"clf_name":clf_name, **get_scores(y_test, y_test_pred), "duration": duration}
combined_scores.append(scores)

Local Correlation Integral

In [20]:
# #  Local Correlation Integral (LOCI)

# from pyod.models.loci import LOCI
# start_time = time.time()


# clf_name = 'LOCI'
# clf = LOCI()
# clf.fit(X_train)
# duration = round(time.time() - start_time,2)

# # get the prediction labels
# y_train_pred = clf.labels_  
# y_test_pred = clf.predict(X_test)


# scores = {"clf_name":clf_name, **get_scores(y_test, y_test_pred), "duration": duration}
# combined_scores.append(scores)

LSCP: Locally Selective Combination of Parallel Outlier Ensembles

In [21]:
# #  LSCP

# from pyod.models.lscp import LSCP
# from pyod.models.lof import LOF
# start_time = time.time()


# clf_name = 'LSCP'
# detector_list = [LOF(), LOF()]
# clf = LSCP(detector_list)
# clf.fit(X_train)
# duration = round(time.time() - start_time,2)

# # get the prediction labels
# y_train_pred = clf.labels_  
# y_test_pred = clf.predict(X_test)


# scores = {"clf_name":clf_name, **get_scores(y_test, y_test_pred), "duration": duration}
# combined_scores.append(scores)

COPOD: Copula-Based Outlier Detection

In [22]:
#  COPOD: Copula-Based Outlier Detection

from pyod.models.copod import COPOD
start_time = time.time()


clf_name = 'COPOD'
clf = COPOD()
clf.fit(X_train)
duration = round(time.time() - start_time,2)

# get the prediction labels
y_train_pred = clf.labels_  
y_test_pred = clf.predict(X_test)


scores = {"clf_name":clf_name, **get_scores(y_test, y_test_pred), "duration": duration}
combined_scores.append(scores)

ABOD: Angle-Based Outlier Detection

In [23]:
#  ABOD: Angle-Based Outlier Detection

from pyod.models.abod import ABOD
start_time = time.time()

clf_name = 'ABOD'
clf = ABOD()
clf.fit(X_train)
duration = round(time.time() - start_time,2)

# get the prediction labels
y_train_pred = clf.labels_  
y_test_pred = clf.predict(X_test)


scores = {"clf_name":clf_name, **get_scores(y_test, y_test_pred), "duration": duration}
combined_scores.append(scores)

QMCD: Quasi-Monte Carlo Discrepancy outlier detection

In [24]:
# QMCD: Quasi-Monte Carlo Discrepancy outlier detection

from pyod.models.qmcd import QMCD
start_time = time.time()

clf_name = 'QMCD'
clf = QMCD()
clf.fit(X_train)
duration = round(time.time() - start_time,2)

# get the prediction labels
y_train_pred = clf.labels_  
y_test_pred = clf.predict(X_test)


scores = {"clf_name":clf_name, **get_scores(y_test, y_test_pred), "duration": duration}
combined_scores.append(scores)


Rapid distance-based outlier detection via sampling

In [25]:
# MAD - Rapid distance-based outlier detection via sampling

from pyod.models.sampling import Sampling
start_time = time.time()

clf_name = 'Rapid distance-based'
clf = Sampling()
clf.fit(X_train)
duration = round(time.time() - start_time,2)

# get the prediction labels
y_train_pred = clf.labels_  
y_test_pred = clf.predict(X_test)


scores = {"clf_name":clf_name, **get_scores(y_test, y_test_pred), "duration": duration}
combined_scores.append(scores)



## Book Keeping

In [26]:
# save the results
save_dir = Path('results')
save_dir.mkdir(exist_ok=True)
save_as = save_dir / f"2-{Path(work_with).stem}.csv"

scores_df = pd.DataFrame(combined_scores)
scores_df.to_csv(save_as, index=False)


In [27]:
display(scores_df)

Unnamed: 0,clf_name,accuracy,balanced,F1,precision,recall,roc_auc,pr_auc,duration
0,IForest,0.9,0.62,0.0,0.0,0.33,0.62,0.0,3.21
1,LOF,0.91,0.66,0.0,0.0,0.4,0.66,0.0,31.32
2,ECOD,0.9,0.57,0.0,0.0,0.24,0.57,0.0,1.66
3,COPOD,0.9,0.55,0.0,0.0,0.2,0.55,0.0,1.31
4,ABOD,0.9,0.58,0.0,0.0,0.27,0.58,0.0,48.25
5,QMCD,0.9,0.52,0.0,0.0,0.14,0.52,0.0,325.68
6,Rapid distance-based,0.9,0.49,0.0,0.0,0.08,0.49,0.0,0.21


``` markdown

# methods used
1. [Isolation Forest](https://cs.nju.edu.cn/zhouzh/zhouzh.files/publication/icdm08b.pdf)
2. ...
```