In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
from collections import defaultdict
#import re

import warnings
warnings.filterwarnings("ignore")

import matplotlib.pyplot as plt
import collections


from sklearn.metrics import precision_recall_curve
from sklearn.metrics import accuracy_score, balanced_accuracy_score, f1_score, precision_score, recall_score, roc_auc_score, average_precision_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report

from pathlib import Path
import time

sns.set()

**Dataset**


- [4-CreditCardfraudTrain](./data/CreditCardfraudTrain.csv) : 
```markdown
desribe the dataset

```

( more : https://huggingface.co/datasets/dazzle-nu/CIS435-CreditCardFraudDetection )

In [2]:

data_dir = Path('data')
work_with = "CreditCardfraudTrain.csv"

df = pd.read_csv(data_dir / work_with, encoding='utf-8',  index_col=0, infer_datetime_format=True, parse_dates=True)
#Impute missing values with 0 since it means the company have no value for that year
# df.fillna(0,inplace=True)
df.dropna(inplace=True, axis=1, how='any')

# df.describe()
#df.info()
df.shape
print(f'{df["is_fraud"].value_counts()}')

is_fraud
0    1042569
1       6006
Name: count, dtype: int64


In [3]:
data_dir = Path('data')
work_with = "CreditCardfraudTrain.csv"

df = pd.read_csv(data_dir / work_with, encoding='utf-8',  index_col=0, infer_datetime_format=True, parse_dates=True,nrows=200000)
#Impute missing values with 0 since it means the company have no value for that year
# df.fillna(0,inplace=True)
df.dropna(inplace=True, axis=1, how='any')

# df.describe()
#df.info()
df.shape

(200000, 22)

In [4]:
df.head()

Unnamed: 0,trans_date_trans_time,cc_num,merchant,category,amt,first,last,gender,street,city,...,lat,long,city_pop,job,dob,trans_num,unix_time,merch_lat,merch_long,is_fraud
0,1/1/19 0:00,2703190000000000.0,"fraud_Rippin, Kub and Mann",misc_net,4.97,Jennifer,Banks,F,561 Perry Cove,Moravian Falls,...,36.0788,-81.1781,3495,"Psychologist, counselling",3/9/88,0b242abb623afc578575680df30655b9,1325376018,36.011293,-82.048315,0
1,1/1/19 0:00,630423000000.0,"fraud_Heller, Gutmann and Zieme",grocery_pos,107.23,Stephanie,Gill,F,43039 Riley Greens Suite 393,Orient,...,48.8878,-118.2105,149,Special educational needs teacher,6/21/78,1f76529f8574734946361c461b024d99,1325376044,49.159047,-118.186462,0
2,1/1/19 0:00,38859500000000.0,fraud_Lind-Buckridge,entertainment,220.11,Edward,Sanchez,M,594 White Dale Suite 530,Malad City,...,42.1808,-112.262,4154,Nature conservation officer,1/19/62,a1a22d70485983eac12b5b88dad1cf95,1325376051,43.150704,-112.154481,0
3,1/1/19 0:01,3534090000000000.0,"fraud_Kutch, Hermiston and Farrell",gas_transport,45.0,Jeremy,White,M,9443 Cynthia Court Apt. 038,Boulder,...,46.2306,-112.1138,1939,Patent attorney,1/12/67,6b849c168bdad6f867558c3793159a81,1325376076,47.034331,-112.561071,0
4,1/1/19 0:03,375534000000000.0,fraud_Keeling-Crist,misc_pos,41.96,Tyler,Garcia,M,408 Bradley Rest,Doe Hill,...,38.4207,-79.4629,99,Dance movement psychotherapist,3/28/86,a41d7549acf90789359a9aa5346dcb46,1325376186,38.674999,-78.632459,0


In [5]:
df.info()


<class 'pandas.core.frame.DataFrame'>
Index: 200000 entries, 0 to 199999
Data columns (total 22 columns):
 #   Column                 Non-Null Count   Dtype  
---  ------                 --------------   -----  
 0   trans_date_trans_time  200000 non-null  object 
 1   cc_num                 200000 non-null  float64
 2   merchant               200000 non-null  object 
 3   category               200000 non-null  object 
 4   amt                    200000 non-null  float64
 5   first                  200000 non-null  object 
 6   last                   200000 non-null  object 
 7   gender                 200000 non-null  object 
 8   street                 200000 non-null  object 
 9   city                   200000 non-null  object 
 10  state                  200000 non-null  object 
 11  zip                    200000 non-null  int64  
 12  lat                    200000 non-null  float64
 13  long                   200000 non-null  float64
 14  city_pop               200000 non-null  i

In [6]:
# drop  some columns
df.drop(['trans_date_trans_time',"trans_num"], axis=1, inplace=True)

In [7]:
# data distribution 
# 0 - no-fraus, 1 - fraud


print(f'{df["is_fraud"].value_counts()}')

is_fraud
0    198355
1      1645
Name: count, dtype: int64


In [8]:
# handle categorical data
cat_columns = df.select_dtypes(include=['object'], exclude=["number"]).columns
print(f"uniques val count:\n{df[[*cat_columns]].nunique()}")

uniques val count:
merchant    693
category     14
first       341
last        471
gender        2
street      931
city        855
state        50
job         480
dob         917
dtype: int64


In [9]:
# encode category and gender as one-hot
df = pd.get_dummies(df, columns=["category"], prefix=["CAT"],dtype="int8")
df = pd.get_dummies(df, columns=["gender"], prefix=["S"],dtype="int8")


In [10]:
cat_columns = df.select_dtypes(include=['object'], exclude=["number"]).columns

# # encode Unnamed: 0 and nameDest with label encoder
from sklearn.preprocessing import LabelEncoder

for col in cat_columns:
    le = LabelEncoder()
    df[col] = le.fit_transform(df[col])
    
df.head(3)

Unnamed: 0,cc_num,merchant,amt,first,last,street,city,state,zip,lat,...,CAT_home,CAT_kids_pets,CAT_misc_net,CAT_misc_pos,CAT_personal_care,CAT_shopping_net,CAT_shopping_pos,CAT_travel,S_F,S_M
0,2703190000000000.0,514,4.97,155,18,541,505,26,28654,36.0788,...,0,0,1,0,0,0,0,0,1,0
1,630423000000.0,241,107.23,299,155,416,582,46,99160,48.8878,...,0,0,0,0,0,0,0,0,1,0
2,38859500000000.0,390,220.11,108,376,575,451,12,83252,42.1808,...,0,0,0,0,0,0,0,0,0,1


In [11]:
target = "is_fraud"

X = df.drop([target], axis=1)
y = df[target]

X.shape, y.shape

((200000, 33), (200000,))

In [12]:
# scaling the data
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
X[X.columns] = scaler.fit_transform(X)

X.shape, y.shape

((200000, 33), (200000,))

In [13]:
# split the data into train and test
from sklearn.model_selection import train_test_split
# X_train, X_test, y_train, y_test = train_test_split(X, y,
#                                                     stratify=y,
#                                                     test_size=0.2,
#                                                     random_state=42)

X_train, X_test, y_train, y_test = X, X, y, y

# count fraud in train and test 
print(f"Train: \n {y_train.value_counts()}")
print(f"Test: \n {y_test.value_counts()} ")

Train: 
 is_fraud
0    198355
1      1645
Name: count, dtype: int64
Test: 
 is_fraud
0    198355
1      1645
Name: count, dtype: int64 


In [14]:
''' Helper functions '''

def get_scores(y_true, y_pred):
    scores = {
        'accuracy': round(accuracy_score(y_true, y_pred),2),
        'balanced': round(balanced_accuracy_score(y_true, y_pred),2),
        'F1': round(f1_score(y_true, y_pred),2),
        'precision': round(precision_score(y_true, y_pred),2),
        'recall': round(recall_score(y_true, y_pred),2),
        'roc_auc': round(roc_auc_score(y_true, y_pred),2),
        'pr_auc': round(average_precision_score(y_true, y_pred),2)
    }
    
    return scores

In [15]:
combined_scores = []

isolation forest

In [16]:
# isolation forest

from pyod.models.iforest import IForest

start_time = time.time()

clf_name = 'IForest'

clf = IForest()
clf.fit(X_train)

duration = round(time.time() - start_time,2)

# get the prediction labels and outlier scores of the training data
y_train_pred = clf.labels_  # binary labels (0: inliers, 1: outliers)
y_train_scores = clf.decision_scores_  # raw outlier scores

# get the prediction on the test data
y_test_pred = clf.predict(X_test)  # outlier labels (0 or 1)
y_test_scores = clf.decision_function(X_test)  # outlier scores

scores = {"clf_name":clf_name, **get_scores(y_test, y_test_pred), "duration": duration}
combined_scores.append(scores)

Local Outlier Factor

In [17]:
#  Local Outlier Factor

from pyod.models.lof import LOF

start_time = time.time()

clf_name = 'LOF'
clf = LOF()
clf.fit(X_train)
duration = round(time.time() - start_time,2)

# get the prediction labels
y_train_pred = clf.labels_  
y_test_pred = clf.predict(X_test)


scores = {"clf_name":clf_name, **get_scores(y_test, y_test_pred), "duration": duration}
combined_scores.append(scores)
# scores

ECOD

In [18]:
#  ECOD

from pyod.models.ecod import ECOD
start_time = time.time()


clf_name = 'ECOD'
clf = ECOD()
clf.fit(X_train)
duration = round(time.time() - start_time,2)

# get the prediction labels
y_train_pred = clf.labels_  
y_test_pred = clf.predict(X_test)


scores = {"clf_name":clf_name, **get_scores(y_test, y_test_pred), "duration": duration}
combined_scores.append(scores)

Local Correlation Integral

In [19]:
# #  Local Correlation Integral (LOCI)

# from pyod.models.loci import LOCI
# start_time = time.time()


# clf_name = 'LOCI'
# clf = LOCI()
# clf.fit(X_train)
# duration = round(time.time() - start_time,2)

# # get the prediction labels
# y_train_pred = clf.labels_  
# y_test_pred = clf.predict(X_test)


# scores = {"clf_name":clf_name, **get_scores(y_test, y_test_pred), "duration": duration}
# combined_scores.append(scores)

LSCP: Locally Selective Combination of Parallel Outlier Ensembles

In [20]:
# #  LSCP

# from pyod.models.lscp import LSCP
# from pyod.models.lof import LOF
# start_time = time.time()


# clf_name = 'LSCP'
# detector_list = [LOF(), LOF()]
# clf = LSCP(detector_list)
# clf.fit(X_train)
# duration = round(time.time() - start_time,2)

# # get the prediction labels
# y_train_pred = clf.labels_  
# y_test_pred = clf.predict(X_test)


# scores = {"clf_name":clf_name, **get_scores(y_test, y_test_pred), "duration": duration}
# combined_scores.append(scores)

COPOD: Copula-Based Outlier Detection

In [21]:
#  COPOD: Copula-Based Outlier Detection

from pyod.models.copod import COPOD
start_time = time.time()


clf_name = 'COPOD'
clf = COPOD()
clf.fit(X_train)
duration = round(time.time() - start_time,2)

# get the prediction labels
y_train_pred = clf.labels_  
y_test_pred = clf.predict(X_test)


scores = {"clf_name":clf_name, **get_scores(y_test, y_test_pred), "duration": duration}
combined_scores.append(scores)

ABOD: Angle-Based Outlier Detection

In [22]:
#  ABOD: Angle-Based Outlier Detection

from pyod.models.abod import ABOD
start_time = time.time()

clf_name = 'ABOD'
clf = ABOD()
clf.fit(X_train)
duration = round(time.time() - start_time,2)

# get the prediction labels
y_train_pred = clf.labels_  
y_test_pred = clf.predict(X_test)


scores = {"clf_name":clf_name, **get_scores(y_test, y_test_pred), "duration": duration}
combined_scores.append(scores)

QMCD: Quasi-Monte Carlo Discrepancy outlier detection

In [23]:
# QMCD: Quasi-Monte Carlo Discrepancy outlier detection

from pyod.models.qmcd import QMCD
start_time = time.time()

clf_name = 'QMCD'
clf = QMCD()
clf.fit(X_train)
duration = round(time.time() - start_time,2)

# get the prediction labels
y_train_pred = clf.labels_  
y_test_pred = clf.predict(X_test)


scores = {"clf_name":clf_name, **get_scores(y_test, y_test_pred), "duration": duration}
combined_scores.append(scores)


Rapid distance-based outlier detection via sampling

In [24]:
# MAD - Rapid distance-based outlier detection via sampling

from pyod.models.sampling import Sampling
start_time = time.time()

clf_name = 'Rapid distance-based'
clf = Sampling()
clf.fit(X_train)
duration = round(time.time() - start_time,2)

# get the prediction labels
y_train_pred = clf.labels_  
y_test_pred = clf.predict(X_test)


scores = {"clf_name":clf_name, **get_scores(y_test, y_test_pred), "duration": duration}
combined_scores.append(scores)



## Book Keeping

In [25]:
# save the results
save_dir = Path('results')
save_dir.mkdir(exist_ok=True)
save_as = save_dir / f"4-{Path(work_with).stem}.csv"

scores_df = pd.DataFrame(combined_scores)
scores_df.to_csv(save_as, index=False)



In [26]:
display(scores_df)

Unnamed: 0,clf_name,accuracy,balanced,F1,precision,recall,roc_auc,pr_auc,duration
0,IForest,0.9,0.62,0.05,0.03,0.34,0.62,0.01,1.65
1,LOF,0.91,0.59,0.04,0.02,0.26,0.59,0.01,26.43
2,ECOD,0.89,0.54,0.03,0.01,0.18,0.54,0.01,1.7
3,COPOD,0.9,0.55,0.03,0.02,0.2,0.55,0.01,1.48
4,ABOD,0.9,0.67,0.07,0.04,0.45,0.67,0.02,38.74
5,QMCD,0.89,0.51,0.02,0.01,0.12,0.51,0.01,160.46
6,Rapid distance-based,0.9,0.62,0.05,0.03,0.35,0.62,0.02,0.17


``` markdown

# methods used
1. [Isolation Forest](https://cs.nju.edu.cn/zhouzh/zhouzh.files/publication/icdm08b.pdf)
2. ...
```