In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
from collections import defaultdict
#import re

import warnings
warnings.filterwarnings("ignore")

import matplotlib.pyplot as plt
import collections


from sklearn.metrics import precision_recall_curve
from sklearn.metrics import accuracy_score, balanced_accuracy_score, f1_score, precision_score, recall_score, roc_auc_score, average_precision_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report

from pathlib import Path
import time

sns.set()

**Dataset**


- [5-Defaulter](./data/Default.xlsx) 
```markdown

Its a Finance type dataset with 10k rows and 5 columns.
default column whether someone is defaulter or not in the data with yes and no data in that column
similarly in Students columns we can se whether person is student or not
there are balance and income data of the person….

```

( more : https://www.kaggle.com/datasets/creepycrap/finance-dataset )

In [2]:
data_dir = Path('data')
work_with = "Default.xlsx"

df = pd.read_excel(data_dir / work_with, index_col=0)
#Impute missing values with 0 since it means the company have no value for that year
# df.fillna(0,inplace=True)
df.dropna(inplace=True, axis=1, how='any')

# df.describe()
#df.info()
df.shape

(10000, 4)

In [3]:
df.head()

Unnamed: 0,default,student,balance,income
1,No,No,729.526495,44361.625074
2,No,Yes,817.180407,12106.1347
3,No,No,1073.549164,31767.138947
4,No,No,529.250605,35704.493935
5,No,No,785.655883,38463.495879


In [4]:
# data distribution 
# 0 - no-fraus, 1 - fraud

print(f'{df["default"].value_counts()}')

default
No     9667
Yes     333
Name: count, dtype: int64


```markdown
the default column is the target column and rest are features.

we have outlier if the person is defaulter.
default - yes - 1
default - no - 0

we can also sub-sample the outliers
```

In [5]:
# data distribution 
# 0 - increased, 1 - otherwise
# df.drop(['isFlaggedFraud'], inplace=True, axis=1)

target = 'default'

print(f'before:{df[target].value_counts()}')

#switch classes 
df[target] = df[target].map({"Yes": 1, "No": 0})
print(f'After: {df[target].value_counts()}')


before:default
No     9667
Yes     333
Name: count, dtype: int64
After: default
0    9667
1     333
Name: count, dtype: int64


In [6]:
# sample non_increasing class
inbound_df = df[df[target] == 0]
outlier_df = df[df[target] == 1].sample(n=int(0.018*len(inbound_df)), random_state=42)

df = pd.concat([inbound_df, outlier_df])
print(f'Sampled: {df[target].value_counts()}')

Sampled: default
0    9667
1     174
Name: count, dtype: int64


In [7]:
# handle categorical data
cat_columns = df.select_dtypes(include=['object'], exclude=["number"]).columns
print(f"uniques val count:\n{df[[*cat_columns]].nunique()}")

uniques val count:
student    2
dtype: int64


In [8]:
# encode student 
df = pd.get_dummies(df, columns=["student"], prefix=["ST"],dtype="int8")

In [9]:
cat_columns = df.select_dtypes(include=['object'], exclude=["number"]).columns

# # encode Unnamed: 0 and nameDest with label encoder
from sklearn.preprocessing import LabelEncoder

for col in cat_columns:
    le = LabelEncoder()
    df[col] = le.fit_transform(df[col])
    
df.head(3)

Unnamed: 0,default,balance,income,ST_No,ST_Yes
1,0,729.526495,44361.625074,1,0
2,0,817.180407,12106.1347,0,1
3,0,1073.549164,31767.138947,1,0


In [10]:
target = "default"

X = df.drop([target], axis=1)
y = df[target]

X.shape, y.shape

((9841, 4), (9841,))

In [11]:
# scaling the data
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
X[X.columns] = scaler.fit_transform(X)

X.shape, y.shape

((9841, 4), (9841,))

In [12]:
# split the data into train and test
from sklearn.model_selection import train_test_split
# X_train, X_test, y_train, y_test = train_test_split(X, y,
#                                                     stratify=y,
#                                                     test_size=0.2,
#                                                     random_state=42)

X_train, X_test, y_train, y_test = X, X, y, y


# count fraud in train and test 
print(f"Train: \n {y_train.value_counts()}")
print(f"Test: \n {y_test.value_counts()} ")

Train: 
 default
0    9667
1     174
Name: count, dtype: int64
Test: 
 default
0    9667
1     174
Name: count, dtype: int64 


In [13]:
''' Helper functions '''

def get_scores(y_true, y_pred):
    scores = {
        'accuracy': round(accuracy_score(y_true, y_pred),2),
        'balanced': round(balanced_accuracy_score(y_true, y_pred),2),
        'F1': round(f1_score(y_true, y_pred),2),
        'precision': round(precision_score(y_true, y_pred),2),
        'recall': round(recall_score(y_true, y_pred),2),
        'roc_auc': round(roc_auc_score(y_true, y_pred),2),
        'pr_auc': round(average_precision_score(y_true, y_pred),2)
    }
    
    return scores

In [14]:
combined_scores = []

isolation forest

In [15]:
# isolation forest

from pyod.models.iforest import IForest

start_time = time.time()

clf_name = 'IForest'

clf = IForest()
clf.fit(X_train)

duration = round(time.time() - start_time,2)

# get the prediction labels and outlier scores of the training data
y_train_pred = clf.labels_  # binary labels (0: inliers, 1: outliers)
y_train_scores = clf.decision_scores_  # raw outlier scores

# get the prediction on the test data
y_test_pred = clf.predict(X_test)  # outlier labels (0 or 1)
y_test_scores = clf.decision_function(X_test)  # outlier scores

scores = {"clf_name":clf_name, **get_scores(y_test, y_test_pred), "duration": duration}
combined_scores.append(scores)

Local Outlier Factor

In [16]:
#  Local Outlier Factor

from pyod.models.lof import LOF

start_time = time.time()

clf_name = 'LOF'
clf = LOF()
clf.fit(X_train)
duration = round(time.time() - start_time,2)

# get the prediction labels
y_train_pred = clf.labels_  
y_test_pred = clf.predict(X_test)


scores = {"clf_name":clf_name, **get_scores(y_test, y_test_pred), "duration": duration}
combined_scores.append(scores)
# scores

ECOD

In [17]:
#  ECOD

from pyod.models.ecod import ECOD
start_time = time.time()


clf_name = 'ECOD'
clf = ECOD()
clf.fit(X_train)
duration = round(time.time() - start_time,2)

# get the prediction labels
y_train_pred = clf.labels_  
y_test_pred = clf.predict(X_test)


scores = {"clf_name":clf_name, **get_scores(y_test, y_test_pred), "duration": duration}
combined_scores.append(scores)

Local Correlation Integral

In [18]:
# #  Local Correlation Integral (LOCI)

# from pyod.models.loci import LOCI
# start_time = time.time()


# clf_name = 'LOCI'
# clf = LOCI()
# clf.fit(X_train)
# duration = round(time.time() - start_time,2)

# # get the prediction labels
# y_train_pred = clf.labels_  
# y_test_pred = clf.predict(X_test)


# scores = {"clf_name":clf_name, **get_scores(y_test, y_test_pred), "duration": duration}
# combined_scores.append(scores)

LSCP: Locally Selective Combination of Parallel Outlier Ensembles

In [19]:
# #  LSCP

# from pyod.models.lscp import LSCP
# from pyod.models.lof import LOF
# start_time = time.time()


# clf_name = 'LSCP'
# detector_list = [LOF(), LOF()]
# clf = LSCP(detector_list)
# clf.fit(X_train)
# duration = round(time.time() - start_time,2)

# # get the prediction labels
# y_train_pred = clf.labels_  
# y_test_pred = clf.predict(X_test)


# scores = {"clf_name":clf_name, **get_scores(y_test, y_test_pred), "duration": duration}
# combined_scores.append(scores)

COPOD: Copula-Based Outlier Detection

In [20]:
#  COPOD: Copula-Based Outlier Detection

from pyod.models.copod import COPOD
start_time = time.time()


clf_name = 'COPOD'
clf = COPOD()
clf.fit(X_train)
duration = round(time.time() - start_time,2)

# get the prediction labels
y_train_pred = clf.labels_  
y_test_pred = clf.predict(X_test)


scores = {"clf_name":clf_name, **get_scores(y_test, y_test_pred), "duration": duration}
combined_scores.append(scores)

ABOD: Angle-Based Outlier Detection

In [21]:
#  ABOD: Angle-Based Outlier Detection

from pyod.models.abod import ABOD
start_time = time.time()

clf_name = 'ABOD'
clf = ABOD()
clf.fit(X_train)
duration = round(time.time() - start_time,2)

# get the prediction labels
y_train_pred = clf.labels_  
y_test_pred = clf.predict(X_test)


scores = {"clf_name":clf_name, **get_scores(y_test, y_test_pred), "duration": duration}
combined_scores.append(scores)

QMCD: Quasi-Monte Carlo Discrepancy outlier detection

In [22]:
# QMCD: Quasi-Monte Carlo Discrepancy outlier detection

from pyod.models.qmcd import QMCD
start_time = time.time()

clf_name = 'QMCD'
clf = QMCD()
clf.fit(X_train)
duration = round(time.time() - start_time,2)

# get the prediction labels
y_train_pred = clf.labels_  
y_test_pred = clf.predict(X_test)


scores = {"clf_name":clf_name, **get_scores(y_test, y_test_pred), "duration": duration}
combined_scores.append(scores)


Rapid distance-based outlier detection via sampling

In [23]:
# MAD - Rapid distance-based outlier detection via sampling

from pyod.models.sampling import Sampling
start_time = time.time()

clf_name = 'Rapid distance-based'
clf = Sampling()
clf.fit(X_train)
duration = round(time.time() - start_time,2)

# get the prediction labels
y_train_pred = clf.labels_  
y_test_pred = clf.predict(X_test)


scores = {"clf_name":clf_name, **get_scores(y_test, y_test_pred), "duration": duration}
combined_scores.append(scores)



## Book Keeping

In [24]:
# save the results

save_dir = Path('results')
save_dir.mkdir(exist_ok=True)
save_as = save_dir / f"5-{Path(work_with).stem}.csv"

scores_df = pd.DataFrame(combined_scores)
scores_df.to_csv(save_as, index=False)


In [25]:
display(scores_df)

Unnamed: 0,clf_name,accuracy,balanced,F1,precision,recall,roc_auc,pr_auc,duration
0,IForest,0.9,0.72,0.16,0.09,0.53,0.72,0.06,0.28
1,LOF,0.91,0.63,0.12,0.07,0.34,0.63,0.04,0.07
2,ECOD,0.9,0.71,0.15,0.09,0.51,0.71,0.05,0.31
3,COPOD,0.9,0.76,0.18,0.11,0.61,0.76,0.07,0.01
4,ABOD,0.9,0.7,0.15,0.09,0.49,0.7,0.05,1.61
5,QMCD,0.9,0.76,0.18,0.11,0.61,0.76,0.07,0.86
6,Rapid distance-based,0.91,0.78,0.2,0.12,0.66,0.78,0.08,0.01


``` markdown

# methods used
1. [Isolation Forest](https://cs.nju.edu.cn/zhouzh/zhouzh.files/publication/icdm08b.pdf)
2. ...
```