In [1]:
import pandas as pd
import numpy as np
from sklearn.ensemble import IsolationForest
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.model_selection import train_test_split
# from imblearn.over_sampling import SMOTE
from sklearn.metrics import classification_report

In [2]:
raw_df = pd.read_csv("anomaly/creditcard.csv")

In [3]:
raw_df.head(2)

Unnamed: 0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V21,V22,V23,V24,V25,V26,V27,V28,Amount,Class
0,0.0,-1.359807,-0.072781,2.536347,1.378155,-0.338321,0.462388,0.239599,0.098698,0.363787,...,-0.018307,0.277838,-0.110474,0.066928,0.128539,-0.189115,0.133558,-0.021053,149.62,0
1,0.0,1.191857,0.266151,0.16648,0.448154,0.060018,-0.082361,-0.078803,0.085102,-0.255425,...,-0.225775,-0.638672,0.101288,-0.339846,0.16717,0.125895,-0.008983,0.014724,2.69,0


In [4]:
raw_df.shape

(284807, 31)

In [5]:
# drop duplicates
raw_df.drop_duplicates(inplace=True)

# check missing vals
raw_df.isna().sum()

Time      0
V1        0
V2        0
V3        0
V4        0
V5        0
V6        0
V7        0
V8        0
V9        0
V10       0
V11       0
V12       0
V13       0
V14       0
V15       0
V16       0
V17       0
V18       0
V19       0
V20       0
V21       0
V22       0
V23       0
V24       0
V25       0
V26       0
V27       0
V28       0
Amount    0
Class     0
dtype: int64

In [6]:
features = raw_df.drop("Class", axis=1)
target = raw_df["Class"]

In [7]:
contamination_range = [0.1, 0.2, 0.3, 0.4]

In [8]:
for contamination in contamination_range:
    isolation_forest = IsolationForest(n_estimators=200, contamination=contamination)
    isolation_forest.fit(features)
    pred = isolation_forest.predict(features)
    pred = [1 if number == -1 else 0 for number in pred]
    print(f"Contamination: {contamination}")
    print(classification_report(target, pred))
    print("-----------------------------------------")



Contamination: 0.1
              precision    recall  f1-score   support

           0       1.00      0.90      0.95    283253
           1       0.01      0.89      0.03       473

    accuracy                           0.90    283726
   macro avg       0.51      0.90      0.49    283726
weighted avg       1.00      0.90      0.95    283726

-----------------------------------------




Contamination: 0.2
              precision    recall  f1-score   support

           0       1.00      0.80      0.89    283253
           1       0.01      0.93      0.02       473

    accuracy                           0.80    283726
   macro avg       0.50      0.86      0.45    283726
weighted avg       1.00      0.80      0.89    283726

-----------------------------------------




Contamination: 0.3
              precision    recall  f1-score   support

           0       1.00      0.70      0.82    283253
           1       0.01      0.95      0.01       473

    accuracy                           0.70    283726
   macro avg       0.50      0.82      0.42    283726
weighted avg       1.00      0.70      0.82    283726

-----------------------------------------




Contamination: 0.4
              precision    recall  f1-score   support

           0       1.00      0.60      0.75    283253
           1       0.00      0.96      0.01       473

    accuracy                           0.60    283726
   macro avg       0.50      0.78      0.38    283726
weighted avg       1.00      0.60      0.75    283726

-----------------------------------------
