In [None]:
import pandas as pd
import numpy as np

from sklearn.ensemble import IsolationForest
from sklearn.metrics import f1_score
from sklearn.metrics import classification_report

import matplotlib.pyplot as plt

import warnings
warnings.filterwarnings(action='ignore')

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:

!pip install scipy==1.4.1


In [None]:
!pip install pycaret --user


In [None]:
!pip install markupsafe==2.0.1


In [None]:
import jinja2

from pycaret.anomaly import *
#loader 에러뜨면 self.loader()로 바꾸기


In [None]:
#data load
train = pd.read_csv('/content/drive/MyDrive/open/train.csv')
val = pd.read_csv('/content/drive/MyDrive/open/val.csv')


In [None]:
df_train = train
df_unseen = val

# Set up Pycaret

In [None]:
anom = setup(data = df_train, 
             silent = True)

Unnamed: 0,Description,Value
0,session_id,8933
1,Original Data,"(113842, 31)"
2,Missing Values,False
3,Numeric Features,31
4,Categorical Features,0
5,Ordinal Features,False
6,High Cardinality Features,False
7,High Cardinality Method,
8,Transformed Data,"(113842, 31)"
9,CPU Jobs,-1


In [None]:
models() #check models

Unnamed: 0_level_0,Name,Reference
ID,Unnamed: 1_level_1,Unnamed: 2_level_1
abod,Angle-base Outlier Detection,pyod.models.abod.ABOD
cluster,Clustering-Based Local Outlier,pyod.models.cblof.CBLOF
cof,Connectivity-Based Local Outlier,pyod.models.cof.COF
iforest,Isolation Forest,pyod.models.iforest.IForest
histogram,Histogram-based Outlier Detection,pyod.models.hbos.HBOS
knn,K-Nearest Neighbors Detector,pyod.models.knn.KNN
lof,Local Outlier Factor,pyod.models.lof.LOF
svm,One-class SVM detector,pyod.models.ocsvm.OCSVM
pca,Principal Component Analysis,pyod.models.pca.PCA
mcd,Minimum Covariance Determinant,pyod.models.mcd.MCD


In [None]:
anom_model = create_model(model = 'iforest', fraction = 0.05)


In [None]:
results = assign_model(anom_model)


In [None]:
save_model(model = anom_model, model_name = 'iforest_model')


Transformation Pipeline and Model Successfully Saved


(Pipeline(memory=None,
          steps=[('dtypes',
                  DataTypes_Auto_infer(categorical_features=[],
                                       display_types=False, features_todrop=[],
                                       id_columns=[], ml_usecase='regression',
                                       numerical_features=[],
                                       target='UNSUPERVISED_DUMMY_TARGET',
                                       time_features=[])),
                 ('imputer',
                  Simple_Imputer(categorical_strategy='most frequent',
                                 fill_value_categorical=None,
                                 fill_value_numerical=Non...
                 ('fix_perfect', 'passthrough'),
                 ('clean_names', Clean_Colum_Names()),
                 ('feature_select', 'passthrough'), ('fix_multi', 'passthrough'),
                 ('dfs', 'passthrough'), ('pca', 'passthrough'),
                 ['trained_model',
                  IFo

In [None]:
loaded_model = load_model('iforest_model')
type(loaded_model)

Transformation Pipeline and Model Successfully Loaded


sklearn.pipeline.Pipeline

In [None]:
loaded_model.predict(df_unseen)


array([0, 0, 0, ..., 0, 0, 0])

In [None]:
loaded_model.predict_proba(df_unseen)


array([[0.86798056, 0.13201944],
       [0.71683123, 0.28316877],
       [0.81063597, 0.18936403],
       ...,
       [0.83765453, 0.16234547],
       [0.8958507 , 0.1041493 ],
       [0.76608839, 0.23391161]])

In [None]:
loaded_model.decision_function(df_unseen)


array([-0.08873786, -0.03121408, -0.0669139 , ..., -0.07719651,
       -0.09934456, -0.04996017])

In [None]:
def get_pred_label(model_pred):
    # IsolationForest 모델 출력 (1:정상, -1:불량(사기)) 이므로 (0:정상, 1:불량(사기))로 Label 변환
    model_pred = np.where(model_pred == 1, 0, model_pred)
    model_pred = np.where(model_pred == -1, 1, model_pred)
    return model_pred

In [None]:
val_x = val.drop(columns=['Class']) # Input Data
val_y = val['Class'] # Label

val_pred = loaded_model.predict(val_x) # model prediction
val_pred = get_pred_label(val_pred)
val_score = f1_score(val_y, val_pred, average='macro')
print(f'Validation F1 Score : [{val_score}]')
print(classification_report(val_y, val_pred))


Validation F1 Score : [0.4997363518121419]
              precision    recall  f1-score   support

           0       1.00      1.00      1.00     28432
           1       0.00      0.00      0.00        30

    accuracy                           1.00     28462
   macro avg       0.50      0.50      0.50     28462
weighted avg       1.00      1.00      1.00     28462



In [None]:
test_df = pd.read_csv('/content/drive/MyDrive/open/test.csv') # Train
test_df.head()

Unnamed: 0,ID,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V21,V22,V23,V24,V25,V26,V27,V28,V29,V30
0,AAAA0x1,-1.359807,-0.072781,2.536347,1.378155,-0.338321,0.462388,0.239599,0.098698,0.363787,...,-0.018307,0.277838,-0.110474,0.066928,0.128539,-0.189115,0.133558,-0.021053,1.783274,-0.994983
1,AAAA0x2,1.191857,0.266151,0.16648,0.448154,0.060018,-0.082361,-0.078803,0.085102,-0.255425,...,-0.225775,-0.638672,0.101288,-0.339846,0.16717,0.125895,-0.008983,0.014724,-0.269825,-0.994983
2,AAAA0x5,-1.158233,0.877737,1.548718,0.403034,-0.407193,0.095921,0.592941,-0.270533,0.817739,...,-0.009431,0.798278,-0.137458,0.141267,-0.20601,0.502292,0.219422,0.215153,0.670579,-0.99496
3,AAAA0x7,1.229658,0.141004,0.045371,1.202613,0.191881,0.272708,-0.005159,0.081213,0.46496,...,-0.167716,-0.27071,-0.154104,-0.780055,0.750137,-0.257237,0.034507,0.005168,-0.237686,-0.994937
4,AAAA0xc,0.384978,0.616109,-0.8743,-0.094019,2.924584,3.317027,0.470455,0.538247,-0.558895,...,0.049924,0.238422,0.00913,0.99671,-0.767315,-0.492208,0.042472,-0.054337,-0.167819,-0.994866
