In [1]:
import numpy as np
import matplotlib.pyplot as plt
import matplotlib
import pandas as pd
import random

import warnings
warnings.simplefilter("ignore", np.ComplexWarning)
from haversine import haversine
from IPython.display import HTML
import plotly.graph_objects as go
import copy 

import tqdm
from rpy2.robjects.packages import importr
from rpy2.robjects.vectors import FloatVector 

from pygsp import graphs, filters, plotting, utils

from sklearn.metrics import confusion_matrix
from sklearn.metrics import precision_score, recall_score, f1_score, accuracy_score

from sklearn.neighbors import LocalOutlierFactor
from pyod.models.knn import KNN
from pyod.models.cblof import CBLOF
from sklearn import svm
from pyod.models.mcd import MCD
from pyod.models.feature_bagging import FeatureBagging
from pyod.models.abod import ABOD
from alibi_detect.od import IForest
from pyod.models.hbos import HBOS
from pyod.models.sos import SOS
from pyod.models.so_gaal import SO_GAAL
from pyod.models.mo_gaal import MO_GAAL
from pyod.models.lscp import LSCP
from pyod.models.lof import LOF
from pyod.models.ocsvm import OCSVM
from sklearn.svm import OneClassSVM

2023-08-07 20:44:37.488664: I tensorflow/core/util/port.cc:110] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
  from .autonotebook import tqdm as notebook_tqdm


|Simple Linear 논문|Accuracy|Precision|Recall|F1|
|:--:|:--:|:--:|:--:|:--:|
|GODE|**0.998**|0.999|**0.999**|**0.999**|
|LOF (Breunig et al., 2000)|0.926|0.961|0.961|0.961|
|kNN (Ramaswamy et al., 2000)|0.950|**1.000**|0.947|0.973|
|CBLOF (He et al., 2003)|0.972|0.985|0.985|0.985|
|OCSVM (Sch ̈olkopf et al., 2001)|0.935|0.991|0.940|0.965|
|MCD (Hardin and Rocke, 2004)|0.998|0.999|**0.999**|**0.999**|
|Feature Bagging (Lazarevic and Kumar, 2005)|0.986|0.993|0.993|0.993|
|ABOD (Kriegel et al., 2008)|0.988|0.994|0.994|0.994|
|Isolation Forest (Liu et al., 2008)|0.868|0.999|0.862|0.925|
|HBOS (Goldstein and Dengel, 2012)|0.960|0.978|0.980|0.979|
|SOS (Janssens et al., 2012)|0.916|0.956|0.956|0.956|
|SO-GAAL (Liu et al., 2019)|0.936|0.966|0.966|0.966|
|MO-GAAL (Liu et al., 2019)|0.940|0.965|0.972|0.969|
|LSCP (Zhao et al., 2019)|0.988|0.994|0.994|0.994|

### LOF

|Parameter|Description|
|:--|:--|
|n_neighbors|Number of neighbors to use by default for k-neighbors queries. If n_neighbors is larger than the number of provided samples, all samples will be used.
|algorithm|Algorithm used to compute the nearest neighbors. It can be one of 'ball_tree', 'kd_tree', 'brute', or 'auto'.|
|leaf_size|Leaf size passed to BallTree or KDTree. This can affect the speed of construction and query, as well as the memory required to store the tree. The optimal value depends on the nature of the problem.|
|metric|Metric used for distance computation. Default is "minkowski," which results in the standard Euclidean distance when p = 2. Other valid metrics can be found in the documentation of scipy.spatial.distance.|
|p|Parameter for the Minkowski metric. When p = 1, it is equivalent to using Manhattan distance (l1), and Euclidean distance (l2) for p = 2. For arbitrary p, Minkowski distance (l_p) is used.|
|metric_params|Additional keyword arguments for the metric function.|
|contamination|The amount of contamination of the dataset, i.e., the proportion of outliers in the dataset. If "auto," the threshold is determined as in the original paper. If a float, the contamination should be in the range (0, 0.5].|
|novelty|By default, LocalOutlierFactor is only meant for outlier detection (novelty=False). Set novelty to True if you want to use LocalOutlierFactor for novelty detection. In this case, use predict, decision_function, and score_samples only on new unseen data, not on the training set. The results obtained this way may differ from the standard LOF results.|
|n_jobs|The number of parallel jobs to run for neighbors search. None means 1, and -1 means using all processors.|

### kNN

|Parameter|Description|
|:--|:--|
|contamination|	(float, optional, default=0.1) The amount of contamination of the data set, i.e., the proportion of outliers in the data set. Used when fitting to define the threshold on the decision function.|
|n_neighbors|	(int, optional, default=5) Number of neighbors to use by default for k-neighbors queries.|
|method|	(str, optional, default='largest') The kNN detection method to use. Supported values: {'largest', 'mean', 'median'}|
|radius|	(float, optional, default=1.0) Range of parameter space to use by default for radius_neighbors queries.|
|algorithm|	(str, optional, default='auto') Algorithm used to compute the nearest neighbors. Supported values: {'auto', 'ball_tree', 'kd_tree', 'brute'} Note: algorithm is deprecated in PyOD 0.7.4 and will not be possible in 0.7.6. It has to use BallTree for consistency.|
|leaf_size|	(int, optional, default=30) Leaf size passed to BallTree. This can affect the speed of construction and query, as well as the memory required to store the tree. The optimal value depends on the nature of the problem.|
|metric|	(str or callable, default='minkowski') Metric used for distance computation. Valid values include various distance metrics from scikit-learn and scipy.spatial.distance.|
|p|	(int, optional, default=2) Parameter for the Minkowski metric. When p = 1, it is equivalent to using Manhattan distance (l1), and Euclidean distance (l2) for p = 2. For arbitrary p, Minkowski distance (l_p) is used.|
|metric_params|	(dict, optional, default=None) Additional keyword arguments for the metric function.|
|n_jobs|	(int, optional, default=1) The number of parallel jobs to run for neighbors search. If -1, then the number of jobs is set to the number of CPU cores. Affects only kneighbors and kneighbors_graph methods|

In [6]:
CBLOF??

[0;31mInit signature:[0m
[0mCBLOF[0m[0;34m([0m[0;34m[0m
[0;34m[0m    [0mn_clusters[0m[0;34m=[0m[0;36m8[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mcontamination[0m[0;34m=[0m[0;36m0.1[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mclustering_estimator[0m[0;34m=[0m[0;32mNone[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0malpha[0m[0;34m=[0m[0;36m0.9[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mbeta[0m[0;34m=[0m[0;36m5[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0muse_weights[0m[0;34m=[0m[0;32mFalse[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mcheck_estimator[0m[0;34m=[0m[0;32mFalse[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mrandom_state[0m[0;34m=[0m[0;32mNone[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mn_jobs[0m[0;34m=[0m[0;36m1[0m[0;34m,[0m[0;34m[0m
[0;34m[0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m
[0;31mSource:[0m        
[0;32mclass[0m [0mCBLOF[0m[0;34m([0m[0mBaseDetector[0m[0;34m)[0m[0;34m:[0m[0;34m[0m
[0;34m