# Meta-Learn Outlier Detection Benchmark

In [26]:
import time
import scipy
import numpy as np
import pandas as pd
import openml as oml

from sklearn import preprocessing
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import Imputer
from sklearn.model_selection import cross_val_score  
from sklearn.neighbors import KNeighborsClassifier
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import roc_auc_score
from sklearn.neighbors import LocalOutlierFactor
from sklearn.ensemble import IsolationForest
from sklearn import svm

In [27]:
import warnings
warnings.filterwarnings('ignore')
# warnings.simplefilter(action="ignore", category=FutureWarning)
# warnings.simplefilter(action="ignore", category=UserWarning)
# warnings.simplefilter(action="ignore", category=RuntimeWarning)

### Basic information of benmark datasets

In [28]:
datasets_basic = []
dataset0 = {'datasetID': 0, 'name': 'lymph', 'contamination(%)': 4.10}
datasets_basic.append(dataset0)
dataset1 = {'datasetID': 1, 'name': 'glass', 'contamination(%)': 4.21}
datasets_basic.append(dataset1)
dataset2 = {'datasetID': 2, 'name': 'wdbc', 'contamination(%)': 37.26}
datasets_basic.append(dataset2)
dataset3 = {'datasetID': 3, 'name': 'speech', 'contamination(%)': 1.65}
datasets_basic.append(dataset3)
dataset4 = {'datasetID': 4, 'name': 'satellite_image', 'contamination(%)': 31.64}
datasets_basic.append(dataset4)
dataset5 = {'datasetID': 5, 'name': 'baseball', 'contamination(%)': 9.33}
datasets_basic.append(dataset5)
dataset6 = {'datasetID': 6, 'name': 'ecoli', 'contamination(%)': 2.68}
datasets_basic.append(dataset6)
dataset7 = {'datasetID': 7, 'name': 'phoneme', 'contamination(%)': 29.35}
datasets_basic.append(dataset7)
dataset8 = {'datasetID': 8, 'name': 'click_prediction_small', 'contamination(%)': 16.84}
datasets_basic.append(dataset8)
dataset9 = {'datasetID': 9, 'name': 'musk', 'contamination(%)': 15.41}
datasets_basic.append(dataset9)
dataset10 = {'datasetID': 10, 'name': 'credit_g', 'contamination(%)': 30.00}
datasets_basic.append(dataset10)
dataset11 = {'datasetID': 11, 'name': 'diabetes', 'contamination(%)': 34.89}
datasets_basic.append(dataset11)
dataset12 = {'datasetID': 12, 'name': 'breast_w', 'contamination(%)': 34.48}
datasets_basic.append(dataset12)
dataset13 = {'datasetID': 13, 'name': 'blood_transfusion_service_center', 'contamination(%)': 23.79}
datasets_basic.append(dataset13)
dataset14 = {'datasetID': 14, 'name': 'heart', 'contamination(%)': 45.54}
datasets_basic.append(dataset14)
dataset15 = {'datasetID': 15, 'name': 'arrhythmia', 'contamination(%)': 45.79}
datasets_basic.append(dataset15)
dataset16 = {'datasetID': 16, 'name': 'spambase', 'contamination(%)': 39.40}
datasets_basic.append(dataset16)
dataset17 = {'datasetID': 17, 'name': 'kc2', 'contamination(%)': 20.49}
datasets_basic.append(dataset17)
dataset18 = {'datasetID': 18, 'name': 'ilpd', 'contamination(%)': 28.64}
datasets_basic.append(dataset18)
dataset19 = {'datasetID': 19, 'name': 'pc1', 'contamination(%)': 6.94}
datasets_basic.append(dataset19)
dataset20 = {'datasetID': 20, 'name': 'abalone', 'contamination(%)': 0.43}
datasets_basic.append(dataset20)
dataset21 = {'datasetID': 21, 'name': 'speed_dating', 'contamination(%)': 16.47}
datasets_basic.append(dataset21)
dataset22 = {'datasetID': 22, 'name': 'cardiotocography', 'contamination(%)': 12.94}
datasets_basic.append(dataset22)
dataset23 = {'datasetID': 23, 'name': 'sick', 'contamination(%)': 6.12}
datasets_basic.append(dataset23)
dataset24 = {'datasetID': 24, 'name': 'adult', 'contamination(%)': 23.93}
datasets_basic.append(dataset24)
dataset25 = {'datasetID': 25, 'name': 'jm1', 'contamination(%)': 19.35}
datasets_basic.append(dataset25)
dataset26 = {'datasetID': 26, 'name': 'scene', 'contamination(%)': 17.91}
datasets_basic.append(dataset26)
dataset27 = {'datasetID': 27, 'name': 'climate_model_simulation_crashes', 'contamination(%)': 8.52}
datasets_basic.append(dataset27)
dataset28 = {'datasetID': 28, 'name': 'quake', 'contamination(%)': 44.49}
datasets_basic.append(dataset28)
dataset29 = {'datasetID': 29, 'name': 'yeast', 'contamination(%)': 0.34}
datasets_basic.append(dataset29)
dataset30 = {'datasetID': 30, 'name': 'churn', 'contamination(%)': 14.14}
datasets_basic.append(dataset30)
dataset31 = {'datasetID': 31, 'name': 'wilt', 'contamination(%)': 5.39}
datasets_basic.append(dataset31)
# datasets_basic

In [29]:
# print basic table
df_basic = pd.DataFrame(datasets_basic)
cols_basic = ['datasetID', 'name', 'contamination(%)']
df_basic = df_basic[cols_basic]
df_basic

Unnamed: 0,datasetID,name,contamination(%)
0,0,lymph,4.1
1,1,glass,4.21
2,2,wdbc,37.26
3,3,speech,1.65
4,4,satellite_image,31.64
5,5,baseball,9.33
6,6,ecoli,2.68
7,7,phoneme,29.35
8,8,click_prediction_small,16.84
9,9,musk,15.41


### Compute Meta-Features Landmarking

The following Landmarking meta-features were calculated: (Matthias Reif et al. 2012, Abdelmessih et al. 2010)

The accuracy values of the following simple learners are used: Naive Bayes, Linear Discriminant Analysis, One-Nearest Neighbor, Decision Node, Random Node.

- **Naive Bayes Learner** is a probabilistic classifier, based on Bayes’ Theorem:
$$ p(X|Y) = rac{p(Y|X) \cdot p(X)}{p(Y)} $$

    where p(X) is the prior probability and p(X|Y) is the posterior probability. It is called naive, because it
    assumes independence of all attributes to each other.
- **Linear Discriminant Learner** is a type of discriminant analysis, which is understood as the grouping and separation of categories according to specific features. Linear discriminant is basically finding a linear combination of features that separates the classes best. The resulting separation model is a line, a plane, or a hyperplane, depending on the number of features combined. 

- **One Nearest Neighbor Learner** is a classifier based on instance-based learning. A test point is assigned to the class of the nearest point within the training set. 

- **Decision Node Learner** is a classifier based on the information gain of attributes. The information gain indicates how informative an attribute is with respect to the classification task using its entropy. The higher the variability of the attribute values, the higher its information gain. This learner selects the attribute with the highest information gain. Then, it creates a single node decision tree consisting of the chosen attribute as a split node. 

- **Randomly Chosen Node Learner** is a classifier that results also in a single decision node, based on a randomly chosen attribute. 

In [30]:
from sklearn.cluster import KMeans
from sklearn import metrics

def compute_clustering_metafeatures(X):
    kmeans_model = KMeans(n_clusters=2, random_state=1).fit(X)
    labels = kmeans_model.labels_
    silhouette_score = metrics.silhouette_score(X, labels, metric='euclidean')
    calinski_harabaz_score = metrics.calinski_harabaz_score(X, labels)
    davies_bouldin_score = metrics.davies_bouldin_score(X, labels)
    return silhouette_score, calinski_harabaz_score, davies_bouldin_score

In [31]:
from sklearn.model_selection import cross_val_predict
from sklearn.metrics import roc_auc_score
import time

def pipeline(X, y, estimator):
#     if scipy.sparse.issparse(X) == True: # Check if X is sparse array
#         X = X.toarray()
    start_time_pipeline = time.process_time()
    pipe = Pipeline([('Imputer', preprocessing.Imputer(missing_values='NaN', strategy='mean', axis=0)),
                     ('classifiers', estimator)])
    
    # to prevent sklearn cross_val_score failure due to label imabalance
#     pred = cross_val_predict(pipe, X, y, cv=5)
#     score = roc_auc_score(y, pred)
    score = np.mean(cross_val_score(pipe, X, y, cv=10, scoring='accuracy', n_jobs=-1))
    time_pipeline = time.process_time() - start_time_pipeline
    return score, time_pipeline

def compute_metafeatures(X, y):
    """
    The following landmarking features are computed:
        Naive Bayes Learner;
        Linear Discriminant Learner;
        One Nearest Neighbor Learner;
        Decision Node Learner;
        Randomly Chosen Node Learner
    """
    
    # if regression
    if len(np.unique(y)) > 100 or len(np.unique(y)) > 0.1*y.shape[0]:
        print("regression")
        print("meta features cannot be extracted as the target is not categorical")
    # if classification
    else:
#         print("classification")
        metafeatures_clf = {}
        # compute clustering performance metafeatures
        metafeatures_clf['silhouette'], metafeatures_clf['calinski_harabaz'], metafeatures_clf['davies_bouldin'] = compute_clustering_metafeatures(X)
    
        # compute landmarking metafeatures
        metafeatures_clf['naive_bayes'], metafeatures_clf['naive_bayes_time'] = pipeline(X, y, GaussianNB()) 
        metafeatures_clf['linear_discriminant_analysis'], metafeatures_clf['linear_discriminant_analysis_time'] = pipeline(X, y, LinearDiscriminantAnalysis(solver='lsqr', shrinkage='auto')) 
        metafeatures_clf['one_nearest_neighbor'], metafeatures_clf['one_nearest_neighbor_time'] = pipeline(X, y, KNeighborsClassifier(n_neighbors = 1)) 
        metafeatures_clf['decision_node'], metafeatures_clf['decision_node_time'] = pipeline(X, y, DecisionTreeClassifier(criterion='entropy', splitter='best', max_depth=1, random_state=0)) 
        metafeatures_clf['random_node'], metafeatures_clf['random_node_time'] = pipeline(X, y, DecisionTreeClassifier(criterion='entropy', splitter='random', max_depth=1, random_state=0))
        metafeatures = list(metafeatures_clf.values())
    
    return metafeatures

### Evaluate outlier detection classifiers

In [32]:
from sklearn.metrics import f1_score
def compute_isolation_forest_f1(X, ground_truth, contamination=0.1):
    # isolation forest
    isof = IsolationForest(contamination=contamination)
    isof.fit(X)
    y_pred = isof.predict(X)
    isof_auroc = f1_score(ground_truth, y_pred)
    return isof_auroc

def compute_lof_f1(X, ground_truth, contamination=0.1):
    # local outlier factor
    lof = LocalOutlierFactor(n_neighbors=20, contamination=contamination)
    y_pred = lof.fit_predict(X)
    lof_auroc = f1_score(ground_truth, y_pred)
    return lof_auroc

def compute_ocsvm_f1(X, ground_truth, contamination=0.1):
    # one class svm
    ocsvm = svm.OneClassSVM(nu=0.95 * contamination + 0.05)
    ocsvm.fit(X)
    y_pred = ocsvm.predict(X)
    ocsvm_auroc = f1_score(ground_truth, y_pred)
    return ocsvm_auroc

In [33]:
def compute_info(datasets_basic=None, name=None):
    dataset = next((item for item in datasets_basic if item['name'] == name), None).copy()
    data = np.genfromtxt('benchmark_anomaly/'+ name + '.csv', delimiter=',')
#     print(data)
    # Imputing the missing values by mean
    imp = Imputer(missing_values=np.nan , strategy='mean', axis=0)
    
    X = data[:, :-1]
    X = imp.fit_transform(X)
    y = data[:,-1]
#     print(y)

    dataset['metafeatures'] = compute_metafeatures(X, y)
    
    contamination = dataset['contamination(%)']/100
#     print(contamination)
    data_anomaly = np.genfromtxt('benchmark_anomaly/'+ name + '_anomaly.csv', delimiter=',')
    X_anomaly = data_anomaly[:, :-1]
    X_anomaly = imp.fit_transform(X_anomaly)
    y_anomaly = data_anomaly[:,-1]
    ground_truth = y_anomaly
    ground_truth[y_anomaly == 1] = -1
    ground_truth[y_anomaly == 0] = 1

    dataset['isolation_forest_f1'] = compute_isolation_forest_f1(X_anomaly, ground_truth, contamination=contamination)
    dataset['lof_f1'] = compute_lof_f1(X_anomaly, ground_truth, contamination=contamination)
    dataset['ocsvm_f1'] = compute_ocsvm_f1(X_anomaly, ground_truth, contamination=contamination)
    print("\n")
    return dataset

### Benchmarking

In [34]:
datasets=[]

### 00 lymph

In [35]:
dataset = compute_info(datasets_basic, 'lymph')
print(dataset)



{'datasetID': 0, 'name': 'lymph', 'contamination(%)': 4.1, 'metafeatures': [0.26863786259667444, 61.30611403534003, 1.4806713339599, 0.7071230158730157, 0.0625, 0.8857142857142858, 0.03125, 0.7572817460317459, 0.0625, 0.6873809523809523, 0.0625, 0.6357936507936508, 0.046875], 'isolation_forest_f1': 0.9964664310954063, 'lof_f1': 0.9893992932862191, 'ocsvm_f1': 0.9264705882352942}


In [36]:
datasets.append(dataset)

### 01 glass

In [37]:
dataset = compute_info(datasets_basic, 'glass')
print(dataset)



{'datasetID': 1, 'name': 'glass', 'contamination(%)': 4.21, 'metafeatures': [0.5600270501084725, 135.14002942901286, 1.0685748630031175, 0.459828722002635, 0.125, 0.6049174979609763, 0.09375, 0.7060016312190226, 0.0625, 0.4505646527385657, 0.0625, 0.43133226676704933, 0.0], 'isolation_forest_f1': 0.9609756097560975, 'lof_f1': 0.9658536585365853, 'ocsvm_f1': 0.935}


In [38]:
datasets.append(dataset)

### 02 wdbc

In [39]:
dataset = compute_info(datasets_basic, 'wdbc')
print(dataset)



{'datasetID': 2, 'name': 'wdbc', 'contamination(%)': 37.26, 'metafeatures': [0.6972646145068585, 1300.2082198691544, 0.5044035701177434, 0.9386796733212339, 0.0625, 0.9543805634776596, 0.109375, 0.9158067582749977, 0.0625, 0.8824075274392879, 0.0625, 0.808493215798116, 0.0625], 'isolation_forest_f1': 0.8039215686274509, 'lof_f1': 0.711484593837535, 'ocsvm_f1': 0.61731843575419}


In [40]:
datasets.append(dataset)

### 03 speech

In [41]:
dataset = compute_info(datasets_basic, 'speech')
print(dataset)



{'datasetID': 3, 'name': 'speech', 'contamination(%)': 1.65, 'metafeatures': [0.012400670667631054, 47.401319205311246, 8.69013370300987, 0.9631008911881127, 0.59375, 0.9763895974447406, 0.03125, 0.9826410623242545, 0.03125, 0.9834518691544144, 0.03125, 0.9834518691544144, 0.0625], 'isolation_forest_f1': 0.984, 'lof_f1': 0.9837241379310345, 'ocsvm_f1': 0.9589816124469589}


In [42]:
datasets.append(dataset)

### 04 satellite_image

In [43]:
dataset = compute_info(datasets_basic, 'satellite_image')
print(dataset)



{'datasetID': 4, 'name': 'satellite_image', 'contamination(%)': 31.64, 'metafeatures': [0.36628000714210573, 3686.128256862358, 1.1209201105317794, 0.7878128466652179, 0.203125, 0.8296717347643972, 0.109375, 0.8828410109247423, 0.015625, 0.43574570897989107, 0.03125, 0.4282678047572393, 0.0625], 'isolation_forest_f1': 0.8092748351898159, 'lof_f1': 0.7087974539668106, 'ocsvm_f1': 0.4096881431059047}


In [44]:
datasets.append(dataset)

### 05 baseball

In [45]:
dataset = compute_info(datasets_basic, 'baseball')
print(dataset)



{'datasetID': 5, 'name': 'baseball', 'contamination(%)': 9.33, 'metafeatures': [0.5309830472185297, 2219.3638242940983, 0.662929265026992, 0.8382204418025314, 0.0625, 0.9268990960742729, 0.078125, 0.9001154250015209, 0.109375, 0.9067596437195808, 0.046875, 0.9172139379052183, 0.0625], 'isolation_forest_f1': 0.9555555555555556, 'lof_f1': 0.9234567901234567, 'ocsvm_f1': 0.92790978654853}


In [46]:
datasets.append(dataset)

### 06 ecoli

In [47]:
dataset = compute_info(datasets_basic, 'ecoli')
print(dataset)



{'datasetID': 6, 'name': 'ecoli', 'contamination(%)': 2.68, 'metafeatures': [0.4048955217168707, 235.3615336759734, 0.976230838456313, 0.7472514035325506, 0.046875, 0.8585529525902041, 0.0625, 0.8082291592358505, 0.0625, 0.632387298297927, 0.0625, 0.5538601692799361, 0.0625], 'isolation_forest_f1': 0.9847094801223242, 'lof_f1': 0.981651376146789, 'ocsvm_f1': 0.9592476489028213}


In [48]:
datasets.append(dataset)

### 07 phoneme

In [49]:
dataset = compute_info(datasets_basic, 'phoneme')
print(dataset)



{'datasetID': 7, 'name': 'phoneme', 'contamination(%)': 29.35, 'metafeatures': [0.2459127488478442, 1646.5528671540126, 1.6727604712530946, 0.7605367194395449, 0.015625, 0.7588796499200512, 0.03125, 0.9045141380736839, 0.015625, 0.7538882188921797, 0.015625, 0.7065142833772355, 0.015625], 'isolation_forest_f1': 0.7205343111576742, 'lof_f1': 0.7336301728653746, 'ocsvm_f1': 0.7040967092008059}


In [50]:
datasets.append(dataset)

### 08 click_prediction_small

In [51]:
dataset = compute_info(datasets_basic, 'click_prediction_small')
print(dataset)



{'datasetID': 8, 'name': 'click_prediction_small', 'contamination(%)': 16.84, 'metafeatures': [0.6419363587593904, 58399.56773256083, 0.6464383405526942, 0.8099027765678555, 0.390625, 0.8315560261543755, 0.109375, 0.7328026019003474, 0.0625, 0.8315810637107101, 0.078125, 0.8315810637107101, 0.078125], 'isolation_forest_f1': 0.8352498494882601, 'lof_f1': 0.8329620710415412, 'ocsvm_f1': 0.8643935675722493}


In [52]:
datasets.append(dataset)

### 09 musk

In [53]:
dataset = compute_info(datasets_basic, 'musk')
print(dataset)



{'datasetID': 9, 'name': 'musk', 'contamination(%)': 15.41, 'metafeatures': [0.29743199498855305, 2969.2237503160577, 1.3577113172661617, 0.8414119011459457, 0.40625, 0.9536296918760492, 0.0625, 0.7202186261523403, 0.03125, 0.9933232169954476, 0.03125, 0.8458628066806231, 0.015625], 'isolation_forest_f1': 0.8340799139939079, 'lof_f1': 0.844293137430568, 'ocsvm_f1': 0.7612288350400486}


In [54]:
datasets.append(dataset)

### 10 credit_g

In [55]:
dataset = compute_info(datasets_basic, 'credit_g')
print(dataset)



{'datasetID': 10, 'name': 'credit_g', 'contamination(%)': 30.0, 'metafeatures': [0.7222364097611819, 2304.593086778019, 0.49803987790561677, 0.7300000000000001, 0.03125, 0.76, 0.09375, 0.6060000000000001, 0.03125, 0.7, 0.0625, 0.7, 0.0625], 'isolation_forest_f1': 0.7257142857142859, 'lof_f1': 0.6857142857142857, 'ocsvm_f1': 0.6513409961685824}


In [56]:
datasets.append(dataset)

### 11 diabetes

In [57]:
dataset = compute_info(datasets_basic, 'diabetes')
print(dataset)



{'datasetID': 11, 'name': 'diabetes', 'contamination(%)': 34.89, 'metafeatures': [0.5687897206066522, 964.2725251674408, 0.7133822795351279, 0.7564935064935066, 0.03125, 0.765686944634313, 0.03125, 0.6796650717703349, 0.0625, 0.7149008885850991, 0.0625, 0.6328776486671224, 0.0625], 'isolation_forest_f1': 0.744, 'lof_f1': 0.662, 'ocsvm_f1': 0.6059979317476731}


In [58]:
datasets.append(dataset)

### 12 breast_w

In [59]:
dataset = compute_info(datasets_basic, 'breast_w')
print(dataset)



{'datasetID': 12, 'name': 'breast_w', 'contamination(%)': 34.48, 'metafeatures': [0.595130350704165, 1039.8866251008255, 0.7628477502434053, 0.9586501618406089, 0.0625, 0.9586909864987024, 0.046875, 0.9515268422126966, 0.0625, 0.924323622896801, 0.0625, 0.924323622896801, 0.0625], 'isolation_forest_f1': 0.9606986899563319, 'lof_f1': 0.5458515283842795, 'ocsvm_f1': 0.9046015712682379}


In [60]:
datasets.append(dataset)

### 13 blood_transfusion_service_center

In [61]:
dataset = compute_info(datasets_basic, 'blood_transfusion_service_center')
print(dataset)



{'datasetID': 13, 'name': 'blood_transfusion_service_center', 'contamination(%)': 23.79, 'metafeatures': [0.7025413950356174, 1006.8074144957931, 0.5789658115737796, 0.7473513513513513, 0.0625, 0.7687207207207207, 0.0625, 0.569963963963964, 0.03125, 0.762054054054054, 0.046875, 0.762054054054054, 0.03125], 'isolation_forest_f1': 0.7736842105263158, 'lof_f1': 0.7637314734088927, 'ocsvm_f1': 0.6755102040816328}


In [62]:
datasets.append(dataset)

### 14 heart

In [63]:
dataset = compute_info(datasets_basic, 'heart')
print(dataset)



{'datasetID': 14, 'name': 'heart', 'contamination(%)': 45.54, 'metafeatures': [0.3895016925056462, 240.76631323056972, 0.9675108708997435, 0.8178680014831293, 0.0625, 0.827982944011865, 0.046875, 0.591449758991472, 0.0625, 0.6859881349647757, 0.046875, 0.7315943641082685, 0.046875], 'isolation_forest_f1': 0.6242424242424243, 'lof_f1': 0.5636363636363636, 'ocsvm_f1': 0.5698005698005699}


In [64]:
datasets.append(dataset)

### 15 arrhythmia	

In [65]:
dataset = compute_info(datasets_basic, 'arrhythmia')
print(dataset)



{'datasetID': 15, 'name': 'arrhythmia', 'contamination(%)': 45.79, 'metafeatures': [0.12655658623145472, 52.614660515883806, 2.6530367501611636, 0.16748182967615627, 0.203125, 0.7107932335427034, 0.25, 0.5256204965059684, 0.0625, 0.5556713638959132, 0.046875, 0.5772532357179758, 0.0625], 'isolation_forest_f1': 0.6979591836734694, 'lof_f1': 0.710204081632653, 'ocsvm_f1': 0.6371681415929205}


In [66]:
datasets.append(dataset)

### 16 spambase

In [67]:
dataset = compute_info(datasets_basic, 'spambase')
print(dataset)



{'datasetID': 16, 'name': 'spambase', 'contamination(%)': 39.4, 'metafeatures': [0.8475319360835041, 4519.933422105306, 0.5855584148216163, 0.8216885343215848, 0.25, 0.8676362405756416, 0.203125, 0.787848669663722, 0.015625, 0.7730475836472476, 0.03125, 0.6059555892945166, 0.03125], 'isolation_forest_f1': 0.6761119081779053, 'lof_f1': 0.5831091984938139, 'ocsvm_f1': 0.6962745457882181}


In [68]:
datasets.append(dataset)

### 17 kc2

In [69]:
dataset = compute_info(datasets_basic, 'kc2')
print(dataset)



{'datasetID': 17, 'name': 'kc2', 'contamination(%)': 20.49, 'metafeatures': [0.9808012284453705, 1547.0821596373014, 0.44519022087226545, 0.8284612538774582, 0.03125, 0.8437035772218902, 0.0, 0.6036967472039615, 0.0625, 0.779772475027747, 0.0625, 0.8284975383477049, 0.0625], 'isolation_forest_f1': 0.891566265060241, 'lof_f1': 0.8158844765342961, 'ocsvm_f1': 0.7150684931506849}


In [70]:
datasets.append(dataset)

### 18 ilpd

In [71]:
dataset = compute_info(datasets_basic, 'ilpd')
print(dataset)



{'datasetID': 18, 'name': 'ilpd', 'contamination(%)': 28.64, 'metafeatures': [0.8572549371115099, 349.4820464781444, 0.6562431307831165, 0.5581480000410143, 0.015625, 0.7102079424159463, 0.03125, 0.6414044315933024, 0.0625, 0.7135977729244208, 0.0625, 0.7135977729244208, 0.0625], 'isolation_forest_f1': 0.6634615384615384, 'lof_f1': 0.6610576923076923, 'ocsvm_f1': 0.6326797385620916}


In [72]:
datasets.append(dataset)

### 19 pc1

In [73]:
dataset = compute_info(datasets_basic, 'pc1')
print(dataset)



{'datasetID': 19, 'name': 'pc1', 'contamination(%)': 6.94, 'metafeatures': [0.9877522554747115, 3328.677372028816, 0.23113785340253235, 0.8917911255411255, 0.109375, 0.9341346378846378, 0.03125, 0.8901114426114427, 0.109375, 0.9305873405873404, 0.0, 0.9305873405873404, 0.0625], 'isolation_forest_f1': 0.946705426356589, 'lof_f1': 0.935077519379845, 'ocsvm_f1': 0.6349206349206349}


In [74]:
datasets.append(dataset)

### 20 abalone

In [75]:
dataset = compute_info(datasets_basic, 'abalone')
print(dataset)



{'datasetID': 20, 'name': 'abalone', 'contamination(%)': 0.43, 'metafeatures': [0.5467584892393395, 6376.1190824749365, 0.6101756341028424, 0.23440718746259237, 0.171875, 0.2625402713501458, 0.234375, 0.20266392396374858, 0.09375, 0.2057422922867933, 0.078125, 0.20565524949290886, 0.015625], 'isolation_forest_f1': 0.9959124789612888, 'lof_f1': 0.9956720365472469, 'ocsvm_f1': 0.9709001233045622}


In [76]:
datasets.append(dataset)

### 21 speed_dating

In [77]:
dataset = compute_info(datasets_basic, 'speed_dating')
print(dataset)



{'datasetID': 21, 'name': 'speed_dating', 'contamination(%)': 16.47, 'metafeatures': [0.5392991912379833, 16224.455321839974, 0.6647590206856908, 0.7598620485139848, 0.15625, 0.8438764424598592, 0.046875, 0.6678343213488337, 0.03125, 0.8352828461689805, 0.03125, 0.8352828461689805, 0.046875], 'isolation_forest_f1': 0.8360960274364104, 'lof_f1': 0.8365247213489568, 'ocsvm_f1': 0.6107437267868466}


In [78]:
datasets.append(dataset)

### 22 cardiotocography

In [79]:
dataset = compute_info(datasets_basic, 'cardiotocography')
print(dataset)



{'datasetID': 22, 'name': 'cardiotocography', 'contamination(%)': 12.94, 'metafeatures': [0.6698047866103679, 7297.726811254146, 0.47351817696858717, 0.9995283018867924, 0.109375, 0.999052111410602, 0.203125, 0.3706101721418026, 0.125, 0.4530135875359765, 0.03125, 0.4530135875359765, 0.03125], 'isolation_forest_f1': 0.8984332793084819, 'lof_f1': 0.8843868179362506, 'ocsvm_f1': 0.7411104179663133}


In [80]:
datasets.append(dataset)

### 23 sick

In [81]:
dataset = compute_info(datasets_basic, 'sick')
print(dataset)



{'datasetID': 23, 'name': 'sick', 'contamination(%)': 6.12, 'metafeatures': [0.39559167198546236, 1720.344902225193, 1.025165385016438, 0.41410944619023954, 0.1875, 0.954667105253949, 0.21875, 0.908266203817109, 0.125, 0.9655382375789981, 0.03125, 0.9387603843704291, 0.03125], 'isolation_forest_f1': 0.9375882519062412, 'lof_f1': 0.9423891556057611, 'ocsvm_f1': 0.7810122394971881}


In [82]:
datasets.append(dataset)

### 24 adult

In [83]:
dataset = compute_info(datasets_basic, 'adult')
print(dataset)



{'datasetID': 24, 'name': 'adult', 'contamination(%)': 23.93, 'metafeatures': [0.5842257305055926, 69093.34966915922, 0.6104682039477688, 0.7952380584336447, 0.5, 0.8279558426270286, 0.078125, 0.740653915828182, 0.09375, 0.7607182417754828, 0.140625, 0.7607182417754828, 0.078125], 'isolation_forest_f1': 0.7576471221520945, 'lof_f1': 0.762976220915367, 'ocsvm_f1': 0.6868543769553155}


In [84]:
datasets.append(dataset)

### 25 jm1

In [85]:
dataset = compute_info(datasets_basic, 'jm1')
print(dataset)



{'datasetID': 25, 'name': 'jm1', 'contamination(%)': 19.35, 'metafeatures': [0.9957037328869507, 24559.20704734215, 0.4043211239910511, 0.8043280130459559, 0.359375, 0.8091978209443166, 0.015625, 0.7095105835242903, 0.03125, 0.8065229174782125, 0.03125, 0.8012887576985982, 0.03125], 'isolation_forest_f1': 0.8540183402631429, 'lof_f1': 0.8048071994076437, 'ocsvm_f1': 0.6761778367617783}


In [86]:
datasets.append(dataset)

### 26 scene

In [87]:
dataset = compute_info(datasets_basic, 'scene')
print(dataset)



{'datasetID': 26, 'name': 'scene', 'contamination(%)': 17.91, 'metafeatures': [0.16394326630552236, 367.9771642687542, 2.347211165974487, 0.8486876736051574, 0.28125, 0.977164420515986, 0.015625, 0.905749745664872, 0.046875, 0.8209398968942537, 0.046875, 0.8209398968942537, 0.03125], 'isolation_forest_f1': 0.7955465587044535, 'lof_f1': 0.8183198380566801, 'ocsvm_f1': 0.77466251298027}


In [88]:
datasets.append(dataset)

### 27 climate_model_simulation_crashes

In [89]:
dataset = compute_info(datasets_basic, 'climate_model_simulation_crashes')
print(dataset)



{'datasetID': 27, 'name': 'climate_model_simulation_crashes', 'contamination(%)': 8.52, 'metafeatures': [0.5410542319474954, 1118.4723802507572, 0.6449264406465076, 0.8926681913474367, 0.046875, 0.9111219109332318, 0.0625, 0.8463229782097708, 0.0625, 0.914929165872562, 0.109375, 0.914929165872562, 0.03125], 'isolation_forest_f1': 0.917004048582996, 'lof_f1': 0.9109311740890689, 'ocsvm_f1': 0.5511363636363636}


In [90]:
datasets.append(dataset)

### 28 quake

In [93]:
dataset = compute_info(datasets_basic, 'quake')
print(dataset)



{'datasetID': 28, 'name': 'quake', 'contamination(%)': 44.49, 'metafeatures': [0.5668926666306254, 1656.9576500860678, 0.7745065167917251, 0.5555470608222901, 0.046875, 0.5592167855929324, 0.046875, 0.5170234454638125, 0.0625, 0.5541794087665648, 0.0625, 0.5537164458036017, 0.0625], 'isolation_forest_f1': 0.5690653432588917, 'lof_f1': 0.5566583953680728, 'ocsvm_f1': 0.5004484304932736}


In [94]:
datasets.append(dataset)

### 29 yeast

In [95]:
dataset = compute_info(datasets_basic, 'yeast')
print(dataset)



{'datasetID': 29, 'name': 'yeast', 'contamination(%)': 0.34, 'metafeatures': [0.26404231937993805, 427.5565911760813, 1.6007035295724215, 0.1415132588916947, 0.046875, 0.5794285405806632, 0.0625, 0.48440540976416485, 0.109375, 0.40710800836168604, 0.078125, 0.3684088915543856, 0.0625], 'isolation_forest_f1': 0.9976327358809605, 'lof_f1': 0.9962800135272236, 'ocsvm_f1': 0.9739854318418314}


In [96]:
datasets.append(dataset)

### 30 churn

In [97]:
dataset = compute_info(datasets_basic, 'churn')
print(dataset)



{'datasetID': 30, 'name': 'churn', 'contamination(%)': 14.14, 'metafeatures': [0.6166327343906929, 14675.657079943683, 0.5128919907945317, 0.8728044864179456, 0.171875, 0.8594024584098336, 0.265625, 0.8051959495837984, 0.1875, 0.8570104648418594, 0.03125, 0.8586008616034464, 0.03125], 'isolation_forest_f1': 0.8739808991381317, 'lof_f1': 0.872816212438854, 'ocsvm_f1': 0.80332135332755}


In [98]:
datasets.append(dataset)

### 31 wilt

In [99]:
dataset = compute_info(datasets_basic, 'wilt')
print(dataset)



{'datasetID': 31, 'name': 'wilt', 'contamination(%)': 5.39, 'metafeatures': [0.48431902732162724, 4892.457139820039, 0.757113142258182, 0.8886251552530433, 0.03125, 0.9431650876323058, 0.03125, 0.978293365762499, 0.015625, 0.9460636383569435, 0.015625, 0.9460636383569435, 0.03125], 'isolation_forest_f1': 0.9434250764525994, 'lof_f1': 0.9477937964176496, 'ocsvm_f1': 0.7407407407407409}


In [100]:
datasets.append(dataset)

### print computed table

In [101]:
df = pd.DataFrame(datasets)
cols = ['datasetID', 'name', 'contamination(%)', 'metafeatures', 'isolation_forest_f1', 'lof_f1', 'ocsvm_f1']
df = df[cols]
df

Unnamed: 0,datasetID,name,contamination(%),metafeatures,isolation_forest_f1,lof_f1,ocsvm_f1
0,0,lymph,4.1,"[0.26863786259667444, 61.30611403534003, 1.480...",0.996466,0.989399,0.926471
1,1,glass,4.21,"[0.5600270501084725, 135.14002942901286, 1.068...",0.960976,0.965854,0.935
2,2,wdbc,37.26,"[0.6972646145068585, 1300.2082198691544, 0.504...",0.803922,0.711485,0.617318
3,3,speech,1.65,"[0.012400670667631054, 47.401319205311246, 8.6...",0.984,0.983724,0.958982
4,4,satellite_image,31.64,"[0.36628000714210573, 3686.128256862358, 1.120...",0.809275,0.708797,0.409688
5,5,baseball,9.33,"[0.5309830472185297, 2219.3638242940983, 0.662...",0.955556,0.923457,0.92791
6,6,ecoli,2.68,"[0.4048955217168707, 235.3615336759734, 0.9762...",0.984709,0.981651,0.959248
7,7,phoneme,29.35,"[0.2459127488478442, 1646.5528671540126, 1.672...",0.720534,0.73363,0.704097
8,8,click_prediction_small,16.84,"[0.6419363587593904, 58399.56773256083, 0.6464...",0.83525,0.832962,0.864394
9,9,musk,15.41,"[0.29743199498855305, 2969.2237503160577, 1.35...",0.83408,0.844293,0.761229


### Save

In [102]:
import pickle
with open("meta_computed_f1.txt", "wb") as fp: # Pickling
    pickle.dump(datasets, fp)

### Load

In [62]:
import pickle
with open("meta_computed.txt", "rb") as fp: # Unpickling
    datasets = pickle.load(fp)