# Outlier detection with isolation forests

In [2]:
import numpy as np
from sklearn.model_selection import KFold
from sklearn.ensemble import IsolationForest
import warnings
import scipy.io

warnings.filterwarnings("ignore")

rng = np.random.RandomState(42)

## Data loading

In [3]:
%run ../util/load_data.py
%run ../util/data_visualization.py
%run ../util/outliers_statistics.py

data = load_data("../data")
mat = scipy.io.loadmat('../data/cover.mat')
df = pd.DataFrame(mat['X'], columns=data['numerical_attributes'])
target = mat['y']
df['target'] = target

### Outlier detection with Isolation Forests

In [12]:
kf = KFold(n_splits=10)

outliers_ids = []

count = 0
for train, test in kf.split(df):
    print(count)
    count += 1
    train_data = df.iloc[train, :]
    test_data = df.iloc[test, :]
    
    isolation_forest_classifier = IsolationForest(random_state=rng, behaviour='new')
    isolation_forest_classifier.fit(train_data)
    
    prediction_result = zip(
        test_data.index.values.tolist(), 
        isolation_forest_classifier.predict(test_data)
    )
    
    outliers_ids += [id for id, is_not_outlier in prediction_result if is_not_outlier < 0]

0
1
2
3
4
5
6
7
8
9


In [13]:
precision, recall, F1_score = outliers_statistics(df, outliers_ids)
print("Precision: "+ str(precision))
print("Recall: "+ str(recall))
print("F1-score: "+ str(F1_score))

TP: 2637
FP: 31267
FN: 110
Precision: 0.0777784332232185
Recall: 0.9599563159810702
F1-score: 0.14389784726201194
