# Outlier Detection - Anomaly Scores as Features
* `out1` -- Isolation Forest

In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
# add path
import sys; import os; sys.path.append(os.path.realpath("../"))

In [3]:
# demo datasets
from datasets.demo1 import X_train, Y_train, fold_ids, X_valid, Y_valid, meta as meta_data
#meta_data

In [4]:
# transformer implementations
typ = 'out1'

if typ is 'out1':
    from verto.out1 import trans, meta
    trans.set_params(**{'model__contamination': 0.15})
else:
    tmp = __import__("verto."+typ, fromlist=['trans', 'meta'])
    trans = tmp.trans
    meta = tmp.meta

In [5]:
meta

{'id': 'out1',
 'name': 'Iso. Forest',
 'description': 'Isolation Forest partitions samples randomly. The path length (or number of splits) is short for anomalies (anomal samples)',
 'keywords': ['IsolationForest', 'Isolation Forest', 'outlier detection'],
 'feature_names_prefix': 'out_iso'}

## Transform

In [6]:
%%time
trans.fit(X_train)

CPU times: user 144 ms, sys: 60.1 ms, total: 204 ms
Wall time: 3.76 s


IsolationForestTransformer(model=IsolationForest(behaviour='new', bootstrap=False, contamination=0.15,
        max_features=1.0, max_samples='auto', n_estimators=96, n_jobs=-1,
        random_state=42, verbose=False))

In [7]:
%%time
X_new = trans.transform(X_train)

CPU times: user 61.2 ms, sys: 2.34 ms, total: 63.6 ms
Wall time: 62.9 ms


In [8]:
from seasalt import create_feature_names
feature_names = create_feature_names(meta['feature_names_prefix'], X_new.shape[1])
print(feature_names)

['out_iso_0']


In [9]:
import pandas as pd
df_new = pd.DataFrame(data=X_new, columns=feature_names)

## Evaluate
- check if the anomaly score is a "good" predictor
- eyeball the p-values of the logistic regression coefficients

In [10]:
df_new.head()

Unnamed: 0,out_iso_0
0,-0.375732
1,-0.392822
2,-0.423584
3,-0.367188
4,-0.365234


In [11]:
import statsmodels.api as sm
#lr = sm.Logit(Y_train, sm.add_constant(X_new)).fit()
lr = sm.Logit(Y_train, sm.add_constant(X_new)).fit_regularized(method='l1', alpha=.5)
print(lr.summary())

Optimization terminated successfully.    (Exit mode 0)
            Current function value: 0.5974161143654582
            Iterations: 22
            Function evaluations: 22
            Gradient evaluations: 22
                           Logit Regression Results                           
Dep. Variable:                      y   No. Observations:                  398
Model:                          Logit   Df Residuals:                      396
Method:                           MLE   Df Model:                            1
Date:                Thu, 11 Apr 2019   Pseudo R-squ.:                  0.1430
Time:                        17:15:21   Log-Likelihood:                -225.55
converged:                       True   LL-Null:                       -263.17
                                        LLR p-value:                 4.131e-18
                 coef    std err          z      P>|z|      [0.025      0.975]
------------------------------------------------------------------------------