In [35]:
import os
import numpy as np
import pandas as pd
from sklearn.metrics import roc_curve, auc

def roc(pred_df):
    fpr, tpr, thresholds = roc_curve(pred_df.outlier, pred_df.predicted_anomaly_score, drop_intermediate=True)
    auc_ = auc(fpr, tpr)
    tnr_, tpr_, th_ = (1-fpr[1:]), tpr[1:], thresholds[1:]  # thresholds[0] represents no instances being predicted and is arbitrarily set to max(y_score) + 1.
    pos, neg = pred_df.outlier.sum(), (~pred_df.outlier).sum()
    best_th = np.argmax(tnr_ * tpr_)
    return {
        'auc': auc_,
        'best_thres': th_[best_th],
        'best_thres_tpr': tpr_[best_th],
        'best_thres_tnr': tnr_[best_th]
    }

import sys
sys.path.append(os.path.dirname(os.path.dirname(os.getcwd())))
from sod.core.dataset import dataset_info, allset_test

class_inlier = allset_test.classnames[0]
class_outlier = allset_test.classnames[1]
class_me_unknown = allset_test.classnames[2]

sel_inlier = allset_test.class_selector[class_inlier]
sel_outlier = allset_test.class_selector[class_outlier]
sel_me_unknown = allset_test.class_selector[class_me_unknown]


print()
print('Loading models and predictions')
maindir = os.path.abspath(os.path.join(os.getcwd(), '..',
                                       'evaluations/results/eval.allset_train_test.iforest.yaml/evalreports/'))
assert os.path.isdir(maindir)
rootdir = maindir.replace('evalreports', '')
modeldir = os.path.join(rootdir, 'models')
predir = os.path.join(rootdir, 'predictions')
assert os.path.isdir(modeldir)
assert os.path.isdir(predir)

auc_df = None

data = []
index = []
modelfiles = os.listdir(modeldir)
last_progress = -1
for i, clfname in enumerate(modelfiles):
    progress = int((100.0*i)/len(modelfiles))
    if int(progress/10) != int(last_progress/10):
        print('%d%% done' % progress)
        last_progress = progress
#     if 'features=psd@5sec' not in clfname: # and 'features=psd@2sec,psd@5sec' not in clfname:
#         continue
    o_pred_df =  pd.read_hdf(os.path.join(predir, clfname.replace('.model', '.hdf')))
    pred_df = o_pred_df[sel_inlier(o_pred_df) | sel_outlier(o_pred_df)]
    row = roc(pred_df)
    row['classifier'] = clfname[:clfname.index('?')]
    flatname = clfname[clfname.index('?')+1:].replace('.model', '')
    for pnamepval in flatname.split('&'):
        pname, pval = pnamepval.split('=')
        try:
            row[pname] = int(pval)
        except:
            try:
                row[pname] = float(pval)
            except:
                row[pname] = pval
    data.append(row)
    # index.append(clfname.replace('.model', '').replace('&behaviour=new&contamination=auto', '').replace('IsolationForest?', ''))

auc_df = pd.DataFrame(data=data)
print('Summary stats (auc, tpr tnr) on predictions (evaluations)')
with pd.option_context("display.max_colwidth", 80):
    print(auc_df.sort_values(by=['auc'], ascending=False).to_string(index=False))

hdfpath = os.path.join(maindir, 'evaluation.auc.all.hdf') 
print('Saving to "%s"' % hdfpath)

auc_df.to_hdf(hdfpath, mode='w', format='table', key='evaluation_all_auc')


Loading models and predictions
10% done
20% done
30% done
40% done
50% done
60% done
70% done
80% done
90% done
Summary stats (auc, tpr tnr) on predictions (evaluations)
auc behaviour  best_thres  best_thres_tnr  best_thres_tpr       classifier contamination                                          features  max_samples  n_estimators
0.994533       new    0.516447        0.992397        0.971725  IsolationForest          auto                                          psd@5sec         2048           200
0.994291       new    0.520936        0.992429        0.972020  IsolationForest          auto                                          psd@5sec         2048           100
0.994114       new    0.557885        0.992898        0.971763  IsolationForest          auto                                          psd@5sec          512           200
0.994020       new    0.564893        0.993282        0.971439  IsolationForest          auto                                          psd@5sec       