In [2]:
import importlib, yaml
import os, pandas as pd, numpy as np
from joblib import dump, load
import matplotlib.pyplot as plt
from contextlib import contextmanager
from collections import defaultdict
import sys
sys.path.append(os.path.dirname(os.path.dirname(os.getcwd())))
import sod.core.dataset as sod_core_dataset
importlib.reload(sod_core_dataset)
import sod.core.plot as sod_plot
importlib.reload(sod_plot)
from sod.core import pdconcat
# from sod.core.dataset import open_dataset # , normalize_df
# from sod import plot

is_outlier = sod_core_dataset.is_outlier

maindir = os.path.abspath(os.path.join(os.getcwd(), '..', 'evaluations/results/cv.allset_train.iforest.yaml/'))
assert os.path.isdir(maindir)

name = 'IsolationForest?features=psd@5sec&contamination=0.1&max_samples=2048&n_estimators=100&behaviour=new'
print('Reading %s' % name)
clf = load(os.path.join(maindir, 'models', name+'.model'))

print(clf.offset_)
raise
pred_df = pd.read_hdf(os.path.join(maindir, 'predictions', name+'.hdf'))
# setting stuff:
pred_df['channel_code'] = pred_df['channel_code'].astype('category')
pred_df['location_code'] = pred_df['location_code'].astype('category')
pred_df['cha_id'] = pred_df['channel_code'].str[:2]
pred_df['cha_id'] = pred_df['cha_id'].astype('category')
pred_df['score_samples'] = -(pred_df['decision_function'] + clf.offset_)




classnames = sod_core_dataset.allset_train.classnames[1:]
classes = {_: sod_core_dataset.allset_train.class_selector[_] for _ in classnames}

# def select(df, classes):
#     sel = None
#     for k, v in classes.items():
#         if sel is None:
#             sel = v(df)
#         else:
#             sel |= v(df)
#     return df[sel]

#_pred_df = select(pred_df, classes)
_pred_df = pred_df
print('Reading source train dataframe for event times')
dfr = pd.read_hdf('/Users/riccardo/work/gfz/projects/sources/python/sod/sod/datasets/allset_train.hdf',
           columns=['Segment.db.id', 'dataset_id', 'event_time'])
dfr.rename({'Segment.db.id': 'allset_train.id'}, axis=1, inplace=True)
assert 'event_time' not in _pred_df.columns
_pred_df = _pred_df.merge(dfr, how='left', on=['allset_train.id', 'dataset_id'])
assert 'event_time' in _pred_df.columns

tmin, tmax = pd.to_datetime(_pred_df.event_time.min()), pd.to_datetime(_pred_df.event_time.max())

#print('%d segments of %d misclassified with score diff >=%f' % (len(_pred_df), len(pred_df), TH))

_fle = os.path.abspath(os.path.join('.', 'jnconfig.yaml'))
assert os.path.isfile(_fle)
with open(_fle, "r") as _:
    jnconfig = yaml.safe_load(_)

dataset_urls= {
    1: jnconfig['dbpath_new'],
    2: jnconfig['dbpath_me'],
    3: jnconfig['dbpath_chile']
}


station_codes = {}

from stream2segment.process.db import get_session
from stream2segment.io.db.models import Station, Segment, concat, Channel
from stream2segment.io.utils import loads_inv

print('Getting station networks and stations')
for dts_id, df in _pred_df.groupby('dataset_id'):
    sess = get_session(dataset_urls[dts_id])
    for (staid, stanet, stasta) in sess.query(Station.id, Station.network, Station.station):
        station_codes[(dts_id, staid)] = "%s.%s" % (stanet, stasta)
    sess.close()

        
columns = ['dataset_id', 'station_id', 'cha_id', 'location_code']
# _group = _pred_df.groupby(columns)
# print('%d distinct channels found, %d channels (no orientation)' % (len(_group), len(_group)/3))


spreds = []
for cname, csel in classes.items():
    class_df = _pred_df[csel(_pred_df)]
    for (dts_id, sta_id, cha_id, loc_code), df in class_df.groupby(columns):
        key = '%s.%s.%s?' % (station_codes[(dts_id, sta_id)], loc_code, cha_id)
        key += ' (database: %s, station id: %d)' % (dataset_urls[dts_id][dataset_urls[dts_id].rfind('/')+1:], sta_id)
        med = df.score_samples.median()
        outl_count = df.outlier.sum()
        if outl_count == len(df) and med < 0.5:
            spreds.append([key, True, cname, df.event_time, df.score_samples])
        elif outl_count == 0 and med > 0.5:
            spreds.append([key, False, cname, df.event_time, df.score_samples])
        elif outl_count != len(df) and outl_count !=0:
            print('%d: ??' % (sta_id))

@contextmanager
def plotparams(params):
    '''makes temporarily matplotlib params.
    Make sure to run this after %matplotlib inline.
    For info see https://stackoverflow.com/questions/36367986/how-to-make-inline-plots-in-jupyter-notebook-larger'''
    def_params = {k: plt.rcParams[k] for k in params}
    for k, v in params.items():
        plt.rcParams[k] = v
    try:
        yield
    finally:
        for k, v in def_params.items():
            plt.rcParams[k] = v

            
spreds.sort(key=lambda val: np.nanmedian(val[-1]), reverse=True)
assert any('wrong inv' in _[2] for _ in spreds)

print()
print()
title = '%d channels to check (one plot per channel, each blue point is one event\'s score):' % len(spreds)
print(title)
print('=' * len(title))
print()
with plotparams({'figure.figsize': (50, 10), 'font.size': 32}):
    for i, (key, outlier, cname, times, vals) in enumerate(spreds):
        median_score = vals.median()
        plt.axhline(median_score, color='r', lw=5, linestyle='dotted')
        plt.scatter([pd.to_datetime(_) for _ in times], vals, s=121)
        plt.xlim([tmin, tmax])
        comment = "Channel " + key
        comment += '. Median score %.3f (red line), label "%s"' % (median_score, cname)
        # comment += ('<0.5 (label: "%s")' if not outlier else '>0.5 (label: "%s")') % cname
        print(comment)  # printing makes it easy to search within the web browser
        # plt.title(key + '\n' + comment)
        plt.ylabel('scores (0: ok, 1:anomaly)')
        plt.xlabel('event time')
        plt.show()
#         if i > 1:
#             break


# path = os.path.join(evalreports, 'evaluations.all.hdf')

# dfr = pd.read_hdf(path)
# print('evaulation dataframe.\n\nColumns:\n' + str(dfr.columns.tolist()))
# print('\nclassname distinct values:\n' + str(np.unique(dfr['classname'])))

Reading IsolationForest?features=psd@5sec&contamination=0.1&max_samples=2048&n_estimators=100&behaviour=new
-0.48911202968027523


RuntimeError: No active exception to reraise