In [29]:
import os
import sys
from pprint import pprint

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# add the parent dir of notebooks to path
sys.path.append(os.path.dirname(os.path.abspath('')))

OUTPUT_DIRECTORY = os.path.join(os.path.abspath(''), "05_ValidTrainingSets")
os.makedirs(OUTPUT_DIRECTORY, exist_ok=True)

## Validation of Training Sets

By splitting into training and CV sets, some series result in no anomalies occurring in the training set, which means that precision/recall/fbeta are undefined, and the decision threshold cannot be fitted. These series should be exlcuded.

In [2]:
from data.yahoo.a1_benchmark import yahoo_a1_benchmark
yahoo_df = yahoo_a1_benchmark(clean_series=False)

In [3]:
from data.nab.real_tweets import nab_multivariate_tweet_volume, STOCK_NAMES
tweet_df = nab_multivariate_tweet_volume(labels="points")

In [4]:
from evaluation.utils import *

In [5]:
def get_null_training_set_series(df, split_frac=0.3, train_set=True):
    train_df, cv_df = cross_validation_split(extract_anomaly_labels_to_anomaly_column(df), frac_cv=split_frac)
    null_cols = []
    edf = train_df if train_set else cv_df
    for column in get_anomaly_cols(df).columns:
        acol = edf[column]
        if len(acol[acol == 1]) == 0:
            null_cols.append(column)
    return null_cols


In [71]:
YAHOO_SPLIT_FRAC = 0.21

In [72]:
null_yahoo_series = get_null_training_set_series(yahoo_df, split_frac=YAHOO_SPLIT_FRAC)
pprint(null_yahoo_series)
print(len(null_yahoo_series))

['anomaly_1',
 'anomaly_2',
 'anomaly_4',
 'anomaly_6',
 'anomaly_10',
 'anomaly_11',
 'anomaly_16',
 'anomaly_21',
 'anomaly_22',
 'anomaly_25',
 'anomaly_31',
 'anomaly_32',
 'anomaly_33',
 'anomaly_35',
 'anomaly_37',
 'anomaly_42',
 'anomaly_45',
 'anomaly_50',
 'anomaly_58',
 'anomaly_59',
 'anomaly_63',
 'anomaly_64',
 'anomaly_65',
 'anomaly_66',
 'anomaly_67']
25


In [73]:
null_yahoo_series_cv = get_null_training_set_series(yahoo_df, split_frac=YAHOO_SPLIT_FRAC, train_set=False)
pprint(null_yahoo_series_cv)
print(len(null_yahoo_series_cv))

['anomaly_5',
 'anomaly_14',
 'anomaly_18',
 'anomaly_35',
 'anomaly_36',
 'anomaly_44',
 'anomaly_48',
 'anomaly_49',
 'anomaly_54',
 'anomaly_59',
 'anomaly_62',
 'anomaly_64']
12


In [64]:
list(map(lambda s: int(s.replace("anomaly_", "")), null_yahoo_series))

[1,
 2,
 4,
 6,
 10,
 11,
 16,
 21,
 22,
 25,
 31,
 32,
 33,
 35,
 37,
 42,
 45,
 50,
 58,
 59,
 63,
 64,
 65,
 66,
 67]

In [38]:
TWEET_SPLIT_FRAC = 0.5

In [41]:
null_tweets_series = get_null_training_set_series(tweet_df, split_frac=TWEET_SPLIT_FRAC)
pprint(null_tweets_series)
print(len(null_tweets_series))

[]
0


In [42]:
null_tweets_series_cv = get_null_training_set_series(tweet_df, split_frac=TWEET_SPLIT_FRAC, train_set=False)
pprint(null_tweets_series_cv)
print(len(null_tweets_series_cv))

[]
0
