This notebook demonstrates how to use the `data_loader` to prepare the data for analysis.

Please refer to the [instruction to access the datasets](https://github.com/ssarfraz/QuoVadisTAD/blob/feature/data_loaders/resources/processed_datasets/README.md).


In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import os
import sys
from pathlib import Path
module_path = str(Path.cwd().parents[0])
if module_path not in sys.path:
    sys.path.append(module_path)

import numpy as np

from quovadis_tad.dataset_utils.data_utils import find_files_in_path, extract_ucr_internal_bleeding_dataset

## UCR Internal Bleeding Dataset

Load and prepare it for the further analysis.

The preprocessing script and dataset are taken from https://github.com/imperial-qore/TranAD/blob/main/preprocess.py

In [3]:
# config

dataset_name = "UCR"
dataset_directory = os.path.join(module_path, "resources", "raw_datasets", dataset_name)
output_dir = os.path.join(module_path, "resources", "processed_datasets", dataset_name)


In [4]:
print(dataset_directory)

C:\meiyche_github\QuoVadisTAD\resources\raw_datasets\UCR


In [5]:
find_files_in_path(directory=dataset_directory, file_ending="txt")

['135_UCR_Anomaly_InternalBleeding16_1200_4187_4199.txt',
 '136_UCR_Anomaly_InternalBleeding17_1600_3198_3309.txt',
 '137_UCR_Anomaly_InternalBleeding18_2300_4485_4587.txt',
 '138_UCR_Anomaly_InternalBleeding19_3000_4187_4197.txt']

In [6]:
# Split the raw data from txt files into .npy files train, test, label for further analysis
extract_ucr_internal_bleeding_dataset(
    dataset_folder=dataset_directory, 
    output_dir=output_dir
    )

# Simple baseline results match the paper and notebooks

In [8]:
from quovadis_tad.eval_simple_baselines import evaluate_simple_baselines_on_all_paper_datasets

In [9]:
df_point_wise = evaluate_simple_baselines_on_all_paper_datasets(
    root_path=module_path,
    dataset_names=['ucr_IB_16', 'ucr_IB_17', 'ucr_IB_18', 'ucr_IB_19', 'ucr_IB'],
    data_normalization="0-1",              
    eval_method='point_wise',   
    score_normalization='optimal',
    verbose=False,
)

[INFO:] UCR contains 4 data traces


In [10]:
(
    df_point_wise
    .drop(['P', 'R','AUPRC'], axis=1, level=1)
    .style
    .format(precision=3)
)

Unnamed: 0_level_0,UCR_IB_16,UCR_IB_17,UCR_IB_18,UCR_IB_19,UCR_IB
Unnamed: 0_level_1,F1,F1,F1,F1,F1
Sensor Range Deviation,0.004,0.085,0.038,0.004,0.033
Simple L2_norm,0.011,0.058,0.061,0.017,0.037
1-NN Distance,0.786,0.973,0.889,0.87,0.879
PCA_Error,0.75,0.974,0.99,1.0,0.928
