This notebook demonstrates how to use the `data_loader` to prepare the data for analysis.

Please refer to the [instruction to access the datasets](https://github.com/ssarfraz/QuoVadisTAD/blob/feature/data_loaders/resources/processed_datasets/README.md).


In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import os
import sys
from pathlib import Path
module_path = str(Path.cwd().parents[0])
if module_path not in sys.path:
    sys.path.append(module_path)

import numpy as np

from quovadis_tad.dataset_utils.data_utils import find_files_in_path, extract_ucr_internal_bleeding_dataset

In [3]:
# Config
root_raw_dataset_directory = os.path.join(module_path, "resources", "raw_datasets")
root_prepared_dataset_directory = os.path.join(module_path, "resources", "processed_datasets")

## UCR Internal Bleeding Dataset

Load and prepare it for the further analysis.

The preprocessing script and dataset are taken from https://github.com/imperial-qore/TranAD/blob/main/preprocess.py

In [4]:
# config

dataset_name = "UCR"
dataset_directory = os.path.join(root_raw_dataset_directory, dataset_name)
output_dir = os.path.join(root_prepared_dataset_directory, dataset_name)


In [5]:
print(dataset_directory)

C:\meiyche_github\QuoVadisTAD\resources\raw_datasets\UCR


In [6]:
find_files_in_path(directory=dataset_directory, file_ending="txt")

['135_UCR_Anomaly_InternalBleeding16_1200_4187_4199.txt',
 '136_UCR_Anomaly_InternalBleeding17_1600_3198_3309.txt',
 '137_UCR_Anomaly_InternalBleeding18_2300_4485_4587.txt',
 '138_UCR_Anomaly_InternalBleeding19_3000_4187_4197.txt']

In [7]:
# Split the raw data from txt files into .npy files train, test, label for further analysis
extract_ucr_internal_bleeding_dataset(
    dataset_folder=dataset_directory, 
    output_dir=output_dir
    )

# Simple baseline results match the paper and notebooks

In [8]:
from quovadis_tad.eval_simple_baselines import evaluate_simple_baselines_on_all_paper_datasets

In [9]:
df_point_wise = evaluate_simple_baselines_on_all_paper_datasets(
    root_path=module_path,
    dataset_names=['ucr_IB_16', 'ucr_IB_17', 'ucr_IB_18', 'ucr_IB_19', 'ucr_IB'],
    data_normalization="0-1",              
    eval_method='point_wise',   
    score_normalization='optimal',
    verbose=False,
)

[INFO:] UCR contains 4 data traces


In [10]:
(
    df_point_wise
    .drop(['P', 'R','AUPRC'], axis=1, level=1)
    .style
    .format(precision=3)
)

Unnamed: 0_level_0,UCR_IB_16,UCR_IB_17,UCR_IB_18,UCR_IB_19,UCR_IB
Unnamed: 0_level_1,F1,F1,F1,F1,F1
Sensor Range Deviation,0.004,0.085,0.038,0.004,0.033
Simple L2_norm,0.011,0.058,0.061,0.017,0.037
1-NN Distance,0.786,0.973,0.889,0.87,0.879
PCA_Error,0.75,0.974,0.99,1.0,0.928


## SMD

1. import the data and save to prepared folder
2. run simple baseline and compare the results

In [11]:
# set the dirs
dataset_name = "SMD"
dataset_directory = os.path.join(root_raw_dataset_directory, dataset_name)
output_dir = os.path.join(root_prepared_dataset_directory)

In [12]:
filenames = find_files_in_path(directory=dataset_directory, file_ending="txt")

In [13]:
print(filenames[0:5])
print("N of files:", len(filenames))

['machine-1-1.txt', 'machine-1-2.txt', 'machine-1-3.txt', 'machine-1-4.txt', 'machine-1-5.txt']
N of files: 112


## Test and try to understand the script

In [14]:
from quovadis_tad.dataset_utils.data_utils import extract_smd_dataset

In [15]:
extract_smd_dataset(
    dataset_folder=dataset_directory, 
    output_dir=output_dir,
)

machine-1-1 train machine-1-1.txt (28479, 38)
machine-1-1 test machine-1-1.txt (28479, 38)
machine-1-1 labels machine-1-1.txt (28479, 38)
machine-1-2 train machine-1-2.txt (23694, 38)
machine-1-2 test machine-1-2.txt (23694, 38)
machine-1-2 labels machine-1-2.txt (23694, 38)
machine-1-3 train machine-1-3.txt (23702, 38)
machine-1-3 test machine-1-3.txt (23703, 38)
machine-1-3 labels machine-1-3.txt (23703, 38)
machine-1-4 train machine-1-4.txt (23706, 38)
machine-1-4 test machine-1-4.txt (23707, 38)
machine-1-4 labels machine-1-4.txt (23707, 38)
machine-1-5 train machine-1-5.txt (23705, 38)
machine-1-5 test machine-1-5.txt (23706, 38)
machine-1-5 labels machine-1-5.txt (23706, 38)
machine-1-6 train machine-1-6.txt (23688, 38)
machine-1-6 test machine-1-6.txt (23689, 38)
machine-1-6 labels machine-1-6.txt (23689, 38)
machine-1-7 train machine-1-7.txt (23697, 38)
machine-1-7 test machine-1-7.txt (23697, 38)
machine-1-7 labels machine-1-7.txt (23697, 38)
machine-1-8 train machine-1-8.txt 

In [16]:
df_point_wise = evaluate_simple_baselines_on_all_paper_datasets(
    root_path=module_path,
    dataset_names=['smd'],
    data_normalization="0-1",              
    eval_method='point_wise',   
    score_normalization='optimal',
    verbose=False,
)

[INFO:] SMD contains 28 data traces.


In [17]:
(
    df_point_wise
    .drop(['P', 'R','AUPRC'], axis=1, level=1)
    .style
    .format(precision=3)
)  # Verified with Suppl. Table 9. - PCA Error in paper = 0.536 < here = 0.572. This could be due to the selected PCA method.

Unnamed: 0_level_0,SMD
Unnamed: 0_level_1,F1
Sensor Range Deviation,0.132
Simple L2_norm,0.404
1-NN Distance,0.463
PCA_Error,0.572


## SMAP and MSL

1. import the data and save to prepared folder
2. run simple baseline and compare the results

In [18]:
dataset_name = "SMAP_MSL"
dataset_directory = os.path.join(root_raw_dataset_directory, dataset_name)
output_dir = os.path.join(root_prepared_dataset_directory)

In [19]:
filenames = find_files_in_path(directory=dataset_directory, file_ending="txt")

In [20]:
print(filenames[0:5])
print("N of files:", len(filenames))

[]
N of files: 0


## Test and try to understand the script

In [21]:
from quovadis_tad.dataset_utils.data_utils import extract_smap_msl_dataset

#### SMAP

In [22]:
extract_smap_msl_dataset(dataset_folder=dataset_directory, output_dir=output_dir, dataset="SMAP")

In [23]:
df_point_wise = evaluate_simple_baselines_on_all_paper_datasets(
    root_path=module_path,
    dataset_names=['smap'],
    data_normalization="0-1",              
    eval_method='point_wise',   
    score_normalization='optimal',
    verbose=False,
)

[INFO:] SMAP contains 54 data traces.


  explained_variance_ratio_ = explained_variance_ / total_var
  explained_variance_ratio_ = explained_variance_ / total_var


In [24]:
(
    df_point_wise
    .drop(['P', 'R','AUPRC'], axis=1, level=1)
    .style
    .format(precision=3)
)  # Verified - matched Suppl. Table 8. However, PCA_Error in Table 8 = 0.361 < here 0.387. This could be due to the selected PCA method.

Unnamed: 0_level_0,SMAP
Unnamed: 0_level_1,F1
Sensor Range Deviation,0.273
Simple L2_norm,0.351
1-NN Distance,0.352
PCA_Error,0.387


In [25]:
#### MSL

In [26]:
extract_smap_msl_dataset(dataset_folder=dataset_directory, output_dir=output_dir, dataset="MSL")

In [27]:
df_point_wise = evaluate_simple_baselines_on_all_paper_datasets(
    root_path=module_path,
    dataset_names=['msl'],
    data_normalization="0-1",              
    eval_method='point_wise',   
    score_normalization='optimal',
    verbose=False,
)

[INFO:] MSL contains 27 data traces.


In [28]:
(
    df_point_wise
    .drop(['P', 'R','AUPRC'], axis=1, level=1)
    .style
    .format(precision=3)
)  # verified: matched Suppli. Table 8.

Unnamed: 0_level_0,MSL
Unnamed: 0_level_1,F1
Sensor Range Deviation,0.328
Simple L2_norm,0.395
1-NN Distance,0.404
PCA_Error,0.426
