In [11]:
%load_ext autoreload
%autoreload 2

In [12]:
import sys
from pathlib import Path

module_path = str(Path.cwd().parents[0])
# in standard python
# module_path = str(Path.cwd(__file__).parents[0])
if module_path not in sys.path:
	sys.path.append(module_path)

In [13]:
import os
import numpy as np
import pandas as pd
from dataset_utils.dataset_reader import datasets

## Datasets

We have five multivariate timeseries datsets benchmarks. WADI, SWaT, SMD, MSL and SMAP.

All datsets have been prepared and are in processed_datasets folder. A datset can be loaded with the datsets utility dict which contains different dataset loading functions.

We can simply load a dataset with the respective key and passing the root folder path (i.e. the path where the processed_dataset folder lies.) 

In [53]:
datasets.keys()

dict_keys(['wadi', 'swat', 'smd', 'msl', 'smap'])

### Load a dataset

In [56]:
train, test, labels = datasets["msl"](module_path)

In [57]:
print(f"{train.shape}, {test.shape}, {labels.shape}")

(2922, 55), (4315, 55), (4315, 55)


#### Note:

Labels are test-set labels (0 or 1). For all datasets (except SwaT) they are 2D i.e have sensor level anaomly for a given timestamp.

For evaluation we only need 0-1 label for each timestamp(row in the test set data array). So any row that has a 1 is anomolous. we can just do something like below to get the 1-d labels for evaluation. While we can use the 2D labels for anaylsis on which sensor is anomolous.

In [58]:
if len(labels.shape) > 1:
	test_labels = labels.max(1)
else:
	test_labels = labels
np.unique(labs, return_counts=True)

(array([0, 1]), array([16527,   753]))

## Also note

datasets are saved without normalization. Ususally the features needs to be scaled in the range 0 -1 with MinMax norm. We will do it in our training/testing model dev.

# Sample Evalaution on a dataset using random predictions

In [11]:
from evaluation_scripts.evaluate import evaluate

In [59]:
predictions = np.random.rand(test_labels.shape[0])

In [60]:
results, df = evaluate(predictions, test_labels, pa=True, interval=10, k=50)

[INFO]: Computing scores with Point Adjust (PA) method by finding best threshold... this may take a while ...
computing conventional PA scores 
computing PA@k scores
           with_PA  with_PA@K  without_PA
       F1 0.888407   0.286174    0.190498
Precision 0.857741   0.166979    0.106202
   Recall 0.921348   1.000000    0.923596
      AUC 0.951889   0.713178    0.505192


The evaluation script is from the AAAI'22 paper [Towards a Rigorous Evaluation of Time-series Anomaly Detection](https://arxiv.org/pdf/2109.05257.pdf).

The scores confirm that the currently used Point Adjust PA method in most of the papers is highly biased (flawed). As seen, we can have the numbers with their PA@K as well as the raw to report/compare and discuss.

## Leftover --  SwaT and WADI data preps with GDN scripts

#### prepare SWaT dataset with gdn provided script

In [83]:
filename = "SWaT_Dataset_Normal_v0.xlsx"
file_path = Path(module_path, "TranAD/data/SWaT_orig", filename)

In [19]:
rfile = pd.read_excel(file_path)

In [85]:
rfile.to_csv(
	Path(module_path, "TranAD/data/SWaT_orig", "swat_train.csv"),
	index=None,
	header=False,
)

In [94]:
df = pd.read_csv(Path(module_path, "TranAD/data/SWaT_orig", "swat_train.csv"))

In [98]:
df["Normal/Attack"] = df["Normal/Attack"].map({"Normal": 0, "Attack": 1})

In [100]:
df = df.rename(columns={"Normal/Attack": "attack"})

In [102]:
df.to_csv(
	Path(module_path, "data/SWaT_orig", "swat_train.csv"), index=None, header=True
)

## 0-1 norm is done but we will save the data without it
from sklearn.preprocessing import MinMaxScaler
normalizer = MinMaxScaler(feature_range=(0, 1)).fit(train_data[:, :-1]) # scale training data to [0,1] range
train_ret = normalizer.transform(train_data[:, :-1])
test_ret = normalizer.transform(test_data[:, :-1])


np.save(Path(module_path, 'processed/SWaT', 'test.npy'), test_data[:, :-1])

In [110]:
tff = pd.read_csv(os.path.join(dataset_folder, "test.csv"))

In [111]:
tff.head()

Unnamed: 0.1,Unnamed: 0,1_AIT_001_PV,1_AIT_002_PV,1_AIT_003_PV,1_AIT_004_PV,1_AIT_005_PV,1_FIT_001_PV,1_LS_001_AL,1_LS_002_AL,1_LT_001_PV,...,3_MV_002_STATUS,3_MV_003_STATUS,3_P_001_STATUS,3_P_002_STATUS,3_P_003_STATUS,3_P_004_STATUS,LEAK_DIFF_PRESSURE,PLANT_START_STOP_LOG,TOTAL_CONS_REQUIRED_FLOW,attack
0,0,164.21,0.529486,11.9972,482.48,0.331167,0.001273,0.0,0.0,48.482,...,1.0,1.0,1.0,1.0,1.0,1.0,62.6226,1.0,0.39,0
1,1,164.213,0.559483,11.9984,482.48,0.33117,0.001104,0.0,0.0,48.4878,...,1.0,1.0,1.0,1.0,1.0,1.0,62.7149,1.0,0.39,0
2,2,164.216,0.556482,11.9982,482.452,0.33127,0.001215,0.0,0.0,48.4695,...,1.0,1.0,1.0,1.0,1.0,1.0,62.6514,1.0,0.39,0
3,3,164.21,0.553484,11.9949,482.491,0.331177,0.001204,0.0,0.0,48.38,...,1.0,1.0,1.0,1.0,1.0,1.0,62.1997,1.0,0.32,0
4,4,164.202,0.532485,12.0,482.477,0.331161,0.00121,0.0,0.0,48.2816,...,1.0,1.0,1.0,1.0,1.0,1.0,62.4,1.0,0.36,0


In [98]:
ls_1 = pd.read_csv(os.path.join(dataset_folder, "WADI_attacklabels_gdn.csv"))

In [99]:
ls_1

Unnamed: 0,Id,Date,Start Time,End Time,Affected
0,1,10/09/2017,19:25:00,19:50:16,1_MV_001
1,2,10/10/2017,10:24:10,10:34:00,1_FIT_001
2,3-4,10/10/2017,10:55:00,11:24:00,"2_LT_002, 1_AIT_001"
3,5,10/10/2017,11:30:40,11:44:50,"2_MCV_101, 2_MCV_201, 2_MCV301, 2_MCV_401, 2_M..."
4,6,10/10/2017,13:39:30,13:50:40,"2_MCV_101, 2_MCV_201"
5,7,10/10/2017,14:48:17,14:59:55,"1_AIT_002, 1_AIT_003, 1_AIT_004, 1_AIT_005, 1_..."
6,8,10/10/2017,17:40:00,17:49:40,2_MCV_007
7,9,10/10/2017,10:55:00,10:56:27,1_P_006
8,10,10/11/2017,11:17:54,11:31:20,1_MV_001
9,11,11/11/2017,11:36:31,11:47:00,2_MCV_007


# prepare WADI dataset with gdn provided script

In [14]:
dataset_folder = Path(module_path, "TranAD/data/WADI")

In [74]:
ls = pd.read_csv(os.path.join(dataset_folder, "WADI_attacklabels_gdn.csv"))
ls["Date"][7] = "11/10/2017"
ls["Date"][9] = "10/11/2017"
ls["Date"][10] = "10/11/2017"
ls["Date"][11] = "10/11/2017"
# train = pd.read_csv(os.path.join(dataset_folder, 'WADI_14days.csv'),  skiprows=1200,  nrows=2e5)
test = pd.read_csv(os.path.join(dataset_folder, "WADI_attackdata.csv"))

# train.dropna(how='all', inplace=True); test.dropna(how='all', inplace=True)
# train.fillna(0, inplace=True); test.fillna(0, inplace=True)

# train = train.fillna(train.mean())
test = test.fillna(test.mean())
# train = train.fillna(0)
test = test.fillna(0)

test["Time"] = test["Time"].astype(str)
test["Time"] = pd.to_datetime(test["Date"] + " " + test["Time"])
labels = test.copy(deep=True)
for i in test.columns.tolist()[3:]:
	labels[i] = 0
for i in ["Start Time", "End Time"]:
	ls[i] = ls[i].astype(str)
	ls[i] = pd.to_datetime(ls["Date"] + " " + ls[i])
for index, row in ls.iterrows():
	to_match = row["Affected"].split(", ")
	matched = []
	for i in test.columns.tolist()[3:]:
		for tm in to_match:
			if tm in i:
				matched.append(i)
				break
	st, et = str(row["Start Time"]), str(row["End Time"])
	labels.loc[(labels["Time"] >= st) & (labels["Time"] <= et), matched] = 1

  test = test.fillna(test.mean())


In [46]:
# test.columns.tolist()[3:]

In [75]:
ls

Unnamed: 0,Id,Date,Start Time,End Time,Affected
0,1,10/09/2017,2017-10-09 19:25:00,2017-10-09 19:50:16,1_MV_001
1,2,10/10/2017,2017-10-10 10:24:10,2017-10-10 10:34:00,1_FIT_001
2,3-4,10/10/2017,2017-10-10 10:55:00,2017-10-10 11:24:00,"2_LT_002, 1_AIT_001"
3,5,10/10/2017,2017-10-10 11:30:40,2017-10-10 11:44:50,"2_MCV_101, 2_MCV_201, 2_MCV301, 2_MCV_401, 2_M..."
4,6,10/10/2017,2017-10-10 13:39:30,2017-10-10 13:50:40,"2_MCV_101, 2_MCV_201"
5,7,10/10/2017,2017-10-10 14:48:17,2017-10-10 14:59:55,"1_AIT_002, 1_AIT_003, 1_AIT_004, 1_AIT_005, 1_..."
6,8,10/10/2017,2017-10-10 17:40:00,2017-10-10 17:49:40,2_MCV_007
7,9,11/10/2017,2017-11-10 10:55:00,2017-11-10 10:56:27,1_P_006
8,10,10/11/2017,2017-10-11 11:17:54,2017-10-11 11:31:20,1_MV_001
9,11,10/11/2017,2017-10-11 11:36:31,2017-10-11 11:47:00,2_MCV_007


In [62]:
labels.head()

Unnamed: 0,Row,Date,Time,\\WIN-25J4RO10SBF\LOG_DATA\SUTD_WADI\LOG_DATA\1_AIT_001_PV,\\WIN-25J4RO10SBF\LOG_DATA\SUTD_WADI\LOG_DATA\1_AIT_002_PV,\\WIN-25J4RO10SBF\LOG_DATA\SUTD_WADI\LOG_DATA\1_AIT_003_PV,\\WIN-25J4RO10SBF\LOG_DATA\SUTD_WADI\LOG_DATA\1_AIT_004_PV,\\WIN-25J4RO10SBF\LOG_DATA\SUTD_WADI\LOG_DATA\1_AIT_005_PV,\\WIN-25J4RO10SBF\LOG_DATA\SUTD_WADI\LOG_DATA\1_FIT_001_PV,\\WIN-25J4RO10SBF\LOG_DATA\SUTD_WADI\LOG_DATA\1_LS_001_AL,...,\\WIN-25J4RO10SBF\LOG_DATA\SUTD_WADI\LOG_DATA\3_MV_001_STATUS,\\WIN-25J4RO10SBF\LOG_DATA\SUTD_WADI\LOG_DATA\3_MV_002_STATUS,\\WIN-25J4RO10SBF\LOG_DATA\SUTD_WADI\LOG_DATA\3_MV_003_STATUS,\\WIN-25J4RO10SBF\LOG_DATA\SUTD_WADI\LOG_DATA\3_P_001_STATUS,\\WIN-25J4RO10SBF\LOG_DATA\SUTD_WADI\LOG_DATA\3_P_002_STATUS,\\WIN-25J4RO10SBF\LOG_DATA\SUTD_WADI\LOG_DATA\3_P_003_STATUS,\\WIN-25J4RO10SBF\LOG_DATA\SUTD_WADI\LOG_DATA\3_P_004_STATUS,\\WIN-25J4RO10SBF\LOG_DATA\SUTD_WADI\LOG_DATA\LEAK_DIFF_PRESSURE,\\WIN-25J4RO10SBF\LOG_DATA\SUTD_WADI\LOG_DATA\PLANT_START_STOP_LOG,\\WIN-25J4RO10SBF\LOG_DATA\SUTD_WADI\LOG_DATA\TOTAL_CONS_REQUIRED_FLOW
0,1,10/9/2017,2017-10-09 18:00:00,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,2,10/9/2017,2017-10-09 18:00:01,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,3,10/9/2017,2017-10-09 18:00:02,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,4,10/9/2017,2017-10-09 18:00:03,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,5,10/9/2017,2017-10-09 18:00:04,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [76]:
labs_ = labels.iloc[:, 3:]

In [77]:
labels_ = labs_.to_numpy()

In [78]:
labels_.shape

(172801, 127)

In [79]:
anomoly_label = labels_.max(1)

In [80]:
np.unique(anomoly_label, return_counts=True)

(array([0, 1]), array([163721,   9080]))

In [81]:
test_labelled = test.copy(deep=True)

In [82]:
test_labelled["attack"] = labels_.max(1)

In [83]:
test_labelled.to_csv(
	"/cvhci/data/Saquib_GSP/TAD/TranAD/data/WADI/WADI_attackdata_labelled.csv"
)

In [84]:
tf = pd.read_csv(
	"/cvhci/data/Saquib_GSP/TAD/TranAD/data/WADI/WADI_attackdata_labelled.csv"
)

In [85]:
tf.head()

Unnamed: 0.1,Unnamed: 0,Row,Date,Time,\\WIN-25J4RO10SBF\LOG_DATA\SUTD_WADI\LOG_DATA\1_AIT_001_PV,\\WIN-25J4RO10SBF\LOG_DATA\SUTD_WADI\LOG_DATA\1_AIT_002_PV,\\WIN-25J4RO10SBF\LOG_DATA\SUTD_WADI\LOG_DATA\1_AIT_003_PV,\\WIN-25J4RO10SBF\LOG_DATA\SUTD_WADI\LOG_DATA\1_AIT_004_PV,\\WIN-25J4RO10SBF\LOG_DATA\SUTD_WADI\LOG_DATA\1_AIT_005_PV,\\WIN-25J4RO10SBF\LOG_DATA\SUTD_WADI\LOG_DATA\1_FIT_001_PV,...,\\WIN-25J4RO10SBF\LOG_DATA\SUTD_WADI\LOG_DATA\3_MV_002_STATUS,\\WIN-25J4RO10SBF\LOG_DATA\SUTD_WADI\LOG_DATA\3_MV_003_STATUS,\\WIN-25J4RO10SBF\LOG_DATA\SUTD_WADI\LOG_DATA\3_P_001_STATUS,\\WIN-25J4RO10SBF\LOG_DATA\SUTD_WADI\LOG_DATA\3_P_002_STATUS,\\WIN-25J4RO10SBF\LOG_DATA\SUTD_WADI\LOG_DATA\3_P_003_STATUS,\\WIN-25J4RO10SBF\LOG_DATA\SUTD_WADI\LOG_DATA\3_P_004_STATUS,\\WIN-25J4RO10SBF\LOG_DATA\SUTD_WADI\LOG_DATA\LEAK_DIFF_PRESSURE,\\WIN-25J4RO10SBF\LOG_DATA\SUTD_WADI\LOG_DATA\PLANT_START_STOP_LOG,\\WIN-25J4RO10SBF\LOG_DATA\SUTD_WADI\LOG_DATA\TOTAL_CONS_REQUIRED_FLOW,attack
0,0,1,10/9/2017,2017-10-09 18:00:00,164.21,0.529486,11.9972,482.48,0.331167,0.001273,...,1,1,1,1,1,1,62.6226,1,0.39,0
1,1,2,10/9/2017,2017-10-09 18:00:01,164.21,0.529486,11.9972,482.48,0.331167,0.001273,...,1,1,1,1,1,1,62.6226,1,0.39,0
2,2,3,10/9/2017,2017-10-09 18:00:02,164.21,0.529486,11.9972,482.48,0.331167,0.001273,...,1,1,1,1,1,1,62.6226,1,0.39,0
3,3,4,10/9/2017,2017-10-09 18:00:03,164.21,0.529486,11.9972,482.48,0.331167,0.001273,...,1,1,1,1,1,1,62.6226,1,0.39,0
4,4,5,10/9/2017,2017-10-09 18:00:04,164.21,0.529486,11.9972,482.48,0.331167,0.001273,...,1,1,1,1,1,1,62.6226,1,0.39,0


In [97]:
tf.iloc[:, 4:]

Unnamed: 0,\\WIN-25J4RO10SBF\LOG_DATA\SUTD_WADI\LOG_DATA\1_AIT_001_PV,\\WIN-25J4RO10SBF\LOG_DATA\SUTD_WADI\LOG_DATA\1_AIT_002_PV,\\WIN-25J4RO10SBF\LOG_DATA\SUTD_WADI\LOG_DATA\1_AIT_003_PV,\\WIN-25J4RO10SBF\LOG_DATA\SUTD_WADI\LOG_DATA\1_AIT_004_PV,\\WIN-25J4RO10SBF\LOG_DATA\SUTD_WADI\LOG_DATA\1_AIT_005_PV,\\WIN-25J4RO10SBF\LOG_DATA\SUTD_WADI\LOG_DATA\1_FIT_001_PV,\\WIN-25J4RO10SBF\LOG_DATA\SUTD_WADI\LOG_DATA\1_LS_001_AL,\\WIN-25J4RO10SBF\LOG_DATA\SUTD_WADI\LOG_DATA\1_LS_002_AL,\\WIN-25J4RO10SBF\LOG_DATA\SUTD_WADI\LOG_DATA\1_LT_001_PV,\\WIN-25J4RO10SBF\LOG_DATA\SUTD_WADI\LOG_DATA\1_MV_001_STATUS,...,\\WIN-25J4RO10SBF\LOG_DATA\SUTD_WADI\LOG_DATA\3_MV_002_STATUS,\\WIN-25J4RO10SBF\LOG_DATA\SUTD_WADI\LOG_DATA\3_MV_003_STATUS,\\WIN-25J4RO10SBF\LOG_DATA\SUTD_WADI\LOG_DATA\3_P_001_STATUS,\\WIN-25J4RO10SBF\LOG_DATA\SUTD_WADI\LOG_DATA\3_P_002_STATUS,\\WIN-25J4RO10SBF\LOG_DATA\SUTD_WADI\LOG_DATA\3_P_003_STATUS,\\WIN-25J4RO10SBF\LOG_DATA\SUTD_WADI\LOG_DATA\3_P_004_STATUS,\\WIN-25J4RO10SBF\LOG_DATA\SUTD_WADI\LOG_DATA\LEAK_DIFF_PRESSURE,\\WIN-25J4RO10SBF\LOG_DATA\SUTD_WADI\LOG_DATA\PLANT_START_STOP_LOG,\\WIN-25J4RO10SBF\LOG_DATA\SUTD_WADI\LOG_DATA\TOTAL_CONS_REQUIRED_FLOW,attack
0,164.210,0.529486,11.9972,482.480,0.331167,0.001273,0,0,48.4820,1,...,1,1,1,1,1,1,62.6226,1,0.39,0
1,164.210,0.529486,11.9972,482.480,0.331167,0.001273,0,0,48.4820,1,...,1,1,1,1,1,1,62.6226,1,0.39,0
2,164.210,0.529486,11.9972,482.480,0.331167,0.001273,0,0,48.4820,1,...,1,1,1,1,1,1,62.6226,1,0.39,0
3,164.210,0.529486,11.9972,482.480,0.331167,0.001273,0,0,48.4820,1,...,1,1,1,1,1,1,62.6226,1,0.39,0
4,164.210,0.529486,11.9972,482.480,0.331167,0.001273,0,0,48.4820,1,...,1,1,1,1,1,1,62.6226,1,0.39,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
172796,172.959,0.547483,11.9184,466.034,0.318217,0.001222,0,0,55.5587,1,...,1,1,1,1,1,1,59.3546,1,0.00,0
172797,172.959,0.547483,11.9184,466.034,0.318217,0.001222,0,0,55.5587,1,...,1,1,1,1,1,1,59.3546,1,0.00,0
172798,172.915,0.583479,11.9211,466.051,0.318317,0.001260,0,0,55.7260,1,...,1,1,1,1,1,1,58.8102,1,0.00,0
172799,172.915,0.583479,11.9211,466.051,0.318317,0.001260,0,0,55.7260,1,...,1,1,1,1,1,1,58.8102,1,0.00,0


In [89]:
tf_t = pd.read_csv(
	"/cvhci/data/Saquib_GSP/TAD/TranAD/data/WADI/WADI_14days.csv",
	skiprows=50,
	nrows=121e4,
)

In [90]:
tf_t.head

<bound method NDFrame.head of               46  9/25/2017  6:00:45.000 PM  171.159  0.613472  11.5691  \
0             47  9/25/2017  6:00:46.000 PM  171.159  0.613472  11.5691   
1             48  9/25/2017  6:00:47.000 PM  171.159  0.613472  11.5691   
2             49  9/25/2017  6:00:48.000 PM  171.157  0.619473  11.5703   
3             50  9/25/2017  6:00:49.000 PM  171.157  0.619473  11.5703   
4             51  9/25/2017  6:00:50.000 PM  171.157  0.619473  11.5703   
...          ...        ...             ...      ...       ...      ...   
1209550  1209597  10/9/2017  5:59:56.000 PM  164.211  0.511488  11.9928   
1209551  1209598  10/9/2017  5:59:57.000 PM  164.211  0.511488  11.9928   
1209552  1209599  10/9/2017  5:59:58.000 PM  164.211  0.511488  11.9928   
1209553  1209600  10/9/2017  5:59:59.000 PM  164.211  0.511488  11.9928   
1209554  1209601  10/9/2017  6:00:00.000 PM  164.210  0.529486  11.9972   

         504.746  0.318524  0.00115931  0  ...  1.28  1.29  1.30  1.3

In [95]:
tf_t.iloc[:, 3:]

Unnamed: 0,171.159,0.613472,11.5691,504.746,0.318524,0.00115931,0,0.1,48.9955,1,...,1.28,1.29,1.30,1.31,1.32,1.33,1.34,60.4271,1.35,0.68
0,171.159,0.613472,11.5691,504.746,0.318524,0.001159,0,0,48.9955,1,...,1,1,1,1,1,1,1,60.4271,1,0.68
1,171.159,0.613472,11.5691,504.746,0.318524,0.001159,0,0,48.9955,1,...,1,1,1,1,1,1,1,60.4271,1,0.68
2,171.157,0.619473,11.5703,504.779,0.318343,0.001279,0,0,49.2548,1,...,1,1,1,1,1,1,1,60.5116,1,0.68
3,171.157,0.619473,11.5703,504.779,0.318343,0.001279,0,0,49.2548,1,...,1,1,1,1,1,1,1,60.5116,1,0.68
4,171.157,0.619473,11.5703,504.779,0.318343,0.001279,0,0,49.2548,1,...,1,1,1,1,1,1,1,60.5116,1,0.68
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1209550,164.211,0.511488,11.9928,482.452,0.331198,0.001182,0,0,48.3848,1,...,1,1,1,1,1,1,1,62.8802,1,0.39
1209551,164.211,0.511488,11.9928,482.452,0.331198,0.001182,0,0,48.3848,1,...,1,1,1,1,1,1,1,62.8802,1,0.39
1209552,164.211,0.511488,11.9928,482.452,0.331198,0.001182,0,0,48.3848,1,...,1,1,1,1,1,1,1,62.8802,1,0.39
1209553,164.211,0.511488,11.9928,482.452,0.331198,0.001182,0,0,48.3848,1,...,1,1,1,1,1,1,1,62.8802,1,0.39


In [91]:
test = test.rename(columns=lambda x: x.strip())

In [8]:
test.columns

NameError: name 'test' is not defined

In [33]:
from TranAD.data.WADI.process_wadi import downsample

In [34]:
test = test.iloc[:, 3:]

In [49]:
d_test_x, d_test_labels = downsample(test.values, labels_.max(1), 10)

In [50]:
d_test_labels = np.array(d_test_labels)

In [51]:
d_test_x = np.array(d_test_x)

In [52]:
d_test_labels.shape

(17280,)

In [59]:
np.unique(d_test_labels, return_counts=True)

(array([0, 1]), array([16527,   753]))

In [53]:
d_test_x.shape

(17280, 127)

In [54]:
labels_.shape

(172801, 127)

In [57]:
np.save(Path(module_path, "processed_datasets/WADI_gdn", "labels.npy"), d_test_labels)

In [58]:
np.save(Path(module_path, "processed_datasets/WADI_gdn", "test.npy"), d_test_x)

In [71]:
col_list = pd.read_csv(
	"/cvhci/data/Saquib_GSP/TAD/TranAD/data/WADI/wadi_list.txt", names=["col_name"]
)

In [72]:
col_list

Unnamed: 0,col_name
0,1_AIT_001_PV
1,1_AIT_002_PV
2,1_AIT_003_PV
3,1_AIT_004_PV
4,1_AIT_005_PV
...,...
107,3_P_003_STATUS
108,3_P_004_STATUS
109,LEAK_DIFF_PRESSURE
110,PLANT_START_STOP_LOG


In [2]:
import pandas as pd

In [3]:
test_orig = pd.read_csv("/cvhci/data/TAD/wadi_gdn_csv/test.csv", sep=",", index_col=0)

In [4]:
test_orig.head()

Unnamed: 0,1_AIT_001_PV,1_AIT_002_PV,1_AIT_003_PV,1_AIT_004_PV,1_AIT_005_PV,1_FIT_001_PV,1_LS_001_AL,1_LS_002_AL,1_LT_001_PV,1_MV_001_STATUS,...,3_MV_002_STATUS,3_MV_003_STATUS,3_P_001_STATUS,3_P_002_STATUS,3_P_003_STATUS,3_P_004_STATUS,LEAK_DIFF_PRESSURE,PLANT_START_STOP_LOG,TOTAL_CONS_REQUIRED_FLOW,attack
0,164.21,0.529486,11.9972,482.48,0.331167,0.001273,0.0,0.0,48.482,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,62.6226,1.0,0.39,0
1,164.213,0.559483,11.9984,482.48,0.33117,0.001104,0.0,0.0,48.4878,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,62.7149,1.0,0.39,0
2,164.216,0.556482,11.9982,482.452,0.33127,0.001215,0.0,0.0,48.4695,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,62.6514,1.0,0.39,0
3,164.21,0.553484,11.9949,482.491,0.331177,0.001204,0.0,0.0,48.38,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,62.1997,1.0,0.32,0
4,164.202,0.532485,12.0,482.477,0.331161,0.00121,0.0,0.0,48.2816,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,62.4,1.0,0.36,0


In [7]:
len(test_orig)

17280