In [8]:
import numpy as np
import pandas as pd

def downsampling(mat, interval):
    """
    对矩阵进行降采样,根据指定的间隔,将矩阵的行数缩减为每 interval 取一行。
    :param mat: 输入的二维矩阵 (NumPy 数组或 DataFrame)
    :param interval: 采样间隔
    :return: 降采样后的矩阵
    """
    mat = mat.to_numpy() if isinstance(mat, pd.DataFrame) else mat  # 转换为 NumPy 数组
    num_row, num_col = mat.shape  # 获取矩阵的行数和列数
    res = num_row % interval  # 计算矩阵行数对采样间隔的余数
    if res != 0:  # 如果行数不能被采样间隔整除
        add_num = interval - res  # 计算需要补充的行数
        add_mat = np.zeros((add_num, num_col))  # 创建一个全零的矩阵进行补充
        mat = np.concatenate((mat, add_mat), axis=0)  # 将原矩阵与补充的全零矩阵拼接在一起
    num_row, num_col = mat.shape  # 获取补充后的矩阵的行数和列数
    mat_tmp = np.zeros((interval, num_row // interval, num_col))  # 创建一个临时矩阵用于存储降采样数据
    for i in range(interval):  # 遍历每个采样间隔
        mat_tmp[i, ...] = mat[i::interval, :]  # 每隔 interval 行采样一次,存储到临时矩阵中
    first_slice = mat_tmp[4, :, :]  # 提取降采样后的其中一片
    print(first_slice.shape)  # 输出降采样后矩阵的形状
    return first_slice  # 返回降采样后的矩阵

# 数据加载和预处理
data = pd.read_csv("WADI_attackdataLABLE.csv", header=1).iloc[:172800, 3:]  # (172804, 131)
data.index = data.index.astype(int)

# 重命名 'Attack LABLE (1:No Attack, -1:Attack)' 列为 'label'
data = data.rename(columns={'Attack LABLE (1:No Attack, -1:Attack)': 'label'})
# 删掉值为空的列
ncolumns = ['2_LS_001_AL', '2_LS_002_AL', '2_P_001_STATUS', '2_P_002_STATUS']
data = data.drop(columns=ncolumns)

# 替换标签值
data.loc[data['label'] == 1, 'label'] = 0
data.loc[data['label'] == -1, 'label'] = 1

# 降采样
data_downsampled = downsampling(data, interval=10)

# 检查降采样后的数据是否存在 NaN 或 Inf
if np.isnan(data_downsampled).any() or np.isinf(data_downsampled).any():
    print("data 存在空值或无穷值")

# 转换为 DataFrame
data_downsampled = pd.DataFrame(data_downsampled, columns=data.columns)

# 查看每列缺失值数量
missing_values = data_downsampled.isnull().sum()
# 筛选出包含缺失值的列
missing_columns = missing_values[missing_values > 0]
if(len(missing_columns)>0):
    print("有缺失值的列及对应的缺失值数量：")
    print(missing_columns)

# 分割数据
test_label = data_downsampled[['label']]  # 提取最后一列
test = data_downsampled.drop(columns=['label'])  # 删除最后一列

# 保存为 CSV 文件
test_label.to_csv("./down/test_label.csv", index=False)  # 不保存索引
test.to_csv("./down/test.csv", index=False)  # 不保存索引

print("文件保存完成：./down/test.csv 和 ./down/test_label.csv")
print(f"test shape:{test.shape},test_label shape:{test_label.shape}")
# test shape:(17281, 127),test_label shape:(17281, 1)

(17280, 124)
文件保存完成：./down/test.csv 和 ./down/test_label.csv
test shape:(17280, 123),test_label shape:(17280, 1)


In [9]:
import pandas as pd
data_train = pd.read_csv('./origin_data/WADI_14days_new.csv').iloc[:, 3:]
ncolumns = ['2_LS_001_AL', '2_LS_002_AL', '2_P_001_STATUS', '2_P_002_STATUS']
data_train = data_train.drop(columns=ncolumns)

# 查看每列缺失值数量
data_train_missing = data_train.isnull().sum()
# 筛选出包含缺失值的列
missing_columns = data_train_missing[data_train_missing > 0]
if(len(missing_columns)>0):
    print("有缺失值的列及对应的缺失值数量：")
    print(missing_columns)
# data_train.to_csv("./down/train.csv", index=False)  # 784571 rows × 123 columns
# data_train
data_train_downsampled = downsampling(data_train, interval=5)
# 转换为 DataFrame
data_train_down = pd.DataFrame(data_train_downsampled, columns=data_train.columns)
data_train_down.to_csv("./down/train.csv", index=False)
data_train_down

有缺失值的列及对应的缺失值数量：
1_AIT_002_PV     12
1_AIT_004_PV      6
2B_AIT_004_PV    10
3_AIT_004_PV      6
dtype: int64
(156915, 123)


Unnamed: 0,1_AIT_001_PV,1_AIT_002_PV,1_AIT_003_PV,1_AIT_004_PV,1_AIT_005_PV,1_FIT_001_PV,1_LS_001_AL,1_LS_002_AL,1_LT_001_PV,1_MV_001_STATUS,...,3_MV_001_STATUS,3_MV_002_STATUS,3_MV_003_STATUS,3_P_001_STATUS,3_P_002_STATUS,3_P_003_STATUS,3_P_004_STATUS,LEAK_DIFF_PRESSURE,PLANT_START_STOP_LOG,TOTAL_CONS_REQUIRED_FLOW
0,171.155,0.607477,11.5725,504.673,0.318438,0.001207,0.0,0.0,47.7503,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,67.1948,1.0,0.68
1,171.151,0.613478,11.5735,504.701,0.318495,0.001260,0.0,0.0,48.5082,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,63.9867,1.0,0.68
2,171.151,0.613478,11.5735,504.701,0.318495,0.001260,0.0,0.0,48.5082,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,63.9867,1.0,0.68
3,171.163,0.619474,11.5721,504.723,0.318440,0.001201,0.0,0.0,49.6802,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,62.8957,1.0,0.68
4,171.159,0.619474,11.5734,504.729,0.318478,0.001138,0.0,0.0,50.2051,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,60.3402,1.0,0.68
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
156910,175.865,0.607477,11.8890,479.151,0.331445,0.001086,0.0,0.0,48.2628,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,60.5908,1.0,0.25
156911,175.874,0.577480,11.8921,479.168,0.331673,0.001135,0.0,0.0,48.2420,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,60.7123,1.0,0.25
156912,175.855,0.589478,11.8941,479.191,0.331571,0.001128,0.0,0.0,48.1129,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,60.6305,1.0,0.25
156913,175.896,0.613476,11.8913,479.224,0.331622,0.001173,0.0,0.0,48.0348,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,60.4477,1.0,0.25


In [10]:
import os
import pandas as pd
import numpy as np
from pickle import dump
output_folder = 'processed'  # 定义保存目录
os.makedirs(output_folder, exist_ok=True)  # 如果目录不存在，则创建

In [11]:
def load_and_save(filename, dataset_folder):
    """
    处理单个文件：加载 CSV 文件，将其转换为 NumPy 数组并保存为 .pkl 格式。
    
    Parameters:
        filename (str): 要加载的文件名。
        dataset_folder (str): 数据集所在的主目录。
    """
    file_path = os.path.join(dataset_folder, filename)
    print(f"Processing file: {file_path}")
    
    # 读取 CSV 文件并转换为 NumPy 数组
    try:
        # .iloc[1:,1:] 删除标题行和timestamp .iloc[1:,1:]
        df = pd.read_csv(file_path)
        # if 'timestamp_(min)' in df.columns:
        #     df = df.drop(columns=['timestamp_(min)'])
        # print(df)
        temp = df.values.astype(np.float32)
        print(f"Dataset Folder: {dataset_folder}, File: {filename}, Shape: {temp.shape}")

        # 定义输出文件夹并确保其存在
        output_folder = 'processed'
        os.makedirs(output_folder, exist_ok=True)

        # 生成输出文件名并保存为 .pkl
        base_name = os.path.splitext(filename)[0]  # 去掉扩展名
        formatted_dataset = dataset_folder.replace(os.sep, "_")  # 替换路径分隔符为 '_'
        save_name = f"WADI_{formatted_dataset}_{base_name}.pkl".replace("__", "_")  # 清理多余下划线
        # save_name = f"{formatted_dataset}_{base_name}.pkl"
        print(save_name)
        save_path = os.path.join(output_folder, save_name)
        
        with open(save_path, "wb") as file:
            dump(temp, file)
        print(f"Saved to: {save_path}")
    except Exception as e:
        print(f"Error processing {filename}: {e}")
        
def load_data(dataset_folder):
    """
    批量处理数据集目录下的所有 CSV 文件。

    Parameters:
        dataset_folder (str): 数据集所在的主目录。
    """
    file_list = os.listdir(dataset_folder)  # 列出目录中的所有文件
    print(f"Files in dataset folder '{dataset_folder}': {file_list}")
    
    # 遍历文件，处理所有 CSV 文件
    for filename in file_list:
        if filename.endswith('.csv'):
            load_and_save(filename, dataset_folder)


In [12]:
dataset = 'down'
# dataset = ['SMAP', 'MSL']
load_data(dataset)

Files in dataset folder 'down': ['test_label.csv', 'test.csv', 'train.csv']
Processing file: down/test_label.csv
Dataset Folder: down, File: test_label.csv, Shape: (17280, 1)
WADI_down_test_label.pkl
Saved to: processed/WADI_down_test_label.pkl
Processing file: down/test.csv
Dataset Folder: down, File: test.csv, Shape: (17280, 123)
WADI_down_test.pkl
Saved to: processed/WADI_down_test.pkl
Processing file: down/train.csv
Dataset Folder: down, File: train.csv, Shape: (156915, 123)
WADI_down_train.pkl
Saved to: processed/WADI_down_train.pkl
