In [25]:
import numpy as np
import pandas as pd

def downsampling(mat, interval,index):
    """
    对矩阵进行降采样,根据指定的间隔,将矩阵的行数缩减为每 interval 取一行。
    :param mat: 输入的二维矩阵 (NumPy 数组或 DataFrame)
    :param interval: 采样间隔
    :return: 降采样后的矩阵
    """
    mat = mat.to_numpy() if isinstance(mat, pd.DataFrame) else mat  # 转换为 NumPy 数组
    num_row, num_col = mat.shape  # 获取矩阵的行数和列数
    res = num_row % interval  # 计算矩阵行数对采样间隔的余数
    if res != 0:  # 如果行数不能被采样间隔整除
        add_num = interval - res  # 计算需要补充的行数
        add_mat = np.zeros((add_num, num_col))  # 创建一个全零的矩阵进行补充
        mat = np.concatenate((mat, add_mat), axis=0)  # 将原矩阵与补充的全零矩阵拼接在一起
    num_row, num_col = mat.shape  # 获取补充后的矩阵的行数和列数
    mat_tmp = np.zeros((interval, num_row // interval, num_col))  # 创建一个临时矩阵用于存储降采样数据
    for i in range(interval):  # 遍历每个采样间隔
        mat_tmp[i, ...] = mat[i::interval, :]  # 每隔 interval 行采样一次,存储到临时矩阵中
    print(f"down index :{index}")
    first_slice = mat_tmp[index, :, :]  # 提取降采样后的其中一片
    print(first_slice.shape)  # 输出降采样后矩阵的形状
    return first_slice  # 返回降采样后的矩阵

# 数据加载和预处理
swat_abnormal = pd.read_csv("./origin/SWaT_Dataset_Attack_v0.csv").iloc[:,1:]
data = swat_abnormal.rename(columns={'Normal/Attack': 'label'})
# 替换标签值
data.loc[data['label'] == 'Normal', 'label'] = 0
data.loc[data['label'] == 'Attack', 'label'] = 1
data.loc[data['label'] == 'A ttack', 'label'] = 1

# 降采样
data_downsampled = downsampling(data, interval=60,index=20)

# 检查降采样后的数据是否存在 NaN 或 Inf
if np.isnan(data_downsampled).any() or np.isinf(data_downsampled).any():
    print("data 存在空值或无穷值")

# 转换为 DataFrame
data_downsampled = pd.DataFrame(data_downsampled, columns=data.columns)

# 查看每列缺失值数量
missing_values = data_downsampled.isnull().sum()
# 筛选出包含缺失值的列
missing_columns = missing_values[missing_values > 0]
print("有缺失值的列及对应的缺失值数量：")
print(missing_columns)

# 分割数据
test_label = data_downsampled[['label']]  # 提取最后一列
test = data_downsampled.drop(columns=['label'])  # 删除最后一列

# 保存为 CSV 文件
test_label.to_csv("./down/test_label.csv", index=False)  # 不保存索引
test.to_csv("./down/test.csv", index=False)  # 不保存索引

print("文件保存完成：./down/test.csv 和 ./down/test_label.csv")
print(f"test shape:{test.shape},test_label shape:{test_label.shape}")
# test shape:(17281, 127),test_label shape:(17281, 1)

down index :20
(7499, 52)
有缺失值的列及对应的缺失值数量：
Series([], dtype: int64)
文件保存完成：./down/test.csv 和 ./down/test_label.csv
test shape:(7499, 51),test_label shape:(7499, 1)


In [27]:
swat_normal = pd.read_csv('./origin/SWaT_Dataset_Normal_v1.csv').iloc[:,1:-1]
data_train_downsampled = downsampling(swat_normal,interval=10,index=0)
# 转换为 DataFrame
data_train_down = pd.DataFrame(data_train_downsampled, columns=swat_normal.columns)
data_train_down.to_csv("./down/train.csv", index=False)
data_train_down

down index :0
(49680, 51)


Unnamed: 0,FIT101,LIT101,MV101,P101,P102,AIT201,AIT202,AIT203,FIT201,MV201,...,FIT504,P501,P502,PIT501,PIT502,PIT503,FIT601,P601,P602,P603
0,2.470294,261.5804,2.0,2.0,1.0,244.3284,8.190080,306.1010,2.471278,2.0,...,0.000000,1.0,1.0,10.02948,0.000000,4.277749,0.000256,1.0,1.0,1.0
1,2.630753,261.7766,2.0,2.0,1.0,244.8090,8.190080,306.1010,2.470894,2.0,...,0.000000,1.0,1.0,10.02948,0.000000,4.277749,0.000256,1.0,1.0,1.0
2,2.556769,261.6589,2.0,2.0,1.0,245.4499,8.190080,305.8703,2.471663,2.0,...,0.000000,1.0,1.0,10.02948,0.000000,4.277749,0.000256,1.0,1.0,1.0
3,2.425456,261.5019,2.0,2.0,1.0,245.4499,8.193604,305.6396,2.470509,2.0,...,0.000000,1.0,1.0,10.12561,0.000000,4.277749,0.000256,1.0,1.0,1.0
4,2.619223,262.9542,2.0,2.0,1.0,245.8985,8.193604,305.7421,2.470894,2.0,...,0.000000,1.0,1.0,10.02948,0.000000,4.277749,0.000256,1.0,1.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
49675,2.574384,522.3757,2.0,2.0,1.0,262.0161,8.391951,328.5312,2.440137,2.0,...,0.308362,2.0,1.0,251.10550,1.778105,189.759000,0.000128,1.0,1.0,1.0
49676,2.597764,523.0823,2.0,2.0,1.0,262.0161,8.389708,328.5312,2.442828,2.0,...,0.306633,2.0,1.0,251.20160,1.698010,189.999400,0.000128,1.0,1.0,1.0
49677,2.436985,522.3364,2.0,2.0,1.0,262.0161,8.388105,328.6593,2.441162,2.0,...,0.306633,2.0,1.0,251.21760,1.698010,190.127500,0.000128,1.0,1.0,1.0
49678,2.669186,521.5121,2.0,2.0,1.0,262.0161,8.388105,328.6593,2.441034,2.0,...,0.306633,2.0,1.0,251.07340,1.681991,189.983300,0.000128,1.0,1.0,1.0


In [30]:
import os
import pandas as pd
import numpy as np
from pickle import dump
output_folder = 'processed'  # 定义保存目录
os.makedirs(output_folder, exist_ok=True)  # 如果目录不存在，则创建

def load_and_save(filename, dataset_folder):
    """
    处理单个文件：加载 CSV 文件，将其转换为 NumPy 数组并保存为 .pkl 格式。
    
    Parameters:
        filename (str): 要加载的文件名。
        dataset_folder (str): 数据集所在的主目录。
    """
    file_path = os.path.join(dataset_folder, filename)
    print(f"Processing file: {file_path}")
    
    # 读取 CSV 文件并转换为 NumPy 数组
    try:
        # .iloc[1:,1:] 删除标题行和timestamp .iloc[1:,1:]
        df = pd.read_csv(file_path)
        # if 'timestamp_(min)' in df.columns:
        #     df = df.drop(columns=['timestamp_(min)'])
        # print(df)
        temp = df.values.astype(np.float32)
        print(f"Dataset Folder: {dataset_folder}, File: {filename}, Shape: {temp.shape}")

        # 定义输出文件夹并确保其存在
        output_folder = 'processed'
        os.makedirs(output_folder, exist_ok=True)

        # 生成输出文件名并保存为 .pkl
        base_name = os.path.splitext(filename)[0]  # 去掉扩展名
        formatted_dataset = dataset_folder.replace(os.sep, "_")  # 替换路径分隔符为 '_'
        save_name = f"{dataset}_{formatted_dataset}_{base_name}.pkl".replace("__", "_")  # 清理多余下划线
        # save_name = f"{formatted_dataset}_{base_name}.pkl"
        print(save_name)
        save_path = os.path.join(output_folder, save_name)
        
        with open(save_path, "wb") as file:
            dump(temp, file)
        print(f"Saved to: {save_path}")
    except Exception as e:
        print(f"Error processing {filename}: {e}")
        
def load_data(dataset_folder):
    """
    批量处理数据集目录下的所有 CSV 文件。

    Parameters:
        dataset_folder (str): 数据集所在的主目录。
    """
    file_list = os.listdir(dataset_folder)  # 列出目录中的所有文件
    print(f"Files in dataset folder '{dataset_folder}': {file_list}")
    
    # 遍历文件，处理所有 CSV 文件
    for filename in file_list:
        if filename.endswith('.csv'):
            load_and_save(filename, dataset_folder)


dataset_folder = 'down'
dataset = 'SWAT'
load_data(dataset_folder)


Files in dataset folder 'down': ['test_label.csv', 'test.csv', 'train.csv']
Processing file: down/test_label.csv
Dataset Folder: down, File: test_label.csv, Shape: (7499, 1)
SWAT_down_test_label.pkl
Saved to: processed/SWAT_down_test_label.pkl
Processing file: down/test.csv
Dataset Folder: down, File: test.csv, Shape: (7499, 51)
SWAT_down_test.pkl
Saved to: processed/SWAT_down_test.pkl
Processing file: down/train.csv
Dataset Folder: down, File: train.csv, Shape: (49680, 51)
SWAT_down_train.pkl
Saved to: processed/SWAT_down_train.pkl
