In [1]:
import os
import pandas as pd
import numpy as np
from pickle import dump
output_folder = 'processed'  # 定义保存目录
os.makedirs(output_folder, exist_ok=True)  # 如果目录不存在，则创建

In [2]:
def load_and_save(filename, dataset_folder):
    """
    处理单个文件：加载 CSV 文件，将其转换为 NumPy 数组并保存为 .pkl 格式。
    
    Parameters:
        filename (str): 要加载的文件名。
        dataset_folder (str): 数据集所在的主目录。
    """
    file_path = os.path.join(dataset_folder, filename)
    print(f"Processing file: {file_path}")
    
    # 读取 CSV 文件并转换为 NumPy 数组
    try:
        # .iloc[1:,1:] 删除标题行和timestamp .iloc[1:,1:]
        df = pd.read_csv(file_path)
        if 'timestamp_(min)' in df.columns:
            df = df.drop(columns=['timestamp_(min)'])
        # print(df)
        temp = df.values.astype(np.float32)
        print(f"Dataset Folder: {dataset_folder}, File: {filename}, Shape: {temp.shape}")

        # 定义输出文件夹并确保其存在
        output_folder = 'processed'
        os.makedirs(output_folder, exist_ok=True)

        # 生成输出文件名并保存为 .pkl
        base_name = os.path.splitext(filename)[0]  # 去掉扩展名
        formatted_dataset = dataset_folder.replace(os.sep, "_")  # 替换路径分隔符为 '_'
        save_name = f"{formatted_dataset}_{base_name}.pkl".replace("__", "_")  # 清理多余下划线
        # save_name = f"{formatted_dataset}_{base_name}.pkl"
        print(save_name)
        save_path = os.path.join(output_folder, save_name)
        
        with open(save_path, "wb") as file:
            dump(temp, file)
        print(f"Saved to: {save_path}")
    except Exception as e:
        print(f"Error processing {filename}: {e}")
        
def load_data(dataset_folder):
    """
    批量处理数据集目录下的所有 CSV 文件。

    Parameters:
        dataset_folder (str): 数据集所在的主目录。
    """
    file_list = os.listdir(dataset_folder)  # 列出目录中的所有文件
    print(f"Files in dataset folder '{dataset_folder}': {file_list}")
    
    # 遍历文件，处理所有 CSV 文件
    for filename in file_list:
        if filename.endswith('.csv'):
            load_and_save(filename, dataset_folder)


In [3]:
import numpy as np
import pandas as pd

def downsampling(mat, interval, batch_index=None):
    """
    对矩阵进行降采样，根据指定的间隔和批次返回数据。
    :param mat: 输入的二维矩阵 (NumPy 数组)
    :param interval: 采样间隔
    :param batch_index: 如果指定，返回该批次的降采样数据；否则返回整体降采样数据。
    :return: 降采样后的矩阵
    """
    num_row, num_col = mat.shape
    res = num_row % interval

    if res != 0:  # 补全行数
        add_num = interval - res
        add_mat = np.zeros((add_num, num_col))
        mat = np.concatenate((mat, add_mat), axis=0)

    if batch_index is not None:
        # 返回指定批次数据
        return mat[batch_index::interval, :]
    else:
        # 返回整体降采样结果
        return mat[::interval, :]



# 读取数据
data_df = pd.read_csv("./PSM/origin_data/test.csv").iloc[:87840, :]  # 保留列名
data_label_df = pd.read_csv("./PSM/origin_data/test_label.csv").iloc[:87840, :]  # 保留列名
data_train_df = pd.read_csv("./PSM/origin_data/train.csv")

# 转换为 NumPy 数组（不包含列名）
data = data_df.iloc[:, 1:].to_numpy()  # 去掉第一列（假设第一列是索引或时间戳）
data_label = data_label_df.iloc[:, 1:].to_numpy()  # 去掉第一列
data_train = data_train_df.iloc[:, 1:].to_numpy()

# 降采样
data = downsampling(data, interval=5, batch_index=3)  # 降采样第三批次
data_label = downsampling(data_label, interval=5, batch_index=3)  # 降采样第三批次

# 保存降采样后的数据
# 使用原始列名
data_columns = data_df.columns[1:]  # 保留原始列名（去掉第一列）
data_label_columns = data_label_df.columns[1:]  # 保留原始列名（去掉第一列）
data_train_columns = data_train_df.columns[1:]

# 转换为 DataFrame 并保存
pd.DataFrame(data, columns=data_columns).to_csv("./PSM/down/test.csv", index=False)
pd.DataFrame(data_label, columns=data_label_columns).to_csv("./PSM/down/test_label.csv", index=False)
pd.DataFrame(data_train, columns=data_train_columns).to_csv("./PSM/down/train.csv", index=False)
print(data.shape)
print(data_label.shape)
print(data_train.shape)

(17568, 25)
(17568, 1)
(132481, 25)


In [4]:
dataset = 'PSM/down/'
# dataset = ['SMAP', 'MSL']
load_data(dataset)

Files in dataset folder 'PSM/down/': ['test_label.csv', 'test.csv', 'train.csv']
Processing file: PSM/down/test_label.csv
Dataset Folder: PSM/down/, File: test_label.csv, Shape: (17568, 1)
PSM_down_test_label.pkl
Saved to: processed/PSM_down_test_label.pkl
Processing file: PSM/down/test.csv
Dataset Folder: PSM/down/, File: test.csv, Shape: (17568, 25)
PSM_down_test.pkl
Saved to: processed/PSM_down_test.pkl
Processing file: PSM/down/train.csv
Dataset Folder: PSM/down/, File: train.csv, Shape: (132481, 25)
PSM_down_train.pkl
Saved to: processed/PSM_down_train.pkl


In [5]:
dataset = 'PSM'
# dataset = ['SMAP', 'MSL']
load_data(dataset)

Files in dataset folder 'PSM': ['test_label.csv', 'origin_data', 'test.csv', 'train.csv', 'down']
Processing file: PSM/test_label.csv
Dataset Folder: PSM, File: test_label.csv, Shape: (87841, 1)
PSM_test_label.pkl
Saved to: processed/PSM_test_label.pkl
Processing file: PSM/test.csv
Dataset Folder: PSM, File: test.csv, Shape: (87841, 25)
PSM_test.pkl
Saved to: processed/PSM_test.pkl
Processing file: PSM/train.csv
Dataset Folder: PSM, File: train.csv, Shape: (132481, 25)
PSM_train.pkl
Saved to: processed/PSM_train.pkl
