In [1]:
import pandas as pd
from statsmodels.tsa.ar_model import AutoReg
import numpy as np
import time

from statsmodels.tsa.stattools import acf, pacf

In [2]:
data = pd.read_csv('../../new_data/merged_dataset.csv')
data.info()

  data = pd.read_csv('../../new_data/merged_dataset.csv')


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 735957 entries, 0 to 735956
Data columns (total 30 columns):
 #   Column          Non-Null Count   Dtype  
---  ------          --------------   -----  
 0   serial_number   735957 non-null  object 
 1   collect_time    735957 non-null  object 
 2   1_hwerr_f       735957 non-null  float64
 3   1_hwerr_e       735957 non-null  float64
 4   2_hwerr_c       735957 non-null  float64
 5   2_sel           735957 non-null  float64
 6   3_hwerr_n       735957 non-null  float64
 7   2_hwerr_s       735957 non-null  float64
 8   3_hwerr_m       735957 non-null  float64
 9   1_hwerr_st      735957 non-null  float64
 10  1_hw_mem_c      735957 non-null  float64
 11  3_hwerr_p       735957 non-null  float64
 12  2_hwerr_ce      735957 non-null  float64
 13  3_hwerr_as      735957 non-null  float64
 14  1_ke            735957 non-null  float64
 15  2_hwerr_p       735957 non-null  float64
 16  3_hwerr_kp      735957 non-null  float64
 17  1_hwerr_fl

In [3]:
data['collect_time'] = pd.to_datetime(data['collect_time'])

In [4]:
data = data.drop('failure_time',axis=1)

In [5]:
# 确保数据按照serial_number和collect_time排序
data.sort_values(['serial_number', 'collect_time'], inplace=True)

## 时间滑动窗口统计

In [6]:
# 创建一个空字典来存储构建的特征
features_dict = {}

# 滑动窗口的大小（以小时为单位）
window_sizes = [1, 6, 24]  # 窗口大小：1小时，6小时，24小时

# 需要计算特征的列（从第2列到第25列）
feature_columns = data.columns[2:26]

In [7]:
# 计算每个窗口大小的滑动窗口统计
for window in window_sizes:
    window_str = f"{window}H"
    for column in feature_columns:
        df_rolled = data.groupby('serial_number').rolling(window=window_str, on='collect_time')
        features_dict[f'sum_{column}_{window}h'] = df_rolled[column].sum().reset_index(level=0, drop=True)
        features_dict[f'mean_{column}_{window}h'] = df_rolled[column].mean().reset_index(level=0, drop=True)
        features_dict[f'std_{column}_{window}h'] = df_rolled[column].std().reset_index(level=0, drop=True)
        features_dict[f'min_{column}_{window}h'] = df_rolled[column].min().reset_index(level=0, drop=True)
        features_dict[f'max_{column}_{window}h'] = df_rolled[column].max().reset_index(level=0, drop=True)

In [8]:
len(features_dict.keys()),features_dict.keys()

(360,
 dict_keys(['sum_1_hwerr_f_1h', 'mean_1_hwerr_f_1h', 'std_1_hwerr_f_1h', 'min_1_hwerr_f_1h', 'max_1_hwerr_f_1h', 'sum_1_hwerr_e_1h', 'mean_1_hwerr_e_1h', 'std_1_hwerr_e_1h', 'min_1_hwerr_e_1h', 'max_1_hwerr_e_1h', 'sum_2_hwerr_c_1h', 'mean_2_hwerr_c_1h', 'std_2_hwerr_c_1h', 'min_2_hwerr_c_1h', 'max_2_hwerr_c_1h', 'sum_2_sel_1h', 'mean_2_sel_1h', 'std_2_sel_1h', 'min_2_sel_1h', 'max_2_sel_1h', 'sum_3_hwerr_n_1h', 'mean_3_hwerr_n_1h', 'std_3_hwerr_n_1h', 'min_3_hwerr_n_1h', 'max_3_hwerr_n_1h', 'sum_2_hwerr_s_1h', 'mean_2_hwerr_s_1h', 'std_2_hwerr_s_1h', 'min_2_hwerr_s_1h', 'max_2_hwerr_s_1h', 'sum_3_hwerr_m_1h', 'mean_3_hwerr_m_1h', 'std_3_hwerr_m_1h', 'min_3_hwerr_m_1h', 'max_3_hwerr_m_1h', 'sum_1_hwerr_st_1h', 'mean_1_hwerr_st_1h', 'std_1_hwerr_st_1h', 'min_1_hwerr_st_1h', 'max_1_hwerr_st_1h', 'sum_1_hw_mem_c_1h', 'mean_1_hw_mem_c_1h', 'std_1_hw_mem_c_1h', 'min_1_hw_mem_c_1h', 'max_1_hw_mem_c_1h', 'sum_3_hwerr_p_1h', 'mean_3_hwerr_p_1h', 'std_3_hwerr_p_1h', 'min_3_hwerr_p_1h', 'm

## 周期性特征

In [9]:
# 提取时间点特征
data['minute'] = data['collect_time'].dt.minute
data['hour'] = data['collect_time'].dt.hour
data['day'] = data['collect_time'].dt.day

# 将提取的特征添加到特征字典中
features_dict['minute'] = data['minute']
features_dict['hour'] = data['hour']
features_dict['day'] = data['day']

In [10]:
len(features_dict.keys())

363

## 滞后特征

In [11]:
# 检查每个设备的时间间隔
data['time_diff'] = data.groupby('serial_number')['collect_time'].diff().dt.total_seconds() / 3600  # 时间差转换为小时

# 查看时间差异的统计数据
print(data['time_diff'].describe())

count    723433.000000
mean         12.689148
std          92.194417
min           0.033333
25%           0.033333
50%           0.033333
75%           0.333333
max        3492.466667
Name: time_diff, dtype: float64


In [None]:
# 滞后时间窗口
lags = [1, 6, 12, 24]

# 遍历每个滞后窗口
for lag in lags:
    window_str = f"{lag}H"
    # 创建滚动窗口对象
    df_rolled = data.groupby('serial_number').rolling(window=window_str, on='collect_time')
    
    # 遍历需要计算的特征列
    for column in feature_columns:
        # 对每个特征列进行滚动窗口内的求和操作，并保存到字典中
        feature_name = f'lag_sum_{column}_{lag}h'  # 创建特征名
        features_dict[feature_name] = df_rolled[column].sum().reset_index(level=0, drop=True)


len(features_dict.keys())

In [None]:
# 滞后时间窗口列表，以分钟为单位
minute_lags = [1, 5, 15, 30]

# 遍历每个滞后窗口
for lag in minute_lags:
    window_str = f"{lag}min"
    # 创建滚动窗口对象
    df_rolled = data.groupby('serial_number').rolling(window=window_str, on='collect_time')
    
    # 遍历需要计算的特征列
    for column in feature_columns:
        # 对每个特征列进行滚动窗口内的求和操作，并保存到字典中
        feature_name = f'lag_sum_{column}_{lag}min'  # 创建特征名
        features_dict[feature_name] = df_rolled[column].sum().reset_index(level=0, drop=True)


len(features_dict.keys())

## 变化率特征

In [None]:
# 对feature_columns中的每个特征进行一阶差分
for column in feature_columns:
    features_dict[f'diff_{column}'] = data[column].diff().fillna(0)

In [None]:
# 定义变化率的时间窗口（以分钟为单位）
change_periods = [2,5,15,30,60]  # 30分钟、60分钟、2小时

# 将collect_time转换为datetime类型，并确保数据是按时间排序的
data['collect_time'] = pd.to_datetime(data['collect_time'])
data.sort_values(['serial_number', 'collect_time'], inplace=True)

# 遍历每个时间窗口和每个特征列来计算变化率
for period in change_periods:
    window_str = f"{period}min"
    df_rolled = data.groupby('serial_number').rolling(window=window_str, on='collect_time')

    for column in feature_columns:
        # 计算窗口结束时点和开始时点的值差，除以窗口开始时点的值，得到变化率
        rate_change = (df_rolled[column].apply(lambda x: x.iloc[-1] - x.iloc[0]) / df_rolled[column].apply(lambda x: x.iloc[0] if x.iloc[0] != 0 else 1)).reset_index(level=0, drop=True)
        features_dict[f'rate_change_{column}_{window_str}'] = rate_change



len(features_dict.keys())

## 窗口聚合特征

In [None]:
len(features_dict.keys())

In [None]:
features_dict.key()

## 保存数据

In [None]:
features_df = pd.DataFrame(features_dict)
data = pd.merge(data, features_df, left_index=True, right_index=True)
data.to_csv('../../new_data/features.csv', index=False)