In [60]:
from typing import Optional
import numpy as np
import pandas as pd
import datetime as dt
from dateutil import rrule
from sklearn.preprocessing import StandardScaler
from torch.utils.data import DataLoader
from tqdm import tqdm

class InformerDataset:
    def __init__(
        self,
        forecast_horizon = 192,
        data_split = "train",
        data_stride_len = 1,
        task_name = "forecasting",
        random_seed = 42,
    ):
        """
        Parameters
        ----------
        forecast_horizon : int
            Length of the prediction sequence.
        data_split : str
            Split of the dataset, 'train' or 'test'.
        data_stride_len : int
            Stride length when generating consecutive
            time series windows.
        task_name : str
            The task that the dataset is used for. One of
            'forecasting', or  'imputation'.
        random_seed : int
            Random seed for reproducibility.
        """

        self.seq_len = 512
        self.forecast_horizon = forecast_horizon
        self.full_file_path_and_name = "../../data/ETTh1.csv"
        self.data_split = data_split
        self.data_stride_len = data_stride_len
        self.task_name = task_name
        self.random_seed = random_seed

        # Read data
        self._read_data()

    def _get_borders(self):
        n_train = 12 * 30 * 24
        n_val = 4 * 30 * 24
        n_test = 4 * 30 * 24

        train_end = n_train
        val_end = n_train + n_val
        test_start = val_end - self.seq_len
        test_end = test_start + n_test + self.seq_len

        train = slice(0, train_end)
        test = slice(test_start, test_end)

        return train, test

    def _read_data(self):
        self.scaler = StandardScaler()
        df = pd.read_csv(self.full_file_path_and_name)
        self.length_timeseries_original = df.shape[0]
        self.n_channels = df.shape[1] - 1

        df.drop(columns=["date"], inplace=True)
        df = df.infer_objects(copy=False).interpolate(method="cubic")

        data_splits = self._get_borders()

        train_data = df[data_splits[0]]
        self.scaler.fit(train_data.values)
        df = self.scaler.transform(df.values)

        if self.data_split == "train":
            self.data = df[data_splits[0], :]
        elif self.data_split == "test":
            self.data = df[data_splits[1], :]

        self.length_timeseries = self.data.shape[0]

    def __getitem__(self, index):
        seq_start = self.data_stride_len * index
        seq_end = seq_start + self.seq_len
        input_mask = np.ones(self.seq_len)

        if self.task_name == "forecasting":
            pred_end = seq_end + self.forecast_horizon

            if pred_end > self.length_timeseries:
                pred_end = self.length_timeseries
                seq_end = seq_end - self.forecast_horizon
                seq_start = seq_end - self.seq_len

            timeseries = self.data[seq_start:seq_end, :].T
            forecast = self.data[seq_end:pred_end, :].T

            return timeseries, forecast, input_mask

        elif self.task_name == "imputation":
            if seq_end > self.length_timeseries:
                seq_end = self.length_timeseries
                seq_end = seq_end - self.seq_len

            timeseries = self.data[seq_start:seq_end, :].T

            return timeseries, input_mask

    def __len__(self):
        if self.task_name == "imputation":
            return (self.length_timeseries - self.seq_len) // self.data_stride_len + 1
        elif self.task_name == "forecasting":
            return (
                self.length_timeseries - self.seq_len - self.forecast_horizon
            ) // self.data_stride_len + 1

# 原始数据读取
- 可以发现原始数据维度为：（17420，8）  共有7个channel，17420个采样点，OT是我们的待预测变量

In [27]:
data_file = "../../data/ETTh1.csv"
data = pd.read_csv(data_file)
print(data.shape)
data.info()

(17420, 8)
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 17420 entries, 0 to 17419
Data columns (total 8 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   date    17420 non-null  object 
 1   HUFL    17420 non-null  float64
 2   HULL    17420 non-null  float64
 3   MUFL    17420 non-null  float64
 4   MULL    17420 non-null  float64
 5   LUFL    17420 non-null  float64
 6   LULL    17420 non-null  float64
 7   OT      17420 non-null  float64
dtypes: float64(7), object(1)
memory usage: 1.1+ MB


In [20]:
# 获取最小日期值
min_date = data['date'].min()

# 获取最大日期值
max_date = data['date'].max()

print('最小日期值:', min_date)
print('最大日期值:', max_date)
# （1）先将字符串-->时间格式date
date1 = dt.datetime.strptime(max_date, '%Y-%m-%d %H:%M:%S').date()  ##datetime.date(2018, 1, 6)
date2 = dt.datetime.strptime(min_date, '%Y-%m-%d %H:%M:%S').date()  ##datetime.date(2018, 1, 9)
# （2）计算两个日期date的天数差
Days = (date1 - date2).days
Months = rrule.rrule(rrule.MONTHLY, dtstart = date2, until = date1).count()  
print('月份差:',Months)

最小日期值: 2016-07-01 00:00:00
最大日期值: 2018-06-26 19:00:00
月份差: 24


# 数据集切分
- 我们希望输入模型的维度是（batch,channel, seq_len)
- 对于（17420，8）的序列，第一个构造的应该是（512，8）axis=0的索引从0到512. 总共原论文中筛选了14400个数据
1. n_train = 12 * 30 * 24  取12个月，每月30天，每天24小时
2. n_val = 4 * 30 * 24     取4个月，每月30天，每天24小时
3. n_test = 4 * 30 * 24    取4个月，每月30天，每天24小时24

In [23]:
seq_len = 512
n_train = 12 * 30 * 24
n_val = 4 * 30 * 24
n_test = 4 * 30 * 24
train_end = n_train
val_end = n_train + n_val
test_start = val_end - seq_len
test_end = test_start + n_test + seq_len
train = slice(0, train_end)
test = slice(test_start, test_end)
print(train)
print(test)

slice(0, 8640, None)
slice(11008, 14400, None)


In [31]:
def get_borders(data):
    seq_len = 512
    n_train = 12 * 30 * 24
    n_val = 4 * 30 * 24
    n_test = 4 * 30 * 24

    train_end = n_train
    val_end = n_train + n_val
    test_start = val_end - seq_len
    test_end = test_start + n_test + seq_len
    
    train = data.iloc[:train_end]
    test = data.iloc[test_start:test_end]
    # train = data.slice(0, train_end)
    # test = data.slice(test_start, test_end)
    
    return train, test

# 数据集读取、预处理
- drop掉不要的列，进行异常值填充

In [38]:
data.drop(columns=["date"], inplace=True)
data = data.infer_objects(copy=False).interpolate(method="cubic")

In [26]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 17420 entries, 0 to 17419
Data columns (total 7 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   HUFL    17420 non-null  float64
 1   HULL    17420 non-null  float64
 2   MUFL    17420 non-null  float64
 3   MULL    17420 non-null  float64
 4   LUFL    17420 non-null  float64
 5   LULL    17420 non-null  float64
 6   OT      17420 non-null  float64
dtypes: float64(7)
memory usage: 952.8 KB


- 拆分数据

In [39]:
data_splits = get_borders(data)
train_data = data_splits[0]
test_data = data_splits[1]
print('train数据shape:',train_data.shape)
print('test数据shape:',test_data.shape)

train数据shape: (8640, 7)
test数据shape: (3392, 7)


In [41]:
scala = StandardScaler()
scala.fit(train_data.values)
df = scala.transform(data.values) # 根据training set 去标准化数据集

In [59]:
train_data = df[data_splits[0].index, :]
length_timeseries = train_data.shape[0]

# 进行input，forcast，input_mask拆分

In [57]:
data_stride_len = 1 #  Stride length when generating consecutive time series windows.
index = 0
forecast_horizon = 192  # Length of the prediction sequence.

In [56]:
seq_start = data_stride_len * index
seq_end = seq_start + seq_len
input_mask = np.ones(seq_len)
print(input_mask.shape)

(512,)


In [61]:
# 构造pred数据
pred_end = seq_end + forecast_horizon
if pred_end > length_timeseries:
    pred_end = length_timeseries
    seq_end = seq_end - forecast_horizon
    seq_start = seq_end - seq_len

In [None]:
def _read_data(self):
        self.scaler = StandardScaler()
        df = pd.read_csv(self.full_file_path_and_name)
        self.length_timeseries_original = df.shape[0]
        self.n_channels = df.shape[1] - 1

        df.drop(columns=["date"], inplace=True)
        df = df.infer_objects(copy=False).interpolate(method="cubic")

        data_splits = self._get_borders()

        train_data = df[data_splits[0]]
        self.scaler.fit(train_data.values)
        df = self.scaler.transform(df.values)

        if self.data_split == "train":
            self.data = df[data_splits[0], :]
        elif self.data_split == "test":
            self.data = df[data_splits[1], :]

        self.length_timeseries = self.data.shape[0]