In [1]:
import torch
from torch.utils.data import Dataset, DataLoader
from sklearn.preprocessing import StandardScaler
import warnings
import numpy as np
warnings.filterwarnings('ignore')
import time
from models import Informer, Autoformer, Transformer, DLinear, Linear, NLinear, PatchTST
import argparse
from torch import optim
import torch.nn as nn

In [2]:
time_steps = 3000
num_features = 10
pred_len = 60
label_len = 40

In [3]:
data_x = np.random.uniform(low=1, high=100, size=(time_steps, num_features))
data_y = data_x


In [4]:
data_x.shape, data_y.shape

((3000, 10), (3000, 10))

In [5]:
device = torch.device('mps')

In [6]:
class DummyPretrainDataset(Dataset):
    def __init__(self, data_x, data_y, num_features, time_steps=15, seq_len=96, pred_len=96, label_len=48, bs=5):
        self.num_features = num_features
        self.time_steps = time_steps
        self.seq_len = seq_len
        self.pred_len = pred_len
        self.label_len = label_len
        self.batch_size = bs
        self.prebatch_len = seq_len + bs  - 1
        self.data_x = data_x
        self.data_y = data_y

    def __len__(self):
        return (self.time_steps - self.seq_len - pred_len + 1) // self.batch_size 

    def __getitem__(self, idx):
        s_begin = self.batch_size * idx
        s_end = s_begin + self.prebatch_len 
        # r_begin = s_end - 1 # - self.label_len
        # r_end = r_begin + 1 # + self.pred_len  + self.label_len 
        #regression
        # r_begin = s_begin + self.seq_len - 1
        # r_end = r_begin + self.batch_size
        
        r_begin = s_begin + self.seq_len - self.label_len -1
        r_end = r_begin + self.pred_len + self.label_len + self.batch_size - 1
        # print(s_begin, s_end, r_begin, r_end)

        seq_x = self.data_x[s_begin:s_end]
        seq_y = self.data_y[r_begin:r_end]
        return torch.Tensor(seq_x).to(device), torch.Tensor(seq_y).to(device)

In [7]:
dset = DummyPretrainDataset(data_x, data_y, num_features, time_steps, seq_len=500, pred_len=pred_len, label_len=label_len, bs=100)

In [8]:
dset[0][0].shape, dset[0][1].shape

(torch.Size([599, 10]), torch.Size([199, 10]))

In [9]:
len(dset)

24

In [91]:
for i in range(len(dset)) :
    print(dset[i][0].shape, dset[i][1].shape)

torch.Size([599, 100]) torch.Size([199, 100])
torch.Size([599, 100]) torch.Size([199, 100])
torch.Size([599, 100]) torch.Size([199, 100])
torch.Size([599, 100]) torch.Size([199, 100])
torch.Size([599, 100]) torch.Size([199, 100])
torch.Size([599, 100]) torch.Size([199, 100])
torch.Size([599, 100]) torch.Size([199, 100])
torch.Size([599, 100]) torch.Size([199, 100])
torch.Size([599, 100]) torch.Size([199, 100])
torch.Size([599, 100]) torch.Size([199, 100])
torch.Size([599, 100]) torch.Size([199, 100])
torch.Size([599, 100]) torch.Size([199, 100])
torch.Size([599, 100]) torch.Size([199, 100])
torch.Size([599, 100]) torch.Size([199, 100])
torch.Size([599, 100]) torch.Size([199, 100])
torch.Size([599, 100]) torch.Size([199, 100])
torch.Size([599, 100]) torch.Size([199, 100])
torch.Size([599, 100]) torch.Size([199, 100])
torch.Size([599, 100]) torch.Size([199, 100])
torch.Size([599, 100]) torch.Size([199, 100])
torch.Size([599, 100]) torch.Size([199, 100])
torch.Size([599, 100]) torch.Size(

In [8]:
class SlidingWindowView:
    def __init__(self, window_size, stride, pred_len, label_len):
        self.window_size = window_size
        self.stride = stride
        self.pred_len = pred_len
        self.label_len = label_len
    def slide_collate_fn(self, batch):
        return batch[0].unfold(0, self.window_size, self.stride).transpose(1,2), batch[1].unfold(0, self.pred_len+self.label_len, self.stride).transpose(1,2)

In [9]:
sw = SlidingWindowView(500, 1, pred_len, label_len)
w_dl = DataLoader(dset, batch_size=None, collate_fn=sw.slide_collate_fn, pin_memory=False)
# start = time.time()
# for b in w_dl:
#     # b1=b
#     print(b[0].shape, b[1].shape)
#     # print(b)
# stop = time.time()
# print(stop-start)

In [12]:
len(w_dl)

24

In [302]:
data_y.shape

(300, 16)

In [6]:
class DummyPretrainStackDataset(Dataset):
    def __init__(self, data_x, data_y, num_features, time_steps=15, seq_len=6, pred_len=8, label_len=8, bs=5):
        self.num_features = num_features
        self.time_steps = time_steps
        self.seq_len = seq_len
        self.pred_len = pred_len
        self.label_len = label_len
        # self.batch_size = bs
        # self.prebatch_len = seq_len + bs - 1
        self.data_x = data_x
        self.data_y = data_y

    def __len__(self):
        return (self.time_steps - self.seq_len + 1 - self.pred_len)  # // self.batch_size 

    def __getitem__(self, idx):
        s_begin = idx
        s_end = s_begin + self.seq_len
        r_begin = s_end - self.label_len
        r_end = r_begin  + self.pred_len  + self.label_len 

        seq_x = self.data_x[s_begin:s_end]
        seq_y = self.data_y[r_begin:r_end]
        return torch.Tensor(seq_x), torch.Tensor(seq_y)
# b1[0].float(), b1[0][:,:,:4].float(), dec_inp, b1[1][:,:,:4].float()

In [7]:
sset = DummyPretrainStackDataset(data_x, data_y, num_features, time_steps, seq_len=500, pred_len=60, label_len=40)

In [8]:
len(sset)

2441

In [104]:
data_x.shape

(10000, 100)

In [11]:
s_dl = DataLoader(sset, batch_size=100, shuffle=False, drop_last=True)

In [12]:
len(s_dl)

24

In [107]:
b1 = None
start = time.time()
for b in s_dl:
    # b1=b
    print(b[0].shape, b[1].shape)
    # print(b)
stop = time.time()
print(stop-start)

torch.Size([100, 500, 100]) torch.Size([100, 100, 100])
torch.Size([100, 500, 100]) torch.Size([100, 100, 100])
torch.Size([100, 500, 100]) torch.Size([100, 100, 100])
torch.Size([100, 500, 100]) torch.Size([100, 100, 100])
torch.Size([100, 500, 100]) torch.Size([100, 100, 100])
torch.Size([100, 500, 100]) torch.Size([100, 100, 100])
torch.Size([100, 500, 100]) torch.Size([100, 100, 100])
torch.Size([100, 500, 100]) torch.Size([100, 100, 100])
torch.Size([100, 500, 100]) torch.Size([100, 100, 100])
torch.Size([100, 500, 100]) torch.Size([100, 100, 100])
torch.Size([100, 500, 100]) torch.Size([100, 100, 100])
torch.Size([100, 500, 100]) torch.Size([100, 100, 100])
torch.Size([100, 500, 100]) torch.Size([100, 100, 100])
torch.Size([100, 500, 100]) torch.Size([100, 100, 100])
torch.Size([100, 500, 100]) torch.Size([100, 100, 100])
torch.Size([100, 500, 100]) torch.Size([100, 100, 100])
torch.Size([100, 500, 100]) torch.Size([100, 100, 100])
torch.Size([100, 500, 100]) torch.Size([100, 100

In [16]:
b1

[tensor([[[89.7994, 55.5096, 37.7528,  ..., 62.6079, 44.4679, 15.7314],
          [83.1737, 83.9626, 77.8187,  ..., 85.2087, 42.3049, 77.4481],
          [95.6394, 71.6739, 79.2538,  ..., 57.0945, 21.0284, 38.3425],
          ...,
          [43.1796, 84.0346, 25.3106,  ..., 53.3819,  8.5694, 67.5977],
          [60.1137, 92.2419,  2.5538,  ..., 65.1549, 55.2438, 92.0792],
          [36.3255, 45.5186, 31.7847,  ..., 68.4275, 11.2777, 17.1702]],
 
         [[83.1737, 83.9626, 77.8187,  ..., 85.2087, 42.3049, 77.4481],
          [95.6394, 71.6739, 79.2538,  ..., 57.0945, 21.0284, 38.3425],
          [17.7794,  1.4215, 12.2500,  ..., 91.0731, 43.1982, 10.1609],
          ...,
          [60.1137, 92.2419,  2.5538,  ..., 65.1549, 55.2438, 92.0792],
          [36.3255, 45.5186, 31.7847,  ..., 68.4275, 11.2777, 17.1702],
          [ 8.5127, 96.7749, 56.4547,  ..., 13.9612, 23.4854, 81.9528]],
 
         [[95.6394, 71.6739, 79.2538,  ..., 57.0945, 21.0284, 38.3425],
          [17.7794,  1.4215,

In [9]:

parser = argparse.ArgumentParser(description='Autoformer & Transformer family for Time Series Forecasting')

# random seed
parser.add_argument('--random_seed', type=int, default=2021, help='random seed')

# # basic config
# parser.add_argument('--is_training', type=int, required=True, default=1, help='status')
# parser.add_argument('--model_id', type=str, required=True, default='test', help='model id')
# parser.add_argument('--model', type=str, required=True, default='Autoformer',
#                     help='model name, options: [Autoformer, Informer, Transformer]')

# # data loader
# parser.add_argument('--data', type=str, required=True, default='ETTm1', help='dataset type')
parser.add_argument('--root_path', type=str, default='./data/ETT/', help='root path of the data file')
parser.add_argument('--data_path', type=str, default='ETTh1.csv', help='data file')
parser.add_argument('--features', type=str, default='M',
                    help='forecasting task, options:[M, S, MS]; M:multivariate predict multivariate, S:univariate predict univariate, MS:multivariate predict univariate')
parser.add_argument('--target', type=str, default='OT', help='target feature in S or MS task')
parser.add_argument('--freq', type=str, default='h',
                    help='freq for time features encoding, options:[s:secondly, t:minutely, h:hourly, d:daily, b:business days, w:weekly, m:monthly], you can also use more detailed freq like 15min or 3h')
parser.add_argument('--checkpoints', type=str, default='./checkpoints/', help='location of model checkpoints')

# forecasting task
parser.add_argument('--seq_len', type=int, default=500, help='input sequence length')
parser.add_argument('--label_len', type=int, default=label_len, help='start token length')
parser.add_argument('--pred_len', type=int, default=pred_len, help='prediction sequence length')


# DLinear
#parser.add_argument('--individual', action='store_true', default=False, help='DLinear: a linear layer for each variate(channel) individually')

# PatchTST
parser.add_argument('--fc_dropout', type=float, default=0.05, help='fully connected dropout')
parser.add_argument('--head_dropout', type=float, default=0.0, help='head dropout')
parser.add_argument('--patch_len', type=int, default=16, help='patch length')
parser.add_argument('--stride', type=int, default=8, help='stride')
parser.add_argument('--padding_patch', default='end', help='None: None; end: padding on the end')
parser.add_argument('--revin', type=int, default=1, help='RevIN; True 1 False 0')
parser.add_argument('--affine', type=int, default=0, help='RevIN-affine; True 1 False 0')
parser.add_argument('--subtract_last', type=int, default=0, help='0: subtract mean; 1: subtract last')
parser.add_argument('--decomposition', type=int, default=0, help='decomposition; True 1 False 0')
parser.add_argument('--kernel_size', type=int, default=25, help='decomposition-kernel')
parser.add_argument('--individual', type=int, default=0, help='individual head; True 1 False 0')

# Formers 
parser.add_argument('--embed_type', type=int, default=0, help='0: default 1: value embedding + temporal embedding + positional embedding 2: value embedding + temporal embedding 3: value embedding + positional embedding 4: value embedding')
parser.add_argument('--enc_in', type=int, default=num_features, help='encoder input size') # DLinear with --individual, use this hyperparameter as the number of channels
parser.add_argument('--dec_in', type=int, default=num_features, help='decoder input size')
parser.add_argument('--c_out', type=int, default=num_features, help='output size')
parser.add_argument('--d_model', type=int, default=512, help='dimension of model')
parser.add_argument('--n_heads', type=int, default=8, help='num of heads')
parser.add_argument('--e_layers', type=int, default=2, help='num of encoder layers')
parser.add_argument('--d_layers', type=int, default=1, help='num of decoder layers')
parser.add_argument('--d_ff', type=int, default=2048, help='dimension of fcn')
parser.add_argument('--moving_avg', type=int, default=25, help='window size of moving average')
parser.add_argument('--factor', type=int, default=1, help='attn factor')
parser.add_argument('--distil', action='store_false',
                    help='whether to use distilling in encoder, using this argument means not using distilling',
                    default=True)
parser.add_argument('--dropout', type=float, default=0.05, help='dropout')
parser.add_argument('--embed', type=str, default='timeF',
                    help='time features encoding, options:[timeF, fixed, learned]')
parser.add_argument('--activation', type=str, default='gelu', help='activation')
parser.add_argument('--output_attention', action='store_true', help='whether to output attention in ecoder')
parser.add_argument('--do_predict', action='store_true', help='whether to predict unseen future data')

# optimization
parser.add_argument('--num_workers', type=int, default=10, help='data loader num workers')
parser.add_argument('--itr', type=int, default=2, help='experiments times')
parser.add_argument('--train_epochs', type=int, default=100, help='train epochs')
parser.add_argument('--batch_size', type=int, default=100, help='batch size of train input data')
parser.add_argument('--patience', type=int, default=100, help='early stopping patience')
parser.add_argument('--learning_rate', type=float, default=0.0001, help='optimizer learning rate')
parser.add_argument('--des', type=str, default='test', help='exp description')
parser.add_argument('--loss', type=str, default='mse', help='loss function')
parser.add_argument('--lradj', type=str, default='type3', help='adjust learning rate')
parser.add_argument('--pct_start', type=float, default=0.3, help='pct_start')
parser.add_argument('--use_amp', action='store_true', help='use automatic mixed precision training', default=False)

# GPU
parser.add_argument('--use_gpu', type=bool, default=True, help='use gpu')
parser.add_argument('--gpu', type=int, default=0, help='gpu')
parser.add_argument('--use_multi_gpu', action='store_true', help='use multiple gpus', default=False)
parser.add_argument('--devices', type=str, default='0,1,2,3', help='device ids of multile gpus')
parser.add_argument('--test_flop', action='store_true', default=False, help='See utils/tools for usage')


args = parser.parse_args(args=[])

import random
# random seed
fix_seed = args.random_seed
random.seed(fix_seed)
torch.manual_seed(fix_seed)
np.random.seed(fix_seed)


args.use_gpu = True if torch.cuda.is_available() and args.use_gpu else False

if args.use_gpu and args.use_multi_gpu:
    args.dvices = args.devices.replace(' ', '')
    device_ids = args.devices.split(',')
    args.device_ids = [int(id_) for id_ in device_ids]
    args.gpu = args.device_ids[0]

print('Args in experiment:')
print(args)

Args in experiment:
Namespace(random_seed=2021, root_path='./data/ETT/', data_path='ETTh1.csv', features='M', target='OT', freq='h', checkpoints='./checkpoints/', seq_len=500, label_len=40, pred_len=60, fc_dropout=0.05, head_dropout=0.0, patch_len=16, stride=8, padding_patch='end', revin=1, affine=0, subtract_last=0, decomposition=0, kernel_size=25, individual=0, embed_type=0, enc_in=10, dec_in=10, c_out=10, d_model=512, n_heads=8, e_layers=2, d_layers=1, d_ff=2048, moving_avg=25, factor=1, distil=True, dropout=0.05, embed='timeF', activation='gelu', output_attention=False, do_predict=False, num_workers=10, itr=2, train_epochs=100, batch_size=100, patience=100, learning_rate=0.0001, des='test', loss='mse', lradj='type3', pct_start=0.3, use_amp=False, use_gpu=False, gpu=0, use_multi_gpu=False, devices='0,1,2,3', test_flop=False)


In [110]:
model = PatchTST.Model(args)

In [100]:
model

Model(
  (model): PatchTST_backbone(
    (revin_layer): RevIN()
    (padding_patch_layer): ReplicationPad1d((0, 8))
    (backbone): TSTiEncoder(
      (W_P): Linear(in_features=16, out_features=512, bias=True)
      (dropout): Dropout(p=0.05, inplace=False)
      (encoder): TSTEncoder(
        (layers): ModuleList(
          (0-1): 2 x TSTEncoderLayer(
            (self_attn): _MultiheadAttention(
              (W_Q): Linear(in_features=512, out_features=512, bias=True)
              (W_K): Linear(in_features=512, out_features=512, bias=True)
              (W_V): Linear(in_features=512, out_features=512, bias=True)
              (sdp_attn): _ScaledDotProductAttention(
                (attn_dropout): Dropout(p=0.0, inplace=False)
              )
              (to_out): Sequential(
                (0): Linear(in_features=512, out_features=512, bias=True)
                (1): Dropout(p=0.05, inplace=False)
              )
            )
            (dropout_attn): Dropout(p=0.05, inplace=F

In [101]:
%%time
model(b1[0])

CPU times: user 21.4 s, sys: 11.8 s, total: 33.1 s
Wall time: 6.2 s


tensor([[[45.9229, 57.0467, 77.3513,  ..., 51.3055, 46.1809, 32.4994],
         [40.8363, 57.9277, 32.6596,  ..., 64.3457, 92.9502, 67.7267],
         [40.5203, 29.9339, 45.7296,  ..., 48.6110, 19.4358, 71.8770],
         ...,
         [39.0498, 36.0480, 24.8613,  ..., 44.9997, 23.3362, 72.1767],
         [50.2753, 49.1318, 70.6954,  ..., 50.0445, 32.8113, 42.3876],
         [58.6064, 61.5207, 56.2034,  ..., 59.1819, 28.4290, 34.1345]],

        [[33.4626, 54.7675, 54.2221,  ..., 30.5864, 40.0469, 50.1327],
         [65.7028, 73.1609, 67.0423,  ..., 68.4425, 45.4227, 44.8203],
         [67.0513, 53.0351, 45.8740,  ..., 51.6351, 56.4419, 55.4173],
         ...,
         [70.8667, 40.1428, 42.9904,  ..., 50.4454, 68.2771, 53.5072],
         [51.1248, 40.6587, 81.4305,  ..., 69.9589, 82.5561, 54.9705],
         [24.2019, 50.2997, 38.0113,  ..., 33.3528, 47.5281, 32.6483]],

        [[43.8000, 42.0806, 45.8512,  ..., 88.0275, 28.4356, 69.5016],
         [39.3049, 39.6446, 42.8704,  ..., 44

In [829]:
model = DLinear.Model(args)

In [830]:
model

Model(
  (decompsition): series_decomp(
    (moving_avg): moving_avg(
      (avg): AvgPool1d(kernel_size=(25,), stride=(1,), padding=(0,))
    )
  )
  (Linear_Seasonal): Linear(in_features=500, out_features=60, bias=True)
  (Linear_Trend): Linear(in_features=500, out_features=60, bias=True)
)

In [831]:
model(b1[0].float())

tensor([[[ 1.7165e+01,  9.3340e+00, -3.3945e-01,  ...,  4.2045e+00,
           3.2258e+01, -8.2107e+00],
         [-3.8288e+01, -3.9523e+01,  1.6036e+01,  ..., -2.7061e+01,
          -1.4023e+01, -2.4650e+01],
         [ 5.5282e-01, -4.6020e+00, -2.2652e+01,  ..., -1.6415e+00,
           7.8541e-01, -1.5511e+01],
         ...,
         [-3.7269e+01,  1.6553e+00, -3.3242e+01,  ...,  2.2123e+00,
          -4.3733e+01, -3.8620e+01],
         [-5.4318e+01, -7.2704e+00, -9.4540e+00,  ..., -5.2228e+01,
          -4.0321e+01, -3.7331e+00],
         [ 1.6410e+01,  1.8340e+01, -7.5531e-01,  ..., -1.7320e+01,
          -3.3644e+01,  1.4176e+01]],

        [[ 2.1554e+01,  9.6288e-01,  1.2074e+01,  ...,  6.5817e+00,
           2.0214e+01,  4.2867e-01],
         [-9.3766e+00, -6.3090e+00, -3.0387e+01,  ..., -9.2536e+00,
          -3.4545e+01, -3.3312e+01],
         [-3.8409e+00, -1.7512e+01, -3.7807e+00,  ..., -5.9622e+00,
          -4.4367e+00,  6.2254e+00],
         ...,
         [-8.5296e-01, -2

In [688]:
b1[0].shape

torch.Size([5, 500, 16])

In [689]:
args.pred_len

96

In [690]:
args.label_len

0

In [832]:
dec_inp = torch.zeros_like(b1[1][:, -args.pred_len:, :]).float()
dec_inp = torch.cat([b1[1][:, :args.label_len, :], dec_inp], dim=1).float()

In [833]:
b1[1].shape

torch.Size([5, 100, 16])

In [834]:
dec_inp.shape

torch.Size([5, 100, 16])

In [835]:
b1[1][:, -args.pred_len:, :].shape

torch.Size([5, 60, 16])

In [836]:
b1[1][:, :args.label_len, :].shape

torch.Size([5, 40, 16])

In [837]:
b1[1][:,:,:4].shape

torch.Size([5, 100, 4])

In [838]:
dec_inp.shape

torch.Size([5, 100, 16])

In [839]:
model = Autoformer.Model(args)
model

Model(
  (decomp): series_decomp(
    (moving_avg): moving_avg(
      (avg): AvgPool1d(kernel_size=(25,), stride=(1,), padding=(0,))
    )
  )
  (enc_embedding): DataEmbedding_wo_pos(
    (value_embedding): TokenEmbedding(
      (tokenConv): Conv1d(16, 512, kernel_size=(3,), stride=(1,), padding=(1,), bias=False, padding_mode=circular)
    )
    (position_embedding): PositionalEmbedding()
    (temporal_embedding): TimeFeatureEmbedding(
      (embed): Linear(in_features=4, out_features=512, bias=False)
    )
    (dropout): Dropout(p=0.05, inplace=False)
  )
  (dec_embedding): DataEmbedding_wo_pos(
    (value_embedding): TokenEmbedding(
      (tokenConv): Conv1d(16, 512, kernel_size=(3,), stride=(1,), padding=(1,), bias=False, padding_mode=circular)
    )
    (position_embedding): PositionalEmbedding()
    (temporal_embedding): TimeFeatureEmbedding(
      (embed): Linear(in_features=4, out_features=512, bias=False)
    )
    (dropout): Dropout(p=0.05, inplace=False)
  )
  (encoder): Enco

In [840]:
model(b1[0], b1[0][:,:,:4], dec_inp, b1[1][:,:,:4])

tensor([[[71.6197, 61.6116, 21.0257,  ..., 71.5852, 73.0065, 55.5396],
         [72.8775, 63.7794, 17.5068,  ..., 74.1997, 67.0087, 56.9475],
         [79.0788, 56.7229, 19.2600,  ..., 74.6488, 63.7263, 55.1225],
         ...,
         [81.0119, 62.4430, 47.9587,  ..., 71.7357, 74.6668, 70.6535],
         [84.9704, 64.7731, 49.5155,  ..., 75.2043, 75.1146, 70.0724],
         [59.4920, 77.6923, 48.7098,  ..., 70.8696, 46.5217, 57.5186]],

        [[64.5330, 67.2606, 16.4378,  ..., 76.3374, 63.5895, 64.3684],
         [73.6335, 60.0841, 14.4786,  ..., 80.4406, 58.5424, 62.2762],
         [70.3385, 64.0664, 24.4106,  ..., 80.6096, 56.6966, 62.5893],
         ...,
         [73.3683, 64.1530, 27.1453,  ..., 87.9009, 53.7951, 73.0177],
         [74.7011, 65.3769, 30.4544,  ..., 86.9556, 53.9701, 75.0075],
         [65.3242, 85.1730, 13.4185,  ..., 91.0981, 49.7809, 55.8396]],

        [[73.0341, 53.6389, 19.8400,  ..., 87.8904, 70.5910, 59.1613],
         [72.4065, 58.6468, 27.5749,  ..., 87

In [598]:
model = Informer.Model(args)
model

Model(
  (enc_embedding): DataEmbedding(
    (value_embedding): TokenEmbedding(
      (tokenConv): Conv1d(16, 512, kernel_size=(3,), stride=(1,), padding=(1,), bias=False, padding_mode=circular)
    )
    (position_embedding): PositionalEmbedding()
    (temporal_embedding): TimeFeatureEmbedding(
      (embed): Linear(in_features=4, out_features=512, bias=False)
    )
    (dropout): Dropout(p=0.05, inplace=False)
  )
  (dec_embedding): DataEmbedding(
    (value_embedding): TokenEmbedding(
      (tokenConv): Conv1d(16, 512, kernel_size=(3,), stride=(1,), padding=(1,), bias=False, padding_mode=circular)
    )
    (position_embedding): PositionalEmbedding()
    (temporal_embedding): TimeFeatureEmbedding(
      (embed): Linear(in_features=4, out_features=512, bias=False)
    )
    (dropout): Dropout(p=0.05, inplace=False)
  )
  (encoder): Encoder(
    (attn_layers): ModuleList(
      (0-1): 2 x EncoderLayer(
        (attention): AttentionLayer(
          (inner_attention): ProbAttention(
  

In [572]:
model(b1[0], b1[0][:,:,:4], dec_inp, b1[1][:,:,:4])

tensor([[[-0.0570, -0.4157, -0.1510,  ...,  0.0500,  0.3376,  0.3915],
         [-0.0348, -0.3373, -0.2179,  ..., -0.1039,  0.2944,  0.4456],
         [ 0.0469, -0.3929, -0.2783,  ...,  0.1064,  0.2877,  0.5842],
         ...,
         [-0.7475, -0.9854, -0.2337,  ...,  0.9094, -0.1976, -0.2089],
         [-0.8493, -1.1082, -0.3114,  ...,  0.4866, -0.2084,  0.1452],
         [-0.6693, -0.9205, -0.2268,  ...,  0.6960, -0.5058,  0.1586]],

        [[-0.0638, -0.5629, -0.2790,  ...,  0.0374,  0.4097,  0.5809],
         [ 0.1585, -0.3637, -0.1344,  ..., -0.4755,  0.1717,  0.5125],
         [-0.0787, -0.2398, -0.0304,  ...,  0.0221,  0.3117,  0.4799],
         ...,
         [-0.9184, -0.9270, -0.0674,  ...,  0.2648, -0.4298, -0.2003],
         [-0.5262, -0.9525, -0.2406,  ...,  0.3022, -0.2528, -0.4428],
         [-0.6395, -1.0149,  0.0353,  ...,  0.5275, -0.4536, -0.2185]],

        [[ 0.2208, -0.5444, -0.0981,  ...,  0.2882,  0.6022,  0.5731],
         [-0.0925, -0.2530,  0.1887,  ...,  0

In [583]:
model = Transformer.Model(args)

In [584]:
o = model(b1[0], b1[0][:,:,:4], dec_inp, b1[1][:,:,:4])
o

tensor([[[ 0.0626, -0.1851, -1.5028,  ...,  0.2717, -0.1957, -0.8799],
         [ 0.0770, -0.3594, -0.4376,  ...,  0.0685, -0.8484, -0.9470],
         [ 0.2978,  0.4614, -0.6564,  ..., -0.1430, -1.1055, -0.2751],
         ...,
         [ 0.0898, -0.3941, -0.7998,  ...,  0.3140, -0.0287,  0.2289],
         [ 0.1967, -0.1721, -1.0142,  ...,  0.4170, -0.4950,  0.0120],
         [ 0.1658, -0.0521,  0.3124,  ...,  1.1811, -0.5614, -0.6409]],

        [[ 0.2112,  0.0224, -0.5463,  ...,  0.6994,  0.2194, -0.5117],
         [ 0.1603,  0.0161, -0.8348,  ...,  0.0037, -1.1548, -0.1709],
         [ 0.2592, -0.1675, -0.9611,  ..., -0.5422, -0.8645, -0.6270],
         ...,
         [ 0.6859,  0.1990, -0.1051,  ..., -0.2792, -0.9675,  0.1758],
         [ 0.7582,  0.7192,  0.1024,  ...,  0.8222, -0.7673, -0.2134],
         [ 0.0285,  0.3575, -0.4119,  ...,  0.0299, -0.2290, -0.5249]],

        [[-0.3877,  0.4745, -1.1034,  ...,  0.1650, -0.7193, -0.5817],
         [ 0.3174,  0.2920, -0.6730,  ..., -0

In [585]:
o = o[:, -args.pred_len:, :]
by = b1[1][:, -args.pred_len:, :]

In [586]:
print(o.shape, by.shape)

torch.Size([5, 96, 16]) torch.Size([5, 96, 16])


In [596]:
# models = {'PatchTST': PatchTST, 'Informer': Informer, 'Autoformer': Autoformer, 'DLinear': DLinear,
#          'Transformer': Transformer}

In [579]:
# optimizer= optim.Adam(model.parameters(), lr=args.learning_rate)

In [17]:
device = torch.device('mps')

In [29]:
len(s_dl)

24

In [112]:
models = {'PatchTST': PatchTST, 'Informer': Informer}
# models = {'PatchTST': PatchTST, 'Informer': Informer, 'Autoformer': Autoformer, 'DLinear': DLinear,
#          'Transformer': Transformer}
loss_fn = nn.MSELoss()
for m in models.keys():
    args.model = models[m]
    model = args.model.Model(args)
    running_loss = 0.
    optimizer= optim.Adam(model.parameters(), lr=args.learning_rate)
    start = time.time()
    for i, data in enumerate(s_dl):
        # Every data instance is an input + label pair
        inputs, labels = data
    
        # Zero your gradients for every batch! 
        optimizer.zero_grad()
    
        # Make predictions for this batch
        dec_inp = torch.zeros_like(labels[:, -args.pred_len:, :]).float()
        dec_inp = torch.cat([labels[:, :args.label_len, :], dec_inp], dim=1).float()
    
        if 'Linear' in m or 'TST' in m:
            outputs = model(inputs)
        else: 
        # outputs = model(inputs)
            outputs = model(inputs, inputs[:,:,:4], dec_inp, labels[:,:,:4])
    
        outputs = outputs[:, -args.pred_len:, :]
        labels = labels[:, -args.pred_len:, :]
        
        # Compute the loss and its gradients
        loss = loss_fn(outputs, labels)
        loss.backward()
    
        # Adjust learning weights
        optimizer.step()
    
        # Gather data and report
        running_loss += loss.item()
    stop = time.time()
    avg_loss = running_loss / len(s_dl)
    print(m, ': ', avg_loss, ' | duration: ', stop-start)

PatchTST :  851.3411118527677  | duration:  16324.859073162079
Informer :  3057.6708257147607  | duration:  2242.310148715973


In [606]:
args.model

<module 'models.Transformer' from '/Users/gift/github/PatchTST/PatchTST_supervised/models/Transformer.py'>

In [17]:
torch.backends.mps.is_available()

True

In [13]:
models = {'PatchTST': PatchTST, 'Informer': Informer}
# models = {'PatchTST': PatchTST, 'Informer': Informer, 'Autoformer': Autoformer, 'DLinear': DLinear,
#          'Transformer': Transformer}
loss_fn = nn.MSELoss()
device = torch.device('mps')
for m in models.keys():
    args.model = models[m]
    model = args.model.Model(args).to(device)
    running_loss = 0.
    optimizer= optim.Adam(model.parameters(), lr=args.learning_rate)
    start = time.time()
    for i, data in enumerate(s_dl):
        # Every data instance is an input + label pair
        inputs, labels = data
        inputs = inputs.to(device)
        labels = labels.to(device)
    
        # Zero your gradients for every batch! 
        optimizer.zero_grad()
    
        # Make predictions for this batch
        dec_inp = torch.zeros_like(labels[:, -args.pred_len:, :]).float()
        dec_inp = torch.cat([labels[:, :args.label_len, :], dec_inp], dim=1).float()
    
        if 'Linear' in m or 'TST' in m:
            outputs = model(inputs)
        else: 
        # outputs = model(inputs)
            outputs = model(inputs, inputs[:,:,:4], dec_inp, labels[:,:,:4])
    
        outputs = outputs[:, -args.pred_len:, :]
        labels = labels[:, -args.pred_len:, :]
        
        # Compute the loss and its gradients
        loss = loss_fn(outputs, labels)
        loss.backward()
    
        # Adjust learning weights
        optimizer.step()
    
        # Gather data and report
        running_loss += loss.item()
    stop = time.time()
    avg_loss = running_loss / len(s_dl)
    print(m, ': ', avg_loss, ' | duration: ', stop-start)

PatchTST :  923.2179946899414  | duration:  56.7196569442749
Informer :  3104.5015970865884  | duration:  37.66055178642273


In [20]:
torch.mps.empty_cache()
models = {'PatchTST': PatchTST}
# models = {'PatchTST': PatchTST, 'Informer': Informer}
# models = {'PatchTST': PatchTST, 'Informer': Informer, 'Autoformer': Autoformer, 'DLinear': DLinear,
#          'Transformer': Transformer}
loss_fn = nn.MSELoss()
device = torch.device('mps')
torch.mps.profiler.start()
for m in models.keys():
    args.model = models[m]
    model = args.model.Model(args).to(device)
    running_loss = 0.
    optimizer= optim.Adam(model.parameters(), lr=args.learning_rate)
    start = time.time()
    for i, data in enumerate(s_dl):
        # Every data instance is an input + label pair
        inputs, labels = data
        inputs = inputs.to(device)
        labels = labels.to(device)
    
        # Zero your gradients for every batch! 
        optimizer.zero_grad()
    
        # Make predictions for this batch
        dec_inp = torch.zeros_like(labels[:, -args.pred_len:, :]).float()
        dec_inp = torch.cat([labels[:, :args.label_len, :], dec_inp], dim=1).float()
    
        if 'Linear' in m or 'TST' in m:
            outputs = model(inputs)
        else: 
        # outputs = model(inputs)
            outputs = model(inputs, inputs[:,:,:4], dec_inp, labels[:,:,:4])
    
        outputs = outputs[:, -args.pred_len:, :]
        labels = labels[:, -args.pred_len:, :]
        
        # Compute the loss and its gradients
        loss = loss_fn(outputs, labels)
        loss.backward()
    
        # Adjust learning weights
        optimizer.step()
    
        # Gather data and report
        running_loss += loss.item()
    stop = time.time()
    avg_loss = running_loss / len(s_dl)
    print(m, ': ', avg_loss, ' | duration: ', stop-start)
torch.mps.profiler.stop()

PatchTST :  925.5157216389974  | duration:  58.14438796043396


In [31]:
torch.mps.empty_cache()

In [11]:
# models = {'PatchTST': PatchTST, 'Informer': Informer, 'Autoformer': Autoformer, 'DLinear': DLinear,
#          'Transformer': Transformer}
torch.mps.empty_cache()
models = {'PatchTST': PatchTST}
loss_fn = nn.MSELoss()
device = torch.device('mps')

torch.mps.profiler.start()
for m in models.keys():
    args.model = models[m]
    model = args.model.Model(args).to(device)
    running_loss = 0.
    optimizer= optim.Adam(model.parameters(), lr=args.learning_rate)
    start = time.time()
    for i, data in enumerate(w_dl):
        # Every data instance is an input + label pair
        inputs, labels = data
        # inputs = inputs
        # labels = labels
    
        # Zero your gradients for every batch! 
        optimizer.zero_grad()
    
        # Make predictions for this batch
        dec_inp = torch.zeros_like(labels[:, -args.pred_len:, :]).float()
        dec_inp = torch.cat([labels[:, :args.label_len, :], dec_inp], dim=1).float()
    
        if 'Linear' in m or 'TST' in m:
            outputs = model(inputs)
        else: 
        # outputs = model(inputs)
            outputs = model(inputs, inputs[:,:,:4], dec_inp, labels[:,:,:4])
    
        outputs = outputs[:, -args.pred_len:, :]
        labels = labels[:, -args.pred_len:, :]
        
        # Compute the loss and its gradients
        loss = loss_fn(outputs, labels)
        loss.backward()
    
        # Adjust learning weights
        optimizer.step()
    
        # Gather data and report
        running_loss += loss.item()
    stop = time.time()
    avg_loss = running_loss / len(w_dl)
    print(m, ': ', avg_loss, ' | duration: ', stop-start)
torch.mps.profiler.stop()
# torch.cuda.memory._record_memory_history(enabled=None)
# torch.cuda.memory._dump_snapshot(f"{file_prefix}.pickle")
# torch.cuda.memory._dump_snapshot("my_snapshot.pickle")

PatchTST :  916.0164006551107  | duration:  58.74329495429993


In [16]:
torch.cuda.is_available()

False

In [None]:
torch.mps.profiler

In [18]:
models = {'PatchTST': PatchTST, 'Informer': Informer}
loss_fn = nn.MSELoss()
device = torch.device('mps')
for m in models.keys():
    args.model = models[m]
    model = args.model.Model(args).to(device)
    running_loss = 0.
    optimizer= optim.Adam(model.parameters(), lr=args.learning_rate)
    start = time.time()
    for i, data in enumerate(w_dl):
        # Every data instance is an input + label pair
        inputs, labels = data
        # inputs = inputs
        # labels = labels
    
        # Zero your gradients for every batch! 
        optimizer.zero_grad()
    
        # Make predictions for this batch
        dec_inp = torch.zeros_like(labels[:, -args.pred_len:, :]).float()
        dec_inp = torch.cat([labels[:, :args.label_len, :], dec_inp], dim=1).float()
    
        if 'Linear' in m or 'TST' in m:
            outputs = model(inputs)
        else: 
        # outputs = model(inputs)
            outputs = model(inputs, inputs[:,:,:4], dec_inp, labels[:,:,:4])
    
        outputs = outputs[:, -args.pred_len:, :]
        labels = labels[:, -args.pred_len:, :]
        
        # Compute the loss and its gradients
        loss = loss_fn(outputs, labels)
        loss.backward()
    
        # Adjust learning weights
        optimizer.step()
    
        # Gather data and report
        running_loss += loss.item()
    stop = time.time()
    avg_loss = running_loss / len(w_dl)
    print(m, ': ', avg_loss, ' | duration: ', stop-start)

RuntimeError: MPS backend out of memory (MPS allocated: 27.01 GB, other allocations: 4.82 GB, max allowed: 36.27 GB). Tried to allocate 4.73 GB on private pool. Use PYTORCH_MPS_HIGH_WATERMARK_RATIO=0.0 to disable upper limit for memory allocations (may cause system failure).

In [None]:
def train_one_epoch(epoch_index, tb_writer):
    running_loss = 0.
    last_loss = 0.

    # Here, we use enumerate(training_loader) instead of
    # iter(training_loader) so that we can track the batch
    # index and do some intra-epoch reporting
    for i, data in enumerate(training_loader):
        # Every data instance is an input + label pair
        inputs, labels = data

        # Zero your gradients for every batch!
        optimizer.zero_grad()

        # Make predictions for this batch
        outputs = model(inputs)

        # Compute the loss and its gradients
        loss = loss_fn(outputs, labels)
        loss.backward()

        # Adjust learning weights
        optimizer.step()

        # Gather data and report
        running_loss += loss.item()
        # if i % 1000 == 999:
        #     last_loss = running_loss / 1000 # loss per batch
        #     print('  batch {} loss: {}'.format(i + 1, last_loss))
        #     tb_x = epoch_index * len(training_loader) + i + 1
        #     tb_writer.add_scalar('Loss/train', last_loss, tb_x)
        #     running_loss = 0.
    avg_loss = running_loss / len(training_loader)

    return last_loss

In [408]:
from data_provider.data_loader import Dataset_ETT_hour as etth

In [410]:
root_path = '../dataset/'
# size = [params.context_points, 0, params.target_points]
# dls = DataLoaders(
#         datasetCls=Dataset_ETT_hour,
#         dataset_kwargs={
#         'root_path': root_path,
#         'data_path': 'ETTh2.csv',
#         'features': params.features,
#         'scale': True,
#         'size': size,
#         'use_time_features': False
#         },
#         batch_size=params.batch_size,
#         workers=params.num_workers,
#         )
d = etth(root_path=root_path)

In [412]:
len(d)

8161

In [414]:
len(d[0])

4

In [415]:
d[0][0].shape, d[0][1].shape, d[0][2].shape, d[0][3].shape

((384, 1), (192, 1), (384, 4), (192, 4))