## multi-step
- https://github.com/oliverguhr/transformer-time-series-prediction/blob/master/transformer-multistep.py

<div style="text-align: right"> <b>Author : Kwang Myung Yu</b></div>
<div style="text-align: right"> Initial upload: 2023.11.06</div>
<div style="text-align: right"> Last update: 2023.11.06</div>

In [1]:
import datetime
import sys
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import warnings; warnings.filterwarnings('ignore')
plt.style.use('seaborn-v0_8-whitegrid')
%matplotlib inline
# print(plt.stype.available)

# Options for pandas
pd.options.display.max_columns = 30

In [2]:
import torch
import torch.nn as nn
import time
import math
from sklearn.preprocessing import MinMaxScaler

In [3]:
data = pd.read_csv("daily-min-temperature.csv")
data.head()

Unnamed: 0,Date,Temp
0,1981-01-01,20.7
1,1981-01-02,17.9
2,1981-01-03,18.8
3,1981-01-04,14.6
4,1981-01-05,15.8


In [4]:
data.shape

(3650, 2)

In [5]:
torch.manual_seed(0)
np.random.seed(0)

In [6]:
# This concept is also called teacher forceing. 
# The flag decides if the loss will be calculted over all 
# or just the predicted values.
calculate_loss_over_all_values = False

In [7]:
# S is 소스 시퀀스 길이
# T is 타겟 시퀀스 길이
# N is 배치 사이즈
# E is 피처수

#src = torch.rand((10, 32, 512)) # (S,N,E) 
#tgt = torch.rand((20, 32, 512)) # (T,N,E)
#out = transformer_model(src, tgt)

In [8]:
input_window = 100 # number of input steps
output_window = 5 # number of prediction steps, in this model its fixed to one
block_len = input_window + output_window # for one input-output pair
batch_size = 10
train_size = 0.8

In [9]:
device = 'mps'

### 1. 포지션 인코딩
- single step과 약간 다름, 확인해보기

하나씩 실행해보기

In [10]:
d_model = 10
max_len = 5000

In [11]:
pe = torch.zeros(max_len, d_model)
pe.shape

torch.Size([5000, 10])

In [12]:
position = torch.arange(0, max_len, dtype=torch.float).unsqueeze(1)
position.shape

torch.Size([5000, 1])

In [13]:
position

tensor([[0.0000e+00],
        [1.0000e+00],
        [2.0000e+00],
        ...,
        [4.9970e+03],
        [4.9980e+03],
        [4.9990e+03]])

In [14]:
# single step과 다름
div_term = torch.exp(torch.arange(0, d_model, 2).float() * (-math.log(10000.0) / d_model))
div_term.shape# single step은 10, 여기서는 5

torch.Size([5])

In [15]:
div_term

tensor([1.0000e+00, 1.5849e-01, 2.5119e-02, 3.9811e-03, 6.3096e-04])

In [16]:
# 기존식: single step
# pe[:, 0::2] = torch.sin(position * div_term[0::2])
# pe[:, 1::2] = torch.cos(position * div_term[1::2])

pe[:, 0::2] = torch.sin(position * div_term)
pe[:, 1::2] = torch.cos(position * div_term)

In [17]:
pe.shape

torch.Size([5000, 10])

In [18]:
pe.unsqueeze(0).shape

torch.Size([1, 5000, 10])

In [19]:
pe = pe.unsqueeze(0).transpose(0, 1) # batch last로 구현함
pe.shape

torch.Size([5000, 1, 10])

- batch last

In [20]:
batch_size = 32
seq_len = 50
embedding_dim = 10

# 임의의 수로 채워진 텐서를 생성합니다.
dummy_input = torch.randn(seq_len, batch_size, embedding_dim)

In [21]:
dummy_input.shape

torch.Size([50, 32, 10])

In [22]:
pe[:dummy_input.size(0), :].shape

torch.Size([50, 1, 10])

- repeat 안해도 되는지? (확인요)

In [23]:
dummy_input + pe[:dummy_input.size(0), :]

tensor([[[-1.1258, -0.1524, -0.2506,  ..., -1.1152,  0.3223, -0.2633],
         [ 0.3500,  1.3081,  0.1198,  ..., -0.6959,  0.5667,  1.7935],
         [ 0.5988, -0.5551, -0.3414,  ...,  1.1835,  1.3894,  2.5863],
         ...,
         [-0.1847,  0.2682, -0.0807,  ...,  1.8200, -0.6332,  2.2948],
         [ 1.4628,  0.3796,  0.9884,  ...,  1.7870,  0.1076, -0.0715],
         [-0.1166, -0.0170, -1.1980,  ...,  0.9668, -0.4186,  0.7444]],

        [[ 0.7122,  0.4857,  0.5662,  ...,  0.5665, -1.2420,  2.2845],
         [ 1.0852,  1.0707,  0.1433,  ...,  4.9300, -0.1238,  1.2953],
         [ 1.2241, -0.0094, -0.8362,  ..., -1.0689,  0.9101,  0.3054],
         ...,
         [ 1.8200,  0.0989, -0.1032,  ...,  0.5250, -0.4946,  0.8016],
         [ 3.0564,  0.4036, -0.8603,  ...,  0.2504, -0.0943,  2.1009],
         [ 2.1519,  0.2475, -0.6305,  ...,  1.2290,  1.2839, -0.3792]],

        [[ 1.4501, -1.3640,  0.5138,  ...,  0.6620,  0.4059,  1.8931],
         [-0.5448,  0.7713,  0.0122,  ...,  3

In [24]:
dummy_input + pe[:dummy_input.size(0), :].repeat(1,dummy_input.shape[1],1)

tensor([[[-1.1258, -0.1524, -0.2506,  ..., -1.1152,  0.3223, -0.2633],
         [ 0.3500,  1.3081,  0.1198,  ..., -0.6959,  0.5667,  1.7935],
         [ 0.5988, -0.5551, -0.3414,  ...,  1.1835,  1.3894,  2.5863],
         ...,
         [-0.1847,  0.2682, -0.0807,  ...,  1.8200, -0.6332,  2.2948],
         [ 1.4628,  0.3796,  0.9884,  ...,  1.7870,  0.1076, -0.0715],
         [-0.1166, -0.0170, -1.1980,  ...,  0.9668, -0.4186,  0.7444]],

        [[ 0.7122,  0.4857,  0.5662,  ...,  0.5665, -1.2420,  2.2845],
         [ 1.0852,  1.0707,  0.1433,  ...,  4.9300, -0.1238,  1.2953],
         [ 1.2241, -0.0094, -0.8362,  ..., -1.0689,  0.9101,  0.3054],
         ...,
         [ 1.8200,  0.0989, -0.1032,  ...,  0.5250, -0.4946,  0.8016],
         [ 3.0564,  0.4036, -0.8603,  ...,  0.2504, -0.0943,  2.1009],
         [ 2.1519,  0.2475, -0.6305,  ...,  1.2290,  1.2839, -0.3792]],

        [[ 1.4501, -1.3640,  0.5138,  ...,  0.6620,  0.4059,  1.8931],
         [-0.5448,  0.7713,  0.0122,  ...,  3

- 동일함(브로드 캐스트)

In [25]:
# class PositionalEncoding(nn.Module):

#     def __init__(self, d_model, max_len=5000):
#         super(PositionalEncoding, self).__init__()       
#         pe = torch.zeros(max_len, d_model)
#         position = torch.arange(0, max_len, dtype=torch.float).unsqueeze(1)
#         # div_term = torch.exp(
#         #     torch.arange(0, d_model, 2).float() * (-math.log(10000.0) / d_model)
#         # )
#         div_term = 1 / (10000 ** ((2 * np.arange(d_model)) / d_model))
#         pe[:, 0::2] = torch.sin(position * div_term[0::2])
#         pe[:, 1::2] = torch.cos(position * div_term[1::2])

#         pe = pe.unsqueeze(0).transpose(0, 1) # [5000, 1, d_model],so need seq-len <= 5000
#         #pe.requires_grad = False
#         self.register_buffer('pe', pe)

#     def forward(self, x):
#         # print(self.pe[:x.size(0), :].repeat(1,x.shape[1],1).shape ,'---',x.shape)
#         # dimension 1 maybe inequal batchsize
#         return x + self.pe[:x.size(0), :].repeat(1,x.shape[1],1)

In [26]:
class PositionalEncoding(nn.Module):

    def __init__(self, d_model, max_len=5000):
        super(PositionalEncoding, self).__init__()       
        pe = torch.zeros(max_len, d_model)
        position = torch.arange(0, max_len, dtype=torch.float).unsqueeze(1)
        div_term = torch.exp(torch.arange(0, d_model, 2).float() * (-math.log(10000.0) / d_model))
        pe[:, 0::2] = torch.sin(position * div_term)
        pe[:, 1::2] = torch.cos(position * div_term)
        pe = pe.unsqueeze(0).transpose(0, 1)
        #pe.requires_grad = False
        self.register_buffer('pe', pe)

    def forward(self, x):
        return x + self.pe[:x.size(0), :]

In [27]:
pe = PositionalEncoding(
    d_model=10
)

In [28]:
dummy_input.shape

torch.Size([50, 32, 10])

In [29]:
pe(dummy_input).shape

torch.Size([50, 32, 10])

### 2. Model

In [30]:
class TransAm(nn.Module): # single과 비슷하나 embedding을 하지 않았음
    def __init__(self,feature_size=250,num_layers=1,dropout=0.1):
        super(TransAm, self).__init__()
        self.model_type = 'Transformer'
        
        self.src_mask = None
        self.pos_encoder = PositionalEncoding(feature_size)
        self.encoder_layer = nn.TransformerEncoderLayer(d_model=feature_size, nhead=10, dropout=dropout)
        self.transformer_encoder = nn.TransformerEncoder(self.encoder_layer, num_layers=num_layers)        
        self.decoder = nn.Linear(feature_size,1)
        self.init_weights()

    def init_weights(self):
        initrange = 0.1    
        self.decoder.bias.data.zero_()
        self.decoder.weight.data.uniform_(-initrange, initrange)

    def forward(self,src):
        if self.src_mask is None or self.src_mask.size(0) != len(src):
            device = src.device
            mask = self._generate_square_subsequent_mask(len(src)).to(device)
            self.src_mask = mask

        src = self.pos_encoder(src)
        output = self.transformer_encoder(src,self.src_mask)#, self.src_mask)
        output = self.decoder(output)
        return output

    def _generate_square_subsequent_mask(self, sz):
        mask = (torch.triu(torch.ones(sz, sz)) == 1).transpose(0, 1)
        mask = mask.float().masked_fill(mask == 0, float('-inf')).masked_fill(mask == 1, float(0.0))
        return mask
        
# if window is 100 and prediction step is 1
# in -> [0..99]
# target -> [1..100]

### 입력 데이터 만들기 함수

In [31]:
def create_inout_sequences(
    input_data,
    output_window,
    tw
):
    inout_seq = []
    L = len(input_data)
    for i in range(L-tw):
        train_seq = np.append(input_data[i:i+tw][:-output_window] , output_window * [0])
        train_label = input_data[i:i+tw]
        #train_label = input_data[i+output_window:i+tw+output_window]
        inout_seq.append((train_seq ,train_label))
    return torch.FloatTensor(inout_seq)

In [32]:
data.head()

Unnamed: 0,Date,Temp
0,1981-01-01,20.7
1,1981-01-02,17.9
2,1981-01-03,18.8
3,1981-01-04,14.6
4,1981-01-05,15.8


In [33]:
input_window = 100 # number of input steps
output_window = 10 # number of prediction steps, in this model its fixed to one
tw = input_window + output_window # for one input-output pair

In [34]:
inout_seq = create_inout_sequences(
    data['Temp'].values,
    output_window=output_window,
    tw=tw
)

In [35]:
inout_seq

tensor([[[20.7000, 17.9000, 18.8000,  ...,  0.0000,  0.0000,  0.0000],
         [20.7000, 17.9000, 18.8000,  ..., 13.2000, 13.8000, 10.6000]],

        [[17.9000, 18.8000, 14.6000,  ...,  0.0000,  0.0000,  0.0000],
         [17.9000, 18.8000, 14.6000,  ..., 13.8000, 10.6000,  9.0000]],

        [[18.8000, 14.6000, 15.8000,  ...,  0.0000,  0.0000,  0.0000],
         [18.8000, 14.6000, 15.8000,  ..., 10.6000,  9.0000, 10.0000]],

        ...,

        [[10.5000, 14.6000, 12.6000,  ...,  0.0000,  0.0000,  0.0000],
         [10.5000, 14.6000, 12.6000,  ..., 14.6000, 14.0000, 13.6000]],

        [[14.6000, 12.6000,  9.8000,  ...,  0.0000,  0.0000,  0.0000],
         [14.6000, 12.6000,  9.8000,  ..., 14.0000, 13.6000, 13.5000]],

        [[12.6000,  9.8000,  7.2000,  ...,  0.0000,  0.0000,  0.0000],
         [12.6000,  9.8000,  7.2000,  ..., 13.6000, 13.5000, 15.7000]]])

In [36]:
inout_seq[0][0].shape

torch.Size([110])

In [37]:
inout_seq[0][1].shape

torch.Size([110])

In [38]:
inout_seq[0][0]

tensor([20.7000, 17.9000, 18.8000, 14.6000, 15.8000, 15.8000, 15.8000, 17.4000,
        21.8000, 20.0000, 16.2000, 13.3000, 16.7000, 21.5000, 25.0000, 20.7000,
        20.6000, 24.8000, 17.7000, 15.5000, 18.2000, 12.1000, 14.4000, 16.0000,
        16.5000, 18.7000, 19.4000, 17.2000, 15.5000, 15.1000, 15.4000, 15.3000,
        18.8000, 21.9000, 19.9000, 16.6000, 16.8000, 14.6000, 17.1000, 25.0000,
        15.0000, 13.7000, 13.9000, 18.3000, 22.0000, 22.1000, 21.2000, 18.4000,
        16.6000, 16.1000, 15.7000, 16.6000, 16.5000, 14.4000, 14.4000, 18.5000,
        16.9000, 17.5000, 21.2000, 17.8000, 18.6000, 17.0000, 16.0000, 13.3000,
        14.3000, 11.4000, 16.3000, 16.1000, 11.8000, 12.2000, 14.7000, 11.8000,
        11.3000, 10.6000, 11.7000, 14.2000, 11.2000, 16.9000, 16.7000,  8.1000,
         8.0000,  8.8000, 13.4000, 10.9000, 13.4000, 11.0000, 15.0000, 15.7000,
        14.5000, 15.8000, 16.7000, 16.8000, 17.5000, 17.1000, 18.1000, 16.6000,
        10.0000, 14.9000, 15.9000, 13.00

In [39]:
inout_seq[0][1]

tensor([20.7000, 17.9000, 18.8000, 14.6000, 15.8000, 15.8000, 15.8000, 17.4000,
        21.8000, 20.0000, 16.2000, 13.3000, 16.7000, 21.5000, 25.0000, 20.7000,
        20.6000, 24.8000, 17.7000, 15.5000, 18.2000, 12.1000, 14.4000, 16.0000,
        16.5000, 18.7000, 19.4000, 17.2000, 15.5000, 15.1000, 15.4000, 15.3000,
        18.8000, 21.9000, 19.9000, 16.6000, 16.8000, 14.6000, 17.1000, 25.0000,
        15.0000, 13.7000, 13.9000, 18.3000, 22.0000, 22.1000, 21.2000, 18.4000,
        16.6000, 16.1000, 15.7000, 16.6000, 16.5000, 14.4000, 14.4000, 18.5000,
        16.9000, 17.5000, 21.2000, 17.8000, 18.6000, 17.0000, 16.0000, 13.3000,
        14.3000, 11.4000, 16.3000, 16.1000, 11.8000, 12.2000, 14.7000, 11.8000,
        11.3000, 10.6000, 11.7000, 14.2000, 11.2000, 16.9000, 16.7000,  8.1000,
         8.0000,  8.8000, 13.4000, 10.9000, 13.4000, 11.0000, 15.0000, 15.7000,
        14.5000, 15.8000, 16.7000, 16.8000, 17.5000, 17.1000, 18.1000, 16.6000,
        10.0000, 14.9000, 15.9000, 13.00

In [40]:
def get_data(
    output_window,
    tw
    ):
    time = np.arange(0, 400, 0.1)
    amplitude   = np.sin(time) + np.sin(time*0.05) +np.sin(time*0.12) *np.random.normal(-0.2, 0.2, len(time))
    
    #from pandas import read_csv
    #series = read_csv('daily-min-temperatures.csv', header=0, index_col=0, parse_dates=True, squeeze=True)
    
    from sklearn.preprocessing import MinMaxScaler
    scaler = MinMaxScaler(feature_range=(-1, 1)) 
    #amplitude = scaler.fit_transform(series.to_numpy().reshape(-1, 1)).reshape(-1)
    amplitude = scaler.fit_transform(amplitude.reshape(-1, 1)).reshape(-1)
    
    
    sampels = 2800
    train_data = amplitude[:sampels]
    test_data = amplitude[sampels:]

    # convert our train data into a pytorch train tensor
    #train_tensor = torch.FloatTensor(train_data).view(-1)
    # todo: add comment.. 
    train_sequence = create_inout_sequences(train_data,output_window, tw)
    train_sequence = train_sequence[:-output_window] #todo: fix hack?

    #test_data = torch.FloatTensor(test_data).view(-1) 
    test_data = create_inout_sequences(test_data,output_window, tw)
    test_data = test_data[:-output_window] #todo: fix hack?

    return train_sequence.to(device),test_data.to(device)

In [41]:
train_test, test_test = get_data(output_window, tw)

In [42]:

len(train_test)

2680

In [43]:
len(test_test)

1080

배치로 구현? : 확인요

In [44]:
def get_batch(source, i,batch_size):
    seq_len = min(batch_size, len(source) - 1 - i)
    data = source[i:i+seq_len]    
    input = torch.stack(torch.stack([item[0] for item in data]).chunk(input_window,1)) # 1 is feature size
    target = torch.stack(torch.stack([item[1] for item in data]).chunk(input_window,1))
    return input, target

### 3. 학습루프, 시각화, 평가

In [45]:
def train(train_data,
          model,
          optimizer,
          criterion,
          epoch,
          scheduler,
          batch_size = 10
          ):
    model.train() # Turn on the train mode \o/
    total_loss = 0.
    start_time = time.time()

    for batch, i in enumerate(range(0, len(train_data), batch_size)):  # Now len-1 is not necessary
        # data and target are the same shape with (input_window,batch_len,1)
        data, targets = get_batch(train_data, i , batch_size)
        optimizer.zero_grad()
        output = model(data)
        loss = criterion(output, targets)
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), 0.7)
        optimizer.step()

        total_loss += loss.item()
        log_interval = int(len(train_data) / batch_size / 5)
        if batch % log_interval == 0 and batch > 0:
            cur_loss = total_loss / log_interval
            elapsed = time.time() - start_time
            print('| epoch {:3d} | {:5d}/{:5d} batches | '
                  'lr {:02.6f} | {:5.2f} ms | '
                  'loss {:5.5f} | ppl {:8.2f}'.format(
                    epoch, batch, len(train_data) // batch_size, scheduler.get_lr()[0],
                    elapsed * 1000 / log_interval,
                    cur_loss, math.exp(cur_loss)))
            total_loss = 0
            start_time = time.time()

In [46]:
def plot_and_loss(eval_model, data_source, criterion, epoch):
    eval_model.eval() 
    total_loss = 0.
    test_result = torch.Tensor(0)    
    truth = torch.Tensor(0)
    with torch.no_grad():
        for i in range(0, len(data_source) - 1):
            data, target = get_batch(data_source, i,1)
            # look like the model returns static values for the output window
            output = eval_model(data)    
            if calculate_loss_over_all_values:                                
                total_loss += criterion(output, target).item()
            else:
                total_loss += criterion(output[-output_window:], target[-output_window:]).item()
            
            test_result = torch.cat((test_result, output[-1].view(-1).cpu()), 0) #todo: check this. -> looks good to me
            truth = torch.cat((truth, target[-1].view(-1).cpu()), 0)
            
    #test_result = test_result.cpu().numpy()
    len(test_result)

    plt.plot(test_result,color="red")
    plt.plot(truth[:500],color="blue")
    plt.plot(test_result-truth,color="green")
    plt.grid(True, which='both')
    plt.axhline(y=0, color='k')
    plt.savefig('graph/transformer-epoch%d.png'%epoch)
    plt.close()
    
    return total_loss / i

In [47]:
def predict_future(eval_model, data_source,steps):
    eval_model.eval() 
    total_loss = 0.
    test_result = torch.Tensor(0)    
    truth = torch.Tensor(0)
    _ , data = get_batch(data_source, 0,1)
    with torch.no_grad():
        for i in range(0, steps,1):
            input = torch.clone(data[-input_window:])
            input[-output_window:] = 0     
            output = eval_model(data[-input_window:])                        
            data = torch.cat((data, output[-1:]))
            
    data = data.cpu().view(-1)
    

    plt.plot(data,color="red")       
    plt.plot(data[:input_window],color="blue")
    plt.grid(True, which='both')
    plt.axhline(y=0, color='k')
    plt.savefig('graph/transformer-future%d.png'%steps)
    plt.close()

In [48]:
def evaluate(
    eval_model, 
    data_source,
    criterion    
    ):
    eval_model.eval() # Turn on the evaluation mode
    total_loss = 0.
    eval_batch_size = 1000
    with torch.no_grad():
        # for i in range(0, len(data_source) - 1, eval_batch_size): # Now len-1 is not necessary
        for i in range(0, len(data_source), eval_batch_size):
            data, targets = get_batch(data_source, i,eval_batch_size)
            output = eval_model(data)            
            total_loss += len(data[0]) * criterion(output, targets).cpu().item()
    return total_loss / len(data_source)

### 4. 모델 학습해보기

In [49]:
input_window = 100 # number of input steps
output_window = 5 # number of prediction steps, in this model its fixed to one
block_len = input_window + output_window # for one input-output pair
batch_size = 10
train_size = 0.8

criterion = nn.MSELoss()
lr = 0.005 

In [50]:
model = TransAm().to(device)

In [51]:
optimizer = torch.optim.AdamW(model.parameters(), lr=lr)
scheduler = torch.optim.lr_scheduler.StepLR(optimizer, 1, gamma=0.98)

In [52]:
best_val_loss = float("inf")
epochs = 100 # The number of epochs
best_model = None

In [53]:
train_data, val_data = get_data(
    output_window=output_window,
    tw=input_window
)

In [54]:
for epoch in range(1, epochs + 1):
    epoch_start_time = time.time()
    train(
        train_data=train_data,
        model = model,
        optimizer=optimizer,
        criterion=criterion,
        epoch=epoch,
        scheduler=scheduler
    )
    
    if ( epoch % 5 == 0 ):
        val_loss = plot_and_loss(
            model, 
            val_data,
            criterion,
            epoch,
            )
        predict_future(
            model, 
            val_data,
            200)
    else:
        val_loss = evaluate(model, 
                            val_data,
                            criterion
                            )
   
    print('-' * 89)
    print('| end of epoch {:3d} | time: {:5.2f}s | valid loss {:5.5f} | valid ppl {:8.2f}'.format(epoch, (time.time() - epoch_start_time),
                                     val_loss, math.exp(val_loss)))
    print('-' * 89)

    #if val_loss < best_val_loss:
    #    best_val_loss = val_loss
    #    best_model = model

    scheduler.step() 

    
    

| epoch   1 |    53/  269 batches | lr 0.005000 | 30.42 ms | loss 5.98159 | ppl   396.07
| epoch   1 |   106/  269 batches | lr 0.005000 | 20.04 ms | loss 0.06615 | ppl     1.07
| epoch   1 |   159/  269 batches | lr 0.005000 | 20.29 ms | loss 0.04159 | ppl     1.04
| epoch   1 |   212/  269 batches | lr 0.005000 | 20.82 ms | loss 0.02925 | ppl     1.03
| epoch   1 |   265/  269 batches | lr 0.005000 | 21.35 ms | loss 0.01835 | ppl     1.02
-----------------------------------------------------------------------------------------
| end of epoch   1 | time:  6.80s | valid loss 0.03010 | valid ppl     1.03
-----------------------------------------------------------------------------------------
| epoch   2 |    53/  269 batches | lr 0.004802 | 21.68 ms | loss 0.02304 | ppl     1.02
| epoch   2 |   106/  269 batches | lr 0.004802 | 21.15 ms | loss 0.01545 | ppl     1.02
| epoch   2 |   159/  269 batches | lr 0.004802 | 21.15 ms | loss 0.02073 | ppl     1.02
| epoch   2 |   212/  269 batche