## a simple example to define transformer in pytorch

In [44]:
import torch
import torch.nn as nn

# Define the hyperparameters
d_model = 512
nhead = 8
num_layers = 6
dropout = 0.1

# Create the Transformer model
transformer = nn.Transformer(d_model, nhead, num_layers, dropout=dropout)

# Example input data (batch_size=2, seq_length=10, d_model=512)
src = torch.randn(10, 2, 512)
tgt = torch.randn(10, 2, 512)

# Pass the input data through the Transformer model
output = transformer(src, tgt)
output

tensor([[[ 0.1493, -1.3924, -0.4691,  ...,  1.3503,  0.0981,  0.5888],
         [-2.2525, -0.6865,  0.3882,  ...,  0.0939, -0.7536, -0.2908]],

        [[-1.0448, -1.9382,  0.3898,  ...,  1.5998, -0.3502,  0.0884],
         [-1.3929, -0.1478, -0.3485,  ...,  0.6100, -0.9397,  0.3129]],

        [[-0.0511, -0.9572, -0.3308,  ...,  1.1702,  0.1238,  0.1381],
         [-1.5637, -0.3723, -0.0174,  ...,  0.9533, -1.2683,  0.2479]],

        ...,

        [[-0.9024, -0.7733,  0.1455,  ...,  0.7555, -1.4278,  0.4268],
         [-0.6650, -0.2154, -0.2301,  ...,  0.2651, -1.4691,  0.0540]],

        [[-0.1727, -0.9760,  0.7051,  ...,  0.5820, -0.9048,  0.1648],
         [-1.2072, -0.7008,  0.0726,  ...,  0.3305, -1.2804, -0.8328]],

        [[-0.3538, -1.7924, -0.7605,  ...,  0.6110, -0.1426, -0.1498],
         [-0.9802,  0.1632, -0.3538,  ...,  0.6591, -0.2645, -0.1160]]],
       grad_fn=<NativeLayerNormBackward0>)

## stock price example

In [39]:
import torch
import torch.nn as nn
import torch.optim as optim
import numpy as np

# Generate dummy stock price data
num_days = 200
stock_prices = np.random.rand(num_days) * 100

# Preprocess the data
input_seq_len = 10
output_seq_len = 5
num_samples = num_days - input_seq_len - output_seq_len + 1

src_data = torch.tensor([stock_prices[i:i+input_seq_len] for i in range(num_samples)]).unsqueeze(-1).float()
tgt_data = torch.tensor([stock_prices[i+input_seq_len:i+input_seq_len+output_seq_len] for i in range(num_samples)]).unsqueeze(-1).float()

# Create a custom Transformer model
class StockPriceTransformer(nn.Module):
    def __init__(self, d_model, nhead, num_layers, dropout):
        super(StockPriceTransformer, self).__init__()
        self.input_linear = nn.Linear(1, d_model)
        self.transformer = nn.Transformer(d_model, nhead, num_layers, dropout=dropout)
        self.output_linear = nn.Linear(d_model, 1)

    def forward(self, src, tgt):
        src = self.input_linear(src)
        tgt = self.input_linear(tgt)
        output = self.transformer(src, tgt)
        output = self.output_linear(output)
        return output

d_model = 64
nhead = 4
num_layers = 2
dropout = 0.1

model = StockPriceTransformer(d_model, nhead, num_layers, dropout=dropout)

# Training parameters
epochs = 100
lr = 0.001
batch_size = 16

# Loss function and optimizer
criterion = nn.MSELoss()
optimizer = optim.Adam(model.parameters(), lr=lr)

# Training loop
for epoch in range(epochs):
    for i in range(0, num_samples, batch_size):
        src_batch = src_data[i:i+batch_size].transpose(0, 1)
        tgt_batch = tgt_data[i:i+batch_size].transpose(0, 1)
        
        optimizer.zero_grad()
        output = model(src_batch, tgt_batch[:-1])
        loss = criterion(output, tgt_batch[1:])
        loss.backward()
        optimizer.step()

    print(f"Epoch {epoch+1}/{epochs}, Loss: {loss.item()}")



Epoch 1/100, Loss: 3398.44091796875
Epoch 2/100, Loss: 3316.48193359375
Epoch 3/100, Loss: 3222.744140625
Epoch 4/100, Loss: 3129.048828125
Epoch 5/100, Loss: 3029.943359375
Epoch 6/100, Loss: 2930.379150390625
Epoch 7/100, Loss: 2826.78173828125
Epoch 8/100, Loss: 2720.460693359375
Epoch 9/100, Loss: 2614.76806640625
Epoch 10/100, Loss: 2508.346435546875
Epoch 11/100, Loss: 2402.098388671875
Epoch 12/100, Loss: 2296.591552734375
Epoch 13/100, Loss: 2194.035400390625
Epoch 14/100, Loss: 2093.64111328125
Epoch 15/100, Loss: 1995.6507568359375
Epoch 16/100, Loss: 1901.906005859375
Epoch 17/100, Loss: 1812.193359375
Epoch 18/100, Loss: 1727.382568359375
Epoch 19/100, Loss: 1646.742919921875
Epoch 20/100, Loss: 1572.1201171875
Epoch 21/100, Loss: 1502.7958984375
Epoch 22/100, Loss: 1437.83447265625
Epoch 23/100, Loss: 1379.832763671875
Epoch 24/100, Loss: 1326.247314453125
Epoch 25/100, Loss: 1278.2840576171875
Epoch 26/100, Loss: 1235.185546875
Epoch 27/100, Loss: 1197.093994140625
Epoch 

## prediction in autogregressive fashion

In [40]:
# Predict the next 5 days of stock prices one at a time
src = torch.tensor(stock_prices[-input_seq_len:]).unsqueeze(-1).unsqueeze(1).float()
tgt = torch.zeros(output_seq_len, 1, 1)

with torch.no_grad():
    for i in range(output_seq_len):
        prediction = model(src, tgt[:i+1])
        tgt[i] = prediction[-1]

output = tgt.squeeze().tolist()
print("Next 5 days of stock prices:", output)

Next 5 days of stock prices: [51.60749435424805, 50.53663635253906, 50.62848663330078, 51.24516296386719, 50.89353942871094]


## prediction entire output sequence in a single forward pass

In [41]:
# Predict the next 5 days of stock prices at one time
src = torch.tensor(stock_prices[-input_seq_len:]).unsqueeze(-1).unsqueeze(1).float()
tgt = torch.zeros(output_seq_len, 1, 1)

with torch.no_grad():
     prediction = model(src, tgt)

output = prediction.squeeze().tolist()
print("Next 5 days of stock prices:", output)

Next 5 days of stock prices: [51.00934982299805, 51.10614013671875, 51.138946533203125, 50.764610290527344, 50.81841278076172]


## difference

The difference between `model(src, tgt[:i+1])` and `model(src, tgt)` lies in the target input tensor passed to the model during the prediction loop.

1. `model(src, tgt[:i+1])`: In this case, we pass a partial target sequence to the model, which includes the target values up to the current step `i`. This is used in the autoregressive decoding approach, where the model generates the output sequence one step at a time, using its own predictions from previous steps as input for the next steps. This approach is useful when the model needs to generate an output sequence step by step, and the output at each step depends on the previous outputs.

2. `model(src, tgt)`: In this case, we pass the entire target sequence to the model at once. This is used when the model is trained to generate the entire output sequence in a single forward pass, given the complete target sequence as input. This approach is useful when the model is trained to generate the output sequence in parallel, and the output at each step does not depend on the previous outputs.

In the context of predicting the next 5 days of stock prices at one time, we use the autoregressive decoding approach (`model(src, tgt[:i+1])`) to generate the output sequence step by step, as the output at each step depends on the previous outputs.