# Imports

In [1]:
import torch
import torch.nn as nn
from architecture.seq2seq import LSTMSeq2Seq, AttentionLSTMSeq2Seq, TransformerSeq2Seq

# Configuration
Supports $(B,L_{in},C_{in})\xrightarrow{network}(B,L_{out},C_{out})$ operations, where  
$\begin{aligned}
B&=\text{batch\_size}\\
L_{in}&=\text{input\_sequence\_length (variable)}\\
C_{in}&=\text{input\_embedding\_size}\\
L_{out}&=\text{output\_sequence\_length (variable)}\\
C_{out}&=\text{output\_embedding\_size}\\
\end{aligned}$  
- ```hidden_size``` Hidden state size of LSTM encoder. Equivalent to ```d_model``` in ```TransformerSeq2Seq```  
- ```num_layers``` Number of LSTM and Transformer encoder, decoder layers.
- ```bidirectional``` Whether to use bidirectional LSTM encoder.  
- ```dropout``` Dropout rate. Applies to:  
  - Residual drop path in 1DCNN in ```architecture.cnn```  
  - Hidden state dropout in LSTM encoder/decoder(for every time step). Unlike ```torch.nn.LSTM```, dropout is applied from the first LSTM layer.  
  - The same dropout as Vanilla Transformer from Vaswani et al..
- ```layernorm``` Layer normalization in LSTM encoder and decoder.  

All network parameters are random-initialized from $\mathcal{N}\sim(0,0.01^2)$, except for all bias with ```torch.zeros``` and normalization layer weights with ```torch.ones```. See ```architectures.init```


In [2]:
batch_size = 32
length_x = 40  # input sequence length
length_y = 60  # output label sequence length
input_size = 27  # input feature size
output_size = 6  # output feature size
hidden_size = 128  # hidden state size of LSTM encoder. Equivalent to d_model in TransformerSeq2Seq
dropout = 0.1  # dropout rate
num_layers = 3  # number of LSTM and Transformer encoder, decoder layers
bidirectional = True  # Whether to use bidirectional LSTM encoder
layernorm = True  # Layer normalization in LSTM encoder and decoder

# Create dummy input/output

In [3]:
x = torch.randn(batch_size, length_x, input_size)
y = torch.randn(batch_size, length_y, output_size)

# Forward operation
- ```x``` Input to the network. Supports $(B,L_{in},C_{in})$ only.  
- ```teacher_forcing``` Teacher forcing ratio $\in [0,1]$. Defaults to -1 (fully autoregressive).  
- ```y``` Output label for teacher forcing. Supports $(B,*,C_{out})$ only. Defaults to ```None``` (fully autoregressive).  
- ```trg_len``` Target sequence length to generate. Defaults to ```1```.

### LSTM Encoder-Decoder

In [4]:
model = LSTMSeq2Seq(
    input_size=input_size,
    output_size=output_size,
    hidden_size=hidden_size,
    num_layers=num_layers,
    bidirectional=bidirectional,
    dropout=dropout,
)
out_1 = model.forward_auto(x, 100)
out_2 = model.forward_labeled(x, y)
print(out_1.shape, out_2.shape)

torch.Size([32, 100, 6]) torch.Size([32, 60, 6])


### LSTM Encoder-Decoder with Bahdanau style attention

In [5]:
model = AttentionLSTMSeq2Seq(
    input_size=input_size,
    output_size=output_size,
    hidden_size=hidden_size,
    num_layers=num_layers,
    bidirectional=bidirectional,
    dropout=dropout,
)
out_1 = model.forward_auto(x, 100)
out_2 = model.forward_labeled(x, y)
print(out_1.shape, out_2.shape)

torch.Size([32, 100, 6]) torch.Size([32, 60, 6])


### Vanilla Transformer Encoder-Decoder

In [6]:
model = TransformerSeq2Seq(
    input_size=input_size,
    output_size=output_size,
    num_layers=num_layers,
    d_model=hidden_size,
    n_heads=4,
    dropout=dropout,
    d_ff=hidden_size * 4,
)
out_1 = model.forward_auto(x, 100)
out_2 = model.forward_labeled(x, y)
print(out_1.shape, out_2.shape)

torch.Size([32, 100, 6]) torch.Size([32, 60, 6])


# Accessing model properties
- Parameters can be counted by ```model.count_params()```
- Properties are accessed using ```model.model_info``` attribute.  
- Another model instance can be created by ```ModelClass(**model.model_init_args)```.  

These features are attributed to ```architectures.skeleton.Skeleton``` class.

In [7]:
model.count_params()

model_info = model.model_info
model_init_args = model.model_init_args
print(model_info)

another_model_instance = TransformerSeq2Seq(**model_init_args)

Number of trainable parameters: 1,393,798
{'bidirectional': True, 'd_ff': 512, 'd_model': 128, 'dropout': 0.1, 'hidden_size': 256, 'input_size': 27, 'layernorm': False, 'n_heads': 4, 'num_hl': 0, 'num_layers': 3, 'output_size': 6}
