(ref : https://nlp.seas.harvard.edu/2018/04/03/attention.html)
<br><br>
## Table of Contents
- [Model Architecture](#scrollTo=7THo9Qx9nVPw)
  - Encoder and Decoder Stacks
    - Encoder
    - Decoder
    - Attention
    - Applications of Attention in our Model
  - Position-wise Feed-Forward Networks
  - Embeddings and Softmax
  - Positional Encoding
  - Full Model

- [Training](#scrollTo=99wO8gdYnX1G)
  - Batches and Masking
  - Training Loop
  - Training Data and Batching
  - Hardware and Schedule
  - Optimizer
  - Regularization
    - Label Smoothing


In [2]:
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
import math, copy, time
from torch.autograd import Variable
import matplotlib.pyplot as plt
import seaborn
seaborn.set_context(context='talk')

( [Table of Contents](#scrollTo=OCXuQtk-lXC0)\ )
## Model Architecture

![ModalNet-21.png](https://nlp.seas.harvard.edu/images/the-annotated-transformer_14_0.png)

In [None]:
class EncoderDecoder(nn.Module):
    """
    A standard Encoder-Decoder architecture. Base for this and many other models.
    """
    def __init__(self, encoder, decoder, src_embed, tgt_embed, generator) -> None:
        super(EncoderDecoder, self).__init__()
        self.encoder = encoder
        self.decoder = decoder
        self.src_embed = src_embed
        self.tgt_embed = tgt_embed
        self.generator = generator
    
    def forward(self, src, tgt, src_mask, tgt_mask):
        "Take in and process masked src and target sequences."
        memory = self.encode(src, src_mask)
        out = self.decode(memory, tgt, src_mask, tgt_mask)
        return out
    
    def encode(self, src, src_mask):
        src_embeded = self.src_embed(src)
        memory = self.encoder(src_embeded, src_mask)
        return memory
    
    def decode(self, memory, tgt, src_mask, tgt_mask):
        tgt_embeded = self.tgt_embed(tgt)
        out = self.decoder(tgt_embeded, memory, src_mask, tgt_mask)
        return out


class Generator(nn.Module):
    "Define standard linear + softmax generation step."
    def __init__(self, d_model, vocab) -> None:
        super(Generator, self).__init__()
        self.proj = nn.Linear(d_model, vocab)
    
    def forward(self, x):
        return F.log_softmax(self.proj(x), dim=-1)

### Encoder and Decoder Stacks

### Full model

In [None]:
def make_model(src_vocab, tgt_vocab, N=6, d_model=512, d_ff=2048, h=8, dropout=0.1):
    "Helper: Construct a model from hyperparameters."
    dpcp = copy.deepcopy
    attn = MultiHeadAttention(h, d_model)
    ff = PositionwiseFeedForward(d_model, d_ff, dropout)
    position = PositionalEncoding(d_model, dropout)
    
    enc_l = EncoderLayer(d_model, dpcp(attn), dpcp(ff), dropout)
    encoder = Encoder(enc_l, N)
    dec_l = DecoderLayer(d_model, dpcp(attn), dpcp(attn), dpcp(ff), dropout)
    decoder = Decoder(dec_l, N)
    src_embed = nn.Sequential(Embeddings(d_model, src_vocab), dpcp(position))
    tgt_embed = nn.Sequential(Embeddings(d_model, tgt_vocab), dpcp(position))
    generator = Generator(d_model, tgt_vocab)
    
    # generate model instance
    model = EncoderDecoder(encoder, decoder, src_embed, tgt_embed, generator)
    
    # Initialize parameters with Glorot / fan_avg.
    for params in model.parameters():
        if params.dim() > 1:
            nn.init.xavier_uniform(params)
    return model

( [Table of Contents](#scrollTo=OCXuQtk-lXC0)\ )
## Training