## Transformer

In [None]:
# 环境配置
%cd /playground/sgd_deep_learning/sgd_nlp/
import sys 
sys.path.append('./python')

In [None]:
import torch
from torch import nn
from sgd_nlp.transformer import MultiheadAttention

### 模块1: MultiheadAttention

In [None]:
heads = 4
N = 10
T, d = 100, 64
M = torch.triu(-float("inf")*torch.ones(T,T),1)
X = torch.randn(N,T,d)

# pytorch 实现
attn_ = nn.MultiheadAttention(d, heads, bias=False, batch_first=True)
Y_, A_ = attn_(X,X,X, attn_mask=M)

# sgd_nlp库实现
attn = MultiheadAttention(d, heads, bias=False)
# init param
attn.proj_k.weight.data, attn.proj_q.weight.data, attn.proj_v.weight.data  = torch.split(attn_.in_proj_weight.detach(), split_size_or_sections=d, dim=0)
attn.out_proj.weight.data = attn_.out_proj.weight.detach()

Y, A = attn(X,X,X,attn_mask=M)

In [None]:
print(Y.shape, Y_.shape)
print(A.shape, A_.shape)
print(torch.dist(Y.detach(), Y_.detach()))
print(torch.dist(A.mean(1).detach(), A_.detach()))

### 模块2: TransformerEncoder

In [None]:
from sgd_nlp.transformer import TransformerEncoder, TransformerEncoderLayer

In [None]:
heads = 4
num_layers = 2

N, T, d = 10, 100, 64
M = torch.triu(-float("inf")*torch.ones(T,T),1)
X = torch.randn(N,T,d)

encoder_layer = TransformerEncoderLayer(d_model=d, nhead=heads, dim_feedforward=128)
out_layer = encoder_layer(X)

print("encoder layer output", out_layer.shape)

encoder = TransformerEncoder(num_layers, d_model=d, nhead=heads, dim_feedforward=128)
out_encoder = encoder(X)

print("encoder stack output", out_encoder.shape)

### 模块3: TransformerDecoder

In [None]:
from sgd_nlp.transformer import TransformerDecoder, TransformerDecoderLayer

In [None]:
tgt_mask = torch.triu(-float("inf")*torch.ones(T, T), 1)

decoder_layer = TransformerDecoderLayer(d_model=d, nhead=heads, dim_feedforward=128)
out_layer = decoder_layer(X, out_encoder, tgt_mask)

print("decoder layer output", out_layer.shape)

decoder = TransformerDecoder(num_layers, d_model=d, nhead=heads, dim_feedforward=128)
out_decoder = decoder(X, out_encoder, tgt_mask)

print("decoder stack output", out_decoder.shape)

### 模块4: 位置编码
模拟二进制

位置  0    1    2     3     4     5    6   7

编码  000  001  010   011   100   101  110 111


行代表词元在序列中的位置，列代表位置编码的不同维度
每个列维度使用不同的频率，变换频率依次降低。 类似于二进制的高阶数字变化较慢（100）
词元的行位置交替使用sin cos来生存每一个列维度对应的具体数值

In [None]:
import matplotlib.pyplot as plt
from sgd_nlp.transformer import PositionalEncoding

batchs, num_steps, encoding_dim = 1, 200, 31
pos_encoding = PositionalEncoding(num_hiddens=encoding_dim, max_len=num_steps)
pos_encoding.eval()

X = torch.zeros((batchs, num_steps, encoding_dim))
xe = pos_encoding(X)
print(xe.shape)

xx = torch.arange(num_steps)
yy1 = xe[0, :, 4].reshape(-1)
yy2 = xe[0, :, 10].reshape(-1)
yy3 = xe[0, :, 12].reshape(-1)
yy4 = xe[0, :, 20].reshape(-1)

plt.plot(xx, yy1)
plt.plot(xx, yy2)
plt.plot(xx, yy3)
plt.plot(xx, yy4)
plt.show()


### 模块整合 Transformer

In [None]:
from sgd_nlp.transformer import Transformer

heads = 4
N, T, d = 10, 100, 64
T_ = 60

M = torch.triu(-float("inf")*torch.ones(T,T),1)
X = torch.randn(N,T,d)

src = torch.randn(N,T,d)
tgt = torch.randn(N,T_,d)

transformer = Transformer(d_model=d, nhead=heads, num_encoder_layers= 6,
                num_decoder_layers=4, dim_feedforward = d*4, dropout=0.1)
out = transformer(src, tgt)
print("transformer out", out.shape)

### 训练语言模型

使用transformer encoder训练语言模型（类似 ./apps/RNN/word_level_language_model.ipynb）

In [None]:
# Download the datasets
import urllib.request
import os

!mkdir -p './data/ptb'
# Download Penn Treebank dataset

# github raw-file下载有问题，手动下载对应文件到data目录
# ptb_data = "https://github.com/wojzaremba/lstm/blob/master/data/ptb."
ptb_data = "https://raw.githubusercontent.com/wojzaremba/lstm/master/data/ptb."
for f in ['train.txt', 'test.txt', 'valid.txt']:
    if not os.path.exists(os.path.join('./data/ptb', f)):
        print(ptb_data + f)
        urllib.request.urlretrieve(ptb_data + f, os.path.join('./data/ptb', f))

In [None]:
import torch
from torch import nn

import sgd_nlp
import numpy as np

from sgd_nlp.simple_training import train_ptb, evaluate_ptb

In [None]:
from sgd_nlp.transformer import PositionalEncoding

class LanguageModel(nn.Module):
    def __init__(self, d_model, n_heads, output_size,  num_encoder_layers= 6,
                num_decoder_layers=4, dim_feedforward = 1024, dropout=0.1,
                device=None, dtype=torch.float32):
        
        factory_kwargs = {'device': device, 'dtype': dtype}
        super(LanguageModel, self).__init__()
        
        self.pos_emb = PositionalEncoding(d_model, dropout=dropout)
        self.emb = nn.Embedding(num_embeddings=output_size, embedding_dim=d_model, **factory_kwargs)
        self.d_model = d_model
        
        self.seq_model = TransformerEncoder(num_layers=num_encoder_layers, d_model=d_model, nhead=n_heads,
                dim_feedforward = dim_feedforward, dropout=dropout, **factory_kwargs)
        
        self.linear = nn.Linear(d_model, output_size, **factory_kwargs)

    def forward(self, x, h=None):
        """
        Given sequence (and the previous hidden state if given), returns probabilities of next word
        (along with the last hidden state from the sequence model).
        Inputs:
        x of shape (seq_len, bs)
        h of shape (num_layers, bs, hidden_size) if using RNN,
            else h is tuple of (h0, c0), each of shape (num_layers, bs, hidden_size)
        Returns (out, h)
        out of shape (seq_len*bs, output_size)
        h of shape (num_layers, bs, hidden_size) if using RNN,
            else h is tuple of (h0, c0), each of shape (num_layers, bs, hidden_size)
        """
        T, B = x.shape
        # 缺少了位置编码
        # 训练集x的次序也给搞错了
        x = x.swapaxes(0, 1) # [B,T]
        src = self.pos_emb(self.emb(x)) #[B,T,C]
        out = self.seq_model(src) #[B,T,C]
        out = out.swapaxes(0, 1) #[T,B,C]
        
        y =  self.linear(out.reshape((T*B, self.d_model))) #[T*B, C_out]
        return y

In [None]:
# 官方的实现

from sgd_nlp.transformer import PositionalEncoding

class LanguageModel_Official(nn.Module):
    def __init__(self, d_model, n_heads, output_size,  num_encoder_layers= 6,
                num_decoder_layers=4, dim_feedforward = 1024, dropout=0.1,
                device=None, dtype=torch.float32):
        
        factory_kwargs = {'device': device, 'dtype': dtype}
        super(LanguageModel_Official, self).__init__()
        
        self.pos_emb = PositionalEncoding(d_model, dropout=dropout)
        self.emb = nn.Embedding(num_embeddings=output_size, embedding_dim=d_model, **factory_kwargs)
        self.d_model = d_model
        
        layer_factory_kwargs = {'d_model':d_model, 'nhead':n_heads, 'dim_feedforward':dim_feedforward,
                    'dropout':dropout, 'device': device, 'dtype': dtype}
        # encoder_layers = nn.TransformerEncoderLayer(batch_first=True, **layer_factory_kwargs)
        # self.seq_model = nn.TransformerEncoder(encoder_layers, num_encoder_layers)
        
        # 手动堆层
        self.seq_model = nn.Sequential(*[nn.TransformerEncoderLayer(batch_first=True, **layer_factory_kwargs) for _ in range(num_encoder_layers)])
        
        self.linear = nn.Linear(d_model, output_size, **factory_kwargs)

    def forward(self, x, h=None):
        """
        Given sequence (and the previous hidden state if given), returns probabilities of next word
        (along with the last hidden state from the sequence model).
        Inputs:
        x of shape (seq_len, bs)
        h of shape (num_layers, bs, hidden_size) if using RNN,
            else h is tuple of (h0, c0), each of shape (num_layers, bs, hidden_size)
        Returns (out, h)
        out of shape (seq_len*bs, output_size)
        h of shape (num_layers, bs, hidden_size) if using RNN,
            else h is tuple of (h0, c0), each of shape (num_layers, bs, hidden_size)
        """
        T, B = x.shape
        # 缺少了位置编码
        # 训练集x的次序也给搞错了
        x = x.swapaxes(0, 1) # [B,T]
        src = self.pos_emb(self.emb(x)) #[B,T,C]
        out = self.seq_model(src) #[B,T,C]
        out = out.swapaxes(0, 1) #[T,B,C]
        
        y =  self.linear(out.reshape((T*B, self.d_model))) #[T*B, C_out]
        return y

In [None]:
# 定义具体训练过程

from sgd_nlp.data import get_batch
from sgd_nlp.simple_training import accuracy

### PTB training ###
def epoch_transformer(data, model, seq_len, loss_fn=nn.CrossEntropyLoss(), opt=None,
        clip=None, device=None, dtype=None):
    """
    Iterates over the data. If optimizer is not None, sets the
    model to train mode, and for each batch updates the model parameters.
    If optimizer is None, sets the model to eval mode, and simply computes
    the loss/accuracy.

    Args:
        data: data of shape (nbatch, batch_size) given from batchify function
        model: LanguageModel instance
        seq_len: i.e. bptt, sequence length
        loss_fn: nn.Module instance
        opt: Optimizer instance (optional)
        clip: max norm of gradients (optional)

    Returns:
        avg_acc: average accuracy over dataset
        avg_loss: average loss over dataset
    """
    np.random.seed(4)
    if opt == None:
        model.eval()
    else:
        model.train()
    nbatch, batch_size = data.shape
    accum_loss = 0
    accum_acc = 0
    sum_samples = 0
    
    for i in range(0, nbatch - 1, seq_len):
        batch_x, batch_y = get_batch(data, i, seq_len, device=device, dtype=dtype)
        sum_samples += batch_y.shape[0]
        
        if opt == None:
            out = model(batch_x)
            loss = loss_fn(out, batch_y)
        else:
            opt.zero_grad()
            out = model(batch_x)
            loss = loss_fn(out, batch_y)
            loss.backward()
            if getattr(opt, 'clip_grad_norm', None) is not None:
                if clip is not None:
                    opt.clip_grad_norm(clip)
                else:
                    opt.clip_grad_norm()
            opt.step()
        
        cur_batch_loss = loss.detach()
        cur_batch_succ = accuracy(out, batch_y)
        accum_loss +=  cur_batch_loss
        accum_acc += cur_batch_succ
        # if i%100==0:
        #     print("done:[{}], left:[{}], total:[{}]".format(i, nbatch-i, nbatch))
        #     print("batch:{} \t batch_loss:[{}] \t batch_acc:[{}]".format(i, cur_batch_loss, cur_batch_succ))
        #     print()
    return accum_acc*(1.0/sum_samples), accum_loss * (1.0/sum_samples)  

In [None]:
# 设置训练超参数
# device = torch.device('cpu')   
device = torch.device('cuda:0')

n_heads = 8
num_layers=2 # Transformer层数
n_epochs=100 # 数据遍历次数

# embedding_size=400 # word编码维度
d_model = 1024  # hidden dim

seq_len = 21 # truncated BPTT 序列截断长度
batch_size = 256 # 批处理数量

optimizer=torch.optim.Adam
lr=1e-4 # 学习率
weight_decay=0
loss_fn=torch.nn.CrossEntropyLoss()

# 加载训练数据
corpus = sgd_nlp.data.Corpus("data/ptb", max_lines=2000)
train_data = sgd_nlp.data.batchify(corpus.train, batch_size=batch_size, device=device, dtype=np.float32)
print(train_data.shape)


model = LanguageModel(d_model=d_model,
                    n_heads=n_heads,
                    output_size=len(corpus.dictionary),
                    num_encoder_layers= num_layers,
                    num_decoder_layers= num_layers,
                    dim_feedforward = d_model*2,
                    dropout=0.1,
                    device=device,
                    dtype=torch.float32)

# for name, param in model.named_parameters():
#     if param.requires_grad:
#         print(name, param.data)

# model = LanguageModel_Official(d_model=d_model,
#                     n_heads=n_heads,
#                     output_size=len(corpus.dictionary),
#                     num_encoder_layers= num_layers,
#                     num_decoder_layers= num_layers,
#                     dim_feedforward = d_model*2,
#                     dropout=0.1,
#                     device=device,
#                     dtype=torch.float32)

print('---------------')
# for name, param in model.named_parameters():
#     if param.requires_grad:
#         print(name, param.data)

# print(next(model.seq_model.encoder_layers[0].parameters()).device)


In [None]:
train_ptb(model, 
          train_data, 
          seq_len=seq_len, 
          n_epochs=n_epochs, 
          device=device, 
          optimizer=optimizer, 
          lr=lr, 
          weight_decay=weight_decay, 
          loss_fn=loss_fn,
          epoch_func=epoch_transformer,)

evaluate_ptb(model,
             train_data,
             seq_len=seq_len,
             epoch_func=epoch_transformer,
             device=device,)

实验小结：

使用时注意 BTC的顺序是否满足
加上positional encoding，模型收敛速度要快得多

整体的计算效率确实要比RNN快，但是区别在于 Td^2 和T^2d的区别
如何涉及非常大的时间序列依赖， T远超d。 d一般取2014，2048，再大就很难算了

当初写Transformer、bert的人想不到后续能有这么大的影响力。
adam 1e-4, 太大loss降不下去

In [None]:
print(*[1,2,3,4])

完整的transformer seq2seq训练过程，参考./apps/Transformer/pos_tagging.ipynb