# import


In [1]:
import torch
from torch import nn
import torch.nn.functional as F
import math

In [2]:
random_torch = torch.rand(4, 4)
print(random_torch)

tensor([[0.5580, 0.4346, 0.6972, 0.3066],
        [0.2133, 0.8577, 0.0920, 0.9061],
        [0.8073, 0.1437, 0.0165, 0.9705],
        [0.0129, 0.6599, 0.1882, 0.1539]])


In [3]:
import sys

print(sys.version)
print(sys.executable)

3.9.19 (main, May  6 2024, 20:12:36) [MSC v.1916 64 bit (AMD64)]
d:\ProgramData\anaconda3\envs\didl\python.exe


# Embedding


In [4]:
from torch import Tensor


# 将输入的词汇表索引 转换为指定维度的Embedding
class TokenEmbedding(nn.Embedding):
    def __init__(self, vocab_size, d_model) -> None:
        super(TokenEmbedding, self).__init__(vocab_size, d_model, padding_idx=1)

In [5]:
class PositionalEmbedding(nn.Module):
    def __init__(self, d_model, max_len, device) -> None:
        super(PositionalEmbedding, self).__init__()

        self.encoding = torch.zeros(max_len, d_model, device=device)
        self.encoding.requires_grad = False

        pos = torch.arange(0, max_len, device=device)
        pos = torch.float().unsqueeze(dim=1)

        _2i = torch.arange(0, d_model, step=2, device=device).float()

        self.encoding[:, 0::2] = torch.sin(pos / (10000 ** (_2i / d_model)))
        self.encoding[:, 1::2] = torch.cos(pos / (10000 ** (_2i / d_model)))

    def forward(self, x):
        batch_size, seq_len = x.size()
        return self.encoding[:seq_len, :]

In [6]:
class TransformerEmbedding(nn.Module):
    def __init__(self, vocab_size, d_model, max_len, drop_prob, device):
        super(TransformerEmbedding, self).__init__()
        self.tok_emb = TokenEmbedding(vocab_size, d_model)
        self.pos_emb = PositionalEmbedding(d_model, max_len, device)
        self.drop_out = nn.Dropout(p=drop_prob)

    def forward(self, x):
        tok_emb = self.tok_emb(x)
        pos_emb = self.pos_emb(x)
        return self.drop_out(tok_emb + pos_emb)

# Multi-Head Attention


In [7]:
class MultiHeadAttention(nn.Module):
    def __init__(self, d_model, n_head) -> None:
        super(MultiHeadAttention, self).__init__()

        self.n_head = n_head
        self.d_model = d_model

        self.w_q = nn.Linear(d_model, d_model)
        self.w_k = nn.Linear(d_model, d_model)
        self.w_v = nn.Linear(d_model, d_model)
        self.w_combine = nn.Linear(d_model, d_model)
        self.softmax = nn.Softmax(dim=-1)

    def forward(self, q, k, v, mask=None):
        batch, time, dimension = q.shape
        n_d = self.d_model // self.n_head

        q, k, v = self.w_q(q), self.w_k(k), self.w_v(v)

        # 将dimension差分为n_head和n_d乘积 并更改time和n_head维度位置(batch,n_head,time,n_d)
        q = q.view(batch, time, self.n_head, n_d).permute(0, 2, 1, 3)
        k = k.view(batch, time, self.n_head, n_d).permute(0, 2, 1, 3)
        v = v.view(batch, time, self.n_head, n_d).permute(0, 2, 1, 3)

        # k的转置的最后两维
        score = q @ k.transpose(2, 3) / math.sqrt(n_d)

        if mask is not None:
            mask = torch.tril(torch.ones(time, time, dtype=bool))
            score = score.masked_fill(mask == 0, -10000)  # float('-inf')

        score = self.softmax(score) @ v

        # 恢复原来的形状
        score = score.permute(0, 2, 1, 3).contiguous().view(batch, time, dimension)
        output = self.w_combine(score)
        return output

In [8]:
x = torch.randn(128, 64, 512)  # batch, time, dimension
print(x.shape)
d_model = 512
n_head = 8
attention = MultiHeadAttention(d_model, n_head)
out = attention(x, x, x)
print(out)

torch.Size([128, 64, 512])
tensor([[[ 0.0751, -0.0150,  0.0540,  ..., -0.0519,  0.0027,  0.0660],
         [ 0.1109, -0.0110,  0.0643,  ..., -0.0395, -0.0152,  0.0795],
         [ 0.0807,  0.0110,  0.0320,  ..., -0.0340, -0.0156,  0.0649],
         ...,
         [ 0.0918, -0.0204,  0.0507,  ..., -0.0828,  0.0026,  0.0800],
         [ 0.0912, -0.0044,  0.0434,  ..., -0.0449, -0.0045,  0.0481],
         [ 0.1021, -0.0036,  0.0480,  ..., -0.0416, -0.0109,  0.0660]],

        [[ 0.0714, -0.0472,  0.0120,  ...,  0.0563, -0.0239,  0.0662],
         [ 0.0714, -0.0579, -0.0357,  ...,  0.0893, -0.0078,  0.0600],
         [ 0.0994, -0.0387, -0.0058,  ...,  0.0652, -0.0308,  0.0486],
         ...,
         [ 0.0659, -0.0595, -0.0206,  ...,  0.0566, -0.0110,  0.0203],
         [ 0.0929, -0.0305, -0.0274,  ...,  0.0717, -0.0372,  0.0535],
         [ 0.0776, -0.0703,  0.0142,  ...,  0.0877, -0.0025,  0.0265]],

        [[-0.0139,  0.0194,  0.0465,  ...,  0.0348, -0.0694,  0.0988],
         [-0.0285,

In [9]:
import numpy as np

# 定义softmax函数


def softmax(x):
    e_x = np.exp(x - np.max(x))
    return e_x / e_x.sum()


# 示例

x = np.array([2.0, 1.0, 0.1])
print(softmax(x))  # 输出：array([0.65900345, 0.24243297, 0.09856468])

[0.65900114 0.24243297 0.09856589]
