# **16.2 셀프 어텐션 메커니즘 소개**
**16.2.1 기본적인 형태의 셀프 어텐션**

In [1]:
import torch

In [2]:
sentence = torch.tensor([0, # can
                         7, # you
                         1, # help
                         2, # me
                         5, # to
                         6, # translate
                         4, # this
                         3] # sentence
                        )
sentence

tensor([0, 7, 1, 2, 5, 6, 4, 3])

In [3]:
torch.manual_seed(123)
embed = torch.nn.Embedding(10,16)
embedded_sentence = embed(sentence).detach()

In [4]:
embedded_sentence.shape

torch.Size([8, 16])

In [5]:
omega = torch. empty(8,8)
for i, x_i in enumerate(embedded_sentence):
    for j, x_j in enumerate(embedded_sentence):
        omega[i,j] = torch.dot(x_i, x_j)

In [6]:
omega_mat = embedded_sentence.matmul(embedded_sentence.T)

In [7]:
torch.allclose(omega_mat, omega)

True

In [8]:
import torch.nn.functional as F
attention_weights = F.softmax(omega, dim=1)
attention_weights.shape

torch.Size([8, 8])

In [9]:
attention_weights.sum(dim=1)

tensor([1.0000, 1.0000, 1.0000, 1.0000, 1.0000, 1.0000, 1.0000, 1.0000])

In [10]:
x_2=embedded_sentence[1,:]
context_vec_2 = torch.zeros(x_2.shape)
for j in range(embedded_sentence.shape[0]):
  x_j = embedded_sentence[j,:]
  context_vec_2 +=attention_weights[1,j]*x_j
context_vec_2

tensor([-9.3975e-01, -4.6856e-01,  1.0311e+00, -2.8192e-01,  4.9373e-01,
        -1.2896e-02, -2.7327e-01, -7.6358e-01,  1.3958e+00, -9.9543e-01,
        -7.1287e-04,  1.2449e+00, -7.8077e-02,  1.2765e+00, -1.4589e+00,
        -2.1601e+00])

In [11]:
context_vectors = torch.matmul(attention_weights, embedded_sentence)
context_vectors.shape

torch.Size([8, 16])

In [12]:
torch.allclose(context_vectors, context_vectors)

True

In [13]:
# import numpy as np

# # 입력 시퀀스 (4개 토큰, 각 차원 3)
# X = np.array([
#     [1, 0, 1],   # x1
#     [0, 1, 1],   # x2
#     [1, 1, 0],   # x3
#     [0, 0, 1]    # x4
# ], dtype=float)

# def softmax(v):
#     e = np.exp(v - np.max(v))  # 안정적 softmax
#     return e / e.sum()

# i = 0  # x1에 대해 attention 계산해보자

# # 1. 점수 계산 (x1^T * xj for all j)
# scores = np.dot(X, X[i])   # (4,)
# print("점수:", scores)

# # 2. softmax 정규화
# attn_weights = softmax(scores)  # (4,)
# print("어텐션 가중치:", attn_weights)

# # 3. 가중합
# Z_i = np.sum(attn_weights[:, None] * X, axis=0)
# print("최종 Z(1):", Z_i)

# print("원래 X(1):", X[i])

**16.2.2 훈련 가능한 셀프 어텐션 메커니즘: 스케일드 점곱 어텐션**

In [32]:
torch.manual_seed(123)
d = embedded_sentence.shape[1]
U_query = torch.rand(d,d)
U_key = torch.rand(d,d)
U_value = torch.rand(d,d)
embedded_sentence[1]

tensor([-9.4053e-01, -4.6806e-01,  1.0322e+00, -2.8300e-01,  4.9275e-01,
        -1.4078e-02, -2.7466e-01, -7.6409e-01,  1.3966e+00, -9.9491e-01,
        -1.5822e-03,  1.2471e+00, -7.7105e-02,  1.2774e+00, -1.4596e+00,
        -2.1595e+00])

In [33]:
embedded_sentencex_2 = embedded_sentence[1]
query_2 = U_query.matmul(x_2)

In [34]:
print(embedded_sentence.shape)
print(x_2.shape)
print(U_query.shape)
print(query_2.shape)

torch.Size([8, 16])
torch.Size([16])
torch.Size([16, 16])
torch.Size([16])


In [35]:
key_2 = U_key.matmul(x_2)
value_2 = U_value.matmul(x_2)

In [36]:
keys = U_key.matmul(embedded_sentence.T).T
values = U_value.matmul(embedded_sentence.T).T

In [37]:
torch.allclose(key_2, keys[1])
torch.allclose(value_2, values[1])

True

In [38]:
omega_23 = query_2.dot(keys[2])
omega_23

tensor(14.3667)

In [39]:
omega_23 = query_2.dot(keys[2].T)
omega_23

tensor(14.3667)

In [40]:
omega_2 = query_2.matmul(keys.T)
omega_2

tensor([-25.1623,   9.3602,  14.3667,  32.1482,  53.8976,  46.6626,  -1.2131,
        -32.9392])

In [41]:
attention_weights_2=F.softmax(omega_2/d**0.5, dim=0)
attention_weights_2

tensor([2.2317e-09, 1.2499e-05, 4.3696e-05, 3.7242e-03, 8.5596e-01, 1.4026e-01,
        8.8897e-07, 3.1935e-10])

In [42]:
context_vector_2 = attention_weights_2.matmul(values)
context_vector_2

tensor([-1.2226, -3.4387, -4.3928, -5.2125, -1.1249, -3.3041, -1.4316, -3.2765,
        -2.5114, -2.6105, -1.5793, -2.8433, -2.4142, -0.3998, -1.9917, -3.3499])

**16.3.1 멀티 헤드 어텐션으로 문맥 임베딩 인코딩하기**

In [45]:
torch.manual_seed(123)
d = embedded_sentence.shape[1]
dk = 32
dv = 64
one_U_query = torch.rand(dk,d)

In [46]:
h = 8
multihead_U_query = torch.rand(h,dk, d)
multihead_U_key = torch.rand(h,dk, d)
multihead_U_value = torch.rand(h,dv, d)