In [2]:
import numpy as np

# Sample prompt
prompt = "Once upon a time"

# Simple tokenization by splitting the text into words
tokens = prompt.split()
print("Tokens:", tokens)

# Define a simple vocabulary
vocab = {
    "Once": 0,
    "upon": 1,
    "a": 2,
    "time": 3
}

# Convert tokens to IDs
token_ids = [vocab[token] for token in tokens]
print("Token IDs:", token_ids)

Tokens: ['Once', 'upon', 'a', 'time']
Token IDs: [0, 1, 2, 3]


In [3]:
# Define the embedding matrix (for simplicity, using random initialization)
embedding_dim = 8  # Dimension of the embedding vectors
vocab_size = len(vocab)
embedding_matrix = np.random.rand(vocab_size, embedding_dim)

# Convert token IDs to embeddings
embeddings = np.array([embedding_matrix[token_id] for token_id in token_ids])
print("Embeddings:\n", embeddings)

Embeddings:
 [[0.06377145 0.3878672  0.08188198 0.21533448 0.40588766 0.46398555
  0.05341908 0.85256056]
 [0.31828137 0.31786979 0.18692775 0.92403407 0.72886036 0.75928042
  0.13488984 0.49474832]
 [0.40731337 0.11683353 0.15344824 0.83145745 0.65755591 0.45402358
  0.72777958 0.34526426]
 [0.08423454 0.94702216 0.91144005 0.96876322 0.40983828 0.4385876
  0.40125965 0.81762265]]


In [4]:
embeddings[0]

array([0.06377145, 0.3878672 , 0.08188198, 0.21533448, 0.40588766,
       0.46398555, 0.05341908, 0.85256056])

In [5]:
# Function to get positional encodings
def get_positional_encoding(seq_len, d_model):
    positional_encoding = np.zeros((seq_len, d_model))
    for pos in range(seq_len):
        for i in range(0, d_model, 2):
            positional_encoding[pos, i] = np.sin(pos / (10000 ** ((2 * i) / d_model)))
            if i + 1 < d_model:
                positional_encoding[pos, i + 1] = np.cos(pos / (10000 ** ((2 * i) / d_model)))
    return positional_encoding

# Get positional encodings
positional_encodings = get_positional_encoding(len(token_ids), embedding_dim)

# Add positional encodings to embeddings
final_embeddings = embeddings + positional_encodings
print("Final Embeddings with Positional Encodings:\n", final_embeddings)

Final Embeddings with Positional Encodings:
 [[ 0.06377145  1.3878672   0.08188198  1.21533448  0.40588766  1.46398555
   0.05341908  1.85256056]
 [ 1.15975236  0.8581721   0.19692758  1.92398407  0.72896036  1.75928041
   0.13489084  1.49474832]
 [ 1.3166108  -0.2993133   0.17344691  1.83125745  0.65775591  1.45402356
   0.72778158  1.34526426]
 [ 0.22535454 -0.04297034  0.94143555  1.96831325  0.41013828  1.43858755
   0.40126265  1.81762265]]


In [6]:
# Define the attention mechanism
def softmax(x):
    exp_x = np.exp(x - np.max(x, axis=-1, keepdims=True))
    return exp_x / np.sum(exp_x, axis=-1, keepdims=True)

def attention(query, key, value, mask=None):
    scores = np.dot(query, key.T) / np.sqrt(query.shape[-1])
    if mask is not None:
        scores += mask
    weights = softmax(scores)
    return np.dot(weights, value)

# Example weights for the attention mechanism (random initialization for simplicity)
weights_query = np.random.rand(embedding_dim, embedding_dim)
weights_key = np.random.rand(embedding_dim, embedding_dim)
weights_value = np.random.rand(embedding_dim, embedding_dim)

# Create query, key, and value matrices
query = np.dot(final_embeddings, weights_query)
key = np.dot(final_embeddings, weights_key)
value = np.dot(final_embeddings, weights_value)

# Apply the attention mechanism
attention_output = attention(query, key, value)
print("Attention Output:\n", attention_output)

Attention Output:
 [[6.43078014 5.64968091 3.0423906  2.50005347 3.537634   3.14750561
  4.57006251 3.87397084]
 [6.44167115 5.66056728 3.04814643 2.50455417 3.54761774 3.14936102
  4.571603   3.87610748]
 [6.43810953 5.65702969 3.04639234 2.50305936 3.54440886 3.14881096
  4.57106977 3.87532509]
 [6.43512108 5.65407702 3.04492533 2.50183819 3.54169492 3.14843514
  4.57068588 3.87471638]]


In [7]:
attention_output.shape

(4, 8)