In [1]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.nn.utils.rnn import pad_sequence

In [2]:
vocab = "$" + ''.join(chr(ord('a') + i) for i in range(26))
vocab_size = len(vocab)

char_to_index = {}
index_to_char = {}
for index, char in enumerate(vocab):
    char_to_index[char] = index
    index_to_char[index] = char

In [3]:
# =============================== PREVIOUS CODE ===============================
vocab = "$" + ''.join(chr(ord('a') + i) for i in range(26))
vocab_size = len(vocab)
# Create dictionaries for character to index and index to character
ch_to_i = {char: i for i, char in enumerate(vocab)}
i_to_ch = {i: char for i, char in enumerate(vocab)}

# Define encode and decode functions
encode = lambda word: torch.tensor([ch_to_i[c] for c in word])
decode = lambda tensor_i: ''.join(i_to_ch[i.item()] for i in tensor_i)

# Get data
names = []
with open('../data/p2ch9/names_2022.txt', 'r') as file:
    for line in file:
        name, _, _= line.lower().strip().split(',')
        names.append(name)

# filter out names 10 characters and longer
names = [name for name in names if len(name) < 10]
# Add special character boundary to names
names = ['$' + name + '$' for name in names]

# Length of max name
max_name_length = max(len(name) for name in names)
longest_name = max(names, key=len)
longest_name_index = names.index(longest_name)

# Create a function to get a batch of data
names_index = [torch.tensor([ch_to_i[char] for char in name]) for name in names]
targets_index = [name_index[1:] for name_index in names_index]
names_index[0], targets_index[0]

from torch.nn.utils.rnn import pad_sequence
X = pad_sequence(names_index, batch_first=True, padding_value=0)
# trick to pad Y with with -1 and the same size as X
targets_index.append(X[0])
Y = pad_sequence(targets_index, batch_first=True, padding_value=-1)[:-1]

def get_batch(batch_size=64):
    random_idx = torch.randint(0, X.size(0), (batch_size,))
    batch = X[random_idx]
    labels = Y[random_idx]
    return batch, labels
batch, labels = get_batch()

def train(model, optimizer, num_steps=10_001, loss_report_interval=1_000):
    losses = []
    for i in range(1, num_steps):
        inputs, labels = get_batch()
        optimizer.zero_grad()
        logits = model(inputs)
        loss = F.cross_entropy(logits.view(-1, logits.shape[-1]), labels.view(-1), ignore_index=-1)
        losses.append(loss.item())
        if i % loss_report_interval == 0:
            print(f'Average loss at step {i}: {sum(losses[-loss_report_interval:]) / loss_report_interval:.4f}')
        loss.backward()
        optimizer.step()

def generate_samples(model, num_samples=1, max_len=max_name_length):
    sequences = torch.zeros((num_samples, 1)).int()
    for _ in range(max_len):
        logits = model(sequences)
        logits = logits[:, -1, :]
        probs = F.softmax(logits, dim=-1)
        idx_next = torch.multinomial(probs, num_samples=1)
        sequences = torch.cat((sequences, idx_next), dim=1)

    for sequence in sequences:
        indices = torch.where(sequence == 0)[0]
        end = indices[1] if len(indices) > 1 else max_len
        sequence = sequence[1:end]
        print(decode(sequence))
# =============================== FINISH PREVIOUS CODE ===============================

In [None]:
all_names_to_indices = []
all_targets_to_indices = []
for name in names:
    name_to_indices = []
    for char in name:
        name_to_indices.append(char_to_index[char])
    all_names_to_indices.append(torch.tensor(name_to_indices))
    all_targets_to_indices.append(torch.tensor(name_to_indices[1:]))

for i in range(5):
    print(f"name {i}: {all_names_to_indices[i]}")
    print(f"target {i}: {all_targets_to_indices[i]}")


In [None]:
from torch.nn.utils.rnn import pad_sequence
X = pad_sequence(all_names_to_indices, batch_first=True)
print(X.shape)
X[:5]

In [None]:
Y = pad_sequence(
    all_targets_to_indices + [X[-1]], 
    batch_first=True, 
    padding_value=-1)
Y = Y[:-1]
print(Y.shape)
Y[:5]

In [8]:
t = torch.arange(0, 24).view(2, 3, 4)
print(t)
sum_t = t.sum(dim=-1)
print(sum_t.shape)
sum_t

tensor([[[ 0,  1,  2,  3],
         [ 4,  5,  6,  7],
         [ 8,  9, 10, 11]],

        [[12, 13, 14, 15],
         [16, 17, 18, 19],
         [20, 21, 22, 23]]])
torch.Size([2, 3])


tensor([[ 6, 22, 38],
        [54, 70, 86]])

In [6]:
t.sum(dim=1)

tensor([[12, 15, 18, 21]])

In [17]:
torch.manual_seed(0)
x = torch.rand(1, 4, 3) # <1>
batch_size, sequence_length, feature_size = x.shape

query = F.linear(x, weight=torch.rand(3, 3), bias=torch.rand(3))
key = F.linear(x, weight=torch.rand(3, 3), bias=torch.rand(3))
value = F.linear(x, weight=torch.rand(3, 3), bias=torch.rand(3))

def scaled_dot_product_causal_attention(q, k, v):
    # assumes batch dimension is present
    attn_weights = q @ k.transpose(1, 2) # <2>
    # create a mask to prevent the model from attending to future tokens
    mask = torch.tril(torch.ones(attn_weights.shape[1:]), diagonal=0) # <3>
    attn_weights = attn_weights.masked_fill(mask == 0, value=float('-inf'))
    # normalize the attention weights
    attn_weights = attn_weights / torch.sqrt(torch.tensor(k.shape[-1]).float()) # <4>
    attn_weights = F.softmax(attn_weights, dim=-1)
    output = attn_weights @ v
    return output, attn_weights

output, attn_weights = scaled_dot_product_causal_attention(query, key, value)
print(output)

# Add an additional dimension for the heads
num_heads = 2

query = F.linear(x, weight=torch.rand(3, 3), bias=torch.rand(3))
key = F.linear(x, weight=torch.rand(3, 3), bias=torch.rand(3))
value = F.linear(x, weight=torch.rand(3, 3), bias=torch.rand(3))

# Split the transformed sequence across multiple heads
query = query.view(x.shape[0], num_heads, x.shape[1] // num_heads, x.shape[2])
key = key.view(x.shape[0], num_heads, x.shape[1] // num_heads, x.shape[2])
value = value.view(x.shape[0], num_heads, x.shape[1] // num_heads, x.shape[2])

def scaled_dot_product_causal_attention(q, k, v):
    # assumes batch dimension is present
    attn_weights = q @ k.transpose(-2, -1)  # <2>
    # create a mask to prevent the model from attending to future tokens
    mask = torch.tril(torch.ones(attn_weights.shape[-2:]), diagonal=0)  # <3>
    attn_weights = attn_weights.masked_fill(mask == 0, value=float('-inf'))
    # normalize the attention weights
    attn_weights = attn_weights / torch.sqrt(torch.tensor(k.shape[-1]).float())  # <4>
    attn_weights = F.softmax(attn_weights, dim=-1)
    output = attn_weights @ v
    return output, attn_weights

output, attn_weights = scaled_dot_product_causal_attention(query, key, value)
# Combine the outputs from all heads
output = output.transpose(1, 2).contiguous().view(batch_size, sequence_length, feature_size)
print(output)

expected_output = F.scaled_dot_product_attention(query, key, value, is_causal=True)
expected_output = expected_output.transpose(1, 2).contiguous().view(batch_size, sequence_length, feature_size)
expected_output

tensor([[[1.1238, 1.6097, 1.1210],
         [1.0748, 1.4402, 1.0530],
         [1.2922, 1.6018, 1.1130],
         [1.2236, 1.5888, 1.1344]]])
tensor([[[2.0750, 1.8136, 0.4190],
         [2.5313, 1.9086, 0.4521],
         [2.0095, 1.5816, 0.3130],
         [2.3691, 1.8376, 0.4426]]])


tensor([[[2.0750, 1.8136, 0.4190],
         [2.5313, 1.9086, 0.4521],
         [2.0095, 1.5816, 0.3130],
         [2.3691, 1.8376, 0.4426]]])

In [4]:
class TransformerBlock(nn.Module):
    def __init__(self, n_embd, num_heads=4, n_hidden=64):
        super().__init__()
        assert n_embd % num_heads == 0, "Embedding dimension must be divisible by the number of heads"

        self.num_heads = num_heads
        self.head_dim = n_embd // num_heads

        self.query_proj = nn.Linear(n_embd, n_embd)
        self.key_proj = nn.Linear(n_embd, n_embd)
        self.value_proj = nn.Linear(n_embd, n_embd)

        self.mlp = nn.Sequential(
            nn.Linear(n_embd, n_hidden),
            nn.ReLU(),
            nn.Linear(n_hidden, n_embd)
        )

        # Layernorms
        self.norm_1 = nn.LayerNorm(n_embd)
        self.norm_2 = nn.LayerNorm(n_embd)

    def forward(self, x):
        batch_size, sequence_length, _ = x.shape

        q = self.query_proj(x)
        k = self.key_proj(x)
        v = self.value_proj(x)

        # multiheaded attention
        q = q.view(batch_size, sequence_length, self.num_heads, self.head_dim).transpose(1, 2)
        k = k.view(batch_size, sequence_length, self.num_heads, self.head_dim).transpose(1, 2)
        v = v.view(batch_size, sequence_length, self.num_heads, self.head_dim).transpose(1, 2)

        # attention
        attn_weights = F.scaled_dot_product_attention(q, k, v, is_causal=True)

        # multiple heads concatenation
        attn_weights = attn_weights.transpose(1, 2).contiguous().view(batch_size, sequence_length, -1)

        # norm and residual connections here
        x = self.norm_1(x + attn_weights)
        x = self.norm_2(x + self.mlp(x))
        return x

In [5]:
class Transformer(nn.Module):
    def __init__(self, n_embd, vocab_size, block_size, num_blocks=6):
        super().__init__()
        self.char_embedding = nn.Embedding(vocab_size, n_embd)
        self.positional_embedding = nn.Embedding(block_size, n_embd)

        self.transformer_blocks = nn.Sequential(
            *[TransformerBlock(n_embd) for _ in range(num_blocks)]
        )

        self.output_proj = nn.Linear(n_embd, vocab_size)
    
    def forward(self, x):
        _, seq_len = x.shape

        pos_embd = self.positional_embedding(torch.arange(seq_len)) #  <1>
        char_embd = self.char_embedding(x)
        x = char_embd + pos_embd
        x = self.transformer_blocks(x)
        x = self.output_proj(x)

        return x
    
n_embd = 64
model = Transformer(n_embd, vocab_size, block_size=max_name_length)
optimizer = optim.SGD(model.parameters(), lr=0.1)

  from .autonotebook import tqdm as notebook_tqdm


In [6]:
x, y = get_batch()


tensor([0, 1, 3, 9, 5, 0, 0, 0, 0, 0, 0])

In [8]:
from torch.utils.tensorboard import SummaryWriter
writer = SummaryWriter()
writer.add_graph(model, x)
writer.close()

In [10]:
!tensorboard --logdir=runs

TensorFlow installation not found - running with reduced feature set.

[1mSanta[0m

This Application is not yet enabled on this computerFor security, this application must be enabled first before it can be opened.Click "Next Steps ..." if you want to run this application. Otherwise, click "Ignore" to dismiss this message.If you keep seeing this message a minute... ...After allowing, you need to manually sync Santa rules...After ignoring, you have a daemon/cron repeatedly restartingIf you are not sure what this prompt is about, please check this announcement. Learn more about this workflow. If you are still blocked please contact helpdesk from fmenu at top right of your screen or phone

[1mPath:      [0m /Users/howardhuang/Code/my_own/DeepLearningWithPytorch-SecondEdition/.venv/lib/python3.10/site-packages/tensorboard_data_server/bin/server
[1mIdentifier:[0m 186b1eb640607bd4f14d1714e906a2f5b8f0091f9e76a2169476ffdb7f87f5d8
[1mParent:    [0m Python (21900)

More info:
https://www.