## seq

In [54]:
import torch

# Build dataset
names = ["S" + name.strip().lower() + "E" for name in open("data/names.txt", "r").readlines()]
names = list(filter(lambda x: "-" not in x, names))
alphabet = {chr(i): i - 97 for i in range(97, 123)}
alphabet[" "] = 26; alphabet["S"] = 27; alphabet["E"] = 28
rev_alphabet = {v: k for k, v in alphabet.items()}

# Convert names to tensors
names = [torch.tensor([alphabet[char] for char in name]) for name in names]

counts = torch.zeros(len(alphabet), len(alphabet), dtype=torch.float)
for name in names:
    for bigram in zip(name, name[1:]):
        counts[bigram[0], bigram[1]] += 1

# Normalize the rows of counts
counts = counts / counts.sum(dim=1, keepdim=True)

In [55]:
def name_prob(name: list[int]) -> float:
    prob = 1.0
    for bigram in zip(name, name[1:]):
        prob *= counts[bigram[0], bigram[1]].item()
    return prob

def name_log_prob(name: list[int]) -> float:
    bigrams = zip(name, name[1:])
    # Vectorize
    return -torch.sum(torch.log(0.01 + counts[name[:-1], name[1:]])).item()

In [56]:
def generate_name_bigram(counts: torch.TensorType):
    name = [alphabet["S"]]
    while name[-1] != alphabet["E"]:
        row = counts[name[-1], :]
        next_letter_idx = torch.multinomial(row, 1).item()
        name.append(next_letter_idx)
    return name[1:-1]

def display_name(name: list[int]):
    return "".join([rev_alphabet[idx] for idx in name])

for i in range(5):
    name = generate_name_bigram(counts)
    print(display_name(name))
    print(name_log_prob(name))
print("\n=====REAL NAMES=====")
for i in range(5):
    print(display_name(names[i].tolist()))
    print(name_log_prob(names[i]))

kiled
8.731794357299805
ge
1.620269775390625
gy
3.860867977142334
pardofoun
19.488985061645508
st
1.675158977508545

=====REAL NAMES=====
SaamirE
16.701412200927734
SaaronE
12.982748031616211
SabbeyE
13.205779075622559
SabbieE
14.282242774963379
SabbotE
15.989633560180664


In [57]:
# Figure out the counts using NN
from torch import nn
class BigramModel(nn.Module):
    def __init__(self, vocab_size: int):
        super().__init__()
        self.vocab_size = vocab_size
        # self.embedding = nn.Embedding(vocab_size, hidden_size) # Would use embedding if you had a larger vocab
        self.linear = nn.Linear(vocab_size, vocab_size)
        self.softmax = nn.Softmax(dim=0) # Why dim=1?

    def forward(self, input: torch.TensorType) -> torch.TensorType:
        output = self.linear(input)
        return self.softmax(output)

def generate_name_nn(model: BigramModel):
    name = [alphabet["S"]]
    i = 0
    while name[-1] != alphabet["E"] and i < 20:
        # One-hot encoding of last letter
        input = torch.zeros(model.vocab_size)
        input[name[-1]] += 1
        # Outputs probabilities row
        probs = model.forward(input)
        # Sample next letter
        next_letter_idx = torch.multinomial(probs, 1).item()
        name.append(next_letter_idx)
        i += 1

    return name[1:-1]

model = BigramModel(len(alphabet))
for i in range(5):
    name = generate_name_nn(model)
    print(display_name(name))
    print(name_log_prob(name))

qphvmfosjfmg
43.074161529541016
ys
3.807882785797119
nmrsuzgyx jfypfvezn
nan
sh abqrqprnkirrzgvw
nan
npcrbwybjiebo ttxge
nan


In [58]:
# Train the model!
import torch.optim as optim

optimizer = optim.SGD(model.parameters(), lr=0.01)

for name in names:
    # Reset gradient
    model.zero_grad()
    # Compute loss
    loss = 0
    for bigram in zip(name, name[1:]):
        input = torch.zeros(model.vocab_size)
        input[bigram[0]] += 1
        probs = model.forward(input)
        loss += -torch.log(probs[bigram[1]]) # Negative log likelihood loss
    # Backprop
    loss.backward()
    # Update parameters
    optimizer.step()

In [59]:
for i in range(5):
    name = generate_name_nn(model)
    print(display_name(name))
    print(name_log_prob(name))

ajstedn
19.755199432373047

-0.0
almv
10.270076751708984
saily
10.595558166503906
shil
6.770891189575195


In [64]:
# Should try doing a trigram model. There are probably many increasingly clever ways to do this, worth making a few up before googling the canonical solutions.
# Keep a delayed bigram and use this along with the normal bigram (average the rows, then take multinomial sample)
# Simply use a 3D tensor (fine for trigram, not for much more)
# Make an embedding of the last n letters
# Use a recurrent neural network
# You should try to make your owv version of the attention mechanism from memory, or whatever you think the word "attention" SHOULD mean. Then see how this later compares to an actual implementation

def create_delayed_bigram(delay: int):
    counts = torch.zeros(len(alphabet), len(alphabet), dtype=torch.float)
    for name in names:
        for bigram in zip(name, name[delay:]):
            counts[bigram[0], bigram[1]] += 1
    counts = counts / counts.sum(dim=1, keepdim=True)
    return counts


delay_counts = []
max_delay = 3
for i in range(1, max_delay + 1):
    delay_counts.append(create_delayed_bigram(i))

def generate_name_delay(delay_counts):
    name = [alphabet["S"]]
    while name[-1] != alphabet["E"]:
        row = delay_counts[0][name[-1], :]
        for i in range(1, len(delay_counts)):
            if len(name) > i:
                row += delay_counts[i][name[-(i + 1)], :]
        next_letter_idx = torch.multinomial(row, 1).item()
        name.append(next_letter_idx)
    return name[1:-1]


delay_sum = 0
bigram_sum = 0
for i in range(100):
    name = generate_name_delay(delay_counts)
    # print(display_name(name))
    delay_sum += name_log_prob(name)
    bigram_sum += name_log_prob(generate_name_bigram(counts))
print("Average Log Probs:")
print("Delay: ", delay_sum / 100.0)
print("Bigram: ", bigram_sum / 100.0)

Average Log Probs:
Delay:  12.633674325942993
Bigram:  14.3560506772995


In [77]:
# Delay is doing worse, but this is because we are checking it against the model created by bigram, which isn't fair
# How do you fairly compare the two? How about finding the loss of all the names
# in the original dataset.

def delay_name_log_prob(name: list[int]):
    total = 0
    for i in range(len(delay_counts)):
        pairs = list(zip(name, name[i + 1:]))
        a = [el[0] for el in pairs]
        b = [el[1] for el in pairs]
        total += -torch.sum(torch.log(0.01 + delay_counts[i][a, b])).item()

    return total

delay_sum = 0
bigram_sum = 0
for i in range(100):
    name = names[i]
    delay_sum += delay_name_log_prob(name)
    bigram_sum += name_log_prob(name)
print("Average Delay Log Prob: ", delay_sum / 100.0)
print("Average Bigram Log Prob: ", bigram_sum / 100.0)

Average Delay Log Prob:  22.67301644742489
Average Bigram Log Prob:  16.118463459014894


In [78]:
# Looks like the delay is doing worse.
# Next idea though is to try learning which of the delays to pay attention to in which situations
# Perhaps you want to make a NN: previous n letters -> softmax over the delays
# Then dot the delays with the output of this NN to get the row which you sample from

class NateAttentionModel:
    def __init__(self, vocab_size: int, max_delay: int):
        self.vocab_size = vocab_size
        self.max_delay = max_delay
        self.linear = nn.Linear(vocab_size * max_delay, max_delay)
        self.softmax = nn.Softmax(dim=0)

    def forward(self, input: torch.TensorType) -> torch.TensorType:
        output = self.linear(input)
        return self.softmax(output)


In [None]:
# Something else you have to take into account is that if you are using a max_delay
# of n, you should be prefixing your names with n S's.

In [61]:
# Now try with RNN - This is a prebuilt model that you can test against. But should build your own.
class RNNModel(nn.Module):
    def __init__(self, vocab_size: int):
        super().__init__()
        self.vocab_size = vocab_size
        # self.hidden_size = hidden_size
        # self.embedding = nn.Embedding(vocab_size, hidden_size) # Would use embedding if you had a larger vocab
        self.rnn = nn.RNN(vocab_size, vocab_size)
        self.linear = nn.Linear(vocab_size, vocab_size)
        self.softmax = nn.Softmax(dim=0)

    def forward(self, input: torch.TensorType, hidden: torch.TensorType) -> torch.TensorType:
        output, hidden = self.rnn(output, hidden)
        output = self.linear(output)
        return self.softmax(output), hidden