## seq

In [68]:
import torch

# Build dataset
names = ["S" + name.strip().lower() + "E" for name in open("data/names.txt", "r").readlines()]
names = list(filter(lambda x: "-" not in x, names))
alphabet = {chr(i): i - 97 for i in range(97, 123)}
alphabet[" "] = 26; alphabet["S"] = 27; alphabet["E"] = 28
rev_alphabet = {v: k for k, v in alphabet.items()}

# Convert names to tensors
names = [torch.tensor([alphabet[char] for char in name]) for name in names]

counts = torch.zeros(len(alphabet), len(alphabet), dtype=torch.float)
for name in names:
    for bigram in zip(name, name[1:]):
        counts[bigram[0], bigram[1]] += 1

# Normalize the rows of counts
counts = counts / counts.sum(dim=1, keepdim=True)

In [73]:
def name_prob(name: list[int]) -> float:
    prob = 1.0
    for bigram in zip(name, name[1:]):
        prob *= counts[bigram[0], bigram[1]].item()
    return prob

def name_log_prob(name: list[int]) -> float:
    bigrams = zip(name, name[1:])
    # Vectorize
    return -torch.sum(torch.log(counts[name[:-1], name[1:]])).item()

In [78]:
def generate_name_bigram(counts: torch.TensorType):
    name = [alphabet["S"]]
    while name[-1] != alphabet["E"]:
        row = counts[name[-1], :]
        next_letter_idx = torch.multinomial(row, 1).item()
        name.append(next_letter_idx)
    return name[1:-1]

def display_name(name: list[int]):
    return "".join([rev_alphabet[idx] for idx in name])

for i in range(5):
    name = generate_name_bigram(counts)
    print(display_name(name))
    print(name_log_prob(name))
print("\n=====REAL NAMES=====")
for i in range(5):
    print(display_name(names[i].tolist()))
    print(name_log_prob(names[i]))

anyaminamselerkeonrd
52.25434112548828
avondy
13.929759979248047
iles
7.177988052368164
selen
8.928884506225586
gantther
14.41200065612793

=====REAL NAMES=====
SaamirE
18.659114837646484
SaaronE
14.553874969482422
SabbeyE
14.094061851501465
SabbieE
15.160078048706055
SabbotE
17.108291625976562


In [84]:
# Figure out the counts using NN
from torch import nn
class BigramModel(nn.Module):
    def __init__(self, vocab_size: int):
        super().__init__()
        self.vocab_size = vocab_size
        # self.embedding = nn.Embedding(vocab_size, hidden_size) # Would use embedding if you had a larger vocab
        self.linear = nn.Linear(vocab_size, vocab_size)
        self.softmax = nn.Softmax(dim=0) # Why dim=1?

    def forward(self, input: torch.TensorType) -> torch.TensorType:
        output = self.linear(input)
        return self.softmax(output)

def generate_name_nn(model: BigramModel):
    name = [alphabet["S"]]
    i = 0
    while name[-1] != alphabet["E"] and i < 20:
        # One-hot encoding of last letter
        input = torch.zeros(model.vocab_size)
        input[name[-1]] += 1
        # Outputs probabilities row
        probs = model.forward(input)
        # Sample next letter
        next_letter_idx = torch.multinomial(probs, 1).item()
        name.append(next_letter_idx)
        i += 1

    return name[1:-1]

model = BigramModel(len(alphabet))
for i in range(5):
    name = generate_name_nn(model)
    print(display_name(name))
    print(name_log_prob(name))

ogjgycgnjmhwlnonlfS
inf
gwnaSkdlp odmrmnvbS
nan
bmSfhmkw
inf
n
-0.0
cfufymrm
inf


In [85]:
# Train the model!
import torch.optim as optim

optimizer = optim.SGD(model.parameters(), lr=0.01)

for name in names:
    # Reset gradient
    model.zero_grad()
    # Compute loss
    loss = 0
    for bigram in zip(name, name[1:]):
        input = torch.zeros(model.vocab_size)
        input[bigram[0]] += 1
        probs = model.forward(input)
        loss += -torch.log(probs[bigram[1]]) # Negative log likelihood loss
    # Backprop
    loss.backward()
    # Update parameters
    optimizer.step()

In [87]:
for i in range(5):
    name = generate_name_nn(model)
    print(display_name(name))
    print(name_log_prob(name))

waiyqtdeoiepaajleha
inf
baiterchndwt
32.55701446533203
tl
4.352035999298096
otarru
14.201091766357422
n
-0.0


In [None]:
# Should try doing a trigram model. There are probably many increasingly clever ways to do this, worth making a few up before googling the canonical solutions.
# Keep a delayed bigram and use this along with the normal bigram (average the rows, then take multinomial sample)
# Simply use a 3D tensor (fine for trigram, not for much more)
# Make an embedding of the last n letters
# Use a recurrent neural network
# You should try to make your owv version of the attention mechanism from memory, or whatever you think the word "attention" SHOULD mean. Then see how this later compares to an actual implementation

In [None]:
# Now try with RNN - This is a prebuilt model that you can test against. But should build your own.
class RNNModel(nn.Module):
    def __init__(self, vocab_size: int):
        super().__init__()
        self.vocab_size = vocab_size
        # self.hidden_size = hidden_size
        # self.embedding = nn.Embedding(vocab_size, hidden_size) # Would use embedding if you had a larger vocab
        self.rnn = nn.RNN(vocab_size, vocab_size)
        self.linear = nn.Linear(vocab_size, vocab_size)
        self.softmax = nn.Softmax(dim=0)

    def forward(self, input: torch.TensorType, hidden: torch.TensorType) -> torch.TensorType:
        output, hidden = self.rnn(output, hidden)
        output = self.linear(output)
        return self.softmax(output), hidden