In [334]:
import torch
import torch.nn as nn
import torch.nn.functional as F

In [5]:
with open("../names.txt", "r") as f:
    names = f.read().splitlines()

In [14]:
letters = sorted(list(set("".join(names))))
letters.append(".")
ltoi = {l:i for i, l in enumerate(letters)}
itol = {i:l for l, i in ltoi.items()}

In [145]:
# bigrams
xs_bigram, ys_bigram = [], []
for n in names:
    n = "." + n + "."
    for first, second in zip(n, n[1:]):
        xs_bigram.append(ltoi[first])
        ys_bigram.append(ltoi[second])
        
xs_bigram = torch.tensor(xs_bigram)
ys_bigram = torch.tensor(ys_bigram)

In [146]:
W = torch.randn((27, 27), requires_grad=True)

In [147]:
# forward
inputs = torch.nn.functional.one_hot(xs_bigram).float()
logits = inputs @ W 
probs = torch.exp(logits) / torch.exp(logits).sum(1, keepdims=True)
loss = -probs[torch.arange(ys_bigram.shape[0]), ys_bigram].log().mean()

In [148]:
print(loss)

tensor(3.8877, grad_fn=<NegBackward0>)


In [149]:
# backward
W.grad = None
loss.backward()

In [150]:
# updates
W.data += -50 * W.grad

In [151]:
# init and training for bigram model
W = torch.randn((27, 27), requires_grad=True)

for i in range(50):
    # forward
    inputs = torch.nn.functional.one_hot(xs_bigram).float()
    logits = inputs @ W 
    probs = torch.exp(logits) / torch.exp(logits).sum(1, keepdims=True)
    loss = -probs[torch.arange(ys_bigram.shape[0]), ys_bigram].log().mean()
    
    print(i, loss)
    
    # backward
    W.grad = None
    loss.backward()
    
    # updates
    W.data += -50 * W.grad

0 tensor(3.7675, grad_fn=<NegBackward0>)
1 tensor(3.3568, grad_fn=<NegBackward0>)
2 tensor(3.1371, grad_fn=<NegBackward0>)
3 tensor(3.0079, grad_fn=<NegBackward0>)
4 tensor(2.9158, grad_fn=<NegBackward0>)
5 tensor(2.8464, grad_fn=<NegBackward0>)
6 tensor(2.7934, grad_fn=<NegBackward0>)
7 tensor(2.7524, grad_fn=<NegBackward0>)
8 tensor(2.7200, grad_fn=<NegBackward0>)
9 tensor(2.6940, grad_fn=<NegBackward0>)
10 tensor(2.6726, grad_fn=<NegBackward0>)
11 tensor(2.6547, grad_fn=<NegBackward0>)
12 tensor(2.6395, grad_fn=<NegBackward0>)
13 tensor(2.6263, grad_fn=<NegBackward0>)
14 tensor(2.6148, grad_fn=<NegBackward0>)
15 tensor(2.6046, grad_fn=<NegBackward0>)
16 tensor(2.5956, grad_fn=<NegBackward0>)
17 tensor(2.5876, grad_fn=<NegBackward0>)
18 tensor(2.5805, grad_fn=<NegBackward0>)
19 tensor(2.5740, grad_fn=<NegBackward0>)
20 tensor(2.5682, grad_fn=<NegBackward0>)
21 tensor(2.5629, grad_fn=<NegBackward0>)
22 tensor(2.5581, grad_fn=<NegBackward0>)
23 tensor(2.5536, grad_fn=<NegBackward0>)
24

# Exercises
E01: train a trigram language model, i.e. take two characters as an input to
predict the 3rd one. Feel free to use either counting or a neural net. Evaluate
the loss; Did it improve over a bigram model?

In [331]:
# trigrams
xs_trigram, ys_trigram = [], []
for n in names:
    n = "." * 2 + n + "."
    for first, second, third in zip(n, n[1:], n[2:]):
        # print(first, second, third)
        xs_trigram.append([ltoi[first], ltoi[second]])
        ys_trigram.append(ltoi[third])
        
xs_trigram = torch.tensor(xs_trigram)
ys_trigram = torch.tensor(ys_trigram)

In [332]:
inputs = torch.nn.functional.one_hot(xs_trigram).float()

In [333]:
W1 = torch.randn((27, 27), requires_grad=True)
W2 = torch.randn((27, 27), requires_grad=True)

for i in range(50):
    logits = inputs[:, 0, :] @ W1 + inputs[:, 1, :] @ W2

    probs = torch.exp(logits) / torch.exp(logits).sum(1, keepdims=True)
    loss = -probs[torch.arange(ys_trigram.shape[0]), ys_trigram].log().mean()

    print(i, loss)
    
    # backward
    W1.grad = None
    W2.grad = None
    loss.backward()
    
    # updates
    W1.data += -20 * W1.grad
    W2.data += -20 * W2.grad

0 tensor(4.1870, grad_fn=<NegBackward0>)
1 tensor(3.7626, grad_fn=<NegBackward0>)
2 tensor(3.5288, grad_fn=<NegBackward0>)
3 tensor(3.3593, grad_fn=<NegBackward0>)
4 tensor(3.2326, grad_fn=<NegBackward0>)
5 tensor(3.1353, grad_fn=<NegBackward0>)
6 tensor(3.0582, grad_fn=<NegBackward0>)
7 tensor(2.9955, grad_fn=<NegBackward0>)
8 tensor(2.9438, grad_fn=<NegBackward0>)
9 tensor(2.9003, grad_fn=<NegBackward0>)
10 tensor(2.8631, grad_fn=<NegBackward0>)
11 tensor(2.8310, grad_fn=<NegBackward0>)
12 tensor(2.8029, grad_fn=<NegBackward0>)
13 tensor(2.7780, grad_fn=<NegBackward0>)
14 tensor(2.7558, grad_fn=<NegBackward0>)
15 tensor(2.7358, grad_fn=<NegBackward0>)
16 tensor(2.7176, grad_fn=<NegBackward0>)
17 tensor(2.7011, grad_fn=<NegBackward0>)
18 tensor(2.6859, grad_fn=<NegBackward0>)
19 tensor(2.6719, grad_fn=<NegBackward0>)
20 tensor(2.6590, grad_fn=<NegBackward0>)
21 tensor(2.6470, grad_fn=<NegBackward0>)
22 tensor(2.6358, grad_fn=<NegBackward0>)
23 tensor(2.6253, grad_fn=<NegBackward0>)
24

Rewritten as a class below - training looks to be equivalent

In [None]:
class Model(nn.Module):
    def __init__(self):
        super().__init__()
        self.w1 = nn.Parameter(torch.randn((27, 27)))
        self.w2 = nn.Parameter(torch.randn((27, 27)))
        
    def forward(self, x, targets=None):
        inputs = torch.nn.functional.one_hot(x).float()
        logits = inputs[:, 0, :] @ self.w1 + inputs[:, 1, :] @ self.w2
        probs = torch.exp(logits) / torch.exp(logits).sum(1, keepdims=True)
        
        
        loss = None
        if targets is not None:
            loss = -probs[torch.arange(targets.shape[0]), targets].log().mean()

        return probs, loss

In [None]:
m = Model()

In [None]:
for i in range(50):
    probs, loss = m.forward(xs_trigram, targets=ys_trigram)
    
    print(i, loss)
    m.zero_grad()
    loss.backward()
    # updates
    m.w1.data += -20 * m.w1.grad
    m.w2.data += -20 * m.w2.grad
    
    

0 tensor(4.3345, grad_fn=<NegBackward0>)
1 tensor(3.9514, grad_fn=<NegBackward0>)
2 tensor(3.6622, grad_fn=<NegBackward0>)
3 tensor(3.4633, grad_fn=<NegBackward0>)
4 tensor(3.3159, grad_fn=<NegBackward0>)
5 tensor(3.1994, grad_fn=<NegBackward0>)
6 tensor(3.1078, grad_fn=<NegBackward0>)
7 tensor(3.0346, grad_fn=<NegBackward0>)
8 tensor(2.9750, grad_fn=<NegBackward0>)
9 tensor(2.9253, grad_fn=<NegBackward0>)
10 tensor(2.8830, grad_fn=<NegBackward0>)
11 tensor(2.8464, grad_fn=<NegBackward0>)
12 tensor(2.8145, grad_fn=<NegBackward0>)
13 tensor(2.7862, grad_fn=<NegBackward0>)
14 tensor(2.7610, grad_fn=<NegBackward0>)
15 tensor(2.7385, grad_fn=<NegBackward0>)
16 tensor(2.7181, grad_fn=<NegBackward0>)
17 tensor(2.6997, grad_fn=<NegBackward0>)
18 tensor(2.6830, grad_fn=<NegBackward0>)
19 tensor(2.6677, grad_fn=<NegBackward0>)
20 tensor(2.6538, grad_fn=<NegBackward0>)
21 tensor(2.6410, grad_fn=<NegBackward0>)
22 tensor(2.6292, grad_fn=<NegBackward0>)
23 tensor(2.6183, grad_fn=<NegBackward0>)
24

E02: split up the dataset randomly into 80% train set, 10% dev set, 10% test
set. Train the bigram and trigram models only on the training set. Evaluate them
on dev and test splits. What can you see?

In [176]:
def split_data(array):
    frac = array.shape[0] * 0.1 // 1
    train = array[:int(frac * 8)]
    val = array[int(frac * 8): int(frac * 9)]
    test = array[int(frac * 9):]
    assert train.shape[0] + test.shape[0] + val.shape[0] == array.shape[0]
    return train, val, test

In [185]:
train_xs_bigram, val_xs_bigram, test_xs_bigram = split_data(xs_bigram)
train_ys_bigram, val_ys_bigram, test_ys_bigram = split_data(ys_bigram)

In [189]:
# init and training for bigram model
W = torch.randn((27, 27), requires_grad=True)

for i in range(50):
    # forward
    inputs = torch.nn.functional.one_hot(train_xs_bigram).float()
    logits = inputs @ W 
    probs = torch.exp(logits) / torch.exp(logits).sum(1, keepdims=True)
    loss = -probs[torch.arange(train_ys_bigram.shape[0]), train_ys_bigram].log().mean()
    
    val_inputs = torch.nn.functional.one_hot(val_xs_bigram).float()
    logits = val_inputs @ W
    probs = torch.exp(logits) / torch.exp(logits).sum(1, keepdims=True)
    val_loss = -probs[torch.arange(val_ys_bigram.shape[0]), val_ys_bigram].log().mean()
    
    
    print(i, f"{loss.data.item()=:.4f}, {val_loss.data.item()=:.4f}, {test_loss.data.item()=:.4f}")
    
    # backward
    W.grad = None
    loss.backward()
    
    # updates
    W.data += -50 * W.grad

0 loss.data.item()=3.7033, val_loss.data.item()=3.6566, test_loss.data.item()=3.6431
1 loss.data.item()=3.2771, val_loss.data.item()=3.3136, test_loss.data.item()=3.3068
2 loss.data.item()=3.0436, val_loss.data.item()=3.1405, test_loss.data.item()=3.1388
3 loss.data.item()=2.9152, val_loss.data.item()=3.0434, test_loss.data.item()=3.0454
4 loss.data.item()=2.8324, val_loss.data.item()=2.9776, test_loss.data.item()=2.9817
5 loss.data.item()=2.7750, val_loss.data.item()=2.9334, test_loss.data.item()=2.9378
6 loss.data.item()=2.7319, val_loss.data.item()=2.8967, test_loss.data.item()=2.9011
7 loss.data.item()=2.6978, val_loss.data.item()=2.8670, test_loss.data.item()=2.8714
8 loss.data.item()=2.6699, val_loss.data.item()=2.8417, test_loss.data.item()=2.8460
9 loss.data.item()=2.6468, val_loss.data.item()=2.8203, test_loss.data.item()=2.8246
10 loss.data.item()=2.6274, val_loss.data.item()=2.8018, test_loss.data.item()=2.8060
11 loss.data.item()=2.6109, val_loss.data.item()=2.7859, test_lo

E03: use the dev set to tune the strength of smoothing (or regularization) for the trigram model - i.e. try many possibilities and see which one works best based on the dev set loss. What patterns can you see in the train and dev set loss as you tune this strength? Take the best setting of the smoothing and evaluate on the test set once and at the end. How good of a loss do you achieve?

In [318]:
train_xs_trigram, val_xs_trigram, test_xs_trigram = split_data(xs_trigram)
train_ys_trigram, val_ys_trigram, test_ys_trigram = split_data(ys_trigram)

In [448]:
def generate_name(model):
    trigram_input = ".."
    output = ""
    while True:
        model_input = torch.tensor([[ltoi[x] for x in trigram_input]])
        probs, _ = model(model_input)
        next_letter = itol[torch.multinomial(probs, 1).item()]
        if next_letter == ".":
            break
        output += next_letter
        trigram_input = output[-2:]
        
        if len(trigram_input) < 2:
            trigram_input = "." + trigram_input
            
    return output

In [450]:
class Model(nn.Module):
    def __init__(self, reg_weight=0):
        super().__init__()
        self.w1 = nn.Parameter(torch.randn((27, 27)))
        self.w2 = nn.Parameter(torch.randn((27, 27)))
        self.reg_weight = reg_weight
        
    def forward(self, x, targets=None):
        inputs = torch.nn.functional.one_hot(x, num_classes=27).float()
        logits = inputs[:, 0, :] @ self.w1 + inputs[:, 1, :] @ self.w2
        probs = torch.exp(logits) / torch.exp(logits).sum(1, keepdims=True)
        
        
        loss = None
        if targets is not None:
            loss = -probs[torch.arange(targets.shape[0]), targets].log().mean()
            
            if self.reg_weight:
                loss += self.reg_weight * torch.cat((self.w1**2, self.w2**2)).sum()

        return probs, loss
    
    def update(self, lr):
        self.w1.data += -lr * self.w1.grad
        self.w2.data += -lr * self.w2.grad

In [430]:
m = Model(1e-3)

In [437]:
probs, loss = m(train_xs_trigram, targets=train_ys_trigram)
m.zero_grad()
print(loss)

tensor(4.5851, grad_fn=<AddBackward0>)


In [435]:
loss.backward()

In [436]:
m.update(20)

In [414]:
loss

tensor(3.5196, grad_fn=<NegBackward0>)

In [404]:
probs, loss = m(train_xs_trigram, targets=train_ys_trigram)


In [407]:
loss.data.item()

3.7461421489715576

In [472]:
for reg in [1e-3]: #[0, 1e-10, 1e-5, 1e-3]:
    m = Model(reg_weight=reg)
    for i in range(50):
        probs, loss = m(train_xs_trigram, targets=train_ys_trigram)
        _, val_loss = m(val_xs_trigram, targets=val_ys_trigram)
        _, test_loss = m(test_xs_trigram, targets=test_ys_trigram)
        
        # print(i, f"{loss.data.item()=:.4f}, {val_loss.data.item()=:.4f}")
        
        m.zero_grad()
        loss.backward()
        m.update(20)

    print(f"{reg=}", f"{loss.data.item()=:.4f}", f"{val_loss.data.item()=:.4f}", f"{test_loss.data.item()=:.4f}")

reg=0.001 loss.data.item()=2.6747 val_loss.data.item()=2.8234 test_loss.data.item()=2.8281


In [547]:
generate_name(m)

'daca'

E04: we saw that our 1-hot vectors merely select a row of W, so producing these vectors explicitly feels wasteful. Can you delete our use of F.one_hot in favor of simply indexing into rows of W?

E05: look up and use F.cross_entropy instead. You should achieve the same result. Can you think of why we'd prefer to use F.cross_entropy instead?

E06: meta-exercise! Think of a fun/interesting exercise and complete it.