##### Sampling from a multinomial distribution using pytorch

In [1]:
import torch 
g = torch.Generator().manual_seed(2147483647)
p = torch.rand(3, generator=g)
p = p / p.sum()
print(p)

torch.multinomial(p, num_samples=100, replacement=True, generator=g)

tensor([0.6064, 0.3033, 0.0903])


tensor([1, 1, 2, 0, 0, 2, 1, 1, 0, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 0, 2, 0, 0,
        1, 0, 0, 1, 0, 0, 0, 1, 1, 1, 0, 1, 1, 0, 0, 1, 1, 1, 0, 1, 1, 0, 1, 1,
        0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0,
        0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 2, 0, 0, 0, 0, 0, 0, 1, 0, 0, 2, 0, 1, 0,
        0, 1, 1, 1])

##### Exercises
##### E01 - Trigram Language Model

In [32]:
words = open('names.txt', 'r').read().splitlines()
len(words)

32033

In [33]:
# create trigrams 
b = {}
for w in words:
    chs = ['<S>'] + list(w) + ['<E>']
    for ch1, ch2, ch3 in zip(chs, chs[1:], chs[2:]):
        trigram = (ch1, ch2, ch3)
        # print(trigram)
        b[trigram] = b.get(trigram, 0) + 1 

In [34]:
# sort by the count
sorted(b.items(), key = lambda kv: -kv[1]) # kv[1] gives the count. -kv[1] for descending order.

[(('a', 'h', '<E>'), 1714),
 (('n', 'a', '<E>'), 1673),
 (('a', 'n', '<E>'), 1509),
 (('o', 'n', '<E>'), 1503),
 (('<S>', 'm', 'a'), 1453),
 (('<S>', 'j', 'a'), 1255),
 (('<S>', 'k', 'a'), 1254),
 (('e', 'n', '<E>'), 1217),
 (('l', 'y', 'n'), 976),
 (('y', 'n', '<E>'), 953),
 (('a', 'r', 'i'), 950),
 (('i', 'a', '<E>'), 903),
 (('i', 'e', '<E>'), 858),
 (('a', 'n', 'n'), 825),
 (('e', 'l', 'l'), 822),
 (('a', 'n', 'a'), 804),
 (('i', 'a', 'n'), 790),
 (('m', 'a', 'r'), 776),
 (('i', 'n', '<E>'), 766),
 (('e', 'l', '<E>'), 727),
 (('y', 'a', '<E>'), 716),
 (('a', 'n', 'i'), 703),
 (('<S>', 'd', 'a'), 700),
 (('l', 'a', '<E>'), 684),
 (('e', 'r', '<E>'), 683),
 (('i', 'y', 'a'), 669),
 (('l', 'a', 'n'), 647),
 (('<S>', 'b', 'r'), 646),
 (('n', 'n', 'a'), 633),
 (('<S>', 'a', 'l'), 632),
 (('<S>', 'c', 'a'), 628),
 (('r', 'a', '<E>'), 627),
 (('n', 'i', '<E>'), 625),
 (('<S>', 'a', 'n'), 623),
 (('n', 'n', '<E>'), 619),
 (('n', 'e', '<E>'), 607),
 (('e', 'e', '<E>'), 605),
 (('e', 'y', '<

In [35]:
# sort list of characters from a to z
chars = sorted(list(set(''.join(words)))) 
# map characters to integers
stoi = {s:i+1 for i,s in enumerate(chars)}
stoi['.'] = 0
# reverse mapping
itos = {i:s for s,i in stoi.items()}

In [36]:
# create the training set of trigrams (x, y)
xs, ys = [], []

for w in words[:1]:
  chs = ['.'] + list(w) + ['.']   # <S> and <E> tokens replaced with dot
  for ch1, ch2, ch3 in zip(chs, chs[1:], chs[2:]):
    ix1 = stoi[ch1]
    ix2 = stoi[ch2]
    ix3 = stoi[ch3]
    print(ch1, ch2, ch3)
    xs.append([ix1, ix2])
    ys.append(ix3)

xs = torch.tensor(xs)
ys = torch.tensor(ys)


. e m
e m m
m m a
m a .


In [37]:
print(xs, xs.shape, "\n")

print(ys, ys.shape)

tensor([[ 0,  5],
        [ 5, 13],
        [13, 13],
        [13,  1]]) torch.Size([4, 2]) 

tensor([13, 13,  1,  0]) torch.Size([4])


In [38]:
import torch.nn.functional as F
xenc = F.one_hot(xs, num_classes=27).float()
xenc

tensor([[[1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
          0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
         [0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
          0., 0., 0., 0., 0., 0., 0., 0., 0., 0.]],

        [[0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
          0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
         [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0.,
          0., 0., 0., 0., 0., 0., 0., 0., 0., 0.]],

        [[0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0.,
          0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
         [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0.,
          0., 0., 0., 0., 0., 0., 0., 0., 0., 0.]],

        [[0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0.,
          0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
         [0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
          0.,

In [39]:
xenc.shape

torch.Size([4, 2, 27])

##### Weights Matrix:
- Since `xenc` is `[4, 2, 27]`, you need to transform this into a shape suitable for matrix multiplication.
- You can reshape `xenc` to `[4, 54]` by flattening the last two dimensions, because each sample (2 characters) now has a combined representation of `2 * 27 = 54` features.
- `W` should be shaped `[54, 27]` to map the 54-dimensional input to the 27-dimensional output (logits for the next character).

In [40]:
xenc_reshaped = xenc.view(4, -1)  # shape: [4, 54]
xenc_reshaped.shape

torch.Size([4, 54])

In [41]:
# randomly initialize 27 neurons' weights. Each neuron receives 27 inputs
g = torch.Generator().manual_seed(2147483647)
W = torch.randn((54, 27), generator=g)  # shape: [54, 27]
logits = xenc_reshaped @ W  # shape: [4, 27]
counts = logits.exp()
probs = counts / counts.sum(1, keepdims=True)

In [42]:
probs[0].sum()

tensor(1.)

In [43]:
probs[0]

tensor([0.0237, 0.0177, 0.0107, 0.0049, 0.0223, 0.0096, 0.0111, 0.0090, 0.0071,
        0.0424, 0.0704, 0.0511, 0.0196, 0.0240, 0.2683, 0.0824, 0.0320, 0.0058,
        0.1061, 0.0203, 0.0267, 0.0060, 0.0026, 0.0565, 0.0026, 0.0264, 0.0407])

In [44]:
### Training using Neural Network

In [45]:
# create the training set of trigrams (x, y)
xs, ys = [], []

for w in words:
  chs = ['.'] + list(w) + ['.']   # <S> and <E> tokens replaced with dot
  for ch1, ch2, ch3 in zip(chs, chs[1:], chs[2:]):
    ix1 = stoi[ch1]
    ix2 = stoi[ch2]
    ix3 = stoi[ch3]
    # print(ch1, ch2, ch3)
    xs.append([ix1, ix2])
    ys.append(ix3)

xs = torch.tensor(xs)
ys = torch.tensor(ys)
num = ys.nelement()
print('number of examples: ', num)


# initialize the 'network'
g = torch.Generator().manual_seed(2147483647)
W = torch.randn((54, 27), generator=g, requires_grad=True)

number of examples:  196113


In [46]:
xs.shape

torch.Size([196113, 2])

In [47]:
xenc = F.one_hot(xs, num_classes=27).float()

In [48]:
xenc.shape

torch.Size([196113, 2, 27])

In [49]:
xenc.view(num, -1).shape

torch.Size([196113, 54])

In [50]:
# gradient descent
for k in range(100):
  
  # forward pass
  xenc = F.one_hot(xs, num_classes=27).float() # input to the network: one-hot encoding
  xenc_reshaped = xenc.view(num, -1)  # shape: [196113, 54]
  logits = xenc_reshaped @ W # predict log-counts
  counts = logits.exp() # counts, equivalent to N
  probs = counts / counts.sum(1, keepdims=True) # probabilities for next character
  loss = -probs[torch.arange(num), ys].log().mean() + 0.01*(W**2).mean()
  
  # backward pass
  W.grad = None # set to zero the gradient
  loss.backward()
  
  # update
  W.data += -50 * W.grad

  if k%10 == 0:
    print(loss.item())

4.195971488952637
2.510397434234619
2.386066198348999
2.3393924236297607
2.3152151107788086
2.300726890563965
2.2911765575408936
2.2844254970550537
2.2794032096862793
2.275524616241455


##### E02 - Train, Dev and Test set

In [21]:
# create the training set of trigrams (x, y)
xs, ys = [], []

for w in words:
  chs = ['.'] + list(w) + ['.']   # <S> and <E> tokens replaced with dot
  for ch1, ch2, ch3 in zip(chs, chs[1:], chs[2:]):
    ix1 = stoi[ch1]
    ix2 = stoi[ch2]
    ix3 = stoi[ch3]
    # print(ch1, ch2, ch3)
    xs.append([ix1, ix2])
    ys.append(ix3)

xs = torch.tensor(xs)
print("shape of xs: ", xs.shape)
ys = torch.tensor(ys)
print("shape of ys: ", ys.shape)
num = ys.nelement()
print('number of examples: ', num)

shape of xs:  torch.Size([196113, 2])
shape of ys:  torch.Size([196113])
number of examples:  196113


In [22]:
from sklearn.model_selection import train_test_split

# Initial split into train and temp
xs_train, xs_temp, ys_train, ys_temp = train_test_split(xs, ys, test_size=0.2, random_state=42)

# Split temp into dev and test
xs_dev, xs_test, ys_dev, ys_test = train_test_split(xs_temp, ys_temp, test_size=0.5, random_state=42)

# Verify shapes
print(xs_train.shape, ys_train.shape)
print(xs_dev.shape, ys_dev.shape)
print(xs_test.shape, ys_test.shape)


torch.Size([156890, 2]) torch.Size([156890])
torch.Size([19611, 2]) torch.Size([19611])
torch.Size([19612, 2]) torch.Size([19612])


In [23]:
# training trigram model only on Train set

# initialize the 'network'
g = torch.Generator().manual_seed(2147483647)
W = torch.randn((54, 27), generator=g, requires_grad=True)

xenc_train = F.one_hot(xs_train, num_classes=27).float()
xenc_train.shape

torch.Size([156890, 2, 27])

In [25]:
# gradient descent
for k in range(100):
  
  # forward pass
  xenc_train = F.one_hot(xs_train, num_classes=27).float() # input to the network: one-hot encoding
  xenc_train_reshaped = xenc_train.view(len(xs_train), -1)  # shape: [196113, 54]
  logits = xenc_train_reshaped @ W # predict log-counts
  counts = logits.exp() # counts, equivalent to N
  probs = counts / counts.sum(1, keepdims=True) # probabilities for next character
  loss = -probs[torch.arange(len(xs_train)), ys_train].log().mean() + 0.01*(W**2).mean()
  
  # backward pass
  W.grad = None # set to zero the gradient
  loss.backward()
  
  # update
  W.data += -50 * W.grad

  if k%10 == 0:
    print(loss.item())

4.195178985595703
2.5115621089935303
2.3871910572052
2.3404290676116943
2.316173791885376
2.3016040325164795
2.291987180709839
2.285188674926758
2.280135154724121
2.2762370109558105


In [26]:
# Function to compute loss on a given dataset
def compute_loss(xs, ys, W):
    xenc = F.one_hot(xs, num_classes=27).float()  # one-hot encode input
    xenc_reshaped = xenc.view(len(xs), -1)  # reshape to [N, 54]
    logits = xenc_reshaped @ W  # compute logits
    counts = logits.exp()  # compute counts
    probs = counts / counts.sum(1, keepdims=True)  # compute probabilities
    loss = -probs[torch.arange(len(xs)), ys].log().mean() + 0.01 * (W**2).mean()  # compute loss
    return loss.item()

# Compute loss on dev set
dev_loss = compute_loss(xs_dev, ys_dev, W)
print(f'Dev loss: {dev_loss}')

# Compute loss on test set
test_loss = compute_loss(xs_test, ys_test, W)
print(f'Test loss: {test_loss}')

Dev loss: 2.2670748233795166
Test loss: 2.2757017612457275


##### E03 - Tuning regularization coefficient

In [27]:
# Find the best regularization coefficients

best_reg = None
best_loss = float('inf')
for reg in [0.001, 0.01, 0.1, 1.0]:  # Example regularization values
    # Train with current regularization coefficient
    for k in range(100):
        xenc_train = F.one_hot(xs_train, num_classes=27).float()
        xenc_train_reshaped = xenc_train.view(len(xs_train), -1)
        logits = xenc_train_reshaped @ W
        counts = logits.exp()
        probs = counts / counts.sum(1, keepdims=True)
        loss = -probs[torch.arange(len(xs_train)), ys_train].log().mean() + reg * (W**2).mean()

        W.grad = None
        loss.backward()
        W.data += -50 * W.grad

    dev_loss = compute_loss(xs_dev, ys_dev, W)
    if dev_loss < best_loss:
        best_loss = dev_loss
        best_reg = reg

print(f'Best regularization coefficient: {best_reg}')


Best regularization coefficient: 0.01


In [28]:
# Train the model on the combined dataset with the best 
# regularization coefficient

# Combine train and dev sets
xs_combined = torch.cat((xs_train, xs_dev), dim=0)
ys_combined = torch.cat((ys_train, ys_dev), dim=0)


for k in range(100):
    xenc_combined = F.one_hot(xs_combined, num_classes=27).float()
    xenc_combined_reshaped = xenc_combined.view(len(xs_combined), -1)
    logits = xenc_combined_reshaped @ W
    counts = logits.exp()
    probs = counts / counts.sum(1, keepdims=True)
    loss = -probs[torch.arange(len(xs_combined)), ys_combined].log().mean() + best_reg * (W**2).mean()

    W.grad = None
    loss.backward()
    W.data += -50 * W.grad

In [29]:
# Evaluate on the test set
test_loss = compute_loss(xs_test, ys_test, W)
print(f'Test loss: {test_loss}')

Test loss: 2.265357255935669


In [51]:
##### E04 - Indexing W (without one hot vectors) is in makemore.ipynb file
