##  Load the text file

In [1]:
# Load the text file
words = open('data/names.txt', 'r').read().splitlines()

In [2]:
len(words)

32033

# Bigram count model

## Build the bigram model

In [3]:
import torch

### Build empty bigram counter and the mapping

In [4]:
# bigram_count[i,j] = num of occurrence where character i is followed by character j
bigram_count = torch.zeros((27,27), dtype=torch.int32)

# This marks beginning or end
SPECIAL_CHAR = '.'
CHAR_TO_ID_STR = SPECIAL_CHAR + 'abcdefghijklmnopqrstuvwxyz'

char_to_id = {}
id_to_char = []

for i in range(len(CHAR_TO_ID_STR)):
    c = CHAR_TO_ID_STR[i]
    id_to_char.append(c)
    char_to_id[c] = i

### Fill in the bigram counter

In [5]:
for word in words:
    bigram_count[char_to_id[SPECIAL_CHAR], char_to_id[word[0]]] += 1
    for left in range(len(word) - 1):
        bigram_count[char_to_id[word[left]], char_to_id[word[left + 1]]] += 1
    bigram_count[char_to_id[word[-1]], char_to_id[SPECIAL_CHAR]] += 1

In [6]:
bigram_count[0]

tensor([   0, 4410, 1306, 1542, 1690, 1531,  417,  669,  874,  591, 2422, 2963,
        1572, 2538, 1146,  394,  515,   92, 1639, 2055, 1308,   78,  376,  307,
         134,  535,  929], dtype=torch.int32)

## Sample from the bigram counter

In [17]:
def get_sample(bigram_count):
    result = []
    cur_char_id = char_to_id[SPECIAL_CHAR]
    while True:
        proba_dist = bigram_count[cur_char_id].float()
        # You don't have to normalize
        # proba_dist /= proba_dist.sum()
        cur_char_id = torch.multinomial(proba_dist, num_samples = 1, replacement=True, generator=g).item()
        if id_to_char[cur_char_id] == SPECIAL_CHAR:
            break
        result.append(id_to_char[cur_char_id])
    return ''.join(result)

g = torch.Generator().manual_seed(2147483647)
num_samples = 20
for sample_i in range(num_samples):
    print(get_sample(bigram_count))

junide
janasah
p
cony
a
nn
kohin
tolian
juee
ksahnaauranilevias
dedainrwieta
ssonielylarte
faveumerifontume
phynslenaruani
core
yaenon
ka
jabdinerimikimaynin
anaasn
ssorionsush


## Compute loss

In [39]:
# Normalize bigram count so P[i,j] = P(char[i] is next | char[j] is previous) = count [i then j] / count[i]
P = bigram_count / bigram_count.sum(dim = 1, keepdim=True)
P.shape

torch.Size([27, 27])

In [42]:
bigram_count.sum(dim = 1, keepdim=True)

tensor([[32033],
        [33885],
        [ 2645],
        [ 3532],
        [ 5496],
        [20423],
        [  905],
        [ 1927],
        [ 7616],
        [17701],
        [ 2900],
        [ 5040],
        [13958],
        [ 6642],
        [18327],
        [ 7934],
        [ 1026],
        [  272],
        [12700],
        [ 8106],
        [ 5570],
        [ 3135],
        [ 2573],
        [  929],
        [  697],
        [ 9776],
        [ 2398]])

In [43]:
P[0].sum()

tensor(1.)

In [55]:
def get_proba(P, word, index):
    return 
# loss = sum of neg LL
log_l = 0
num_transitions = 0
for word in words:
    word = SPECIAL_CHAR + word + SPECIAL_CHAR
    for left in range(len(word)-1):
        log_l += torch.log(P[char_to_id[word[left]], char_to_id[word[left + 1]]])
        num_transitions += 1
        
# loss = average negative LL

loss = -log_l / num_transitions
print(log_l)
print(loss)

tensor(-559891.7500)
tensor(2.4541)


# 1-linear logit NN model

## Dataset
Just focus on the first word, which has 5 transitions:

In [144]:
words[0]

'emma'

In [145]:
training_word = SPECIAL_CHAR + words[0] + SPECIAL_CHAR
xs = []
ys = []
for left in range(len(training_word)-1):
    xs.append(char_to_id[training_word[left]])
    ys.append(char_to_id[training_word[left+1]])

xs = torch.tensor(xs)
ys = torch.tensor(ys)
print(xs)
print(ys)

tensor([ 0,  5, 13, 13,  1])
tensor([ 5, 13, 13,  1,  0])


## Model and loss (step by step)

In [166]:
# 0. Input = 5 integers, which corresponds to an input batch of size 5, and each item is the previous char's ID.
# 1. One-hot encode
import torch.nn.functional as F
# The float() is needed, or else xs @ W will fail: incompatible types
xs_encoded = F.one_hot(xs, num_classes = len(char_to_id)).float()
print(xs_encoded)

tensor([[1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
         0., 0., 0., 0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
         0., 0., 0., 0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0.,
         0., 0., 0., 0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0.,
         0., 0., 0., 0., 0., 0., 0., 0., 0.],
        [0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
         0., 0., 0., 0., 0., 0., 0., 0., 0.]])


In [167]:
# 2. linear layer.
W = torch.randn((27, 27))

In [168]:
logits = xs_encoded @ W
print(logits)

tensor([[ 2.4334,  0.7718, -0.7763, -0.5231,  3.8199,  2.1374, -1.0169,  0.3291,
         -0.2065,  0.3236, -0.7128,  0.4365,  0.7066, -0.7783, -2.0312,  1.1396,
         -0.6401, -1.2754, -0.6002,  0.8198, -0.4853,  0.1190, -0.2811,  1.3456,
          0.7820,  1.3204,  0.3893],
        [-1.6037, -1.2664, -0.6748,  0.2757, -0.1345, -0.0398,  1.3545, -1.3284,
         -0.4246,  1.0885, -0.7935,  0.3824, -0.2161,  1.5545, -0.7669, -0.6485,
         -0.1323,  1.3386,  1.3518,  1.0557, -0.6167,  1.2291,  0.7124,  0.2874,
         -0.6726, -0.2079, -0.0234],
        [-2.3104,  0.0854, -0.1798, -0.5887, -0.3187, -0.0692,  1.2934, -0.1856,
         -1.2556,  2.6381, -1.0469, -1.0034,  0.8970, -0.5868,  0.2359, -0.0210,
         -1.0383, -0.8930,  0.0863, -0.1942, -0.7802,  1.3803, -1.1363,  0.8544,
          1.8015,  0.2029,  0.9579],
        [-2.3104,  0.0854, -0.1798, -0.5887, -0.3187, -0.0692,  1.2934, -0.1856,
         -1.2556,  2.6381, -1.0469, -1.0034,  0.8970, -0.5868,  0.2359, -0.0210

In [170]:
# 3. Softmax the logits to get probabilities. This is where you will do sampling from!
probas = F.softmax(logits, dim=1)
print(probas)
# Note dim = 1 means summing where dim 1 = : will be 1. 
print(probas[2,:].sum())
print(probas[1,:].sum())

tensor([[0.1166, 0.0221, 0.0047, 0.0061, 0.4663, 0.0867, 0.0037, 0.0142, 0.0083,
         0.0141, 0.0050, 0.0158, 0.0207, 0.0047, 0.0013, 0.0320, 0.0054, 0.0029,
         0.0056, 0.0232, 0.0063, 0.0115, 0.0077, 0.0393, 0.0224, 0.0383, 0.0151],
        [0.0049, 0.0068, 0.0123, 0.0318, 0.0211, 0.0232, 0.0936, 0.0064, 0.0158,
         0.0717, 0.0109, 0.0354, 0.0195, 0.1143, 0.0112, 0.0126, 0.0212, 0.0921,
         0.0933, 0.0694, 0.0130, 0.0825, 0.0492, 0.0322, 0.0123, 0.0196, 0.0236],
        [0.0020, 0.0224, 0.0172, 0.0114, 0.0150, 0.0192, 0.0750, 0.0171, 0.0059,
         0.2876, 0.0072, 0.0075, 0.0504, 0.0114, 0.0260, 0.0201, 0.0073, 0.0084,
         0.0224, 0.0169, 0.0094, 0.0818, 0.0066, 0.0483, 0.1246, 0.0252, 0.0536],
        [0.0020, 0.0224, 0.0172, 0.0114, 0.0150, 0.0192, 0.0750, 0.0171, 0.0059,
         0.2876, 0.0072, 0.0075, 0.0504, 0.0114, 0.0260, 0.0201, 0.0073, 0.0084,
         0.0224, 0.0169, 0.0094, 0.0818, 0.0066, 0.0483, 0.1246, 0.0252, 0.0536],
        [0.0458, 0.0535,

In [159]:
# The model above is already fully defined, but for training, we need to define more layers to get the loss fcn, so we can do bprop
# Get the probas corresponding to the gold label ys
ys

tensor([ 5, 13, 13,  1,  0])

In [160]:
torch.arange(5)

tensor([0, 1, 2, 3, 4])

In [161]:
selected_probas = probas[torch.arange(5), ys]
print(selected_probas)

tensor([0.0263, 0.0134, 0.0364, 0.0558, 0.0076])


In [162]:
# Now just do avg negative LL
-selected_probas.log().mean()

tensor(3.8055)

## Gradient descent

In [163]:
step_size = 0.1
W = torch.randn(27,27, requires_grad = True)
for rep in range(10000):
    # forward pass
    xs_enc = F.one_hot(xs, num_classes = 27).float()
    logits = xs_enc @ W
    probas = F.softmax(logits, dim = 1)
    selected_probas = probas[torch.arange(len(xs)), ys]
    loss = -selected_probas.log().mean()
    
    # bwd
    W.grad = None
    loss.backward()
    W.data -= 0.1 * W.grad
    
    if rep % 50 == 0:
        print(f"Loss at {rep} = {loss:.4f}")
    

Loss at 0 = 3.6588
Loss at 50 = 2.6739
Loss at 100 = 1.8711
Loss at 150 = 1.3066
Loss at 200 = 0.9578
Loss at 250 = 0.7530
Loss at 300 = 0.6300
Loss at 350 = 0.5522
Loss at 400 = 0.5000
Loss at 450 = 0.4633
Loss at 500 = 0.4363
Loss at 550 = 0.4157
Loss at 600 = 0.3996
Loss at 650 = 0.3867
Loss at 700 = 0.3762
Loss at 750 = 0.3674
Loss at 800 = 0.3600
Loss at 850 = 0.3537
Loss at 900 = 0.3483
Loss at 950 = 0.3436
Loss at 1000 = 0.3394
Loss at 1050 = 0.3357
Loss at 1100 = 0.3324
Loss at 1150 = 0.3295
Loss at 1200 = 0.3269
Loss at 1250 = 0.3245
Loss at 1300 = 0.3223
Loss at 1350 = 0.3203
Loss at 1400 = 0.3185
Loss at 1450 = 0.3168
Loss at 1500 = 0.3152
Loss at 1550 = 0.3138
Loss at 1600 = 0.3125
Loss at 1650 = 0.3112
Loss at 1700 = 0.3101
Loss at 1750 = 0.3090
Loss at 1800 = 0.3080
Loss at 1850 = 0.3070
Loss at 1900 = 0.3061
Loss at 1950 = 0.3053
Loss at 2000 = 0.3045
Loss at 2050 = 0.3037
Loss at 2100 = 0.3030
Loss at 2150 = 0.3023
Loss at 2200 = 0.3017
Loss at 2250 = 0.3011
Loss at 230

## Sample

In [164]:
max_len = 50
for sample_i in range(10):
    cur_char_id = char_to_id[SPECIAL_CHAR]
    word = []
    while True:
        # forward pass, but without the loss
        input_raw = torch.tensor([cur_char_id])
        input_enc = F.one_hot(input_raw, num_classes = 27).float()
        logits = input_enc @ W
        probas = F.softmax(logits, dim = 1)
        
        # Sample
        assert probas.shape == (1,27), probas.shape
        cur_char_id = torch.multinomial(probas[0], num_samples = 1, replacement=True).item()
        if cur_char_id == char_to_id[SPECIAL_CHAR]:
            break
        word.append(id_to_char[cur_char_id])
        if len(word) >= max_len:
            break;
    print(''.join(word))

emma
emmma
emmma
emma
ema
ema
emmmma
emmmma
ema
ema
