In [1]:
import torch
import matplotlib.pyplot as plt

In [2]:
words = open('names.txt', 'r').read().splitlines()# splits the string into a list of strings, using line breaks as the separator.

In [3]:
chars = sorted(list(set(''.join(words))))

In [4]:
stoi = {s:i+1 for i,s in enumerate(chars)}

stoi['.'] = 0
stoi

{'a': 1,
 'b': 2,
 'c': 3,
 'd': 4,
 'e': 5,
 'f': 6,
 'g': 7,
 'h': 8,
 'i': 9,
 'j': 10,
 'k': 11,
 'l': 12,
 'm': 13,
 'n': 14,
 'o': 15,
 'p': 16,
 'q': 17,
 'r': 18,
 's': 19,
 't': 20,
 'u': 21,
 'v': 22,
 'w': 23,
 'x': 24,
 'y': 25,
 'z': 26,
 '.': 0}

In [5]:
itos = {value:key for key, value in stoi.items()}

In [6]:
# create the training set of bigrams (x,y)
xs, ys = [], []                        ###

for w in words[:1]:
    chs = ['.'] + list(w) + ['.']
    for ch1, ch2 in zip(chs, chs[1:]):
        ix1 = stoi[ch1]                ###
        ix2 = stoi[ch2]                ###
        print(ch1, ch2)
        xs.append(ix1)                 ###
        ys.append(ix2)                 ###
    
xs = torch.tensor(xs)                  ###
ys = torch.tensor(ys)                  ###

. e
e m
m m
m a
a .


In [7]:
xs

tensor([ 0,  5, 13, 13,  1])

In [8]:
ys

tensor([ 5, 13, 13,  1,  0])

In [9]:
xenc = torch.nn.functional.one_hot(xs, num_classes=27).float()
xenc

tensor([[1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
         0., 0., 0., 0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
         0., 0., 0., 0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0.,
         0., 0., 0., 0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0.,
         0., 0., 0., 0., 0., 0., 0., 0., 0.],
        [0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
         0., 0., 0., 0., 0., 0., 0., 0., 0.]])

In [10]:
xenc.shape

torch.Size([5, 27])

In [11]:
xenc.dtype

torch.float32

In [12]:
W = torch.randn((27, 1))
xenc[0] @ W # one neuron output (1, 27) @ (27, 1) -> (1, 1)

tensor([-1.2097])

In [13]:
xenc @ W #Vectorized

tensor([[-1.2097],
        [ 0.5425],
        [-0.8667],
        [-0.8667],
        [ 1.2896]])

In [14]:
W = torch.randn((27, 27), requires_grad=True) # 729 parameters
xenc @ W # (5, 27) @ (27, 27) -> (5, 27)

tensor([[ 0.5113, -1.2889,  0.6511,  1.4577, -1.8795, -1.8569, -0.6685, -0.3807,
          0.1299, -0.5953,  0.3240, -0.6966,  0.1884,  1.3102, -0.8770, -1.6961,
          0.0131, -2.0565,  0.7665,  0.7568,  0.0504, -0.5717, -0.0621,  1.3506,
          0.2363, -0.2872, -0.9544],
        [-0.4569, -1.6088,  0.2555,  0.2293,  0.9758, -0.0554, -1.0687, -0.3408,
          0.3882,  1.2667, -1.9132,  1.0240,  0.0253, -0.5618, -0.1042,  0.0165,
         -1.6488,  0.9745,  0.7305, -1.1469, -0.5620,  0.4836,  0.0942,  0.8132,
          0.5246, -0.1325,  0.5245],
        [ 1.1489, -0.5758,  1.1291,  0.8534,  0.1623, -0.6800, -0.3093,  1.9961,
          0.2742,  0.0604,  0.7210, -0.7238,  0.5377,  1.1540,  0.9461, -0.1802,
         -0.0296,  0.1697, -0.7552,  0.1991, -0.2870, -0.6825,  0.6623, -0.5634,
          0.6878,  0.7256, -0.9141],
        [ 1.1489, -0.5758,  1.1291,  0.8534,  0.1623, -0.6800, -0.3093,  1.9961,
          0.2742,  0.0604,  0.7210, -0.7238,  0.5377,  1.1540,  0.9461, -0.1802

In [15]:
logits = xenc @ W # log-counts (5, 27) @ (27, 27) -> (5, 27)
counts = logits.exp() # similar N (5, 27)
probs = counts / counts.sum(1, keepdims=True) # probabilities for next character (5, 27)
probs

tensor([[0.0505, 0.0084, 0.0581, 0.1302, 0.0046, 0.0047, 0.0155, 0.0207, 0.0345,
         0.0167, 0.0419, 0.0151, 0.0366, 0.1124, 0.0126, 0.0056, 0.0307, 0.0039,
         0.0652, 0.0646, 0.0319, 0.0171, 0.0285, 0.1170, 0.0384, 0.0227, 0.0117],
        [0.0183, 0.0058, 0.0374, 0.0364, 0.0768, 0.0274, 0.0099, 0.0206, 0.0427,
         0.1027, 0.0043, 0.0806, 0.0297, 0.0165, 0.0261, 0.0294, 0.0056, 0.0767,
         0.0601, 0.0092, 0.0165, 0.0470, 0.0318, 0.0653, 0.0489, 0.0254, 0.0489],
        [0.0713, 0.0127, 0.0700, 0.0531, 0.0266, 0.0115, 0.0166, 0.1665, 0.0298,
         0.0240, 0.0465, 0.0110, 0.0387, 0.0717, 0.0583, 0.0189, 0.0220, 0.0268,
         0.0106, 0.0276, 0.0170, 0.0114, 0.0439, 0.0129, 0.0450, 0.0467, 0.0091],
        [0.0713, 0.0127, 0.0700, 0.0531, 0.0266, 0.0115, 0.0166, 0.1665, 0.0298,
         0.0240, 0.0465, 0.0110, 0.0387, 0.0717, 0.0583, 0.0189, 0.0220, 0.0268,
         0.0106, 0.0276, 0.0170, 0.0114, 0.0439, 0.0129, 0.0450, 0.0467, 0.0091],
        [0.0258, 0.0142,

In [16]:
probs.shape

torch.Size([5, 27])

In [17]:
probs[0]

tensor([0.0505, 0.0084, 0.0581, 0.1302, 0.0046, 0.0047, 0.0155, 0.0207, 0.0345,
        0.0167, 0.0419, 0.0151, 0.0366, 0.1124, 0.0126, 0.0056, 0.0307, 0.0039,
        0.0652, 0.0646, 0.0319, 0.0171, 0.0285, 0.1170, 0.0384, 0.0227, 0.0117],
       grad_fn=<SelectBackward0>)

In [18]:
probs[0].shape

torch.Size([27])

In [19]:
probs[0].sum()

tensor(1., grad_fn=<SumBackward0>)

In [20]:
x = xs[0].item() # input character index
y = ys[0].item() # label character index
print(f'bigram example: {itos[x]}{itos[y]} (indexes {x},{y})')
print('input: ', x)
print('output:\n', probs[0])
print('label: ', y)
print('probability assigned by the net to the the correct character: ', probs[0, y].item())
print('log likelihood:', torch.log(probs[0, y]).item())
print('negative log likelihood:', -torch.log(probs[0, y]).item())

bigram example: .e (indexes 0,5)
input:  0
output:
 tensor([0.0505, 0.0084, 0.0581, 0.1302, 0.0046, 0.0047, 0.0155, 0.0207, 0.0345,
        0.0167, 0.0419, 0.0151, 0.0366, 0.1124, 0.0126, 0.0056, 0.0307, 0.0039,
        0.0652, 0.0646, 0.0319, 0.0171, 0.0285, 0.1170, 0.0384, 0.0227, 0.0117],
       grad_fn=<SelectBackward0>)
label:  5
probability assigned by the net to the the correct character:  0.0047335815615952015
log likelihood: -5.3530731201171875
negative log likelihood: 5.3530731201171875


In [21]:
x = xs[3].item() # input character index
y = ys[3].item() # label character index
print(f'bigram example: {itos[x]}{itos[y]} (indexes {x},{y})')
print('input: ', x)
print('output:\n', probs[3])
print('label: ', y)
print('probability assigned by the net to the the correct character: ', probs[3, y].item())
print('log likelihood:', torch.log(probs[3, y]).item())
print('negative log likelihood:', -torch.log(probs[3, y]).item())

bigram example: ma (indexes 13,1)
input:  13
output:
 tensor([0.0713, 0.0127, 0.0700, 0.0531, 0.0266, 0.0115, 0.0166, 0.1665, 0.0298,
        0.0240, 0.0465, 0.0110, 0.0387, 0.0717, 0.0583, 0.0189, 0.0220, 0.0268,
        0.0106, 0.0276, 0.0170, 0.0114, 0.0439, 0.0129, 0.0450, 0.0467, 0.0091],
       grad_fn=<SelectBackward0>)
label:  1
probability assigned by the net to the the correct character:  0.012716551311314106
log likelihood: -4.364850997924805
negative log likelihood: 4.364850997924805


In [22]:
nlls = torch.zeros(5)                                                       ###
for i in range(5):    
    x = xs[i].item() # input character index
    y = ys[i].item() # label character index
    print('--------')
    print(f'bigram example: {itos[x]}{itos[y]} (indexes {x},{y})')
    print('input: ', x)
    print('output:\n', probs[i])
    print('label: ', y)
    print('probability assigned by the net to the the correct character: ', probs[i, y].item())
    print('log likelihood:', torch.log(probs[i, y]).item())
    print('negative log likelihood:', -torch.log(probs[i, y]).item())
    nlls[i] = -torch.log(probs[i, y]).item()                               ###
    
print('=========')                                                         ###
print('average negative log likelihood, i.e. loss =', nlls.mean().item())  ###

--------
bigram example: .e (indexes 0,5)
input:  0
output:
 tensor([0.0505, 0.0084, 0.0581, 0.1302, 0.0046, 0.0047, 0.0155, 0.0207, 0.0345,
        0.0167, 0.0419, 0.0151, 0.0366, 0.1124, 0.0126, 0.0056, 0.0307, 0.0039,
        0.0652, 0.0646, 0.0319, 0.0171, 0.0285, 0.1170, 0.0384, 0.0227, 0.0117],
       grad_fn=<SelectBackward0>)
label:  5
probability assigned by the net to the the correct character:  0.0047335815615952015
log likelihood: -5.3530731201171875
negative log likelihood: 5.3530731201171875
--------
bigram example: em (indexes 5,13)
input:  5
output:
 tensor([0.0183, 0.0058, 0.0374, 0.0364, 0.0768, 0.0274, 0.0099, 0.0206, 0.0427,
        0.1027, 0.0043, 0.0806, 0.0297, 0.0165, 0.0261, 0.0294, 0.0056, 0.0767,
        0.0601, 0.0092, 0.0165, 0.0470, 0.0318, 0.0653, 0.0489, 0.0254, 0.0489],
       grad_fn=<SelectBackward0>)
label:  13
probability assigned by the net to the the correct character:  0.016507331281900406
log likelihood: -4.103950500488281
negative log likelihoo

In [23]:
# create the training set of bigrams (x,y)
xs, ys = [], []                       

for w in words:                        ###
    chs = ['.'] + list(w) + ['.']
    for ch1, ch2 in zip(chs, chs[1:]):
        ix1 = stoi[ch1]                
        ix2 = stoi[ch2]                
        #print(ch1, ch2)
        xs.append(ix1)                 
        ys.append(ix2)                 
    
xs = torch.tensor(xs)                  
ys = torch.tensor(ys)

num = xs.nelement()                ###
print('number of examples: ', num) ###

number of examples:  228146


In [24]:
xenc = torch.nn.functional.one_hot(xs, num_classes=27).float() # input to the network: one-hot encoding
xenc.shape

torch.Size([228146, 27])

In [25]:
logits = xenc @ W # (228146, 27) @ (27, 27) -> (228146, 27)
counts = logits.exp() # (228146, 27)
probs = counts / counts.sum(1, keepdims=True) # (228146, 27) / (228146, 1) -> (228146, 27)

In [26]:
probs[0, 5], probs[1, 13], probs[2, 13], probs[3, 1], probs[4, 0]

(tensor(0.0047, grad_fn=<SelectBackward0>),
 tensor(0.0165, grad_fn=<SelectBackward0>),
 tensor(0.0717, grad_fn=<SelectBackward0>),
 tensor(0.0127, grad_fn=<SelectBackward0>),
 tensor(0.0258, grad_fn=<SelectBackward0>))

In [27]:
probs[[0, 1, 2, 3, 4], [5, 13, 13, 1, 0]].data

tensor([0.0047, 0.0165, 0.0717, 0.0127, 0.0258])

In [28]:
probs[torch.arange(5), ys[:5]].data

tensor([0.0047, 0.0165, 0.0717, 0.0127, 0.0258])

In [29]:
-probs[torch.arange(5), ys[:5]].log().mean().data #Vectorized 

tensor(4.0225)

In [30]:
probs[torch.arange(num), ys].shape

torch.Size([228146])

In [31]:
# gradient descent
for _ in range(100):
  
    # forward pass
    logits = xenc @ W # (228146, 27) @ (27, 27) -> (228146, 27)
    counts = logits.exp() # (228146, 27)
    probs = counts / counts.sum(1, keepdims=True) # (228146, 27) / (228146, 1) -> (228146, 27)
    loss = -probs[torch.arange(num), ys].log().mean() + 0.01*(W**2).mean() #L2 regularization
    print(loss.item())
  
    # backward pass
    W.grad = None # set to zero the gradient
    loss.backward()
  
    # update
    W.data += -50 * W.grad
print(f'loss without regularization: {-probs[torch.arange(num), ys].log().mean()}')

3.6674909591674805
3.316964626312256
3.112459897994995
2.9850077629089355
2.8959949016571045
2.8305578231811523
2.781200647354126
2.7433338165283203
2.7136714458465576
2.6898412704467773
2.670196056365967
2.653632164001465
2.639416456222534
2.6270511150360107
2.6161880493164062
2.6065728664398193
2.5980124473571777
2.5903570652008057
2.5834829807281494
2.5772883892059326
2.5716865062713623
2.5666022300720215
2.5619726181030273
2.5577425956726074
2.5538647174835205
2.5502989292144775
2.5470094680786133
2.543966770172119
2.541144371032715
2.538520574569702
2.5360753536224365
2.53379225730896
2.53165602684021
2.529653787612915
2.5277740955352783
2.5260064601898193
2.5243418216705322
2.5227720737457275
2.521289348602295
2.519888401031494
2.518561601638794
2.5173046588897705
2.5161123275756836
2.514979600906372
2.513902425765991
2.5128777027130127
2.51190185546875
2.5109713077545166
2.5100831985473633
2.509235382080078
2.5084245204925537
2.5076489448547363
2.506906747817993
2.50619530677795

In [32]:
# gradient descent
for _ in range(100):
  
    # forward pass
    logits = xenc @ W # predict log-counts
    counts = logits.exp() # counts, equivalent to N
    probs = counts / counts.sum(1, keepdims=True) # probabilities for next character
    loss = -probs[torch.arange(num), ys].log().mean() + 0.01*(W**2).mean()
    print(loss.item())
  
    # backward pass
    W.grad = None # set to zero the gradient
    loss.backward()
  
    # update
    W.data += -25 * W.grad
print(f'loss without regularization: {-probs[torch.arange(num), ys].log().mean()}')

2.4901773929595947
2.490097761154175
2.4900197982788086
2.4899418354034424
2.48986554145813
2.4897899627685547
2.489715099334717
2.4896411895751953
2.4895682334899902
2.4894959926605225
2.489424228668213
2.489353656768799
2.489284038543701
2.489215135574341
2.4891467094421387
2.489079475402832
2.4890124797821045
2.4889466762542725
2.4888813495635986
2.488816738128662
2.488753080368042
2.48868989944458
2.4886276721954346
2.488565683364868
2.4885048866271973
2.4884443283081055
2.488384485244751
2.488325357437134
2.488266944885254
2.4882090091705322
2.488151788711548
2.488095283508301
2.488039255142212
2.487983465194702
2.487928628921509
2.4878742694854736
2.487820625305176
2.487767219543457
2.4877147674560547
2.4876625537872314
2.4876110553741455
2.4875600337982178
2.487509250640869
2.487459182739258
2.4874095916748047
2.487360715866089
2.4873123168945312
2.4872639179229736
2.4872162342071533
2.4871692657470703
2.4871222972869873
2.4870760440826416
2.4870307445526123
2.486985206604004
2.

In [33]:
for _ in range(40): 
    out = []
    i = 0
    
    while True:
        i = torch.multinomial(probs[i], num_samples=1, replacement=True).item()
        out.append(itos[i])
        if i == 0:
            break
            
    print(''.join(out)[:-1])

myeazljzorhna
s
h
zknekniiiayid
ralra
cba
alpg
dcirtroimniatumzalaenesqmmdyeav
snijalegebearktezemhr
ros
eialm
aa
dvlsrharsiarlnyaaiaxrtamhrhegemnqnia
dym
mr
jtroadyreagianialoni
ds
asnyiemulmneemrliear
rlaaa
a
a
ayrsw
iia
htefeo
z
anyltdnejymzyjr
j
h
km
ljgoemriiiiiabatacanaelayddia
kiadco
a
smngiiiiekrha
munreblebalcfvebalzemhnetoniiyjednwemyklebiarhmcisfniyad
pmk
oneoz
dxlba
ba
nonvyahreaj
jmvulbania
