<a href="https://colab.research.google.com/github/sganguly3000ai/makemore/blob/main/makemore_01.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# **Bigram character level language model using neural network**

In [2]:
import torch
import numpy as np
import string
import matplotlib.pyplot as plt
import torch.nn.functional as F

In [3]:
chars = list(string.ascii_lowercase)
stoi = {s:i+1 for i,s in enumerate(chars)}
stoi['.'] = 0
itos = {i:s for s,i in stoi.items()}

In [398]:
words = ['sumit','sorit','swapna','suraj','sujit', 'sujoy', 'manoj', 'amit',
         'kajol', 'steve', 'andrew','bob', 'cindy', 'ebrahim', 'daku', 'johny',
         'timmy', 'david', 'tarun', 'nancy', 'edith', 'jason', 'kyle', 'joseph',
         'kimberly', 'emma', 'irene', 'michael', 'scott', 'valentino']
xs, ys = [], []
N = torch.zeros((27,27) , dtype = torch.int32)
for w in words[:1]:
  chs = ['.'] + list(w) + ['.']
  for ch1,ch2 in zip(chs, chs[1:]):
    ix1 = stoi[ch1]
    ix2 = stoi[ch2]
    xs.append(ix1)
    ys.append(ix2)

xs = torch.tensor(xs) # inputs
ys = torch.tensor(ys) # labels
num = xs.nelement()   # keeping track of how many bigrams
print('number of examples or inputs: ', num)

"""
+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++

                INITIALIZE THE NEURAL NETWORK WITH 27 NEURON IN 1 LAYER

+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++

"""

# RANDOMLY INITIALIZE 27 NEURONS' WEIGHTS. EACH NEURON RECEIVES 27 INPUTS
g = torch.Generator().manual_seed(2147483647)

"""
IMPORTANT TO SET THE requires_grad=True : SO THAT PYTORCH CAN KEEP TRACK OF THE GRADIENTS OF W
"""

# getting random numbers from a normal distribution
W0 = torch.randn((27,27), generator=g, requires_grad=True)
W1 = torch.randn((10,27), generator=g, requires_grad=True)


number of examples or inputs:  6


In [399]:
for k in range(10):
  """
  +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++

                                      FORWARD PASS
                                    GRADIENT DESCENT

  +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++

  """

  # cannot pass just one integer as an input to a neural network but can pass
  # vectors
  # thats why using one hot encoding to generate a tensor represented by 1
  # at the index represented by the xs integer

  # should not pass in vectors of dtype as int, need to convert xenc to float

  xenc = F.one_hot(xs,num_classes=27).float() #inputs to the network: one hot encoding
  # plt.imshow(xenc)  uncomment to visualize in a plot

  # multiplying to get the logits from the neurons
  # feeding in all 6 inputs simultaneously into 27 neurons in layer 1
  logits = xenc @ W0 # predict log-counts
  #logits1 = logits @ W1
  counts = logits.exp() # equivalent to the N tensor we used in makemore where we were just counting the bigrams
  probs = counts / counts.sum(1, keepdims = True) # probabilities for next character (of 27 characters)

  # btw: the last 2 lines here are together called a 'softmax'

  """
  +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++

      CALCULATING THE LOSS - USING NEGATIVE LOG LIKELIHOOD TO CALCULATE LOSS SINCE THIS IS CLASSIFICATION

                IF THIS WAS REGRESSION WE WOULD USE MEAN SQUARED ERROR TO CALCULATE LOSS

  +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++

  plucking the values of the probabilities for 6 indices corresponding to the labels
  from the probs tensor (since there are 6 labels)
  """

  loss = -probs[torch.arange(num), ys].log().mean()
  """
  +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++

                                    BACKWARD PASS


                                WHAT DOES backward() DO?

    pytorch does a backward pass through the neural network starting at the loss node
    pytorch keeps track of the gradients which are calculated by performing the derivatives
    at each node

  +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++

  """
# SETTING GRADIENTS OF THE WEIGHTS TO ZERO OR NONE

  W0.grad = None
  loss.backward()

  """
  +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++

                                    UPDATE W


                                WHAT DOES backward() DO?

    pytorch does a backward pass through the neural network starting at the loss node
    pytorch keeps track of the gradients which are calculated by performing the derivatives
    at each node
    we update the weights (in the negative direction of the gradients) with a learning factor
    multiplied by the gradient

  +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++

  """
  W0.data += -.01 * W0.grad

In [311]:
W0[0]

tensor([ 1.5674, -0.2373, -0.0274, -1.1008,  0.2859, -0.0296, -1.5471,  0.6049,
         0.0791,  0.9046, -0.4713,  0.7868, -0.3284, -0.4330,  1.3729,  2.9334,
         1.5618, -1.6261,  0.6772, -0.8404,  0.9849, -0.1484, -1.4795,  0.4483,
        -0.0707,  2.4968,  2.4448], grad_fn=<SelectBackward0>)

In [318]:
xenc[0]

tensor([1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0.])

In [407]:
xnew = torch.zeros(num,27)
out = []
for j in range(num):
  for k in range(27):
    for l in range(27):
      xnew[j,k] += xenc[j,l]*W0[l,k]
      l += 1
    k += 1
s = ''
s += 'k='+ str(k) + 'j='+ str(j)
out.append(s)
j += 1
s = ''
s += 'k='+ str(k) + 'j='+ str(j) + 'i='+ str(i)
out.append(s)
i += 1

xnew.shape

torch.Size([6, 27])

In [415]:
xnew[0:2]

tensor([[ 1.5664, -0.2375, -0.0276, -1.1008,  0.2856, -0.0298, -1.5471,  0.6045,
          0.0789,  0.9041, -0.4714,  0.7864, -0.3286, -0.4331,  1.3721,  2.9294,
          1.5608, -1.6261,  0.6767, -0.8238,  0.9843, -0.1486, -1.4796,  0.4480,
         -0.0709,  2.4942,  2.4424],
        [-0.6287, -0.4427,  0.5683,  1.2792, -0.5541,  1.1169, -0.6007, -0.5864,
         -0.2830,  0.5334, -0.9940, -1.6997,  1.8342,  0.4197, -0.6875, -0.3508,
          0.7553, -0.9364, -0.0844, -1.6362,  1.0215,  1.0890, -0.5747,  0.0493,
          0.7231,  0.5969,  2.6919]], grad_fn=<SliceBackward0>)

In [417]:
logits.shape

torch.Size([6, 27])

In [392]:
out

['k=26',
 'k=26',
 'k=26',
 'k=26',
 'k=26',
 'k=26',
 'k=26',
 'k=26',
 'k=26',
 'k=26',
 'k=26',
 'k=26',
 'k=26',
 'k=26',
 'k=26',
 'k=26',
 'k=26',
 'k=26',
 'k=26',
 'k=26',
 'k=26',
 'k=26',
 'k=26',
 'k=26',
 'k=26',
 'k=26',
 'k=26',
 'j=26']

In [312]:
logits[0]

tensor([ 0.9968, -0.3565, -0.1725, -1.1527,  0.0923, -0.1744, -1.5808,  0.3473,
        -0.0810,  0.5706, -0.5667,  0.4849, -0.4378, -0.5320,  0.8823,  1.5431,
         0.9936, -1.6571,  0.4027,  7.5458,  0.6274, -0.2780, -1.5156,  0.2242,
        -0.2101,  1.4154,  1.3974], grad_fn=<SelectBackward0>)

In [208]:
print(loss.item())

0.42016875743865967


In [285]:
logits.shape

torch.Size([25, 27])

In [286]:
xenc.shape

torch.Size([25, 27])

In [167]:
probs[19]

tensor([4.3009e-04, 1.1767e-04, 1.4087e-04, 5.3663e-05, 1.8221e-04, 1.4060e-04,
        3.5063e-05, 2.3298e-04, 1.5399e-04, 2.8837e-04, 9.5723e-05, 2.6577e-04,
        1.0866e-04, 9.9055e-05, 3.8671e-04, 7.0442e-04, 4.2883e-04, 3.2512e-05,
        2.4569e-04, 9.9380e-01, 3.0433e-04, 1.2708e-04, 3.7395e-05, 2.0697e-04,
        1.3579e-04, 6.2917e-04, 6.1923e-04], grad_fn=<SelectBackward0>)

In [166]:
torch.multinomial(probs[18], num_samples=27, replacement=True, generator=g)

tensor([16, 16, 10,  0, 16, 16,  0, 10, 16, 10, 16, 10, 10, 16,  0, 10, 14,  0,
         0,  0, 10, 16, 16, 10, 16, 16, 10])

In [None]:
"""
+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++

                     VISUALIZING WHERE THE LOSS IS COMING FROM

+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++

"""

nlls = torch.zeros(6) # negetive log likelihood

for i in range(6):
  # i-th bigram:
  x = xs[i].item() # input character index
  y = ys[i].item() # label character index
  print('================================================================\n')
  print(f'bigram example {i+1}: {itos[x]}{itos[y]} (indexes {x},{y})')
  print('input to the neural net:', x)
  print('output probabilities from the neural net:', probs[i])
  print('label (actual next character):', y)
  p = probs[i, y]
  print('probability assigned by the network to the correct character:', p.item())
  logp = torch.log(p)
  print('log likelihood:', logp.item())
  nll = -logp
  print(f'negative log likelihood: {nll.item()} \n')
  nlls[i] = nll
  print('================================================================\n')
  print(f'average negative log likelihood after each bigram, i.e. loss = {nlls.mean().item()} \n')
  print('================================================================\n')

print(f'average negative log likelihood after all inputs, i.e. loss = {nlls.mean().item()} \n')

In [132]:
xs

tensor([ 0, 19, 21, 13,  9, 20,  0, 19, 15, 18,  9, 20,  0, 19, 23,  1, 16, 14,
         1,  0, 19, 21, 18,  1, 10,  0, 19, 21, 10,  9, 20])

# Sampling


*   torch.multinomial generates sample integer based on probabilities that are fed to the function

*   These integers are the index to the probability tensor P


*   The out list is being appended with the character equivalent of the index number thst the model is generating based on its training. The generation stops when the index value generated is 0 (indicating the end of a generated name).









In [209]:
g = torch.Generator().manual_seed(2147483647)
print (f'GENERATED NAMES BY THE BIGRAM MODEL')
print (f'===================================\n')
out = []
elemnum = 0
for i in range(num):
    p = probs[i]
    ix = torch.multinomial(p, num_samples=1, replacement=True, generator=g).item()
    elemnum += 1
    print (f'Predicted character=  {itos[ix]} : ix = {ix} : Element number = {elemnum-1} : Imput = {itos[xs[i].item()]}')
    out.append(itos[ix])
    if ix == 0:
      print (f'Predicted name =  {out} : Element number = {elemnum-1} \n')
      out = []


GENERATED NAMES BY THE BIGRAM MODEL

Predicted character=  s : ix = 19 : Element number = 0 : Imput = .
Predicted character=  u : ix = 21 : Element number = 1 : Imput = s
Predicted character=  m : ix = 13 : Element number = 2 : Imput = u
Predicted character=  i : ix = 9 : Element number = 3 : Imput = m
Predicted character=  t : ix = 20 : Element number = 4 : Imput = i
Predicted character=  . : ix = 0 : Element number = 5 : Imput = t
Predicted name =  ['s', 'u', 'm', 'i', 't', '.'] : Element number = 5 

Predicted character=  s : ix = 19 : Element number = 6 : Imput = .
Predicted character=  u : ix = 21 : Element number = 7 : Imput = s
Predicted character=  r : ix = 18 : Element number = 8 : Imput = o
Predicted character=  i : ix = 9 : Element number = 9 : Imput = r
Predicted character=  t : ix = 20 : Element number = 10 : Imput = i
Predicted character=  . : ix = 0 : Element number = 11 : Imput = t
Predicted name =  ['s', 'u', 'r', 'i', 't', '.'] : Element number = 11 

Predicted charac

In [200]:
probs[18]

tensor([3.2537e-01, 4.6815e-04, 1.2669e-03, 5.2630e-04, 1.5165e-03, 1.1149e-03,
        1.8147e-03, 6.4590e-04, 6.2343e-04, 2.8486e-04, 3.2537e-01, 4.6600e-04,
        1.7717e-03, 5.7201e-04, 9.4775e-04, 8.8209e-04, 3.2537e-01, 4.2930e-04,
        1.7263e-03, 1.4412e-03, 1.3600e-03, 1.1166e-03, 3.0206e-04, 1.1537e-03,
        1.5588e-03, 1.4089e-03, 4.8941e-04], grad_fn=<SelectBackward0>)

In [435]:
g = torch.Generator().manual_seed(2147483647)
t = torch.randn(5, generator=g).abs()
t /= t.sum()
torch.multinomial(t, num_samples=27, replacement=True, generator=g)

tensor([4, 0, 4, 4, 4, 1, 4, 4, 4, 4, 4, 3, 1, 4, 0, 2, 4, 1, 3, 4, 0, 1, 4, 0,
        1, 0, 0])

In [434]:
t

tensor([0.1908, 0.3229, 0.0111, 0.0664, 0.4088])

In [202]:
itos[torch.multinomial(probs[18], num_samples=27, replacement=True, generator=g)[18].item()]

'p'

In [436]:
a = torch.ones(6,27)
a[0,2],a[0,3] = 3.,4.
a[1,4] = 2.
b = torch.ones(27,6)
b[0,0],b[4,0],b[0,3] = 2.,2.,2.
#b.resize(2,1)
b.shape
print(f'{a}\n{b}')
c = a @ b
print(f'{c}')
c.shape

tensor([[1., 1., 3., 4., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
         1., 1., 1., 1., 1., 1., 1., 1., 1.],
        [1., 1., 1., 1., 2., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
         1., 1., 1., 1., 1., 1., 1., 1., 1.],
        [1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
         1., 1., 1., 1., 1., 1., 1., 1., 1.],
        [1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
         1., 1., 1., 1., 1., 1., 1., 1., 1.],
        [1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
         1., 1., 1., 1., 1., 1., 1., 1., 1.],
        [1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
         1., 1., 1., 1., 1., 1., 1., 1., 1.]])
tensor([[2., 1., 1., 2., 1., 1.],
        [1., 1., 1., 1., 1., 1.],
        [1., 1., 1., 1., 1., 1.],
        [1., 1., 1., 1., 1., 1.],
        [2., 1., 1., 1., 1., 1.],
        [1., 1., 1., 1., 1., 1.],
        [1., 1., 1., 1., 1., 1.],

RuntimeError: mat1 and mat2 shapes cannot be multiplied (6x27 and 10x6)

In [240]:
a

tensor([[1, 2],
        [3, 4],
        [4, 5]])

#Find the loss

In [12]:
log_likelihood = 0.0
n = 0
for w in words:
  chs = ['.'] + list(w) + ['.']
  for ch1,ch2 in zip(chs, chs[1:]):
    ix1 = stoi[ch1]
    ix2 = stoi[ch2]
    prob = P[ix1,ix2]
    logprob = torch.log(prob)
    log_likelihood += logprob
    n += 1
    print(f'{ch1}{ch2}: {prob:.4f} {logprob:.4f}')

print (f'{log_likelihood=}')
#negetive log_likelihood
nll = -log_likelihood
print (f'{nll=}')
print (f'{nll/n}') # loss

.s: 0.2667 -1.3218
su: 0.4000 -0.9163
um: 0.1667 -1.7918
mi: 0.3000 -1.2040
it: 0.3846 -0.9555
t.: 0.4545 -0.7885
.s: 0.2667 -1.3218
so: 0.2000 -1.6094
or: 0.1000 -2.3026
ri: 0.1429 -1.9459
it: 0.3846 -0.9555
t.: 0.4545 -0.7885
.s: 0.2667 -1.3218
sw: 0.1000 -2.3026
wa: 0.5000 -0.6931
ap: 0.0625 -2.7726
pn: 0.5000 -0.6931
na: 0.1667 -1.7918
a.: 0.1250 -2.0794
.s: 0.2667 -1.3218
su: 0.4000 -0.9163
ur: 0.1667 -1.7918
ra: 0.2857 -1.2528
aj: 0.1250 -2.0794
j.: 0.2500 -1.3863
.s: 0.2667 -1.3218
su: 0.4000 -0.9163
uj: 0.3333 -1.0986
ji: 0.1250 -2.0794
it: 0.3846 -0.9555
t.: 0.4545 -0.7885
.s: 0.2667 -1.3218
su: 0.4000 -0.9163
uj: 0.3333 -1.0986
jo: 0.5000 -0.6931
oy: 0.1000 -2.3026
y.: 0.8571 -0.1542
.m: 0.0667 -2.7081
ma: 0.2000 -1.6094
an: 0.1875 -1.6740
no: 0.1667 -1.7918
oj: 0.1000 -2.3026
j.: 0.2500 -1.3863
.a: 0.0667 -2.7081
am: 0.0625 -2.7726
mi: 0.3000 -1.2040
it: 0.3846 -0.9555
t.: 0.4545 -0.7885
.k: 0.1000 -2.3026
ka: 0.2500 -1.3863
aj: 0.1250 -2.0794
jo: 0.5000 -0.6931
ol: 0.1000 -

# Check the probability and loss for any given word

In [25]:
log_likelihood = 0.0
n = 0
for w in ['steve']:
  chs = ['.'] + list(w) + ['.']
  for ch1,ch2 in zip(chs, chs[1:]):
    ix1 = stoi[ch1]
    ix2 = stoi[ch2]
    prob = P[ix1,ix2]
    logprob = torch.log(prob)
    log_likelihood += logprob
    n += 1
    print(f'{ch1}{ch2}: {prob:.4f} {logprob:.4f}')

print (f'{log_likelihood=}')
#negetive log_likelihood
nll = -log_likelihood
print (f'{nll=}')
print (f'{nll/n}') # loss

.s: 0.2667 -1.3218
st: 0.1000 -2.3026
te: 0.0909 -2.3979
ev: 0.0769 -2.5649
ve: 0.3333 -1.0986
e.: 0.2308 -1.4663
log_likelihood=tensor(-11.1521)
nll=tensor(11.1521)
1.858689308166504


# Model smoothing


*   The loss is infinite for some names that the model has 0 probability of predicting
*   In order to alleviate this infinite loss this model can be smoothed out by adding some positive values to the probability



In [29]:

# here we are adding 1 to every bigram count
# any number can be added
P = (N+1).float()
D = P.sum(1, keepdim = True)
# If the sum of the items in the row is 0, then set it to 1 to avoid division by 0
for d in D:
  if d[0]==0.0:
    d[0]= 1.0
P /= D

g = torch.Generator().manual_seed(2147483647)
print (f'GENERATED NAMES BY THE BIGRAM MODEL')
print (f'===================================\n')
for i in range(10):
  out = []
  ix = 0
  while True:
    p = P[ix]
    ix = torch.multinomial(p, num_samples=1, replacement=True, generator=g).item()
    out.append(itos[ix])
    if ix == 0:
      break
  print (''.join(out))

log_likelihood = 0.0
n = 0
for w in ['chico']:
  chs = ['.'] + list(w) + ['.']
  for ch1,ch2 in zip(chs, chs[1:]):
    ix1 = stoi[ch1]
    ix2 = stoi[ch2]
    prob = P[ix1,ix2]
    logprob = torch.log(prob)
    log_likelihood += logprob
    n += 1
    print(f'{ch1}{ch2}: {prob:.4f} {logprob:.4f}')

print (f'{log_likelihood=}')
#negetive log_likelihood
nll = -log_likelihood
print (f'{nll=}')
print (f'{nll/n}') # loss

GENERATED NAMES BY THE BIGRAM MODEL

juwjdvditkaqaz.
p.
cfqywacny.
kuitrltohcogsjgwzvudahntauy.
bilevhajkdbdainrwimbl.
snjyinaylaftezffvmumthyfodtumj.
pfytsuwjhruanq.
core.
ysezocfky.
jabdywebfmiifmwyfin.
.c: 0.0351 -3.3499
ch: 0.0645 -2.7408
hi: 0.0625 -2.7726
ic: 0.0500 -2.9957
co: 0.0645 -2.7408
o.: 0.0541 -2.9178
log_likelihood=tensor(-17.5177)
nll=tensor(17.5177)
2.9196126461029053


# Complete model code

In [None]:
# Import all required packages

import torch
import numpy as np
import string
import matplotlib.pyplot as plt

# Create dictionaries as character lookup and translation to integer
# This is required since strings cannot be stored in tensors

chars = list(string.ascii_lowercase)
stoi = {s:i+1 for i,s in enumerate(chars)}
stoi['.'] = 0
itos = {i:s for s,i in stoi.items()}

# Create a list of common names. Preferably one should find a dataset of names online

words = ['sumit','sorit','swapna','suraj','sujit', 'sujoy', 'manoj', 'amit',
         'kajol', 'steve', 'andrew','bob', 'cindy', 'ebrahim', 'daku', 'johny',
         'timmy', 'david', 'tarun', 'nancy', 'edith', 'jason', 'kyle', 'joseph',
         'kimberly', 'emma', 'irene', 'michael', 'scott', 'valentino']

# Tensor N will hold the raw counts of the Bigram

N = torch.zeros((27,27) , dtype = torch.int32)
for w in words:
  chs = ['.'] + list(w) + ['.']
  for ch1,ch2 in zip(chs, chs[1:]):
    ix1 = stoi[ch1]
    ix2 = stoi[ch2]
    N[ix1,ix2] += 1

plt.figure(figsize=(16,16))
plt.imshow(N, cmap='Reds')

for i in range(27):
  for j in range(27):
    chstr = itos[i] + itos [j]
    plt.text(j, i, chstr, ha="center", va="bottom", color="gray")
    plt.text(j, i, N[i,j].item(), ha="center", va="top", color="gray")
plt.axis('off')

P = N.float()
D = P.sum(1, keepdim = True)
# If the sum of the items in the row is 0, then set it to 1 to avoid division by 0
for d in D:
  if d[0]==0.0:
    d[0]= 1.0
P /= D

g = torch.Generator().manual_seed(2147483647)
print (f'GENERATED NAMES BY THE BIGRAM MODEL')
print (f'===================================\n')
for i in range(10):
  out = []
  ix = 0
  while True:
    p = P[ix]
    ix = torch.multinomial(p, num_samples=1, replacement=True, generator=g).item()
    out.append(itos[ix])
    if ix == 0:
      break
  print (''.join(out))

log_likelihood = 0.0
n = 0
for w in words:
  chs = ['.'] + list(w) + ['.']
  for ch1,ch2 in zip(chs, chs[1:]):
    ix1 = stoi[ch1]
    ix2 = stoi[ch2]
    prob = P[ix1,ix2]
    logprob = torch.log(prob)
    log_likelihood += logprob
    n += 1
    print(f'{ch1}{ch2}: {prob:.4f} {logprob:.4f}')

print (f'{log_likelihood=}')
#negetive log_likelihood
nll = -log_likelihood
print (f'{nll=}')
print (f'{nll/n}') # loss