In [1]:
#!/usr/bin/env python3

This file illustrates how you might experiment with the HMM interface.
You can paste these commands in at the Python prompt, or execute `test_ic.py` directly.
A notebook interface is nicer than the plain Python prompt, so we provide
a notebook version of this file as `test_ic.ipynb`, which you can open with
`jupyter` or with Visual Studio `code` (run it with the `nlp-class` kernel).

In [2]:
import logging, math, os
from pathlib import Path

In [3]:
import torch
from torch import tensor

In [4]:
from corpus import TaggedCorpus
from eval import model_cross_entropy, write_tagging
from hmm import HiddenMarkovModel
from crf import ConditionalRandomField

Set up logging.

In [5]:
log = logging.getLogger("test_ic")       # For usage, see findsim.py in earlier assignment.
logging.root.setLevel(level=logging.INFO)
logging.basicConfig(level=logging.INFO)  # could change INFO to DEBUG
# torch.autograd.set_detect_anomaly(True)    # uncomment to improve error messages from .backward(), but slows down

Switch working directory to the directory where the data live.  You may want to edit this line.

In [6]:
os.chdir("../data")

Get vocabulary and tagset from a supervised corpus.

In [7]:
icsup = TaggedCorpus(Path("icsup"), add_oov=False)
log.info(f"Ice cream vocabulary: {list(icsup.vocab)}")
log.info(f"Ice cream tagset: {list(icsup.tagset)}")

INFO:corpus:Read 40 tokens from icsup
INFO:corpus:Created 4 tag types
INFO:corpus:Created 5 word types
INFO:test_ic:Ice cream vocabulary: ['1', '2', '3', '_EOS_WORD_', '_BOS_WORD_']
INFO:test_ic:Ice cream tagset: ['C', 'H', '_EOS_TAG_', '_BOS_TAG_']


Two ways to look at the corpus ...

In [8]:
os.system("cat icsup")   # call the shell to look at the file directly

1/C 1/C 1/C 1/C 1/C 1/C 1/C 2/C 2/C 3/H
1/H 2/H 2/H 3/H 3/H 3/H 3/H 3/H 3/H 3/C
1/C 1/C 1/C 1/C 1/C 1/C 1/C 2/C 2/C 3/H
1/H 2/H 2/H 3/H 3/H 3/H 3/H 3/H 3/H 3/C


0

In [9]:
log.info(icsup)          # print the TaggedCorpus python object we constructed from it

INFO:test_ic:1/C 1/C 1/C 1/C 1/C 1/C 1/C 2/C 2/C 3/H
1/H 2/H 2/H 3/H 3/H 3/H 3/H 3/H 3/H 3/C
1/C 1/C 1/C 1/C 1/C 1/C 1/C 2/C 2/C 3/H
1/H 2/H 2/H 3/H 3/H 3/H 3/H 3/H 3/H 3/C


Make an HMM.

In [10]:
log.info("*** Hidden Markov Model (HMM) test\n")
hmm = HiddenMarkovModel(icsup.tagset, icsup.vocab)
# Change the transition/emission initial probabilities to match the ice cream spreadsheet,
# and test your implementation of the Viterbi algorithm.  Note that the spreadsheet 
# uses transposed versions of these matrices.
hmm.B = tensor([[0.7000, 0.2000, 0.1000],    # emission probabilities
                [0.1000, 0.2000, 0.7000],
                [0.0000, 0.0000, 0.0000],
                [0.0000, 0.0000, 0.0000]])
hmm.A = tensor([[0.8000, 0.1000, 0.1000, 0.0000],   # transition probabilities
                [0.1000, 0.8000, 0.1000, 0.0000],
                [0.0000, 0.0000, 0.0000, 0.0000],
                [0.5000, 0.5000, 0.0000, 0.0000]])
log.info("*** Current A, B matrices (using initalizations from the ice cream spreadsheet)")
hmm.printAB()

INFO:test_ic:*** Hidden Markov Model (HMM) test

INFO:test_ic:*** Current A, B matrices (using initalizations from the ice cream spreadsheet)


Transition matrix A:
	C	H	_EOS_TAG_	_BOS_TAG_
C	0.800	0.100	0.100	0.000
H	0.100	0.800	0.100	0.000
_EOS_TAG_	0.000	0.000	0.000	0.000
_BOS_TAG_	0.500	0.500	0.000	0.000

Emission matrix B:
	1	2	3
C	0.700	0.200	0.100
H	0.100	0.200	0.700
_EOS_TAG_	0.000	0.000	0.000
_BOS_TAG_	0.000	0.000	0.000




Try it out on the raw data from the spreadsheet, available in `icraw``.

In [11]:
log.info("*** Viterbi results on icraw with hard coded parameters")
icraw = TaggedCorpus(Path("icraw"), tagset=icsup.tagset, vocab=icsup.vocab)
write_tagging(hmm, icraw, Path("icraw_hmm.output"))  # calls hmm.viterbi_tagging on each sentence
os.system("cat icraw_hmm.output")   # print the file we just created, and remove it

INFO:test_ic:*** Viterbi results on icraw with hard coded parameters
100%|██████████| 1/1 [00:00<00:00, 162.44it/s]

2/H 3/H 3/H 2/H 3/H 2/H 3/H 2/H 2/H 3/H 1/H 3/H 3/H 1/C 1/C 1/C 2/C 1/C 1/C 1/C 3/C 1/C 2/C 1/C 1/C 1/C 2/H 3/H 3/H 2/H 3/H 2/H 2/H





0

Did the parameters that we guessed above get the "correct" answer, 
as revealed in `icdev`?

In [12]:
icdev = TaggedCorpus(Path("icdev"), tagset=icsup.tagset, vocab=icsup.vocab)
log.info(f"*** Compare to icdev corpus:\n{icdev}")
from eval import viterbi_error_rate
viterbi_error_rate(hmm, icdev, show_cross_entropy=False)

INFO:test_ic:*** Compare to icdev corpus:
2/H 3/H 3/H 2/H 3/H 2/H 3/H 2/H 2/H 3/H 1/C 3/C 3/C 1/C 1/C 1/C 2/C 1/C 1/C 1/C 3/C 1/C 2/C 1/C 1/C 1/C 2/H 3/H 3/H 2/H 3/H 2/H 2/H
100%|██████████| 1/1 [00:00<00:00, 368.15it/s]
INFO:eval:Tagging accuracy: all: 90.909%, seen: 90.909%, novel: nan%


0.09090909090909094

Now let's try your training code, running it on supervised data.
To test this, we'll restart from a random initialization.
(You could also try creating this new model with `unigram=true`, 
which will affect the rest of the notebook.)

In [13]:
hmm = HiddenMarkovModel(icsup.tagset, icsup.vocab)
log.info("*** A, B matrices as randomly initialized close to uniform")
hmm.printAB()

INFO:test_ic:*** A, B matrices as randomly initialized close to uniform


Transition matrix A:
	C	H	_EOS_TAG_	_BOS_TAG_
C	0.334	0.334	0.332	0.000
H	0.334	0.332	0.334	0.000
_EOS_TAG_	0.334	0.333	0.333	0.000
_BOS_TAG_	0.333	0.334	0.334	0.000

Emission matrix B:
	1	2	3
C	0.333	0.335	0.332
H	0.333	0.333	0.334
_EOS_TAG_	0.000	0.000	0.000
_BOS_TAG_	0.000	0.000	0.000




In [14]:
log.info("*** Supervised training on icsup")
cross_entropy_loss = lambda model: model_cross_entropy(model, icsup)
hmm.train(corpus=icsup, loss=cross_entropy_loss, tolerance=0.0001)
log.info("*** A, B matrices after training on icsup (should "
         "match initial params on spreadsheet [transposed])")
hmm.printAB()

INFO:test_ic:*** Supervised training on icsup
100%|██████████| 4/4 [00:00<00:00, 243.77it/s]
INFO:eval:Cross-entropy: 2.0979 nats (= perplexity 8.149)


tensor([0.1109, 0.0000, 0.0000, 0.0000])
tensor([0.0123, 0.0000, 0.0000, 0.0000])
tensor([0.0014, 0.0000, 0.0000, 0.0000])
tensor([0.0002, 0.0000, 0.0000, 0.0000])
tensor([1.6996e-05, 0.0000e+00, 0.0000e+00, 0.0000e+00])
tensor([1.8909e-06, 0.0000e+00, 0.0000e+00, 0.0000e+00])
tensor([2.1037e-07, 0.0000e+00, 0.0000e+00, 0.0000e+00])
tensor([2.3491e-08, 0.0000e+00, 0.0000e+00, 0.0000e+00])
tensor([2.6232e-09, 0.0000e+00, 0.0000e+00, 0.0000e+00])
tensor([0.0000e+00, 2.9240e-10, 0.0000e+00, 0.0000e+00])
tensor([0.0000e+00, 0.0000e+00, 9.7567e-11, 0.0000e+00])
tensor([0.0000, 0.1111, 0.0000, 0.0000])
tensor([0.0000, 0.0123, 0.0000, 0.0000])
tensor([0.0000, 0.0014, 0.0000, 0.0000])
tensor([0.0000, 0.0002, 0.0000, 0.0000])
tensor([0.0000e+00, 1.6689e-05, 0.0000e+00, 0.0000e+00])
tensor([0.0000e+00, 1.8484e-06, 0.0000e+00, 0.0000e+00])
tensor([0.0000e+00, 2.0471e-07, 0.0000e+00, 0.0000e+00])
tensor([0.0000e+00, 2.2672e-08, 0.0000e+00, 0.0000e+00])
tensor([0.0000e+00, 2.5110e-09, 0.0000e+00, 0

100%|██████████| 4/4 [00:00<00:00, 151.32it/s]


tensor([0.1109, 0.0000, 0.0000, 0.0000])
tensor([0.0123, 0.0000, 0.0000, 0.0000])
tensor([0.0014, 0.0000, 0.0000, 0.0000])
tensor([0.0002, 0.0000, 0.0000, 0.0000])
tensor([1.6996e-05, 0.0000e+00, 0.0000e+00, 0.0000e+00])
tensor([1.8909e-06, 0.0000e+00, 0.0000e+00, 0.0000e+00])
tensor([2.1037e-07, 0.0000e+00, 0.0000e+00, 0.0000e+00])
tensor([2.3491e-08, 0.0000e+00, 0.0000e+00, 0.0000e+00])
tensor([2.6232e-09, 0.0000e+00, 0.0000e+00, 0.0000e+00])
tensor([0.0000e+00, 2.9240e-10, 0.0000e+00, 0.0000e+00])
tensor([0.0000e+00, 0.0000e+00, 9.7567e-11, 0.0000e+00])
tensor([0.0000, 0.1111, 0.0000, 0.0000])
tensor([0.0000, 0.0123, 0.0000, 0.0000])
tensor([0.0000, 0.0014, 0.0000, 0.0000])
tensor([0.0000, 0.0002, 0.0000, 0.0000])
tensor([0.0000e+00, 1.6689e-05, 0.0000e+00, 0.0000e+00])
tensor([0.0000e+00, 1.8484e-06, 0.0000e+00, 0.0000e+00])
tensor([0.0000e+00, 2.0471e-07, 0.0000e+00, 0.0000e+00])
tensor([0.0000e+00, 2.2672e-08, 0.0000e+00, 0.0000e+00])
tensor([0.0000e+00, 2.5110e-09, 0.0000e+00, 0

100%|██████████| 4/4 [00:00<00:00, 243.82it/s]
INFO:eval:Cross-entropy: 1.3729 nats (= perplexity 3.947)


tensor([0.3500, 0.0000, 0.0000, 0.0000])
tensor([0.1960, 0.0000, 0.0000, 0.0000])
tensor([0.1098, 0.0000, 0.0000, 0.0000])
tensor([0.0615, 0.0000, 0.0000, 0.0000])
tensor([0.0344, 0.0000, 0.0000, 0.0000])
tensor([0.0193, 0.0000, 0.0000, 0.0000])
tensor([0.0108, 0.0000, 0.0000, 0.0000])
tensor([0.0017, 0.0000, 0.0000, 0.0000])
tensor([0.0003, 0.0000, 0.0000, 0.0000])
tensor([0.0000e+00, 1.9344e-05, 0.0000e+00, 0.0000e+00])
tensor([0.0000e+00, 0.0000e+00, 1.9344e-06, 0.0000e+00])
tensor([0.0000, 0.0500, 0.0000, 0.0000])
tensor([0.0000, 0.0080, 0.0000, 0.0000])
tensor([0.0000, 0.0013, 0.0000, 0.0000])
tensor([0.0000, 0.0007, 0.0000, 0.0000])
tensor([0.0000, 0.0004, 0.0000, 0.0000])
tensor([0.0000, 0.0002, 0.0000, 0.0000])
tensor([0.0000, 0.0001, 0.0000, 0.0000])
tensor([0.0000e+00, 7.0493e-05, 0.0000e+00, 0.0000e+00])
tensor([0.0000e+00, 3.9476e-05, 0.0000e+00, 0.0000e+00])
tensor([3.9476e-07, 0.0000e+00, 0.0000e+00, 0.0000e+00])
tensor([0.0000e+00, 0.0000e+00, 3.9476e-08, 0.0000e+00])
te

100%|██████████| 4/4 [00:00<00:00, 200.24it/s]


tensor([0.3500, 0.0000, 0.0000, 0.0000])
tensor([0.1960, 0.0000, 0.0000, 0.0000])
tensor([0.1098, 0.0000, 0.0000, 0.0000])
tensor([0.0615, 0.0000, 0.0000, 0.0000])
tensor([0.0344, 0.0000, 0.0000, 0.0000])
tensor([0.0193, 0.0000, 0.0000, 0.0000])
tensor([0.0108, 0.0000, 0.0000, 0.0000])
tensor([0.0017, 0.0000, 0.0000, 0.0000])
tensor([0.0003, 0.0000, 0.0000, 0.0000])
tensor([0.0000e+00, 1.9344e-05, 0.0000e+00, 0.0000e+00])
tensor([0.0000e+00, 0.0000e+00, 1.9344e-06, 0.0000e+00])
tensor([0.0000, 0.0500, 0.0000, 0.0000])
tensor([0.0000, 0.0080, 0.0000, 0.0000])
tensor([0.0000, 0.0013, 0.0000, 0.0000])
tensor([0.0000, 0.0007, 0.0000, 0.0000])
tensor([0.0000, 0.0004, 0.0000, 0.0000])
tensor([0.0000, 0.0002, 0.0000, 0.0000])
tensor([0.0000, 0.0001, 0.0000, 0.0000])
tensor([0.0000e+00, 7.0493e-05, 0.0000e+00, 0.0000e+00])
tensor([0.0000e+00, 3.9476e-05, 0.0000e+00, 0.0000e+00])
tensor([3.9476e-07, 0.0000e+00, 0.0000e+00, 0.0000e+00])
tensor([0.0000e+00, 0.0000e+00, 3.9476e-08, 0.0000e+00])
te

100%|██████████| 4/4 [00:00<00:00, 267.20it/s]
INFO:eval:Cross-entropy: 1.3729 nats (= perplexity 3.947)
INFO:hmm:Saving model to my_hmm.pkl
INFO:hmm:Saved model to my_hmm.pkl
INFO:test_ic:*** A, B matrices after training on icsup (should match initial params on spreadsheet [transposed])


tensor([0.3500, 0.0000, 0.0000, 0.0000])
tensor([0.1960, 0.0000, 0.0000, 0.0000])
tensor([0.1098, 0.0000, 0.0000, 0.0000])
tensor([0.0615, 0.0000, 0.0000, 0.0000])
tensor([0.0344, 0.0000, 0.0000, 0.0000])
tensor([0.0193, 0.0000, 0.0000, 0.0000])
tensor([0.0108, 0.0000, 0.0000, 0.0000])
tensor([0.0017, 0.0000, 0.0000, 0.0000])
tensor([0.0003, 0.0000, 0.0000, 0.0000])
tensor([0.0000e+00, 1.9343e-05, 0.0000e+00, 0.0000e+00])
tensor([0.0000e+00, 0.0000e+00, 1.9343e-06, 0.0000e+00])
tensor([0.0000, 0.0500, 0.0000, 0.0000])
tensor([0.0000, 0.0080, 0.0000, 0.0000])
tensor([0.0000, 0.0013, 0.0000, 0.0000])
tensor([0.0000, 0.0007, 0.0000, 0.0000])
tensor([0.0000, 0.0004, 0.0000, 0.0000])
tensor([0.0000, 0.0002, 0.0000, 0.0000])
tensor([0.0000, 0.0001, 0.0000, 0.0000])
tensor([0.0000e+00, 7.0494e-05, 0.0000e+00, 0.0000e+00])
tensor([0.0000e+00, 3.9476e-05, 0.0000e+00, 0.0000e+00])
tensor([3.9476e-07, 0.0000e+00, 0.0000e+00, 0.0000e+00])
tensor([0.0000e+00, 0.0000e+00, 3.9476e-08, 0.0000e+00])
te

Now that we've reached the spreadsheet's starting guess, let's again tag
the spreadsheet "sentence" (that is, the sequence of ice creams) using the
Viterbi algorithm.

In [15]:
log.info("*** Viterbi results on icraw")
icraw = TaggedCorpus(Path("icraw"), tagset=icsup.tagset, vocab=icsup.vocab)
write_tagging(hmm, icraw, Path("icraw_hmm.output"))  # calls hmm.viterbi_tagging on each sentence
os.system("cat icraw_hmm.output")   # print the file we just created, and remove it

INFO:test_ic:*** Viterbi results on icraw
100%|██████████| 1/1 [00:00<00:00, 360.43it/s]

2/H 3/H 3/H 2/H 3/H 2/H 3/H 2/H 2/H 3/H 1/H 3/H 3/H 1/C 1/C 1/C 2/C 1/C 1/C 1/C 3/C 1/C 2/C 1/C 1/C 1/C 2/C 3/H 3/H 2/H 3/H 2/H 2/H





0

Next let's use the forward algorithm to see what the model thinks about 
the probability of the spreadsheet "sentence."

In [16]:
log.info("*** Forward algorithm on icraw (should approximately match iteration 0 "
             "on spreadsheet)")
for sentence in icraw:
    prob = math.exp(hmm.logprob(sentence, icraw))
    log.info(f"{prob} = p({sentence})")

INFO:test_ic:*** Forward algorithm on icraw (should approximately match iteration 0 on spreadsheet)


tensor([0.1000, 0.1000, 0.0000, 0.0000])
tensor([0.0090, 0.0630, 0.0000, 0.0000])
tensor([0.0013, 0.0359, 0.0000, 0.0000])
tensor([0.0009, 0.0058, 0.0000, 0.0000])
tensor([0.0001, 0.0033, 0.0000, 0.0000])
tensor([8.7155e-05, 5.3034e-04, 0.0000e+00, 0.0000e+00])
tensor([1.2276e-05, 3.0309e-04, 0.0000e+00, 0.0000e+00])
tensor([8.0259e-06, 4.8740e-05, 0.0000e+00, 0.0000e+00])
tensor([2.2589e-06, 7.9589e-06, 0.0000e+00, 0.0000e+00])
tensor([2.6030e-07, 4.6151e-06, 0.0000e+00, 0.0000e+00])
tensor([4.6883e-07, 3.7181e-07, 0.0000e+00, 0.0000e+00])
tensor([4.1224e-08, 2.4103e-07, 0.0000e+00, 0.0000e+00])
tensor([5.7082e-09, 1.3786e-07, 0.0000e+00, 0.0000e+00])
tensor([1.2847e-08, 1.1086e-08, 0.0000e+00, 0.0000e+00])
tensor([7.9704e-09, 1.0154e-09, 0.0000e+00, 0.0000e+00])
tensor([4.5345e-09, 1.6093e-10, 0.0000e+00, 0.0000e+00])
tensor([7.2873e-10, 1.1644e-10, 0.0000e+00, 0.0000e+00])
tensor([4.1624e-10, 1.6602e-11, 0.0000e+00, 0.0000e+00])
tensor([2.3426e-10, 5.4906e-12, 0.0000e+00, 0.0000e+00

INFO:test_ic:9.127346070247585e-19 = p(2 3 3 2 3 2 3 2 2 3 1 3 3 1 1 1 2 1 1 1 3 1 2 1 1 1 2 3 3 2 3 2 2)


tensor([1.0855e-12, 2.8798e-13, 0.0000e+00, 0.0000e+00])
tensor([6.2804e-13, 3.3894e-14, 0.0000e+00, 0.0000e+00])
tensor([3.5408e-13, 8.9920e-15, 0.0000e+00, 0.0000e+00])
tensor([1.9891e-13, 4.2601e-15, 0.0000e+00, 0.0000e+00])
tensor([3.1911e-14, 4.6599e-15, 0.0000e+00, 0.0000e+00])
tensor([2.5995e-15, 4.8433e-15, 0.0000e+00, 0.0000e+00])
tensor([2.5639e-16, 2.8942e-15, 0.0000e+00, 0.0000e+00])
tensor([9.8907e-17, 4.6820e-16, 0.0000e+00, 0.0000e+00])
tensor([1.2595e-17, 2.6912e-16, 0.0000e+00, 0.0000e+00])
tensor([7.3974e-18, 4.3310e-17, 0.0000e+00, 0.0000e+00])
tensor([2.0498e-18, 7.0776e-18, 0.0000e+00, 0.0000e+00])
tensor([0.0000e+00, 0.0000e+00, 9.1273e-19, 0.0000e+00])


Finally, let's reestimate on the icraw data, as the spreadsheet does.
We'll evaluate as we go along on the *training* perplexity, and stop
when that has more or less converged.

In [17]:
log.info("*** Reestimating on icraw (perplexity should improve on every iteration)")
negative_log_likelihood = lambda model: model_cross_entropy(model, icraw)  # evaluate on icraw itself
hmm.train(corpus=icraw, loss=negative_log_likelihood, tolerance=0.0001)




INFO:test_ic:*** Reestimating on icraw (perplexity should improve on every iteration)
100%|██████████| 1/1 [00:00<00:00, 83.47it/s]
INFO:eval:Cross-entropy: 1.2217 nats (= perplexity 3.393)


tensor([0.1000, 0.1000, 0.0000, 0.0000])
tensor([0.0090, 0.0630, 0.0000, 0.0000])
tensor([0.0013, 0.0359, 0.0000, 0.0000])
tensor([0.0009, 0.0058, 0.0000, 0.0000])
tensor([0.0001, 0.0033, 0.0000, 0.0000])
tensor([8.7155e-05, 5.3034e-04, 0.0000e+00, 0.0000e+00])
tensor([1.2276e-05, 3.0309e-04, 0.0000e+00, 0.0000e+00])
tensor([8.0259e-06, 4.8740e-05, 0.0000e+00, 0.0000e+00])
tensor([2.2589e-06, 7.9589e-06, 0.0000e+00, 0.0000e+00])
tensor([2.6030e-07, 4.6151e-06, 0.0000e+00, 0.0000e+00])
tensor([4.6883e-07, 3.7181e-07, 0.0000e+00, 0.0000e+00])
tensor([4.1224e-08, 2.4103e-07, 0.0000e+00, 0.0000e+00])
tensor([5.7082e-09, 1.3786e-07, 0.0000e+00, 0.0000e+00])
tensor([1.2847e-08, 1.1086e-08, 0.0000e+00, 0.0000e+00])
tensor([7.9704e-09, 1.0154e-09, 0.0000e+00, 0.0000e+00])
tensor([4.5345e-09, 1.6093e-10, 0.0000e+00, 0.0000e+00])
tensor([7.2873e-10, 1.1644e-10, 0.0000e+00, 0.0000e+00])
tensor([4.1624e-10, 1.6602e-11, 0.0000e+00, 0.0000e+00])
tensor([2.3426e-10, 5.4906e-12, 0.0000e+00, 0.0000e+00

  0%|          | 0/1 [00:00<?, ?it/s]

tensor([0.1000, 0.1000, 0.0000, 0.0000])


100%|██████████| 1/1 [00:00<00:00, 64.96it/s]


tensor([0.0090, 0.0630, 0.0000, 0.0000])
tensor([0.0013, 0.0359, 0.0000, 0.0000])
tensor([0.0009, 0.0058, 0.0000, 0.0000])
tensor([0.0001, 0.0033, 0.0000, 0.0000])
tensor([8.7155e-05, 5.3034e-04, 0.0000e+00, 0.0000e+00])
tensor([1.2276e-05, 3.0309e-04, 0.0000e+00, 0.0000e+00])
tensor([8.0259e-06, 4.8740e-05, 0.0000e+00, 0.0000e+00])
tensor([2.2589e-06, 7.9589e-06, 0.0000e+00, 0.0000e+00])
tensor([2.6030e-07, 4.6151e-06, 0.0000e+00, 0.0000e+00])
tensor([4.6883e-07, 3.7181e-07, 0.0000e+00, 0.0000e+00])
tensor([4.1224e-08, 2.4103e-07, 0.0000e+00, 0.0000e+00])
tensor([5.7082e-09, 1.3786e-07, 0.0000e+00, 0.0000e+00])
tensor([1.2847e-08, 1.1086e-08, 0.0000e+00, 0.0000e+00])
tensor([7.9704e-09, 1.0154e-09, 0.0000e+00, 0.0000e+00])
tensor([4.5345e-09, 1.6093e-10, 0.0000e+00, 0.0000e+00])
tensor([7.2873e-10, 1.1644e-10, 0.0000e+00, 0.0000e+00])
tensor([4.1624e-10, 1.6602e-11, 0.0000e+00, 0.0000e+00])
tensor([2.3426e-10, 5.4906e-12, 0.0000e+00, 0.0000e+00])
tensor([1.3157e-10, 2.7818e-12, 0.0000

  0%|          | 0/1 [00:00<?, ?it/s]

tensor([0.0282, 0.3702, 0.0000, 0.0000])
tensor([0.0062, 0.1670, 0.0000, 0.0000])
tensor([0.0022, 0.0750, 0.0000, 0.0000])
tensor([0.0019, 0.0277, 0.0000, 0.0000])
tensor([0.0004, 0.0125, 0.0000, 0.0000])
tensor([0.0003, 0.0046, 0.0000, 0.0000])
tensor([7.5638e-05, 2.0793e-03, 0.0000e+00, 0.0000e+00])
tensor([5.6590e-05, 7.6821e-04, 0.0000e+00, 0.0000e+00])
tensor([2.6396e-05, 2.8514e-04, 0.0000e+00, 0.0000e+00])
tensor([5.1813e-06, 1.2891e-04, 0.0000e+00, 0.0000e+00])
tensor([1.1138e-05, 6.5433e-06, 0.0000e+00, 0.0000e+00])


100%|██████████| 1/1 [00:00<00:00, 76.11it/s]
INFO:eval:Cross-entropy: 1.0807 nats (= perplexity 2.947)


tensor([1.0844e-06, 3.5510e-06, 0.0000e+00, 0.0000e+00])
tensor([1.3380e-07, 1.6479e-06, 0.0000e+00, 0.0000e+00])
tensor([1.8241e-07, 8.4074e-08, 0.0000e+00, 0.0000e+00])
tensor([1.1333e-07, 5.4060e-09, 0.0000e+00, 0.0000e+00])
tensor([6.7479e-08, 9.9381e-10, 0.0000e+00, 0.0000e+00])
tensor([1.2951e-08, 3.4909e-09, 0.0000e+00, 0.0000e+00])
tensor([7.8912e-09, 2.5867e-10, 0.0000e+00, 0.0000e+00])
tensor([4.6913e-09, 6.3253e-11, 0.0000e+00, 0.0000e+00])
tensor([2.7833e-09, 3.3032e-11, 0.0000e+00, 0.0000e+00])
tensor([2.5546e-10, 1.7141e-10, 0.0000e+00, 0.0000e+00])
tensor([1.6208e-10, 1.0281e-11, 0.0000e+00, 0.0000e+00])
tensor([3.1267e-11, 1.1288e-11, 0.0000e+00, 0.0000e+00])
tensor([1.9230e-11, 7.6892e-13, 0.0000e+00, 0.0000e+00])
tensor([1.1441e-11, 1.6114e-13, 0.0000e+00, 0.0000e+00])
tensor([6.7881e-12, 8.0904e-14, 0.0000e+00, 0.0000e+00])
tensor([1.3024e-12, 3.4416e-13, 0.0000e+00, 0.0000e+00])
tensor([1.2273e-13, 2.2711e-13, 0.0000e+00, 0.0000e+00])
tensor([1.3450e-14, 1.0840e-13,

  0%|          | 0/1 [00:00<?, ?it/s]

tensor([0.0282, 0.3702, 0.0000, 0.0000])
tensor([0.0062, 0.1670, 0.0000, 0.0000])
tensor([0.0022, 0.0750, 0.0000, 0.0000])
tensor([0.0019, 0.0277, 0.0000, 0.0000])
tensor([0.0004, 0.0125, 0.0000, 0.0000])


100%|██████████| 1/1 [00:00<00:00, 64.92it/s]


tensor([0.0003, 0.0046, 0.0000, 0.0000])
tensor([7.5638e-05, 2.0793e-03, 0.0000e+00, 0.0000e+00])
tensor([5.6590e-05, 7.6821e-04, 0.0000e+00, 0.0000e+00])
tensor([2.6396e-05, 2.8514e-04, 0.0000e+00, 0.0000e+00])
tensor([5.1813e-06, 1.2891e-04, 0.0000e+00, 0.0000e+00])
tensor([1.1138e-05, 6.5433e-06, 0.0000e+00, 0.0000e+00])
tensor([1.0844e-06, 3.5510e-06, 0.0000e+00, 0.0000e+00])
tensor([1.3380e-07, 1.6479e-06, 0.0000e+00, 0.0000e+00])
tensor([1.8241e-07, 8.4074e-08, 0.0000e+00, 0.0000e+00])
tensor([1.1333e-07, 5.4060e-09, 0.0000e+00, 0.0000e+00])
tensor([6.7479e-08, 9.9381e-10, 0.0000e+00, 0.0000e+00])
tensor([1.2951e-08, 3.4909e-09, 0.0000e+00, 0.0000e+00])
tensor([7.8912e-09, 2.5867e-10, 0.0000e+00, 0.0000e+00])
tensor([4.6913e-09, 6.3253e-11, 0.0000e+00, 0.0000e+00])
tensor([2.7833e-09, 3.3032e-11, 0.0000e+00, 0.0000e+00])
tensor([2.5546e-10, 1.7141e-10, 0.0000e+00, 0.0000e+00])
tensor([1.6208e-10, 1.0281e-11, 0.0000e+00, 0.0000e+00])
tensor([3.1267e-11, 1.1288e-11, 0.0000e+00, 0.0

  0%|          | 0/1 [00:00<?, ?it/s]

tensor([0.0021, 0.4582, 0.0000, 0.0000])
tensor([0.0049, 0.1979, 0.0000, 0.0000])
tensor([0.0026, 0.0857, 0.0000, 0.0000])
tensor([0.0015, 0.0347, 0.0000, 0.0000])
tensor([0.0005, 0.0150, 0.0000, 0.0000])


100%|██████████| 1/1 [00:00<00:00, 91.24it/s]
INFO:eval:Cross-entropy: 1.0576 nats (= perplexity 2.879)


tensor([0.0003, 0.0061, 0.0000, 0.0000])
tensor([9.4840e-05, 2.6448e-03, 0.0000e+00, 0.0000e+00])
tensor([4.9476e-05, 1.0718e-03, 0.0000e+00, 0.0000e+00])
tensor([2.1762e-05, 4.3483e-04, 0.0000e+00, 0.0000e+00])
tensor([6.9757e-06, 1.8872e-04, 0.0000e+00, 0.0000e+00])
tensor([1.4482e-05, 6.6381e-06, 0.0000e+00, 0.0000e+00])
tensor([1.7895e-06, 3.5386e-06, 0.0000e+00, 0.0000e+00])
tensor([2.4855e-07, 1.6107e-06, 0.0000e+00, 0.0000e+00])
tensor([2.4270e-07, 5.7369e-08, 0.0000e+00, 0.0000e+00])
tensor([1.5599e-07, 2.9255e-09, 0.0000e+00, 0.0000e+00])
tensor([9.8440e-08, 6.9091e-10, 0.0000e+00, 0.0000e+00])
tensor([1.5268e-08, 4.5576e-09, 0.0000e+00, 0.0000e+00])
tensor([9.8636e-09, 2.1727e-10, 0.0000e+00, 0.0000e+00])
tensor([6.2265e-09, 4.4820e-11, 0.0000e+00, 0.0000e+00])
tensor([3.9257e-09, 2.5058e-11, 0.0000e+00, 0.0000e+00])
tensor([4.6719e-10, 1.9327e-10, 0.0000e+00, 0.0000e+00])
tensor([3.0470e-10, 8.5338e-12, 0.0000e+00, 0.0000e+00])
tensor([4.7342e-11, 1.6689e-11, 0.0000e+00, 0.0

100%|██████████| 1/1 [00:00<00:00, 67.72it/s]


tensor([0.0021, 0.4582, 0.0000, 0.0000])
tensor([0.0049, 0.1979, 0.0000, 0.0000])
tensor([0.0026, 0.0857, 0.0000, 0.0000])
tensor([0.0015, 0.0347, 0.0000, 0.0000])
tensor([0.0005, 0.0150, 0.0000, 0.0000])
tensor([0.0003, 0.0061, 0.0000, 0.0000])
tensor([9.4840e-05, 2.6448e-03, 0.0000e+00, 0.0000e+00])
tensor([4.9476e-05, 1.0718e-03, 0.0000e+00, 0.0000e+00])
tensor([2.1762e-05, 4.3483e-04, 0.0000e+00, 0.0000e+00])
tensor([6.9757e-06, 1.8872e-04, 0.0000e+00, 0.0000e+00])
tensor([1.4482e-05, 6.6381e-06, 0.0000e+00, 0.0000e+00])
tensor([1.7895e-06, 3.5386e-06, 0.0000e+00, 0.0000e+00])
tensor([2.4855e-07, 1.6107e-06, 0.0000e+00, 0.0000e+00])
tensor([2.4270e-07, 5.7369e-08, 0.0000e+00, 0.0000e+00])
tensor([1.5599e-07, 2.9255e-09, 0.0000e+00, 0.0000e+00])
tensor([9.8440e-08, 6.9091e-10, 0.0000e+00, 0.0000e+00])
tensor([1.5268e-08, 4.5576e-09, 0.0000e+00, 0.0000e+00])
tensor([9.8636e-09, 2.1727e-10, 0.0000e+00, 0.0000e+00])
tensor([6.2265e-09, 4.4820e-11, 0.0000e+00, 0.0000e+00])
tensor([3.925

  0%|          | 0/1 [00:00<?, ?it/s]

tensor([1.0555e-04, 4.8891e-01, 0.0000e+00, 0.0000e+00])
tensor([0.0057, 0.2065, 0.0000, 0.0000])
tensor([0.0032, 0.0874, 0.0000, 0.0000])
tensor([0.0014, 0.0374, 0.0000, 0.0000])


100%|██████████| 1/1 [00:00<00:00, 88.55it/s]
INFO:eval:Cross-entropy: 1.0486 nats (= perplexity 2.854)


tensor([0.0006, 0.0159, 0.0000, 0.0000])
tensor([0.0003, 0.0068, 0.0000, 0.0000])
tensor([0.0001, 0.0029, 0.0000, 0.0000])
tensor([4.9334e-05, 1.2321e-03, 0.0000e+00, 0.0000e+00])
tensor([2.0895e-05, 5.2737e-04, 0.0000e+00, 0.0000e+00])
tensor([9.2132e-06, 2.2356e-04, 0.0000e+00, 0.0000e+00])
tensor([1.6645e-05, 5.1273e-06, 0.0000e+00, 0.0000e+00])
tensor([2.5379e-06, 2.8173e-06, 0.0000e+00, 0.0000e+00])
tensor([4.1051e-07, 1.2893e-06, 0.0000e+00, 0.0000e+00])
tensor([3.1975e-07, 3.0327e-08, 0.0000e+00, 0.0000e+00])
tensor([2.0167e-07, 1.3701e-09, 0.0000e+00, 0.0000e+00])
tensor([1.2634e-07, 4.5843e-10, 0.0000e+00, 0.0000e+00])
tensor([1.8202e-08, 5.1897e-09, 0.0000e+00, 0.0000e+00])
tensor([1.1649e-08, 1.5712e-10, 0.0000e+00, 0.0000e+00])
tensor([7.3013e-09, 2.8261e-11, 0.0000e+00, 0.0000e+00])
tensor([4.5728e-09, 1.6109e-11, 0.0000e+00, 0.0000e+00])
tensor([6.8111e-10, 1.8584e-10, 0.0000e+00, 0.0000e+00])
tensor([4.3550e-10, 5.6885e-12, 0.0000e+00, 0.0000e+00])
tensor([6.2790e-11, 1.

  0%|          | 0/1 [00:00<?, ?it/s]

tensor([1.0555e-04, 4.8891e-01, 0.0000e+00, 0.0000e+00])
tensor([0.0057, 0.2065, 0.0000, 0.0000])
tensor([0.0032, 0.0874, 0.0000, 0.0000])
tensor([0.0014, 0.0374, 0.0000, 0.0000])
tensor([0.0006, 0.0159, 0.0000, 0.0000])
tensor([0.0003, 0.0068, 0.0000, 0.0000])
tensor([0.0001, 0.0029, 0.0000, 0.0000])
tensor([4.9334e-05, 1.2321e-03, 0.0000e+00, 0.0000e+00])
tensor([2.0895e-05, 5.2737e-04, 0.0000e+00, 0.0000e+00])
tensor([9.2132e-06, 2.2356e-04, 0.0000e+00, 0.0000e+00])
tensor([1.6645e-05, 5.1273e-06, 0.0000e+00, 0.0000e+00])
tensor([2.5379e-06, 2.8173e-06, 0.0000e+00, 0.0000e+00])
tensor([4.1051e-07, 1.2893e-06, 0.0000e+00, 0.0000e+00])
tensor([3.1975e-07, 3.0327e-08, 0.0000e+00, 0.0000e+00])
tensor([2.0167e-07, 1.3701e-09, 0.0000e+00, 0.0000e+00])
tensor([1.2634e-07, 4.5843e-10, 0.0000e+00, 0.0000e+00])
tensor([1.8202e-08, 5.1897e-09, 0.0000e+00, 0.0000e+00])
tensor([1.1649e-08, 1.5712e-10, 0.0000e+00, 0.0000e+00])
tensor([7.3013e-09, 2.8261e-11, 0.0000e+00, 0.0000e+00])
tensor([4.572

100%|██████████| 1/1 [00:00<00:00, 60.23it/s]


tensor([6.8111e-10, 1.8584e-10, 0.0000e+00, 0.0000e+00])
tensor([4.3550e-10, 5.6885e-12, 0.0000e+00, 0.0000e+00])
tensor([6.2790e-11, 1.9641e-11, 0.0000e+00, 0.0000e+00])
tensor([4.0269e-11, 5.8174e-13, 0.0000e+00, 0.0000e+00])
tensor([2.5241e-11, 9.8577e-14, 0.0000e+00, 0.0000e+00])
tensor([1.5809e-11, 5.5711e-14, 0.0000e+00, 0.0000e+00])
tensor([2.2776e-12, 6.4868e-13, 0.0000e+00, 0.0000e+00])
tensor([3.4666e-13, 3.6315e-13, 0.0000e+00, 0.0000e+00])
tensor([5.5823e-14, 1.6696e-13, 0.0000e+00, 0.0000e+00])
tensor([9.9089e-15, 7.3405e-14, 0.0000e+00, 0.0000e+00])
tensor([2.3248e-15, 3.1392e-14, 0.0000e+00, 0.0000e+00])
tensor([6.8618e-16, 1.3479e-14, 0.0000e+00, 0.0000e+00])
tensor([2.4968e-16, 5.7752e-15, 0.0000e+00, 0.0000e+00])
tensor([0.0000e+00, 0.0000e+00, 3.2879e-16, 0.0000e+00])


100%|██████████| 1/1 [00:00<00:00, 95.10it/s]

tensor([4.6573e-06, 5.0880e-01, 0.0000e+00, 0.0000e+00])
tensor([0.0067, 0.2107, 0.0000, 0.0000])
tensor([0.0039, 0.0875, 0.0000, 0.0000])
tensor([0.0015, 0.0389, 0.0000, 0.0000])
tensor([0.0008, 0.0162, 0.0000, 0.0000])
tensor([0.0003, 0.0072, 0.0000, 0.0000])
tensor([0.0001, 0.0030, 0.0000, 0.0000])
tensor([5.1848e-05, 1.3250e-03, 0.0000e+00, 0.0000e+00])
tensor([2.1456e-05, 5.8835e-04, 0.0000e+00, 0.0000e+00])
tensor([1.1388e-05, 2.4442e-04, 0.0000e+00, 0.0000e+00])
tensor([1.8453e-05, 3.2109e-06, 0.0000e+00, 0.0000e+00])
tensor([3.2113e-06, 1.9732e-06, 0.0000e+00, 0.0000e+00])
tensor([5.7737e-07, 9.2915e-07, 0.0000e+00, 0.0000e+00])
tensor([3.9842e-07, 1.2795e-08, 0.0000e+00, 0.0000e+00])
tensor([2.4549e-07, 6.0631e-10, 0.0000e+00, 0.0000e+00])
tensor([1.5092e-07, 2.7836e-10, 0.0000e+00, 0.0000e+00])
tensor([2.1187e-08, 5.7460e-09, 0.0000e+00, 0.0000e+00])
tensor([1.3292e-08, 9.8530e-11, 0.0000e+00, 0.0000e+00])
tensor([8.1747e-09, 1.5931e-11, 0.0000e+00, 0.0000e+00])
tensor([5.025


INFO:eval:Cross-entropy: 1.0438 nats (= perplexity 2.840)
100%|██████████| 1/1 [00:00<00:00, 58.22it/s]


tensor([4.6573e-06, 5.0880e-01, 0.0000e+00, 0.0000e+00])
tensor([0.0067, 0.2107, 0.0000, 0.0000])
tensor([0.0039, 0.0875, 0.0000, 0.0000])
tensor([0.0015, 0.0389, 0.0000, 0.0000])
tensor([0.0008, 0.0162, 0.0000, 0.0000])
tensor([0.0003, 0.0072, 0.0000, 0.0000])
tensor([0.0001, 0.0030, 0.0000, 0.0000])
tensor([5.1848e-05, 1.3250e-03, 0.0000e+00, 0.0000e+00])
tensor([2.1456e-05, 5.8835e-04, 0.0000e+00, 0.0000e+00])
tensor([1.1388e-05, 2.4442e-04, 0.0000e+00, 0.0000e+00])
tensor([1.8453e-05, 3.2109e-06, 0.0000e+00, 0.0000e+00])
tensor([3.2113e-06, 1.9732e-06, 0.0000e+00, 0.0000e+00])
tensor([5.7737e-07, 9.2915e-07, 0.0000e+00, 0.0000e+00])
tensor([3.9842e-07, 1.2795e-08, 0.0000e+00, 0.0000e+00])
tensor([2.4549e-07, 6.0631e-10, 0.0000e+00, 0.0000e+00])
tensor([1.5092e-07, 2.7836e-10, 0.0000e+00, 0.0000e+00])
tensor([2.1187e-08, 5.7460e-09, 0.0000e+00, 0.0000e+00])
tensor([1.3292e-08, 9.8530e-11, 0.0000e+00, 0.0000e+00])
tensor([8.1747e-09, 1.5931e-11, 0.0000e+00, 0.0000e+00])
tensor([5.025

100%|██████████| 1/1 [00:00<00:00, 93.08it/s]
INFO:eval:Cross-entropy: 1.0414 nats (= perplexity 2.833)


tensor([1.9265e-07, 5.2144e-01, 0.0000e+00, 0.0000e+00])
tensor([0.0074, 0.2130, 0.0000, 0.0000])
tensor([0.0044, 0.0873, 0.0000, 0.0000])
tensor([0.0015, 0.0397, 0.0000, 0.0000])
tensor([0.0008, 0.0162, 0.0000, 0.0000])
tensor([0.0003, 0.0074, 0.0000, 0.0000])
tensor([0.0002, 0.0030, 0.0000, 0.0000])
tensor([5.4028e-05, 1.3748e-03, 0.0000e+00, 0.0000e+00])
tensor([2.2091e-05, 6.2403e-04, 0.0000e+00, 0.0000e+00])
tensor([1.2897e-05, 2.5568e-04, 0.0000e+00, 0.0000e+00])
tensor([1.9652e-05, 1.7169e-06, 0.0000e+00, 0.0000e+00])
tensor([3.6610e-06, 1.3438e-06, 0.0000e+00, 0.0000e+00])
tensor([6.9645e-07, 6.6869e-07, 0.0000e+00, 0.0000e+00])
tensor([4.5335e-07, 4.8449e-09, 0.0000e+00, 0.0000e+00])
tensor([2.7518e-07, 2.7497e-10, 0.0000e+00, 0.0000e+00])
tensor([1.6691e-07, 1.4908e-10, 0.0000e+00, 0.0000e+00])
tensor([2.3205e-08, 6.1093e-09, 0.0000e+00, 0.0000e+00])
tensor([1.4357e-08, 5.3276e-11, 0.0000e+00, 0.0000e+00])
tensor([8.7100e-09, 8.0380e-12, 0.0000e+00, 0.0000e+00])
tensor([5.283

  0%|          | 0/1 [00:00<?, ?it/s]

tensor([1.9265e-07, 5.2144e-01, 0.0000e+00, 0.0000e+00])
tensor([0.0074, 0.2130, 0.0000, 0.0000])
tensor([0.0044, 0.0873, 0.0000, 0.0000])
tensor([0.0015, 0.0397, 0.0000, 0.0000])
tensor([0.0008, 0.0162, 0.0000, 0.0000])
tensor([0.0003, 0.0074, 0.0000, 0.0000])
tensor([0.0002, 0.0030, 0.0000, 0.0000])
tensor([5.4028e-05, 1.3748e-03, 0.0000e+00, 0.0000e+00])
tensor([2.2091e-05, 6.2403e-04, 0.0000e+00, 0.0000e+00])
tensor([1.2897e-05, 2.5568e-04, 0.0000e+00, 0.0000e+00])
tensor([1.9652e-05, 1.7169e-06, 0.0000e+00, 0.0000e+00])
tensor([3.6610e-06, 1.3438e-06, 0.0000e+00, 0.0000e+00])
tensor([6.9645e-07, 6.6869e-07, 0.0000e+00, 0.0000e+00])
tensor([4.5335e-07, 4.8449e-09, 0.0000e+00, 0.0000e+00])
tensor([2.7518e-07, 2.7497e-10, 0.0000e+00, 0.0000e+00])
tensor([1.6691e-07, 1.4908e-10, 0.0000e+00, 0.0000e+00])
tensor([2.3205e-08, 6.1093e-09, 0.0000e+00, 0.0000e+00])
tensor([1.4357e-08, 5.3276e-11, 0.0000e+00, 0.0000e+00])
tensor([8.7100e-09, 8.0380e-12, 0.0000e+00, 0.0000e+00])
tensor([5.283

100%|██████████| 1/1 [00:00<00:00, 59.32it/s]
100%|██████████| 1/1 [00:00<00:00, 93.70it/s]
INFO:eval:Cross-entropy: 1.0402 nats (= perplexity 2.830)


tensor([7.7533e-09, 5.2825e-01, 0.0000e+00, 0.0000e+00])
tensor([0.0078, 0.2142, 0.0000, 0.0000])
tensor([0.0046, 0.0871, 0.0000, 0.0000])
tensor([0.0016, 0.0400, 0.0000, 0.0000])
tensor([0.0009, 0.0163, 0.0000, 0.0000])
tensor([0.0003, 0.0075, 0.0000, 0.0000])
tensor([0.0002, 0.0030, 0.0000, 0.0000])
tensor([5.5351e-05, 1.3990e-03, 0.0000e+00, 0.0000e+00])
tensor([2.2505e-05, 6.4221e-04, 0.0000e+00, 0.0000e+00])
tensor([1.3741e-05, 2.6113e-04, 0.0000e+00, 0.0000e+00])
tensor([2.0310e-05, 8.3772e-07, 0.0000e+00, 0.0000e+00])
tensor([3.9062e-06, 9.8279e-07, 0.0000e+00, 0.0000e+00])
tensor([7.6336e-07, 5.2221e-07, 0.0000e+00, 0.0000e+00])
tensor([4.8368e-07, 1.8589e-09, 0.0000e+00, 0.0000e+00])
tensor([2.9130e-07, 1.2661e-10, 0.0000e+00, 0.0000e+00])
tensor([1.7539e-07, 7.3079e-11, 0.0000e+00, 0.0000e+00])
tensor([2.4299e-08, 6.3014e-09, 0.0000e+00, 0.0000e+00])
tensor([1.4921e-08, 2.6195e-11, 0.0000e+00, 0.0000e+00])
tensor([8.9847e-09, 3.8061e-12, 0.0000e+00, 0.0000e+00])
tensor([5.409

  0%|          | 0/1 [00:00<?, ?it/s]

tensor([7.7533e-09, 5.2825e-01, 0.0000e+00, 0.0000e+00])
tensor([0.0078, 0.2142, 0.0000, 0.0000])
tensor([0.0046, 0.0871, 0.0000, 0.0000])
tensor([0.0016, 0.0400, 0.0000, 0.0000])
tensor([0.0009, 0.0163, 0.0000, 0.0000])
tensor([0.0003, 0.0075, 0.0000, 0.0000])
tensor([0.0002, 0.0030, 0.0000, 0.0000])
tensor([5.5351e-05, 1.3990e-03, 0.0000e+00, 0.0000e+00])
tensor([2.2505e-05, 6.4221e-04, 0.0000e+00, 0.0000e+00])
tensor([1.3741e-05, 2.6113e-04, 0.0000e+00, 0.0000e+00])
tensor([2.0310e-05, 8.3772e-07, 0.0000e+00, 0.0000e+00])
tensor([3.9062e-06, 9.8279e-07, 0.0000e+00, 0.0000e+00])
tensor([7.6336e-07, 5.2221e-07, 0.0000e+00, 0.0000e+00])
tensor([4.8368e-07, 1.8589e-09, 0.0000e+00, 0.0000e+00])
tensor([2.9130e-07, 1.2661e-10, 0.0000e+00, 0.0000e+00])
tensor([1.7539e-07, 7.3079e-11, 0.0000e+00, 0.0000e+00])
tensor([2.4299e-08, 6.3014e-09, 0.0000e+00, 0.0000e+00])
tensor([1.4921e-08, 2.6195e-11, 0.0000e+00, 0.0000e+00])
tensor([8.9847e-09, 3.8061e-12, 0.0000e+00, 0.0000e+00])
tensor([5.409

100%|██████████| 1/1 [00:00<00:00, 67.78it/s]
100%|██████████| 1/1 [00:00<00:00, 94.56it/s]
INFO:eval:Cross-entropy: 1.0396 nats (= perplexity 2.828)


tensor([3.0853e-10, 5.3157e-01, 0.0000e+00, 0.0000e+00])
tensor([0.0079, 0.2148, 0.0000, 0.0000])
tensor([0.0048, 0.0870, 0.0000, 0.0000])
tensor([0.0016, 0.0402, 0.0000, 0.0000])
tensor([0.0009, 0.0163, 0.0000, 0.0000])
tensor([0.0003, 0.0075, 0.0000, 0.0000])
tensor([0.0002, 0.0031, 0.0000, 0.0000])
tensor([5.6041e-05, 1.4100e-03, 0.0000e+00, 0.0000e+00])
tensor([2.2728e-05, 6.5078e-04, 0.0000e+00, 0.0000e+00])
tensor([1.4161e-05, 2.6363e-04, 0.0000e+00, 0.0000e+00])
tensor([2.0634e-05, 3.9065e-07, 0.0000e+00, 0.0000e+00])
tensor([4.0265e-06, 8.0169e-07, 0.0000e+00, 0.0000e+00])
tensor([7.9657e-07, 4.4953e-07, 0.0000e+00, 0.0000e+00])
tensor([4.9858e-07, 7.5416e-10, 0.0000e+00, 0.0000e+00])
tensor([2.9914e-07, 5.7940e-11, 0.0000e+00, 0.0000e+00])
tensor([1.7947e-07, 3.4181e-11, 0.0000e+00, 0.0000e+00])
tensor([2.4832e-08, 6.3938e-09, 0.0000e+00, 0.0000e+00])
tensor([1.5192e-08, 1.2266e-11, 0.0000e+00, 0.0000e+00])
tensor([9.1146e-09, 1.7497e-12, 0.0000e+00, 0.0000e+00])
tensor([5.468

  0%|          | 0/1 [00:00<?, ?it/s]

tensor([3.0853e-10, 5.3157e-01, 0.0000e+00, 0.0000e+00])
tensor([0.0079, 0.2148, 0.0000, 0.0000])
tensor([0.0048, 0.0870, 0.0000, 0.0000])
tensor([0.0016, 0.0402, 0.0000, 0.0000])
tensor([0.0009, 0.0163, 0.0000, 0.0000])
tensor([0.0003, 0.0075, 0.0000, 0.0000])
tensor([0.0002, 0.0031, 0.0000, 0.0000])
tensor([5.6041e-05, 1.4100e-03, 0.0000e+00, 0.0000e+00])
tensor([2.2728e-05, 6.5078e-04, 0.0000e+00, 0.0000e+00])
tensor([1.4161e-05, 2.6363e-04, 0.0000e+00, 0.0000e+00])
tensor([2.0634e-05, 3.9065e-07, 0.0000e+00, 0.0000e+00])
tensor([4.0265e-06, 8.0169e-07, 0.0000e+00, 0.0000e+00])
tensor([7.9657e-07, 4.4953e-07, 0.0000e+00, 0.0000e+00])
tensor([4.9858e-07, 7.5416e-10, 0.0000e+00, 0.0000e+00])
tensor([2.9914e-07, 5.7940e-11, 0.0000e+00, 0.0000e+00])
tensor([1.7947e-07, 3.4181e-11, 0.0000e+00, 0.0000e+00])
tensor([2.4832e-08, 6.3938e-09, 0.0000e+00, 0.0000e+00])
tensor([1.5192e-08, 1.2266e-11, 0.0000e+00, 0.0000e+00])
tensor([9.1146e-09, 1.7497e-12, 0.0000e+00, 0.0000e+00])


100%|██████████| 1/1 [00:00<00:00, 61.34it/s]


tensor([5.4681e-09, 1.0415e-12, 0.0000e+00, 0.0000e+00])
tensor([1.0655e-09, 1.7105e-10, 0.0000e+00, 0.0000e+00])
tensor([6.4709e-10, 3.7386e-13, 0.0000e+00, 0.0000e+00])
tensor([8.9540e-11, 2.3169e-11, 0.0000e+00, 0.0000e+00])
tensor([5.4784e-11, 4.4397e-14, 0.0000e+00, 0.0000e+00])
tensor([3.2868e-11, 6.3097e-15, 0.0000e+00, 0.0000e+00])
tensor([1.9719e-11, 3.7556e-15, 0.0000e+00, 0.0000e+00])
tensor([2.7285e-12, 7.0252e-13, 0.0000e+00, 0.0000e+00])
tensor([5.4216e-13, 3.6896e-13, 0.0000e+00, 0.0000e+00])
tensor([1.1116e-13, 1.6598e-13, 0.0000e+00, 0.0000e+00])
tensor([1.7143e-14, 8.0323e-14, 0.0000e+00, 0.0000e+00])
tensor([4.5417e-15, 3.2985e-14, 0.0000e+00, 0.0000e+00])
tensor([9.7872e-16, 1.5339e-14, 0.0000e+00, 0.0000e+00])
tensor([2.9832e-16, 7.0929e-15, 0.0000e+00, 0.0000e+00])
tensor([0.0000e+00, 0.0000e+00, 4.4519e-16, 0.0000e+00])


100%|██████████| 1/1 [00:00<00:00, 96.04it/s]
INFO:eval:Cross-entropy: 1.0394 nats (= perplexity 2.828)


tensor([1.2221e-11, 5.3311e-01, 0.0000e+00, 0.0000e+00])
tensor([0.0080, 0.2150, 0.0000, 0.0000])
tensor([0.0048, 0.0870, 0.0000, 0.0000])
tensor([0.0016, 0.0403, 0.0000, 0.0000])
tensor([0.0009, 0.0163, 0.0000, 0.0000])
tensor([0.0003, 0.0075, 0.0000, 0.0000])
tensor([0.0002, 0.0031, 0.0000, 0.0000])
tensor([5.6377e-05, 1.4150e-03, 0.0000e+00, 0.0000e+00])
tensor([2.2839e-05, 6.5469e-04, 0.0000e+00, 0.0000e+00])
tensor([1.4359e-05, 2.6475e-04, 0.0000e+00, 0.0000e+00])
tensor([2.0787e-05, 1.7840e-07, 0.0000e+00, 0.0000e+00])
tensor([4.0827e-06, 7.1632e-07, 0.0000e+00, 0.0000e+00])
tensor([8.1216e-07, 4.1545e-07, 0.0000e+00, 0.0000e+00])
tensor([5.0554e-07, 3.2068e-10, 0.0000e+00, 0.0000e+00])
tensor([3.0279e-07, 2.6290e-11, 0.0000e+00, 0.0000e+00])
tensor([1.8134e-07, 1.5635e-11, 0.0000e+00, 0.0000e+00])
tensor([2.5081e-08, 6.4366e-09, 0.0000e+00, 0.0000e+00])
tensor([1.5318e-08, 5.6129e-12, 0.0000e+00, 0.0000e+00])
tensor([9.1742e-09, 7.9381e-13, 0.0000e+00, 0.0000e+00])
tensor([5.494

  0%|          | 0/1 [00:00<?, ?it/s]

tensor([1.2221e-11, 5.3311e-01, 0.0000e+00, 0.0000e+00])
tensor([0.0080, 0.2150, 0.0000, 0.0000])
tensor([0.0048, 0.0870, 0.0000, 0.0000])
tensor([0.0016, 0.0403, 0.0000, 0.0000])
tensor([0.0009, 0.0163, 0.0000, 0.0000])
tensor([0.0003, 0.0075, 0.0000, 0.0000])
tensor([0.0002, 0.0031, 0.0000, 0.0000])
tensor([5.6377e-05, 1.4150e-03, 0.0000e+00, 0.0000e+00])
tensor([2.2839e-05, 6.5469e-04, 0.0000e+00, 0.0000e+00])
tensor([1.4359e-05, 2.6475e-04, 0.0000e+00, 0.0000e+00])
tensor([2.0787e-05, 1.7840e-07, 0.0000e+00, 0.0000e+00])
tensor([4.0827e-06, 7.1632e-07, 0.0000e+00, 0.0000e+00])
tensor([8.1216e-07, 4.1545e-07, 0.0000e+00, 0.0000e+00])
tensor([5.0554e-07, 3.2068e-10, 0.0000e+00, 0.0000e+00])
tensor([3.0279e-07, 2.6290e-11, 0.0000e+00, 0.0000e+00])
tensor([1.8134e-07, 1.5635e-11, 0.0000e+00, 0.0000e+00])
tensor([2.5081e-08, 6.4366e-09, 0.0000e+00, 0.0000e+00])
tensor([1.5318e-08, 5.6129e-12, 0.0000e+00, 0.0000e+00])


100%|██████████| 1/1 [00:00<00:00, 64.05it/s]


tensor([9.1742e-09, 7.9381e-13, 0.0000e+00, 0.0000e+00])
tensor([5.4945e-09, 4.7371e-13, 0.0000e+00, 0.0000e+00])
tensor([1.0785e-09, 1.7051e-10, 0.0000e+00, 0.0000e+00])
tensor([6.5375e-10, 1.7005e-13, 0.0000e+00, 0.0000e+00])
tensor([9.0420e-11, 2.3257e-11, 0.0000e+00, 0.0000e+00])
tensor([5.5224e-11, 2.0270e-14, 0.0000e+00, 0.0000e+00])
tensor([3.3075e-11, 2.8619e-15, 0.0000e+00, 0.0000e+00])
tensor([1.9809e-11, 1.7079e-15, 0.0000e+00, 0.0000e+00])
tensor([2.7398e-12, 7.0311e-13, 0.0000e+00, 0.0000e+00])
tensor([5.4836e-13, 3.6850e-13, 0.0000e+00, 0.0000e+00])
tensor([1.1319e-13, 1.6561e-13, 0.0000e+00, 0.0000e+00])
tensor([1.7415e-14, 8.0407e-14, 0.0000e+00, 0.0000e+00])
tensor([4.6313e-15, 3.2968e-14, 0.0000e+00, 0.0000e+00])
tensor([9.9100e-16, 1.5372e-14, 0.0000e+00, 0.0000e+00])
tensor([3.0047e-16, 7.1258e-15, 0.0000e+00, 0.0000e+00])
tensor([0.0000e+00, 0.0000e+00, 4.4890e-16, 0.0000e+00])


100%|██████████| 1/1 [00:00<00:00, 94.98it/s]
INFO:eval:Cross-entropy: 1.0393 nats (= perplexity 2.827)


tensor([4.8315e-13, 5.3381e-01, 0.0000e+00, 0.0000e+00])
tensor([0.0081, 0.2151, 0.0000, 0.0000])
tensor([0.0049, 0.0869, 0.0000, 0.0000])
tensor([0.0016, 0.0403, 0.0000, 0.0000])
tensor([0.0009, 0.0163, 0.0000, 0.0000])
tensor([0.0003, 0.0076, 0.0000, 0.0000])
tensor([0.0002, 0.0031, 0.0000, 0.0000])
tensor([5.6534e-05, 1.4172e-03, 0.0000e+00, 0.0000e+00])
tensor([2.2891e-05, 6.5646e-04, 0.0000e+00, 0.0000e+00])
tensor([1.4451e-05, 2.6525e-04, 0.0000e+00, 0.0000e+00])
tensor([2.0858e-05, 8.0707e-08, 0.0000e+00, 0.0000e+00])
tensor([4.1085e-06, 6.7716e-07, 0.0000e+00, 0.0000e+00])
tensor([8.1931e-07, 3.9987e-07, 0.0000e+00, 0.0000e+00])
tensor([5.0872e-07, 1.4020e-10, 0.0000e+00, 0.0000e+00])
tensor([3.0445e-07, 1.1864e-11, 0.0000e+00, 0.0000e+00])
tensor([1.8220e-07, 7.0785e-12, 0.0000e+00, 0.0000e+00])
tensor([2.5195e-08, 6.4562e-09, 0.0000e+00, 0.0000e+00])
tensor([1.5375e-08, 2.5417e-12, 0.0000e+00, 0.0000e+00])
tensor([9.2014e-09, 3.5806e-13, 0.0000e+00, 0.0000e+00])
tensor([5.506

  0%|          | 0/1 [00:00<?, ?it/s]

tensor([4.8315e-13, 5.3381e-01, 0.0000e+00, 0.0000e+00])
tensor([0.0081, 0.2151, 0.0000, 0.0000])
tensor([0.0049, 0.0869, 0.0000, 0.0000])
tensor([0.0016, 0.0403, 0.0000, 0.0000])
tensor([0.0009, 0.0163, 0.0000, 0.0000])
tensor([0.0003, 0.0076, 0.0000, 0.0000])
tensor([0.0002, 0.0031, 0.0000, 0.0000])
tensor([5.6534e-05, 1.4172e-03, 0.0000e+00, 0.0000e+00])
tensor([2.2891e-05, 6.5646e-04, 0.0000e+00, 0.0000e+00])
tensor([1.4451e-05, 2.6525e-04, 0.0000e+00, 0.0000e+00])
tensor([2.0858e-05, 8.0707e-08, 0.0000e+00, 0.0000e+00])
tensor([4.1085e-06, 6.7716e-07, 0.0000e+00, 0.0000e+00])
tensor([8.1931e-07, 3.9987e-07, 0.0000e+00, 0.0000e+00])


100%|██████████| 1/1 [00:00<00:00, 59.96it/s]

tensor([5.0872e-07, 1.4020e-10, 0.0000e+00, 0.0000e+00])
tensor([3.0445e-07, 1.1864e-11, 0.0000e+00, 0.0000e+00])
tensor([1.8220e-07, 7.0785e-12, 0.0000e+00, 0.0000e+00])
tensor([2.5195e-08, 6.4562e-09, 0.0000e+00, 0.0000e+00])
tensor([1.5375e-08, 2.5417e-12, 0.0000e+00, 0.0000e+00])
tensor([9.2014e-09, 3.5806e-13, 0.0000e+00, 0.0000e+00])
tensor([5.5066e-09, 2.1393e-13, 0.0000e+00, 0.0000e+00])
tensor([1.0844e-09, 1.7028e-10, 0.0000e+00, 0.0000e+00])
tensor([6.5678e-10, 7.6792e-14, 0.0000e+00, 0.0000e+00])
tensor([9.0821e-11, 2.3296e-11, 0.0000e+00, 0.0000e+00])
tensor([5.5424e-11, 9.1693e-15, 0.0000e+00, 0.0000e+00])
tensor([3.3169e-11, 1.2907e-15, 0.0000e+00, 0.0000e+00])
tensor([1.9850e-11, 7.7118e-16, 0.0000e+00, 0.0000e+00])
tensor([2.7449e-12, 7.0338e-13, 0.0000e+00, 0.0000e+00])
tensor([5.5119e-13, 3.6829e-13, 0.0000e+00, 0.0000e+00])
tensor([1.1412e-13, 1.6545e-13, 0.0000e+00, 0.0000e+00])
tensor([1.7540e-14, 8.0447e-14, 0.0000e+00, 0.0000e+00])
tensor([4.6726e-15, 3.2961e-14,


  0%|          | 0/1 [00:00<?, ?it/s]

tensor([1.9085e-14, 5.3413e-01, 0.0000e+00, 0.0000e+00])
tensor([0.0081, 0.2152, 0.0000, 0.0000])
tensor([0.0049, 0.0869, 0.0000, 0.0000])
tensor([0.0016, 0.0403, 0.0000, 0.0000])
tensor([0.0009, 0.0163, 0.0000, 0.0000])
tensor([0.0003, 0.0076, 0.0000, 0.0000])
tensor([0.0002, 0.0031, 0.0000, 0.0000])
tensor([5.6606e-05, 1.4182e-03, 0.0000e+00, 0.0000e+00])
tensor([2.2915e-05, 6.5725e-04, 0.0000e+00, 0.0000e+00])
tensor([1.4492e-05, 2.6547e-04, 0.0000e+00, 0.0000e+00])
tensor([2.0889e-05, 3.6356e-08, 0.0000e+00, 0.0000e+00])
tensor([4.1201e-06, 6.5941e-07, 0.0000e+00, 0.0000e+00])
tensor([8.2254e-07, 3.9281e-07, 0.0000e+00, 0.0000e+00])
tensor([5.1015e-07, 6.2165e-11, 0.0000e+00, 0.0000e+00])
tensor([3.0520e-07, 5.3392e-12, 0.0000e+00, 0.0000e+00])


100%|██████████| 1/1 [00:00<00:00, 87.52it/s]
INFO:eval:Cross-entropy: 1.0392 nats (= perplexity 2.827)
INFO:hmm:Saving model to my_hmm.pkl
INFO:hmm:Saved model to my_hmm.pkl


tensor([1.8258e-07, 3.1898e-12, 0.0000e+00, 0.0000e+00])
tensor([2.5246e-08, 6.4650e-09, 0.0000e+00, 0.0000e+00])
tensor([1.5401e-08, 1.1455e-12, 0.0000e+00, 0.0000e+00])
tensor([9.2134e-09, 1.6108e-13, 0.0000e+00, 0.0000e+00])
tensor([5.5118e-09, 9.6295e-14, 0.0000e+00, 0.0000e+00])
tensor([1.0870e-09, 1.7017e-10, 0.0000e+00, 0.0000e+00])
tensor([6.5811e-10, 3.4565e-14, 0.0000e+00, 0.0000e+00])
tensor([9.0999e-11, 2.3314e-11, 0.0000e+00, 0.0000e+00])
tensor([5.5512e-11, 4.1303e-15, 0.0000e+00, 0.0000e+00])
tensor([3.3210e-11, 5.8062e-16, 0.0000e+00, 0.0000e+00])
tensor([1.9867e-11, 3.4710e-16, 0.0000e+00, 0.0000e+00])
tensor([2.7471e-12, 7.0348e-13, 0.0000e+00, 0.0000e+00])
tensor([5.5243e-13, 3.6819e-13, 0.0000e+00, 0.0000e+00])
tensor([1.1453e-13, 1.6537e-13, 0.0000e+00, 0.0000e+00])
tensor([1.7596e-14, 8.0461e-14, 0.0000e+00, 0.0000e+00])
tensor([4.6910e-15, 3.2956e-14, 0.0000e+00, 0.0000e+00])
tensor([9.9925e-16, 1.5393e-14, 0.0000e+00, 0.0000e+00])
tensor([3.0193e-16, 7.1473e-15,

In [18]:
log.info("*** A, B matrices after reestimation on icraw"
         "should match final params on spreadsheet [transposed])")
hmm.printAB()

INFO:test_ic:*** A, B matrices after reestimation on icrawshould match final params on spreadsheet [transposed])


Transition matrix A:
	C	H	_EOS_TAG_	_BOS_TAG_
C	0.934	0.066	0.000	0.000
H	0.072	0.865	0.063	0.000
_EOS_TAG_	0.333	0.333	0.333	0.000
_BOS_TAG_	0.000	1.000	0.000	0.000

Emission matrix B:
	1	2	3
C	0.641	0.148	0.211
H	0.000	0.534	0.466
_EOS_TAG_	0.000	0.000	0.000
_BOS_TAG_	0.000	0.000	0.000




Now let's try out a randomly initialized CRF on the ice cream data. Notice how
the initialized A and B matrices now hold non-negative potentials,
rather than probabilities that sum to 1.

In [19]:
log.info("*** Conditional Random Field (CRF) test\n")
crf = ConditionalRandomField(icsup.tagset, icsup.vocab)
log.info("*** Current A, B matrices (potentials from small random parameters)")
crf.printAB()

INFO:test_ic:*** Conditional Random Field (CRF) test



NotImplementedError: 

Now let's try your training code, running it on supervised data. To test this,
we'll restart from a random initialization. 

Note that the logger reports the CRF's *conditional* cross-entropy, 
log p(tags | words) / n.  This is much lower than the HMM's *joint* 
cross-entropy log p(tags, words) / n, but that doesn't mean the CRF
is worse at tagging.  The CRF is just predicting less information.

In [None]:
log.info("*** Supervised training on icsup")
cross_entropy_loss = lambda model: model_cross_entropy(model, icsup)
crf.train(corpus=icsup, loss=cross_entropy_loss, lr=0.1, tolerance=0.0001)
log.info("*** A, B matrices after training on icsup")
crf.printAB()

Let's again tag the spreadsheet "sentence" (that is, the sequence of ice creams) 
using the Viterbi algorithm (this may not match the HMM).

In [None]:
log.info("*** Viterbi results on icraw with trained parameters")
icraw = TaggedCorpus(Path("icraw"), tagset=icsup.tagset, vocab=icsup.vocab)
write_tagging(hmm, icraw, Path("icraw_crf.output"))  # calls hmm.viterbi_tagging on each sentence
os.system("cat icraw_crf.output")   # print the file we just created, and remove it