In [1]:
#!/usr/bin/env python3

In [2]:
import torch
print("CUDA available:", torch.cuda.is_available())

CUDA available: True


This file illustrates how you might experiment with the HMM interface.
You can paste these commands in at the Python prompt, or execute `test_en.py` directly.
A notebook interface is nicer than the plain Python prompt, so we provide
a notebook version of this file as `test_en.ipynb`, which you can open with
`jupyter` or with Visual Studio `code` (run it with the `nlp-class` kernel).

In [3]:
import logging
import math
import os
from pathlib import Path

In [5]:
%cd /kaggle/input/testing-run

/kaggle/input/testing-run


In [6]:
pip install typeguard

Note: you may need to restart the kernel to use updated packages.


In [7]:
pip install more_itertools

Note: you may need to restart the kernel to use updated packages.


In [8]:
pip install jaxtyping

Collecting jaxtyping
  Downloading jaxtyping-0.3.3-py3-none-any.whl.metadata (7.8 kB)
Collecting wadler-lindig>=0.1.3 (from jaxtyping)
  Downloading wadler_lindig-0.1.7-py3-none-any.whl.metadata (17 kB)
Downloading jaxtyping-0.3.3-py3-none-any.whl (55 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m55.9/55.9 kB[0m [31m2.1 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading wadler_lindig-0.1.7-py3-none-any.whl (20 kB)
Installing collected packages: wadler-lindig, jaxtyping
Successfully installed jaxtyping-0.3.3 wadler-lindig-0.1.7
Note: you may need to restart the kernel to use updated packages.


In [9]:
from corpus import TaggedCorpus
from eval import eval_tagging, model_cross_entropy, viterbi_error_rate
from hmm import HiddenMarkovModel
from crf import ConditionalRandomField

Set up logging.

In [10]:
logging.root.setLevel(level=logging.INFO)
log = logging.getLogger("test_en")       # For usage, see findsim.py in earlier assignment.
logging.basicConfig(format="%(levelname)s : %(message)s", level=logging.INFO)  # could change INFO to DEBUG

Switch working directory to the directory where the data live.  You may need to edit this line.

In [11]:
entrain = TaggedCorpus(Path("ensup"), Path("enraw"))                               # all training
ensup =   TaggedCorpus(Path("ensup"), tagset=entrain.tagset, vocab=entrain.vocab)  # supervised training
endev =   TaggedCorpus(Path("endev"), tagset=entrain.tagset, vocab=entrain.vocab)  # evaluation
print(f"{len(entrain)=}  {len(ensup)=}  {len(endev)=}")

INFO : Read 191873 tokens from ensup, enraw
INFO : Created 26 tag types
INFO : Created 18461 word types


len(entrain)=8064  len(ensup)=4051  len(endev)=996


In [12]:
known_vocab = TaggedCorpus(Path("ensup")).vocab    # words seen with supervised tags; used in evaluation
log.info(f"Tagset: f{list(entrain.tagset)}")

INFO : Read 95936 tokens from ensup
INFO : Created 26 tag types
INFO : Created 12466 word types
INFO : Tagset: f['W', 'J', 'N', 'C', 'V', 'I', 'D', ',', 'M', 'P', '.', 'E', 'R', '`', "'", 'T', '$', ':', '-', '#', 'S', 'F', 'U', 'L', '_EOS_TAG_', '_BOS_TAG_']


Make an HMM.  Let's do some pre-training to approximately maximize the
regularized log-likelihood on supervised training data.  In other words, the
probabilities at the M step will just be supervised count ratios.

On each epoch, you will see two progress bars: first it collects counts from
all the sentences (E step), and then after the M step, it evaluates the loss
function, which is the (unregularized) cross-entropy on the training set.

The parameters don't actually matter during the E step because there are no
hidden tags to impute.  The first M step will jump right to the optimal
solution.  The code will try a second epoch with the revised parameters, but
the result will be identical, so it will detect convergence and stop.

We arbitrarily choose λ=1 for our add-λ smoothing at the M step, but it would
be better to search for the best value of this hyperparameter.

In [13]:
log.info("*** Hidden Markov Model (HMM)")
hmm = HiddenMarkovModel(entrain.tagset, entrain.vocab)  # randomly initialized parameters  
loss_sup = lambda model: model_cross_entropy(model, eval_corpus=ensup)
hmm.train(corpus=ensup, loss=loss_sup, λ=1.0,
          save_path="/kaggle/working/ensup_hmm.pkl") 

INFO : *** Hidden Markov Model (HMM)
100%|██████████| 4051/4051 [00:34<00:00, 118.42it/s]
INFO : Cross-entropy: 12.6440 nats (= perplexity 309911.781)
100%|██████████| 4051/4051 [01:19<00:00, 50.97it/s]


M-step updated A values:
0: tensor([9.3953e-20, 3.7944e-02, 1.1729e-01, 1.1570e-02, 4.7851e-01, 1.4794e-02,
        7.0937e-02, 6.3251e-03, 7.8292e-02, 1.2216e-01, 1.0616e-03, 3.1725e-03,
        4.5349e-02, 9.3953e-20, 9.3953e-20, 1.0480e-02, 1.0546e-03, 9.3953e-20,
        1.0523e-03, 9.3953e-20, 9.3953e-20, 9.3953e-20, 9.3953e-20, 9.3953e-20,
        9.3953e-20, 0.0000e+00], device='cuda:0')
1: tensor([7.4254e-04, 7.7334e-02, 7.0333e-01, 3.4510e-02, 8.6227e-03, 7.4773e-02,
        2.6821e-03, 3.3887e-02, 2.9589e-04, 2.6703e-03, 2.3387e-02, 1.3252e-20,
        4.7409e-03, 1.3418e-03, 4.1924e-03, 1.9703e-02, 2.4028e-03, 3.7428e-03,
        1.6399e-03, 1.3252e-20, 1.3252e-20, 1.3252e-20, 1.3252e-20, 1.3252e-20,
        1.3252e-20, 0.0000e+00], device='cuda:0')
2: tensor([9.3278e-04, 9.4217e-04, 2.0974e-02, 4.1360e-03, 1.0905e-02, 1.4146e-02,
        6.1158e-04, 1.0185e-02, 1.1621e-03, 2.9302e-03, 7.7825e-03, 5.5597e-06,
        1.6216e-03, 1.2802e-04, 3.3420e-04, 2.1991e-03, 1.9465e-05

100%|██████████| 4051/4051 [00:33<00:00, 119.43it/s]
INFO : Cross-entropy: 7.9735 nats (= perplexity 2903.063)
100%|██████████| 4051/4051 [01:18<00:00, 51.28it/s]


M-step updated A values:
0: tensor([2.9345e-23, 1.1219e-03, 2.6877e-03, 6.7343e-04, 4.0064e-01, 1.9617e-03,
        2.8489e-01, 5.7740e-03, 1.5882e-01, 1.2369e-01, 1.6008e-04, 8.4995e-04,
        2.6478e-03, 2.9345e-23, 2.9345e-23, 1.5855e-02, 1.5741e-04, 2.9345e-23,
        6.9611e-05, 2.9345e-23, 2.9345e-23, 2.9345e-23, 2.9345e-23, 2.9345e-23,
        2.9345e-23, 0.0000e+00], device='cuda:0')
1: tensor([1.7625e-05, 5.6893e-03, 1.5978e-01, 4.2607e-02, 2.4524e-04, 1.3938e-01,
        4.8374e-04, 3.6069e-01, 9.7988e-07, 1.4906e-04, 1.5997e-01, 9.0042e-24,
        7.4469e-05, 5.4440e-04, 5.0978e-03, 1.2165e-01, 1.6512e-03, 1.6228e-03,
        3.4855e-04, 9.0042e-24, 9.0042e-24, 9.0042e-24, 9.0042e-24, 9.0042e-24,
        9.0042e-24, 0.0000e+00], device='cuda:0')
2: tensor([6.0873e-04, 2.3061e-05, 1.5660e-03, 1.2658e-02, 8.1598e-03, 1.0193e-01,
        3.9546e-04, 4.1846e-01, 1.0041e-03, 6.7955e-03, 2.3962e-01, 7.3366e-08,
        1.3747e-04, 5.9901e-05, 4.2036e-04, 1.9493e-02, 1.0774e-06

100%|██████████| 4051/4051 [00:33<00:00, 120.02it/s]
INFO : Cross-entropy: 8.1743 nats (= perplexity 3548.742)
INFO : Saved model to /kaggle/working/ensup_hmm.pkl


Now let's throw in the unsupervised training data as well, and continue
training as before, in order to increase the regularized log-likelihood on
this larger, semi-supervised training set.  It's now the *incomplete-data*
log-likelihood.

This time, we'll use a different evaluation loss function: we'll stop when the
*tagging error rate* on a held-out dev set stops getting better.  Also, the
implementation of this loss function (`viterbi_error_rate`) includes a helpful
side effect: it logs the *cross-entropy* on the held-out dataset as well, just
for your information.

We hope that held-out tagging accuracy will go up for a little bit before it
goes down again (see Merialdo 1994). (Log-likelihood on training data will
continue to improve, and that improvement may generalize to held-out
cross-entropy.  But getting accuracy to increase is harder.)

In [14]:
hmm = HiddenMarkovModel.load("/kaggle/working/ensup_hmm.pkl")  # reset to supervised model (in case you're re-executing this bit)
loss_dev = lambda model: viterbi_error_rate(model, eval_corpus=endev, 
                                            known_vocab=known_vocab)
hmm.train(corpus=entrain, loss=loss_dev, λ=1.0,
          save_path="/kaggle/working/entrain_hmm.pkl")

INFO : Loaded model from /kaggle/working/ensup_hmm.pkl
  0%|          | 0/996 [00:00<?, ?it/s]


RuntimeError: Expected all tensors to be on the same device, but found at least two devices, cuda:0 and cpu! (when checking argument for argument mat2 in method wrapper_CUDA_mm)

You can also retry the above workflow where you start with a worse supervised
model (like Merialdo).  Does EM help more in that case?  It's easiest to rerun
exactly the code above, but first make the `ensup` file smaller by copying
`ensup-tiny` over it.  `ensup-tiny` is only 25 sentences (that happen to cover
all tags in `endev`).  Back up your old `ensup` and your old `*.pkl` models
before you do this.

More detailed look at the first 10 sentences in the held-out corpus,
including Viterbi tagging.

In [None]:
def look_at_your_data(model, dev, N):
    for m, sentence in enumerate(dev):
        if m >= N: break
        viterbi = model.viterbi_tagging(sentence.desupervise(), endev)
        counts = eval_tagging(predicted=viterbi, gold=sentence, 
                              known_vocab=known_vocab)
        num = counts['NUM', 'ALL']
        denom = counts['DENOM', 'ALL']
        
        log.info(f"Gold:    {sentence}")
        log.info(f"Viterbi: {viterbi}")
        log.info(f"Loss:    {denom - num}/{denom}")
        xent = -model.logprob(sentence, endev) / len(sentence)  # measured in nats
        log.info(f"Cross-entropy: {xent/math.log(2)} nats (= perplexity {math.exp(xent)})\n---")

In [None]:
look_at_your_data(hmm, endev, 10)

Now let's try supervised training of a CRF (this doesn't use the unsupervised
part of the data, so it is comparable to the supervised pre-training we did
for the HMM).  We will use SGD to approximately maximize the regularized
log-likelihood. 

As with the semi-supervised HMM training, we'll periodically evaluate the
tagging accuracy (and also print the cross-entropy) on a held-out dev set.
We use the default `eval_interval` and `tolerance`.  If you want to stop
sooner, then you could increase the `tolerance` so the training method decides
sooner that it has converged.

We arbitrarily choose reg = 1.0 for L2 regularization, learning rate = 0.05,
and a minibatch size of 10, but it would be better to search for the best
value of these hyperparameters.

Note that the logger reports the CRF's *conditional* cross-entropy, log p(tags
| words) / n.  This is much lower than the HMM's *joint* cross-entropy log
p(tags, words) / n, but that doesn't mean the CRF is worse at tagging.  The
CRF is just predicting less information.

In [16]:
log.info("*** Conditional Random Field (CRF)\n")
crf = ConditionalRandomField(entrain.tagset, entrain.vocab)  # randomly initialized parameters  
crf.train(corpus=ensup, loss=loss_dev, reg=1.0, lr=0.05, minibatch_size=10,
          save_path="/kaggle/working/ensup_crf.pkl")

INFO : *** Conditional Random Field (CRF)

100%|██████████| 996/996 [00:04<00:00, 212.53it/s]
INFO : Cross-entropy: 3.0501 nats (= perplexity 21.118)
100%|██████████| 996/996 [02:56<00:00,  5.65it/s]
INFO : Tagging accuracy: all: 5.800%, known: 6.245%, seen: 3.704%, novel: 0.198%
100%|██████████| 500/500 [00:07<00:00, 70.98it/s]
100%|██████████| 996/996 [00:04<00:00, 216.99it/s]
INFO : Cross-entropy: 2.9038 nats (= perplexity 18.244)
100%|██████████| 996/996 [02:55<00:00,  5.68it/s]
INFO : Tagging accuracy: all: 33.513%, known: 31.336%, seen: 54.209%, novel: 56.803%
  1%|          | 6/500 [00:00<00:09, 53.11it/s]INFO : Saved model to /kaggle/working/ensup_crf-510.pkl
100%|██████████| 500/500 [00:07<00:00, 69.81it/s]
100%|██████████| 996/996 [00:04<00:00, 211.21it/s]
INFO : Cross-entropy: 2.8704 nats (= perplexity 17.645)
100%|██████████| 996/996 [02:56<00:00,  5.63it/s]
INFO : Tagging accuracy: all: 33.722%, known: 31.564%, seen: 54.209%, novel: 56.803%
100%|██████████| 500/500 [00:07<

Let's examine how the CRF does on individual sentences. 
(Do you see any error patterns here that would inspire additional CRF features?)

In [17]:
look_at_your_data(crf, endev, 10)

INFO : Gold:    ``/` We/P 're/V strongly/R _OOV_/V that/I anyone/N who/W has/V eaten/V in/I the/D cafeteria/N this/D month/N have/V the/D shot/N ,/, ''/' Mr./N Mattausch/N added/V ,/, ``/` and/C that/D means/V virtually/R everyone/N who/W works/V here/R ./.
INFO : Viterbi: ``/N We/N 're/N strongly/N _OOV_/N that/N anyone/N who/N has/N eaten/N in/N the/N cafeteria/N this/N month/N have/N the/N shot/N ,/N ''/N Mr./N Mattausch/N added/N ,/N ``/N and/N that/N means/N virtually/N everyone/N who/N works/N here/N ./.
INFO : Loss:    26/34
INFO : Cross-entropy: 4.020375211200853 nats (= perplexity 16.227571539277715)
---
INFO : Gold:    I/P was/V _OOV_/V to/T read/V the/D _OOV_/N of/I facts/N in/I your/P Oct./N 13/C editorial/N ``/` _OOV_/N 's/P _OOV_/N _OOV_/N ./. ''/'
INFO : Viterbi: I/N was/N _OOV_/N to/N read/N the/N _OOV_/N of/N facts/N in/N your/N Oct./N 13/N editorial/N ``/N _OOV_/N 's/N _OOV_/N _OOV_/N ./. ''/.
INFO : Loss:    13/21
INFO : Cross-entropy: 3.8831678415781927 nats (= perp

### CRF with PyTorch backprop
Train the autograd-enabled CRF to verify `ConditionalRandomFieldBackprop` behaves like the manual-gradient version.

In [18]:
loss_dev = lambda model: viterbi_error_rate(model, eval_corpus=endev, 
                                            known_vocab=known_vocab)

In [None]:
from crf_backprop import ConditionalRandomFieldBackprop

loss_dev = lambda model: viterbi_error_rate(model, eval_corpus=endev, 
                                            known_vocab=known_vocab)

log.info("*** Conditional Random Field (autograd/backprop)")
crf_backprop = ConditionalRandomFieldBackprop(entrain.tagset, entrain.vocab)
crf_backprop.train(
    corpus=ensup,
    loss=loss_dev,
    reg=1.0,
    lr=0.05,
    minibatch_size=10,
    eval_interval=200,
    max_steps=5000,
    save_path="/kaggle/working/ensup_crf_backprop.pkl",
)
look_at_your_data(crf_backprop, endev, 5)

INFO : *** Conditional Random Field (autograd/backprop)
INFO : Parameters: 480610 = 26*18459 + 26*26
100%|██████████| 996/996 [00:04<00:00, 212.18it/s]
INFO : Cross-entropy: 3.0511 nats (= perplexity 21.139)
100%|██████████| 996/996 [02:58<00:00,  5.57it/s]
INFO : Tagging accuracy: all: 5.558%, known: 5.993%, seen: 3.535%, novel: 0.066%
100%|██████████| 200/200 [00:35<00:00,  5.71it/s]
INFO : Average learning speed: 0.12 (estimated training loss reduction per example)
100%|██████████| 996/996 [00:04<00:00, 208.53it/s]
INFO : Cross-entropy: 2.3221 nats (= perplexity 10.197)
100%|██████████| 996/996 [02:56<00:00,  5.63it/s]
INFO : Tagging accuracy: all: 33.513%, known: 31.336%, seen: 54.209%, novel: 56.803%
  4%|▍         | 9/200 [00:01<00:22,  8.46it/s]INFO : Saved model to /kaggle/working/ensup_crf_backprop-210.pkl
100%|██████████| 200/200 [00:34<00:00,  5.73it/s]
INFO : Average learning speed: 0.078 (estimated training loss reduction per example)
100%|██████████| 996/996 [00:04<00:00,

KeyboardInterrupt: 

### Neural biRNN-CRF
Run the neural CRF with a simple one-hot lexicon to sanity-check `ConditionalRandomFieldNeural`.

In [21]:
from crf_neural import ConditionalRandomFieldNeural
from lexicon import build_lexicon
import torch

log.info("*** Neural biRNN-CRF (autograd/backprop)")
lexicon = build_lexicon(entrain, embeddings_file = Path("words-10.txt"),
                       newvocab=known_vocab)
lexicon = lexicon.to(torch.device("cuda" if torch.cuda.is_available() else "cpu"), dtype=torch.float64)
crf_neural = ConditionalRandomFieldNeural(
    entrain.tagset,
    entrain.vocab,
    lexicon=lexicon,
    rnn_dim=2,
)
crf_neural.train(
    corpus=ensup,
    loss=loss_dev,
    reg=0.5,
    lr=0.01,
    minibatch_size=5,
    eval_interval=100,
    max_steps=2000,
    save_path="/kaggle/working/ensup_crf_neural.pkl",
)
look_at_your_data(crf_neural, endev, 3)


INFO : *** Neural biRNN-CRF (autograd/backprop)
INFO : From words-10.txt, got embeddings for 10420 of 18461 previously known types + 0 new seen types
INFO : Parameters: 352 = 2*13 + 2*13 + 3*57 + 3*41 + 3 + 3
100%|██████████| 996/996 [00:52<00:00, 18.89it/s]
INFO : Cross-entropy: 3.0508 nats (= perplexity 21.132)
100%|██████████| 996/996 [04:15<00:00,  3.90it/s]
INFO : Tagging accuracy: all: 0.000%, known: 0.000%, seen: 0.000%, novel: 0.000%
  4%|▍         | 4/100 [00:02<01:11,  1.35it/s]


RuntimeError: File ensup_crf_neural-5.pkl cannot be opened.

In [None]:
from crf_neural import ConditionalRandomFieldNeural
from lexicon import build_lexicon
MAX_STEPS=1500
EVAL_INTERVAL=200
BATCH=8
REG=0.5
dims = [0,1,3,5]
lrs = [0.01, 0.001]
log.info("*** Neural biRNN-CRF (autograd/backprop)")
lexicon = build_lexicon(entrain, embeddings_file = Path("words-100.txt"),
                       newvocab=known_vocab)
lexicon = lexicon.to(torch.device("cuda" if torch.cuda.is_available() else "cpu"), dtype=torch.float64)
for d in dims:
    for lr in lrs:
        crf_neural = ConditionalRandomFieldNeural(
            entrain.tagset,
            entrain.vocab,
            lexicon=lexicon,
            rnn_dim=d,
        )
        crf_neural.train(
            corpus=ensup,
            loss=loss_dev,
            reg=REG,
            lr=lr,
            minibatch_size=BATCH,
            eval_interval=EVAL_INTERVAL,
            max_steps=MAX_STEPS,
            save_path=f"/kaggle/working/ensup_crf_{d}_{lr}_neural.pkl",
        )
        # ---- CAPTURE OUTPUT OF THE EVALUATION ----
        buffer = io.StringIO()
        with redirect_stdout(buffer):
            look_at_your_data(crf_neural, endev, 5)

        # ---- SAVE TO FILE ----
        out_file = f"/kaggle/working/output_d={d}_lr={lr}.eval"
        with open(out_file, "w") as f:
            f.write(buffer.getvalue())

        print(f"Saved evaluation output to {out_file}")
        

INFO : *** Neural biRNN-CRF (autograd/backprop)
INFO : From words-100.txt, got embeddings for 10420 of 18461 previously known types + 0 new seen types
INFO : Parameters: 182 = 0*0 + 0*0 + 1*53 + 1*127 + 1 + 1
100%|██████████| 996/996 [00:35<00:00, 28.18it/s]
INFO : Cross-entropy: 3.0512 nats (= perplexity 21.140)
 61%|██████    | 605/996 [08:58<04:19,  1.51it/s]