In [1]:
#!/usr/bin/env python3

In [2]:
import torch
print("CUDA available:", torch.cuda.is_available())

CUDA available: False


This file illustrates how you might experiment with the HMM interface.
You can paste these commands in at the Python prompt, or execute `test_en.py` directly.
A notebook interface is nicer than the plain Python prompt, so we provide
a notebook version of this file as `test_en.ipynb`, which you can open with
`jupyter` or with Visual Studio `code` (run it with the `nlp-class` kernel).

In [3]:
import logging
import math
import os
from pathlib import Path

In [4]:
%cd /kaggle/input/testing-run

/kaggle/input/testing-run


In [5]:
pip install typeguard

Collecting typeguard
  Downloading typeguard-4.4.4-py3-none-any.whl.metadata (3.3 kB)
Downloading typeguard-4.4.4-py3-none-any.whl (34 kB)
Installing collected packages: typeguard
Successfully installed typeguard-4.4.4
[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m25.0.1[0m[39;49m -> [0m[32;49m25.3[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m
Note: you may need to restart the kernel to use updated packages.


In [6]:
pip install more_itertools

Collecting more_itertools
  Downloading more_itertools-10.8.0-py3-none-any.whl.metadata (39 kB)
Downloading more_itertools-10.8.0-py3-none-any.whl (69 kB)
Installing collected packages: more_itertools
Successfully installed more_itertools-10.8.0
[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m25.0.1[0m[39;49m -> [0m[32;49m25.3[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m
Note: you may need to restart the kernel to use updated packages.


In [7]:
pip install jaxtyping

[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m25.0.1[0m[39;49m -> [0m[32;49m25.3[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m
Note: you may need to restart the kernel to use updated packages.


In [8]:
from corpus import TaggedCorpus
from eval import eval_tagging, model_cross_entropy, viterbi_error_rate
from hmm import HiddenMarkovModel
from crf import ConditionalRandomField

Set up logging.

In [9]:
logging.root.setLevel(level=logging.INFO)
log = logging.getLogger("test_en")       # For usage, see findsim.py in earlier assignment.
logging.basicConfig(format="%(levelname)s : %(message)s", level=logging.INFO)  # could change INFO to DEBUG

Switch working directory to the directory where the data live.  You may need to edit this line.

In [10]:
entrain = TaggedCorpus(Path("ensup"), Path("enraw"))                               # all training
ensup =   TaggedCorpus(Path("ensup"), tagset=entrain.tagset, vocab=entrain.vocab)  # supervised training
endev =   TaggedCorpus(Path("endev"), tagset=entrain.tagset, vocab=entrain.vocab)  # evaluation
print(f"{len(entrain)=}  {len(ensup)=}  {len(endev)=}")

INFO : Read 191873 tokens from ensup, enraw
INFO : Created 26 tag types
INFO : Created 18461 word types


len(entrain)=8064  len(ensup)=4051  len(endev)=996


In [11]:
known_vocab = TaggedCorpus(Path("ensup")).vocab    # words seen with supervised tags; used in evaluation
log.info(f"Tagset: f{list(entrain.tagset)}")

INFO : Read 95936 tokens from ensup
INFO : Created 26 tag types
INFO : Created 12466 word types
INFO : Tagset: f['W', 'J', 'N', 'C', 'V', 'I', 'D', ',', 'M', 'P', '.', 'E', 'R', '`', "'", 'T', '$', ':', '-', '#', 'S', 'F', 'U', 'L', '_EOS_TAG_', '_BOS_TAG_']


Make an HMM.  Let's do some pre-training to approximately maximize the
regularized log-likelihood on supervised training data.  In other words, the
probabilities at the M step will just be supervised count ratios.

On each epoch, you will see two progress bars: first it collects counts from
all the sentences (E step), and then after the M step, it evaluates the loss
function, which is the (unregularized) cross-entropy on the training set.

The parameters don't actually matter during the E step because there are no
hidden tags to impute.  The first M step will jump right to the optimal
solution.  The code will try a second epoch with the revised parameters, but
the result will be identical, so it will detect convergence and stop.

We arbitrarily choose λ=1 for our add-λ smoothing at the M step, but it would
be better to search for the best value of this hyperparameter.

In [12]:
log.info("*** Hidden Markov Model (HMM)")
hmm = HiddenMarkovModel(entrain.tagset, entrain.vocab)  # randomly initialized parameters  
loss_sup = lambda model: model_cross_entropy(model, eval_corpus=ensup)
hmm.train(corpus=ensup, loss=loss_sup, λ=1.0,
          save_path="/kaggle/working/ensup_hmm.pkl") 

INFO : *** Hidden Markov Model (HMM)
100%|██████████| 4051/4051 [00:10<00:00, 378.24it/s]
INFO : Cross-entropy: 12.6440 nats (= perplexity 309893.787)
100%|██████████| 4051/4051 [00:25<00:00, 158.77it/s]


M-step updated A values:
0: tensor([9.4107e-20, 3.8028e-02, 1.1793e-01, 1.1624e-02, 4.7782e-01, 1.4787e-02,
        7.0548e-02, 6.3396e-03, 7.8161e-02, 1.2242e-01, 1.0630e-03, 3.1572e-03,
        4.5436e-02, 9.4107e-20, 9.4107e-20, 1.0552e-02, 1.0641e-03, 9.4107e-20,
        1.0652e-03, 9.4107e-20, 9.4107e-20, 9.4107e-20, 9.4107e-20, 9.4107e-20,
        9.4107e-20, 0.0000e+00])
1: tensor([7.4188e-04, 7.6821e-02, 7.0454e-01, 3.4448e-02, 8.6307e-03, 7.4635e-02,
        2.6699e-03, 3.3698e-02, 2.9666e-04, 2.6706e-03, 2.3364e-02, 1.3196e-20,
        4.7442e-03, 1.3276e-03, 4.1474e-03, 1.9553e-02, 2.3648e-03, 3.7073e-03,
        1.6382e-03, 1.3196e-20, 1.3196e-20, 1.3196e-20, 1.3196e-20, 1.3196e-20,
        1.3196e-20, 0.0000e+00])
2: tensor([9.1840e-04, 9.4116e-04, 2.0798e-02, 4.0924e-03, 1.0871e-02, 1.4010e-02,
        6.1269e-04, 1.0102e-02, 1.1552e-03, 2.9211e-03, 7.7810e-03, 5.4938e-06,
        1.6006e-03, 1.2607e-04, 3.3026e-04, 2.1775e-03, 1.9236e-05, 8.3759e-04,
        4.6005e-04, 

100%|██████████| 4051/4051 [00:10<00:00, 380.31it/s]
INFO : Cross-entropy: 7.9752 nats (= perplexity 2907.821)
100%|██████████| 4051/4051 [00:25<00:00, 158.20it/s]


M-step updated A values:
0: tensor([2.9404e-23, 1.1266e-03, 2.7079e-03, 6.7795e-04, 4.0086e-01, 1.9647e-03,
        2.8390e-01, 5.7989e-03, 1.5887e-01, 1.2420e-01, 1.6061e-04, 8.4755e-04,
        2.6582e-03, 2.9404e-23, 2.9404e-23, 1.5996e-02, 1.5915e-04, 2.9404e-23,
        7.0605e-05, 2.9404e-23, 2.9404e-23, 2.9404e-23, 2.9404e-23, 2.9404e-23,
        2.9404e-23, 0.0000e+00])
1: tensor([1.7667e-05, 5.6702e-03, 1.6058e-01, 4.2670e-02, 2.4628e-04, 1.3958e-01,
        4.8314e-04, 3.5987e-01, 9.8570e-07, 1.4957e-04, 1.6034e-01, 9.0339e-24,
        7.4767e-05, 5.4041e-04, 5.0597e-03, 1.2112e-01, 1.6305e-03, 1.6128e-03,
        3.4933e-04, 9.0339e-24, 9.0339e-24, 9.0339e-24, 9.0339e-24, 9.0339e-24,
        9.0339e-24, 0.0000e+00])
2: tensor([6.0219e-04, 2.3146e-05, 1.5602e-03, 1.2584e-02, 8.1722e-03, 1.0143e-01,
        3.9806e-04, 4.1702e-01, 1.0028e-03, 6.8067e-03, 2.4071e-01, 7.2841e-08,
        1.3634e-04, 5.9268e-05, 4.1737e-04, 1.9393e-02, 1.0698e-06, 1.0627e-03,
        3.4171e-04, 

100%|██████████| 4051/4051 [00:10<00:00, 378.22it/s]
INFO : Cross-entropy: 8.1748 nats (= perplexity 3550.281)
INFO : Saved model to /kaggle/working/ensup_hmm.pkl


Now let's throw in the unsupervised training data as well, and continue
training as before, in order to increase the regularized log-likelihood on
this larger, semi-supervised training set.  It's now the *incomplete-data*
log-likelihood.

This time, we'll use a different evaluation loss function: we'll stop when the
*tagging error rate* on a held-out dev set stops getting better.  Also, the
implementation of this loss function (`viterbi_error_rate`) includes a helpful
side effect: it logs the *cross-entropy* on the held-out dataset as well, just
for your information.

We hope that held-out tagging accuracy will go up for a little bit before it
goes down again (see Merialdo 1994). (Log-likelihood on training data will
continue to improve, and that improvement may generalize to held-out
cross-entropy.  But getting accuracy to increase is harder.)

In [13]:
hmm = HiddenMarkovModel.load("/kaggle/working/ensup_hmm.pkl")  # reset to supervised model (in case you're re-executing this bit)
loss_dev = lambda model: viterbi_error_rate(model, eval_corpus=endev, 
                                            known_vocab=known_vocab)
hmm.train(corpus=entrain, loss=loss_dev, λ=1.0,
          save_path="/kaggle/working/entrain_hmm.pkl")

INFO : Loaded model from /kaggle/working/ensup_hmm.pkl
100%|██████████| 996/996 [00:02<00:00, 340.80it/s]
INFO : Cross-entropy: 12.8301 nats (= perplexity 373293.208)
100%|██████████| 996/996 [03:29<00:00,  4.76it/s]
INFO : Tagging accuracy: all: 87.010%, known: 94.144%, seen: 10.943%, novel: 13.937%
100%|██████████| 8064/8064 [00:51<00:00, 155.47it/s]


M-step updated A values:
0: tensor([7.1401e-24, 2.7224e-05, 2.7295e-05, 1.0635e-05, 1.4372e-01, 8.6812e-05,
        6.5150e-01, 1.9480e-03, 1.1894e-01, 7.1687e-02, 6.0119e-06, 1.7833e-04,
        8.1839e-05, 3.3717e-14, 7.1396e-24, 1.1780e-02, 5.7791e-06, 7.1407e-24,
        1.2788e-06, 7.1396e-24, 7.1396e-24, 2.5997e-23, 7.1396e-24, 7.1396e-24,
        7.1396e-24, 0.0000e+00])
1: tensor([2.9077e-05, 9.5703e-05, 5.4691e-03, 8.4964e-03, 2.5434e-06, 4.3518e-02,
        4.9274e-05, 6.0417e-01, 7.7672e-08, 2.3388e-06, 1.9878e-01, 8.3890e-25,
        5.7893e-06, 3.3594e-05, 8.9551e-04, 1.3808e-01, 2.7364e-04, 9.9931e-05,
        7.6689e-06, 3.0194e-15, 8.3893e-25, 8.7526e-25, 8.3890e-25, 8.3890e-25,
        8.3890e-25, 0.0000e+00])
2: tensor([1.8782e-04, 8.4539e-08, 4.8486e-06, 1.5446e-03, 2.5386e-04, 3.0922e-02,
        2.1227e-05, 6.5559e-01, 3.6527e-05, 6.0966e-04, 3.0227e-01, 5.6531e-10,
        4.5489e-06, 1.3415e-06, 1.9592e-05, 7.3666e-03, 2.6440e-09, 5.7523e-05,
        8.1900e-06, 

100%|██████████| 996/996 [00:02<00:00, 340.97it/s]
INFO : Cross-entropy: 14.0351 nats (= perplexity 1245583.460)
100%|██████████| 996/996 [03:25<00:00,  4.84it/s]
INFO : Tagging accuracy: all: 84.822%, known: 91.905%, seen: 10.269%, novel: 11.889%
INFO : Saved model to /kaggle/working/entrain_hmm.pkl


You can also retry the above workflow where you start with a worse supervised
model (like Merialdo).  Does EM help more in that case?  It's easiest to rerun
exactly the code above, but first make the `ensup` file smaller by copying
`ensup-tiny` over it.  `ensup-tiny` is only 25 sentences (that happen to cover
all tags in `endev`).  Back up your old `ensup` and your old `*.pkl` models
before you do this.

More detailed look at the first 10 sentences in the held-out corpus,
including Viterbi tagging.

In [14]:
def look_at_your_data(model, dev, N):
    for m, sentence in enumerate(dev):
        if m >= N: break
        viterbi = model.viterbi_tagging(sentence.desupervise(), endev)
        counts = eval_tagging(predicted=viterbi, gold=sentence, 
                              known_vocab=known_vocab)
        num = counts['NUM', 'ALL']
        denom = counts['DENOM', 'ALL']
        
        log.info(f"Gold:    {sentence}")
        log.info(f"Viterbi: {viterbi}")
        log.info(f"Loss:    {denom - num}/{denom}")
        xent = -model.logprob(sentence, endev) / len(sentence)  # measured in nats
        log.info(f"Cross-entropy: {xent/math.log(2)} nats (= perplexity {math.exp(xent)})\n---")

In [15]:
look_at_your_data(hmm, endev, 10)

INFO : Gold:    ``/` We/P 're/V strongly/R _OOV_/V that/I anyone/N who/W has/V eaten/V in/I the/D cafeteria/N this/D month/N have/V the/D shot/N ,/, ''/' Mr./N Mattausch/N added/V ,/, ``/` and/C that/D means/V virtually/R everyone/N who/W works/V here/R ./.
INFO : Viterbi: ``/` We/P 're/V strongly/R _OOV_/, that/D anyone/N who/W has/V eaten/V in/I the/D cafeteria/` this/D month/N have/V the/D shot/V ,/, ''/D Mr./N Mattausch/P added/V ,/, ``/` and/C that/W means/V virtually/R everyone/N who/W works/V here/R ./.
INFO : Loss:    7/34
INFO : Cross-entropy: 18.20014945645551 nats (= perplexity 301155.57826512575)
---
INFO : Gold:    I/P was/V _OOV_/V to/T read/V the/D _OOV_/N of/I facts/N in/I your/P Oct./N 13/C editorial/N ``/` _OOV_/N 's/P _OOV_/N _OOV_/N ./. ''/'
INFO : Viterbi: I/P was/V _OOV_/R to/T read/V the/D _OOV_/J of/I facts/N in/I your/P Oct./N 13/C editorial/J ``/` _OOV_/E 's/P _OOV_/M _OOV_/R ./, ''/'
INFO : Loss:    7/21
INFO : Cross-entropy: 30.1527424287503 nats (= perplexi

Now let's try supervised training of a CRF (this doesn't use the unsupervised
part of the data, so it is comparable to the supervised pre-training we did
for the HMM).  We will use SGD to approximately maximize the regularized
log-likelihood. 

As with the semi-supervised HMM training, we'll periodically evaluate the
tagging accuracy (and also print the cross-entropy) on a held-out dev set.
We use the default `eval_interval` and `tolerance`.  If you want to stop
sooner, then you could increase the `tolerance` so the training method decides
sooner that it has converged.

We arbitrarily choose reg = 1.0 for L2 regularization, learning rate = 0.05,
and a minibatch size of 10, but it would be better to search for the best
value of these hyperparameters.

Note that the logger reports the CRF's *conditional* cross-entropy, log p(tags
| words) / n.  This is much lower than the HMM's *joint* cross-entropy log
p(tags, words) / n, but that doesn't mean the CRF is worse at tagging.  The
CRF is just predicting less information.

In [16]:
log.info("*** Conditional Random Field (CRF)\n")
crf = ConditionalRandomField(entrain.tagset, entrain.vocab)  # randomly initialized parameters  
crf.train(corpus=ensup, loss=loss_dev, reg=1.0, lr=0.05, minibatch_size=10,
          save_path="/kaggle/working/ensup_crf.pkl")

INFO : *** Conditional Random Field (CRF)

100%|██████████| 996/996 [00:04<00:00, 216.14it/s]
INFO : Cross-entropy: 3.0501 nats (= perplexity 21.118)
100%|██████████| 996/996 [02:57<00:00,  5.62it/s]
INFO : Tagging accuracy: all: 5.800%, known: 6.245%, seen: 3.704%, novel: 0.198%
100%|██████████| 500/500 [00:06<00:00, 71.89it/s]
100%|██████████| 996/996 [00:04<00:00, 214.42it/s]
INFO : Cross-entropy: 2.9038 nats (= perplexity 18.244)
100%|██████████| 996/996 [02:55<00:00,  5.66it/s]
INFO : Tagging accuracy: all: 33.513%, known: 31.336%, seen: 54.209%, novel: 56.803%
  1%|          | 6/500 [00:00<00:08, 56.59it/s]INFO : Saved model to /kaggle/working/ensup_crf-510.pkl
100%|██████████| 500/500 [00:07<00:00, 67.30it/s]
100%|██████████| 996/996 [00:04<00:00, 214.59it/s]
INFO : Cross-entropy: 2.8704 nats (= perplexity 17.645)
100%|██████████| 996/996 [02:56<00:00,  5.63it/s]
INFO : Tagging accuracy: all: 33.722%, known: 31.564%, seen: 54.209%, novel: 56.803%
100%|██████████| 500/500 [00:07<

Let's examine how the CRF does on individual sentences. 
(Do you see any error patterns here that would inspire additional CRF features?)

In [17]:
look_at_your_data(crf, endev, 10)

INFO : Gold:    ``/` We/P 're/V strongly/R _OOV_/V that/I anyone/N who/W has/V eaten/V in/I the/D cafeteria/N this/D month/N have/V the/D shot/N ,/, ''/' Mr./N Mattausch/N added/V ,/, ``/` and/C that/D means/V virtually/R everyone/N who/W works/V here/R ./.
INFO : Viterbi: ``/N We/N 're/N strongly/N _OOV_/N that/N anyone/N who/N has/N eaten/N in/N the/N cafeteria/N this/N month/N have/N the/N shot/N ,/N ''/N Mr./N Mattausch/N added/N ,/N ``/N and/N that/N means/N virtually/N everyone/N who/N works/N here/N ./.
INFO : Loss:    26/34
INFO : Cross-entropy: 4.020375211200853 nats (= perplexity 16.227571539277715)
---
INFO : Gold:    I/P was/V _OOV_/V to/T read/V the/D _OOV_/N of/I facts/N in/I your/P Oct./N 13/C editorial/N ``/` _OOV_/N 's/P _OOV_/N _OOV_/N ./. ''/'
INFO : Viterbi: I/N was/N _OOV_/N to/N read/N the/N _OOV_/N of/N facts/N in/N your/N Oct./N 13/N editorial/N ``/N _OOV_/N 's/N _OOV_/N _OOV_/N ./. ''/.
INFO : Loss:    13/21
INFO : Cross-entropy: 3.8831678415781927 nats (= perp

### CRF with PyTorch backprop
Train the autograd-enabled CRF to verify `ConditionalRandomFieldBackprop` behaves like the manual-gradient version.

In [13]:
from crf_backprop import ConditionalRandomFieldBackprop

loss_dev = lambda model: viterbi_error_rate(model, eval_corpus=endev, 
                                            known_vocab=known_vocab)

log.info("*** Conditional Random Field (autograd/backprop)")
crf_backprop = ConditionalRandomFieldBackprop(entrain.tagset, entrain.vocab)
crf_backprop.train(
    corpus=ensup,
    loss=loss_dev,
    reg=1.0,
    lr=0.05,
    minibatch_size=10,
    eval_interval=200,
    max_steps=5000,
    save_path="/kaggle/working/ensup_crf_backprop.pkl",
)
look_at_your_data(crf_backprop, endev, 5)

INFO : *** Conditional Random Field (autograd/backprop)
INFO : Parameters: 480610 = 26*18459 + 26*26
100%|██████████| 996/996 [00:04<00:00, 200.27it/s]
INFO : Cross-entropy: 3.0506 nats (= perplexity 21.129)
100%|██████████| 996/996 [03:04<00:00,  5.41it/s]
INFO : Tagging accuracy: all: 4.451%, known: 4.707%, seen: 6.061%, novel: 0.132%
100%|██████████| 200/200 [00:33<00:00,  6.03it/s]
INFO : Average learning speed: 0.12 (estimated training loss reduction per example)
100%|██████████| 996/996 [00:04<00:00, 200.98it/s]
INFO : Cross-entropy: 2.3296 nats (= perplexity 10.274)
100%|██████████| 996/996 [03:04<00:00,  5.41it/s]
INFO : Tagging accuracy: all: 33.513%, known: 31.336%, seen: 54.209%, novel: 56.803%
  4%|▍         | 9/200 [00:00<00:22,  8.53it/s]INFO : Saved model to /kaggle/working/ensup_crf_backprop-210.pkl
100%|██████████| 200/200 [00:32<00:00,  6.20it/s]
INFO : Average learning speed: 0.075 (estimated training loss reduction per example)
100%|██████████| 996/996 [00:05<00:00,

NameError: name 'look_at_your_data' is not defined

In [None]:
from crf_backprop import ConditionalRandomFieldBackprop

loss_dev = lambda model: viterbi_error_rate(model, eval_corpus=endev, 
                                            known_vocab=known_vocab)

log.info("*** Conditional Random Field (autograd/backprop)")
crf_backprop = ConditionalRandomFieldBackprop(entrain.tagset, entrain.vocab)
crf_backprop.train(
    corpus=ensup,
    loss=loss_dev,
    reg=1.0,
    lr=0.05,
    minibatch_size=10,
    eval_interval=200,
    max_steps=5000,
    save_path="/kaggle/working/ensup_crf_backprop.pkl",
)
look_at_your_data(crf_backprop, endev, 5)

In [15]:
look_at_your_data(crf_backprop, endev, 5)

INFO : Gold:    ``/` We/P 're/V strongly/R _OOV_/V that/I anyone/N who/W has/V eaten/V in/I the/D cafeteria/N this/D month/N have/V the/D shot/N ,/, ''/' Mr./N Mattausch/N added/V ,/, ``/` and/C that/D means/V virtually/R everyone/N who/W works/V here/R ./.
INFO : Viterbi: ``/` We/J 're/N strongly/N _OOV_/N that/I anyone/N who/N has/V eaten/N in/I the/D cafeteria/J this/N month/N have/V the/D shot/N ,/, ''/' Mr./N Mattausch/N added/N ,/, ``/` and/C that/I means/N virtually/N everyone/N who/N works/N here/N ./.
INFO : Loss:    15/34
Consider using tensor.detach() first. (Triggered internally at /pytorch/torch/csrc/autograd/generated/python_variable_methods.cpp:835.)
  log.info(f"Cross-entropy: {xent/math.log(2)} nats (= perplexity {math.exp(xent)})\n---")
INFO : Cross-entropy: 1.7643154826952563 nats (= perplexity 3.397127774681285)
---
INFO : Gold:    I/P was/V _OOV_/V to/T read/V the/D _OOV_/N of/I facts/N in/I your/P Oct./N 13/C editorial/N ``/` _OOV_/N 's/P _OOV_/N _OOV_/N ./. ''/'


### Neural biRNN-CRF
Run the neural CRF with a simple one-hot lexicon to sanity-check `ConditionalRandomFieldNeural`.

In [None]:
from crf_neural import ConditionalRandomFieldNeural
from lexicon import build_lexicon
import torch

log.info("*** Neural biRNN-CRF (autograd/backprop)")
lexicon = build_lexicon(entrain, one_hot=True)
lexicon = lexicon.to(torch.device("cuda" if torch.cuda.is_available() else "cpu"), dtype=torch.float64)
crf_neural = ConditionalRandomFieldNeural(
    entrain.tagset,
    entrain.vocab,
    lexicon=lexicon,
    rnn_dim=16,
)
crf_neural.train(
    corpus=ensup,
    loss=loss_dev,
    reg=0.5,
    lr=0.01,
    minibatch_size=5,
    eval_interval=100,
    max_steps=2000,
    save_path="ensup_crf_neural.pkl",
)
look_at_your_data(crf_neural, endev, 3)
