In [None]:
from vowpalwabbit import pyvw

There is an installation issue in vw. Refer to https://github.com/JohnLangford/vowpal_wabbit/issues/1021

### Generate dataset from CoNLL 2003 (using spacy code):

In [9]:

def _consume_os(tags):
    ## reference: https://github.com/explosion/spaCy/blob/c7d53348d7c0474852dc5ebe5794f2816ef7eb01/spacy/gold.pyx
    while tags and tags[0] == 'O':
        yield tags.pop(0)


def _consume_ent(tags):
    if not tags:
        return []
    tag = tags.pop(0)
    target_in = 'I' + tag[1:]
    target_last = 'L' + tag[1:]
    length = 1
    while tags and tags[0] in {target_in, target_last}:
        length += 1
        tags.pop(0)
    label = tag[2:]
    if length == 1:
        return ['U-' + label]
    else:
        start = 'B-' + label
        end = 'L-' + label
        middle = ['I-%s' % label for _ in range(1, length - 1)]
        return [start] + middle + [end]
    
def iob_to_biluo(tags):
    out = []
    curr_label = None
    tags = list(tags)
    while tags:
        out.extend(_consume_os(tags))
        out.extend(_consume_ent(tags))
    return out

def read_conll_ner(input_path):
    ## reference: https://github.com/explosion/spaCy/blob/master/spacy/cli/converters/conll_ner2json.py
    text = open(input_path,'r', encoding='utf-8').read()
    i = 0
    delimit_docs = '-DOCSTART- -X- O O'
    output_docs = []
    for doc in text.strip().split(delimit_docs):
        doc = doc.strip()
        if not doc:
            continue
        output_doc = []
        for sent in doc.split('\n\n'):
            sent = sent.strip()
            if not sent:
                continue
            lines = [line.strip() for line in sent.split('\n') if line.strip()]
            words, tags, chunks, iob_ents = zip(*[line.split() for line in lines])
            biluo_ents = iob_to_biluo(iob_ents)
            output_doc.append({'tokens': [
                {'orth': w, 'tag': tag, 'ner': ent} for (w, tag, ent) in
                zip(words, tags, biluo_ents)
            ]})
        output_docs.append({
            'id': len(output_docs),
            'paragraphs': [{'sentences': output_doc}]
        })
        output_doc = []
    return output_docs

In [11]:
# print(read_conll_ner('CoNLL - 2003/en/test.txt'))

### Here is a basic implementation of sequence labeling:

In [None]:
class SequenceLabeler(pyvw.SearchTask):
    def __init__(self, vw, sch, num_actions):
        # you must must must initialize the parent class
        # this will automatically store self.sch <- sch, self.vw <- vw
        pyvw.SearchTask.__init__(self, vw, sch, num_actions)
        
        # set whatever options you want
        sch.set_options( sch.AUTO_HAMMING_LOSS | sch.AUTO_CONDITION_FEATURES )

    def _run(self, sentence):   # it's called _run to remind you that you shouldn't call it directly!
        output = []
        for n in range(len(sentence)):
            pos,word = sentence[n]
            # use "with...as..." to guarantee that the example is finished properly
            with self.vw.example({'w': [word]}) as ex:
                pred = self.sch.predict(examples=ex, my_tag=n+1, oracle=pos, condition=[(n,'p'), (n-1, 'q')])
                output.append(pred)
        return output

In [None]:
vw = pyvw.vw("--search 4 --audit --quiet --search_task hook --ring_size 1024")

In [None]:
sequenceLabeler = vw.init_search_task(SequenceLabeler)

In [None]:
for i in range(10):
    sequenceLabeler.learn(my_dataset)

In [None]:
test_example = [ (0,w) for w in "the sandwich ate a monster".split() ]
print(test_example)

In [None]:
out = sequenceLabeler.predict(test_example)
print(out)