In the first part of this week's work, I'm going to apply the pre-trained model described in the [Lample et al](https://arxiv.org/abs/1603.01360) paper. We've been referring to this as a "Glample" model, so I'll stick with that. In this work, they use a bi-directional LSTM with a conditional random field on top to output the classes. Performance is roughly on par with other systems, but provides the advantage of being language agnostic and does not rely on any external labeled data (e.g. gazetteer). Performance on Spanish and German surpasses previous state-of-the-art.

To evaluate the glample model, I need to write an output file

In [45]:
import lxml.etree as etree
from bs4 import BeautifulSoup
import glob
files = glob.glob('../data/audible/processedText/*.xml')
outdat = open('audible_files.txt', 'w')
for f in files:
    xml_data = open(f).read()
    soup = BeautifulSoup(xml_data, 'lxml')
    sentences = soup.find_all('s')
    for s in sentences:
        tokens = s.find_all(['w', 'c'])
        for i in tokens:
            outdat.write(i.text.encode('utf8'))
            outdat.write(' ')
        outdat.write('\n')

outdat.close()

In [38]:
xml_data = open(files[0]).read()
soup = BeautifulSoup(xml_data, 'lxml')
sentences = soup.find_all('s')
sentences = sentences[125:150]

In [39]:
outdat = open('audible_files.txt', 'w')
for s in sentences:
    tokens = s.find_all(['w', 'c'])
    for i in tokens:
        outdat.write(i.text.encode('utf8'))
        outdat.write(' ')
    outdat.write('\n')
outdat.close()

In [40]:
[i.text.encode('utf8') for i in tokens]

['"', 'Is', 'this', 'Mr.', 'David', 'Robicheaux', '?', '"']

In [10]:
from model import Model
from loader import prepare_sentence
model = Model(model_path='models/english/')
parameters = model.parameters


In [11]:
# Load reverse mappings
word_to_id, char_to_id, tag_to_id = [
    {v: k for k, v in x.items()}
    for x in [model.id_to_word, model.id_to_char, model.id_to_tag]
]

In [41]:
import codecs
with codecs.open('audible_files.txt', 'r', 'utf-8') as f_input:
    count = 0
    for line in f_input:
        words = line.rstrip().split()
        sentence = prepare_sentence(words, word_to_id, char_to_id, lower=parameters['lower'])
        print sentence

{'str_words': [u'"', u'Do', u'you', u'like', u'trouble', u'?', u'"'], 'chars': [[43], [41, 5], [19, 5, 13], [9, 4, 27, 0], [2, 6, 5, 13, 21, 9, 0], [69], [43]], 'words': [13, 12197, 277, 504, 3380, 1706, 13], 'caps': [0, 2, 0, 0, 0, 0, 0]}
{'str_words': [u'she', u'asked', u'.'], 'chars': [[7, 11, 0], [1, 7, 27, 0, 10], [18]], 'words': [163, 499, 1], 'caps': [0, 0, 0]}
{'str_words': [u'"', u'I', u'do', u"n't", u'seek', u'it', u'out', u',', u'"', u'I', u'said', u'.'], 'chars': [[43], [30], [10, 5], [3, 45, 2], [7, 0, 0, 27], [4, 2], [5, 13, 2], [23], [43], [30], [7, 1, 4, 10], [18]], 'words': [13, 62, 165, 190, 1388, 37, 67, 2, 13, 62, 15, 1], 'caps': [0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0]}
{'str_words': [u'"', u'I', u'heard', u'you', u'were', u'a', u'Twelve-Step', u'person', u'.', u'"'], 'chars': [[43], [30], [11, 0, 1, 6, 10], [19, 5, 13], [20, 0, 6, 0], [1], [28, 20, 0, 9, 24, 0, 22, 25, 2, 0, 16], [16, 0, 6, 7, 5, 3], [18], [43]], 'words': [13, 62, 2093, 277, 40, 9, 0, 2920, 1, 13], 'c

In [44]:
from utils import zero_digits
zero_digits('11')

'00'

In [20]:
with codecs.open(opts.input, 'r', 'utf-8') as f_input:
    count = 0
    for line in f_input:
        words = line.rstrip().split()
        if line:
            # Lowercase sentence
            if parameters['lower']:
                line = line.lower()
            # Replace all digits with zeros
            if parameters['zeros']:
                line = zero_digits(line)
            # Prepare input
            sentence = prepare_sentence(words, word_to_id, char_to_id,
                                        lower=parameters['lower'])
            input = create_input(sentence, parameters, False)
            # Decoding
            if parameters['crf']:
                y_preds = np.array(f_eval(*input))[1:-1]
            else:
                y_preds = f_eval(*input).argmax(axis=1)
            y_preds = [model.id_to_tag[y_pred] for y_pred in y_preds]
            # Output tags in the IOB2 format
            if parameters['tag_scheme'] == 'iobes':
                y_preds = iobes_iob(y_preds)
            # Write tags
            assert len(y_preds) == len(words)
            f_output.write('%s\n' % ' '.join('%s%s%s' % (w, opts.delimiter, y)
                                             for w, y in zip(words, y_preds)))
        else:
            f_output.write('\n')
        count += 1
        if count % 100 == 0:
            print count

print '---- %i lines tagged in %.4fs ----' % (count, time.time() - start)

But
I
was
all
out
of
Purple
Hearts
and
had
decided
that
Honoria
was
going
to
leave
of
her
own
accord
or
be
picked
up
by
a
cruiser
.
My
determination
suddenly
dissipated
when
I
looked
out
the
front
window
and
saw
the
Chalonses
'
handyman
,
with
his
son
and
Sister
Molly
next
to
him
,
turn
into
my
driveway
.
"
I
'm
going
to
talk
to
some
people
out
front
.
There
's
no
need
for
you
to
leave
right
now
,
"
I
said
to
Honoria
.
"
Too
late
,
my
love
,
"
she
said
.
She
walked
out
the
front
door
and
down
the
street
toward
the
Shadows
,
her
purse
swinging
from
a
shoulder
string
.
I
stood
on
the
gallery
,
barefoot
,
unshaved
,
looking
down
at
Molly
Boyle
,
my
face
burning
.
"
I
should
have
called
first
,
I
guess
,
but
Tee
Bleu
says
he
knows
where
the
boat
is
,
"
she
said
,
speaking
awkwardly
and
too
fast
,
trying
to
hide
her
embarrassment
at
my
situation
.
"
Which
boat
?
"
I
said
.
