In the first part of this week's work, I'm going to apply the pre-trained model described in the [Lample et al](https://arxiv.org/abs/1603.01360) paper. We've been referring to this as a "Glample" model, so I'll stick with that. In this work, they use a bi-directional LSTM with a conditional random field on top to output the classes. Performance is roughly on par with other systems, but provides the advantage of being language agnostic and does not rely on any external labeled data (e.g. gazetteer). Performance on Spanish and German surpasses previous state-of-the-art.

To evaluate the glample model, I need to write an output file

In [30]:
import lxml.etree as etree
from bs4 import BeautifulSoup
import glob
files = glob.glob('data/audible/processedText/*.xml')
outdat = open('audible_files.txt', 'w')
for f in files:
    xml_data = open(f).read()
    soup = BeautifulSoup(xml_data, 'lxml')
    sentences = soup.find_all('s')
    for s in sentences:
        tokens = s.find_all(['w', 'c'])
        for i in tokens:
            outdat.write(i.text.encode('utf8'))
            outdat.write(' ')
        outdat.write('\n')

outdat.close()

Having written this file, I ran it through glample via the command-line utility function. It returned an output file in which the words were tagged as below:

In [2]:
f = open('tagger-master/output.txt')
glampledat = f.readlines()

['"__O There__O was__O no__O water__O in__O the__O cat__O \'s__O bowl__O ,__O "__O she__O said__O .__O\n',
 '"__O He__O drinks__O out__O of__O the__O toilet__O ,__O "__O I__O said__O .__O\n',
 '"__O That__O \'s__O disgusting__O .__O "__O\n',
 '"__O That__O \'s__O what__O I__O \'ve__O been__O telling__O him__O ,__O "__O I__O said__O .__O\n',
 'But__O she__O saw__O no__O humor__O in__O my__O remark__O .__O\n',
 'She__O served__O oatmeal__O in__O two__O bowls__O and__O placed__O them__O on__O the__O breakfast__O table__O ,__O then__O began__O hunting__O for__O spoons__O and__O coffee__O cups__O .__O\n',
 'I__O looked__O at__O my__O watch__O .__O\n',
 '"__O I__O \'m__O running__O a__O little__O bit__O late__O for__O Mass__B-ORG ,__O "__O I__O lied__O .__O\n',
 '"__O Where__O \'s__O your__O butter__O dish__O ?__O "__O\n',
 '"__O I__O do__O n\'t__O have__O one__O .__O\n']

In [63]:
glampletags = []
for i, sentence in enumerate(glampledat):
    glampletags.append(enumerate(sentence.split()))

In [31]:
audible_text = []
audible_tag = []
for f in files:
    xml_data = open(f).read()
    soup = BeautifulSoup(xml_data, 'lxml')
    sentences = soup.find_all('s')
    for j, s in enumerate(sentences):
        words = s.find_all(['w', 'c'])
        doc = ' '.join(w.text.encode('utf8') for w in words)
        for i, w in enumerate(words):
            if 'ner' in w.attrs:
                audible_text.append(w.text)
                audible_tag.append(w.attrs['ner'])
            else:
                audible_text.append(w.text)
                audible_tag.append('O-')

In [None]:
glample_text = []
glample_tag = []
audible_text = []
audible_tag = []
for f in files:
    xml_data = open(f).read()
    soup = BeautifulSoup(xml_data, 'lxml')
    sentences = soup.find_all('s')
    for j, s in enumerate(sentences):
        glample_sent = glampletags[j]
        glample_words = glample_sent.split('__')
        words = s.find_all(['w', 'c'])
        doc = ' '.join(w.text.encode('utf8') for w in words)
        if len(glample_words) == len(words):
            for i, w in enumerate(words):
                if 'ner' in w.attrs:
                    glample_text.append(stanford_words[i][0])
                    glample_tag.append(stanford_words[i][1])
                    audible_text.append(w.text)
                    audible_tag.append(w.attrs['ner'])
                else:
                    stanford_text.append(stanford_words[i][0])
                    stanford_tag.append(stanford_words[i][1])
                    audible_text.append(w.text)
                    audible_tag.append('O-')
        else:
            

In [62]:
#print len(audible_text)
#print len(glampletags)

print audible_text[14850:14875]
print glampletags[14850:14875]

[u'yourself', u'. . . .', u'Take', u'care', u'of', u'your', u'tools', u'and', u'they', u"'ll", u'take', u'care', u'of', u'you', u'. . . .', u'Put', u'your', u'shotgun', u'through', u'the', u'fence', u',', u'then', u'crawl', u'after']
[['yourself', 'O'], ['.', 'O'], ['.', 'O'], ['.', 'O'], ['.', 'O'], ['Take', 'O'], ['care', 'O'], ['of', 'O'], ['your', 'O'], ['tools', 'O'], ['and', 'O'], ['they', 'O'], ["'ll", 'O'], ['take', 'O'], ['care', 'O'], ['of', 'O'], ['you', 'O'], ['.', 'O'], ['.', 'O'], ['.', 'O'], ['.', 'O'], ['Put', 'O'], ['your', 'O'], ['shotgun', 'O'], ['through', 'O']]


In [10]:
from model import Model
from loader import prepare_sentence
model = Model(model_path='models/english/')
parameters = model.parameters


In [11]:
def condense_glample_tokens(audible_sentence, glample_words, glample_tag, audible_text, audible_tag):
    offset = [0,0] #stanford, audible
    words = audible_sentence.find_all(['w', 'c'])
    doc = ' '.join(w.text.encode('utf8') for w in words)
    if len(glample_words) > len(words):
        for i, w in enumerate(words):
            stanford_words = word_tokenize(w.text)
            if len(stanford_words) == 1:
                if 'ner' in w.attrs:
                    stanford_text.append(stanford_tags[i+offset[0]][0]) 
                    stanford_tag.append(stanford_tags[i+offset[0]][1])
                    audible_text.append(w.text)
                    audible_tag.append(w.attrs['ner'])
                else:
                    stanford_text.append(stanford_tags[i+offset[0]][0]) 
                    stanford_tag.append(stanford_tags[i+offset[0]][1])
                    audible_text.append(w.text)
                    audible_tag.append('O-')
            else:
                if 'ner' in w.attrs:
                    stanford_text.append(''.join(j[0] for j in stanford_tags[i+offset[0]:i+offset[0]+len(stanford_words)])) 
                    stanford_tag.append(''.join(j[1] for j in stanford_tags[i+offset[0]:i+offset[0]+len(stanford_words)]))
                    audible_text.append(w.text)
                    audible_tag.append(w.attrs['ner'])
                else:
                    stanford_text.append(''.join(j[0] for j in stanford_tags[i+offset[0]:i+offset[0]+len(stanford_words)])) 
                    stanford_tag.append(''.join(j[1] for j in stanford_tags[i+offset[0]:i+offset[0]+len(stanford_words)]))
                    audible_text.append(w.text)
                    audible_tag.append('O-')

                offset[0] += len(stanford_words)-1
    else:
        offset[1] = len(stanford_tags)
        while len(words)>len(stanford_tags):
            try:
                doc = ' '.join(w.text.encode('utf8') for w in words[offset[1]:])
                stanford_tags.extend(stanford_mod.tag([doc]))
            except:
                doc = ' '.join(w.text.encode('utf8') for w in words[offset[1]:])
                stanford_tags.extend(stanford_mod.tag([doc.decode('ascii', 'ignore')]))
            offset[1] = len(stanford_tags)
        for i, w in enumerate(stanford_tags):   
            if 'ner' in words[i].attrs:
                stanford_text.append(w[0])
                stanford_tag.append(w[1])
                audible_text.append(words[i].text)
                audible_tag.append(words[i].attrs['ner'])
            else:
                stanford_text.append(w[0])
                stanford_tag.append(w[1])
                audible_text.append(words[i].text)
                audible_tag.append('O-')
            
    return(stanford_text, stanford_tag, audible_text, audible_tag)