# Add entities recognized by spacy to Macbeth

In [1]:
%load_ext autoreload
%autoreload 2

from standoffconverter import Converter as Co

## Download Macbeth and parse it with standoffconverter

In [2]:
import urllib.request
from lxml import etree

# download a play by shakespeare from the first folio edition
url = "https://firstfolio.bodleian.ox.ac.uk/download/xml/F-mac.xml"
response = urllib.request.urlopen(url).read()

In [3]:
# parse it into our standof object (sobj)
tree = etree.fromstring(response)
    
macbeth = Co.from_tree(tree)

## Filter only spoken text

In [5]:
speeches = macbeth.root_ap.xpath("//tei:sp", namespaces={"tei":"http://www.tei-c.org/ns/1.0"})

## Parse text with spacy

In [6]:
import spacy

nlp = spacy.load("en")

In [7]:
import dhspacy
dhnlp = dhspacy.init(nlp)

for dhdoc in dhnlp(speeches):
    for ent in dhdoc.doc.ents:
        char_begin, char_end = dhdoc.get_char_inds(ent)
        # in this case we stop as soon as we found the first entity
        break;
    break

AttributeError: module 'standoffconverter' has no attribute 'Filter'

In [7]:
print(ent, "-", ent.label_)

three - CARDINAL


In [8]:
macbeth.add_annotation(
    char_begin,
    char_end,
    "entity",
    attribute={
        "label":ent.label_,
        "responsible": "spacy"
    }
)

In [9]:
resulting_tree = etree.tostring(macbeth.tree).decode("utf-8")

pos_of_label = resulting_tree.index('<entity ')

print(resulting_tree[pos_of_label-400:pos_of_label+400])

ead>
                        
                            <ns0:stage rend="italic center" type="mixed">Thunder and Lightning. Enter three Witches.</ns0:stage>
                            <ns0:sp who="#F-mac-wit.1">
                                <ns0:speaker>1.</ns0:speaker>
                                <ns0:l n="1">
                        <ns0:c rend="decoratedCapital">W</ns0:c>hen shall we <entity label="CARDINAL" responsible="spacy">three</entity> meet againe?</ns0:l>
                                <ns0:l n="2">In Thunder, Lightning, or in Raine?</ns0:l>
                            </ns0:sp>
                            <ns0:sp who="#F-mac-wit.2">
                                <ns0:speaker>2.</ns0:speaker>
                                <ns0:l n="3">When the Hurley&#8209;burley'
