# Add entities recognized by spacy to Macbeth

In [1]:
%load_ext autoreload
%autoreload 2

from standoffconverter import Standoff, View

## Download Macbeth and parse it with standoffconverter

In [2]:
import urllib.request
from lxml import etree

# download a play by shakespeare from the first folio edition
url = "https://firstfolio.bodleian.ox.ac.uk/download/xml/F-mac.xml"
response = urllib.request.urlopen(url).read()

In [3]:
# parse it into our standof object (sobj)
tree = etree.fromstring(response)
    
macbeth_so = Standoff(tree, namespaces={"tei":"http://www.tei-c.org/ns/1.0"})

## Filter only spoken text

In [4]:
view = View(macbeth_so.table)
view = (view
    .exclude_outside("{http://www.tei-c.org/ns/1.0}:sp")
    .shrink_whitespace()   
)
plain, lookup = view.get_plain()

## Parse text with spacy

In [5]:
import spacy
nlp = spacy.load("en_core_web_sm")

In [6]:
doc = nlp(plain)
for ent in doc.ents:
    if ent.label_ == "PERSON":
        start_ind = lookup.get_pos(ent.start_char)
        end_ind = lookup.get_pos(ent.end_char+1)
        # in this case we stop as soon as we found the first entity
        break;

In [7]:
print(ent, "-", ent.label_)

Witches - PERSON


In [8]:
macbeth_so.add_inline(
    begin=start_ind,
    end=end_ind,
    tag='MyPersonTag',
    depth=None,
    attrib={"resp": "spacy"}
)

In [9]:
resulting_tree = etree.tostring(macbeth_so.tree).decode("utf-8")

pos_of_label = resulting_tree.index('spacy')

print(resulting_tree[pos_of_label-400:pos_of_label+400])

pe="act" n="1">
                        <div type="scene" n="1">
                            <head rend="italic center">Actus Primus. Sc&#339;na Prima.</head>
                                    <head type="supplied">[Act 1, Scene 1]</head>
                        <cb n="1"/>
                            <stage rend="italic center" type="mixed">Thunder and Lightning. Enter three <MyPersonTag resp="spacy">Witches.</MyPersonTag></stage>
                            <sp who="#F-mac-wit.1">
                                <speaker>1.</speaker>
                                <l n="1">
                        <c rend="decoratedCapital">W</c>hen shall we three meet againe?</l>
                                <l n="2">In Thunder, Lightning, or in Raine?</l>
                            </sp>
       
