# Add entities recognized by spacy to Macbeth

In [1]:
%load_ext autoreload
%autoreload 2

from standoffconverter import Standoff, View

## Download Macbeth and parse it with standoffconverter

In [2]:
import urllib.request
from lxml import etree

# download a play by shakespeare from the first folio edition
url = "https://firstfolio.bodleian.ox.ac.uk/download/xml/F-mac.xml"
response = urllib.request.urlopen(url).read()

In [3]:
# parse it into our standof object (sobj)
tree = etree.fromstring(response)
    
macbeth_so = Standoff(tree, namespaces={"tei":"http://www.tei-c.org/ns/1.0"})

## Filter only spoken text

In [4]:
view = View(macbeth_so)

In [5]:
view = (view
        .exclude_outside("{http://www.tei-c.org/ns/1.0}sp")
        .shrink_whitespace())

In [6]:
plain = view.get_plain()

In [7]:
plain[:1000]

" 1. When shall we three meet againe? In Thunder, Lightning, or in Raine? 2. When the Hurley‑burley's done, When the Battaile's lost, and wonne. 3. That will be ere the set of Sunne. 1. Where the place? 2. Vpon the Heath. 3. There to meet with Macbeth. 1. I come, Gray‑Malkin. All. Padock calls anon: faire is foule, and foule is faire, Houer through the fogge and filthie ayre. King. What bloody man is that? he can report, As seemeth by his plight, of the Reuolt The newest state Mal. This is the Serieant, Who like a good and hardie Souldier fought 'Gainst my Captiuitie: Haile braue friend; Say to the King, the knowledge of the Broyle, As thou didst leaue it. Cap. Doubtfull it stood, As two spent Swimmers, that doe cling together, And choake their Art: The mercilesse Macdonwald (Worthie to be a Rebell, for to that The multiplying Villanies of Nature Doe swarme vpon him) from the Westerne Isles Of Kernes and Gallowgrosses is supply'd, And Fortune on his damned Quarry smiling, Shew'd like a

## Parse text with spacy

In [8]:
import spacy
nlp = spacy.load("en_core_web_sm")

In [9]:
doc = nlp(plain)
for ent in doc.ents:
    if ent.label_ == "PERSON":
        start_ind = view.get_table_pos(ent.start_char)
        end_ind = view.get_table_pos(ent.end_char)
        # in this case we stop as soon as we found the first entity
        break;

In [10]:
print(ent, "-", ent.label_)

Macbeth - PERSON


In [11]:
macbeth_so.add_inline(
    begin=start_ind,
    end=end_ind,
    tag='MyPersonTag',
    depth=None,
    attrib={"resp": "spacy"},
)

In [12]:
resulting_tree = etree.tostring(macbeth_so.tree).decode("utf-8")

pos_of_label = resulting_tree.index('spacy')

print(resulting_tree[pos_of_label-400:pos_of_label+400])

                        <sp who="#F-mac-wit.2">
                                <speaker>2.</speaker>
                                <l n="7">Vpon the Heath.</l>
                            </sp>
                            <sp who="#F-mac-mur.3">
                                <speaker>3.</speaker>
                                <l n="8">There to meet with <hi rend="italic"><MyPersonTag resp="spacy">Macbeth</MyPersonTag></hi>.</l>
                            </sp>
                            <sp who="#F-mac-wit.1">
                                <speaker>1.</speaker>
                                <l n="9">I come, <hi rend="italic">Gray&#8209;Malkin</hi>.</l>
                  </sp>
                            <sp who="#F-mac-all">
                                <speaker rend="itali
